comparison BSseeker2/bs_align/bs_pair_end.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
comparison
equal deleted inserted replaced
0:e6df770c0e58 1:8b26adf64adc
102 no_small_lines, 102 no_small_lines,
103 max_mismatch_no, 103 max_mismatch_no,
104 aligner_command, 104 aligner_command,
105 db_path, 105 db_path,
106 tmp_path, 106 tmp_path,
107 outfile, XS_pct, XS_count, show_multiple_hit=False): 107 outfile, XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):
108 108
109 109
110 #---------------------------------------------------------------- 110 #----------------------------------------------------------------
111 adapter="" 111 adapter = ""
112 adapterA="" 112 adapterA = ""
113 adapterB="" 113 adapterB = ""
114 if adapter_file !="": 114 if adapter_file !="":
115 try : 115 try :
116 adapter_inf=open(adapter_file,"r") 116 adapter_inf=open(adapter_file,"r")
117 if asktag=="N": #<--- directional library 117 if asktag=="N": #<--- directional library
118 adapter=adapter_inf.readline() 118 adapter=adapter_inf.readline()
132 #---------------------------------------------------------------- 132 #----------------------------------------------------------------
133 133
134 logm("End 1 filename: %s"% main_read_file_1 ) 134 logm("End 1 filename: %s"% main_read_file_1 )
135 logm("End 2 filename: %s"% main_read_file_2 ) 135 logm("End 2 filename: %s"% main_read_file_2 )
136 logm("The first base (for mapping): %d"% cut1 ) 136 logm("The first base (for mapping): %d"% cut1 )
137 logm("The last base (for mapping): %d"% cut2 ) 137 logm("The last base (for mapping): %d"% cut2 )
138 138
139 logm("-------------------------------- " )
140 logm("Un-directional library: %s" % asktag )
141 logm("Path for short reads aligner: %s"% aligner_command + '\n') 139 logm("Path for short reads aligner: %s"% aligner_command + '\n')
142 logm("Reference genome library path: %s"% db_path ) 140 logm("Reference genome library path: %s"% db_path )
141
142 if asktag == "Y" :
143 logm("Un-directional library" )
144 else :
145 logm("Directional library")
143 logm("Number of mismatches allowed: %s"% max_mismatch_no ) 146 logm("Number of mismatches allowed: %s"% max_mismatch_no )
144 147
145 if adapter_file !="": 148 if adapter_file !="":
146 if asktag=="Y": 149 if asktag=="Y":
147 logm("Adapters to be removed from 3' of the reads:" ) 150 logm("Adapters to be removed from 3' of the reads:" )
180 #---- Stats ------------------------------------------------------------ 183 #---- Stats ------------------------------------------------------------
181 all_raw_reads=0 184 all_raw_reads=0
182 all_trimmed=0 185 all_trimmed=0
183 all_mapped=0 186 all_mapped=0
184 all_mapped_passed=0 187 all_mapped_passed=0
188 all_base_before_trim=0
189 all_base_after_trim=0
190 all_base_mapped=0
185 191
186 all_unmapped=0 192 all_unmapped=0
187 193
188 numbers_premapped_lst=[0,0,0,0] 194 numbers_premapped_lst=[0,0,0,0]
189 numbers_mapped_lst=[0,0,0,0] 195 numbers_mapped_lst=[0,0,0,0]
190 196
191 mC_lst=[0,0,0] 197 mC_lst=[0,0,0]
192 uC_lst=[0,0,0] 198 uC_lst=[0,0,0]
193 199
194 no_my_files=0 200 no_my_files=0
201 read_id_lst_1 = dict()
202 read_id_lst_2 = dict()
195 203
196 #---------------------------------------------------------------- 204 #----------------------------------------------------------------
197 print "== Start mapping ==" 205 print "== Start mapping =="
198 206
199 for read_file_1, read_file_2 in my_files: 207 for read_file_1, read_file_2 in my_files:
210 if asktag == "Y" : 218 if asktag == "Y" :
211 219
212 #---------------------------------------------------------------- 220 #----------------------------------------------------------------
213 outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id) 221 outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id)
214 outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id) 222 outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id)
215 outfile_2FCT = tmp_d('Trimmed_FCT_2.fa'+random_id) 223 outfile_2RGA = tmp_d('Trimmed_FCT_2.fa'+random_id)
216 outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id) 224 outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id)
217 225
218 try : 226 try :
219 read_inf = open(tmp_d(read_file_1),"r") 227 if read_file_1.endswith(".gz") : # support input file ending with ".gz"
228 read_inf = gzip.open(tmp_d(read_file_1), "rb")
229 else :
230 read_inf = open(tmp_d(read_file_1),"r")
220 except IOError : 231 except IOError :
221 print "[Error] Cannot open file : %s", tmp_d(read_file_1) 232 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
222 exit(-1) 233 exit(-1)
223 oneline = read_inf.readline() 234 oneline = read_inf.readline()
224 l = oneline.split() 235 l = oneline.split()
225 input_format = "" 236 input_format = ""
226 237
227 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) 238 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009)
228 input_format="fastq" 239 input_format="fastq"
229 n_fastq=0
230 elif len(l)==1 and oneline[0]!=">": # pure sequences 240 elif len(l)==1 and oneline[0]!=">": # pure sequences
231 input_format="seq" 241 input_format="seq"
232 elif len(l)==11: # Illumina GAII qseq file 242 elif len(l)==11: # Illumina GAII qseq file
233 input_format="qseq" 243 input_format="qseq"
234 elif oneline[0]==">": # fasta 244 elif oneline[0]==">": # fasta
235 input_format="fasta" 245 input_format="fasta"
236 n_fasta=0
237 read_inf.close() 246 read_inf.close()
238 247
239 print "Detected data format: %s" % input_format 248 print "Detected data format: %s" % input_format
240 249
241 #---------------------------------------------------------------- 250 #----------------------------------------------------------------
242 read_file_list = [read_file_1, read_file_2] 251 read_file_list = [read_file_1, read_file_2]
243 outfile_FCT_list = [outfile_1FCT, outfile_2FCT] 252 outfile_FCT_list = [outfile_1FCT, outfile_2RGA]
244 outfile_RCT_list = [outfile_1RCT, outfile_2RCT] 253 outfile_RCT_list = [outfile_1RCT, outfile_2RCT]
245 n_list = [0, 0] 254 n_list = [0, 0]
246
247
248 for f in range(2): 255 for f in range(2):
249 read_file = read_file_list[f] 256 read_file = read_file_list[f]
250 outf_FCT = open(outfile_FCT_list[f], 'w') 257 outf_FCT = open(outfile_FCT_list[f], 'w')
251 outf_RCT = open(outfile_RCT_list[f], 'w') 258 outf_RGA = open(outfile_RCT_list[f], 'w')
252 original_bs_reads = original_bs_reads_lst[f] 259 original_bs_reads = original_bs_reads_lst[f]
253 n = n_list[f] 260 n = n_list[f]
254 261 read_id = ""
255 seq_id = ""
256 seq = "" 262 seq = ""
257 seq_ready = "N" 263 seq_ready = "N"
264 line_no = 0
258 for line in fileinput.input(tmp_d(read_file)): 265 for line in fileinput.input(tmp_d(read_file)):
259 l=line.split() 266 l = line.split()
267 line_no += 1
260 if input_format=="seq": 268 if input_format=="seq":
261 n+=1 269 n += 1
262 seq_id=str(n) 270 read_id = str(n)
263 seq_id=seq_id.zfill(12) 271 read_id = read_id.zfill(12)
264 seq=l[0] 272 if f == 1 :
265 seq_ready="Y" 273 read_id_lst_1[read_id] = read_id
274 else :
275 read_id_lst_2[read_id] = read_id
276 seq = l[0]
277 seq_ready = "Y"
266 elif input_format=="fastq": 278 elif input_format=="fastq":
267 m_fastq=math.fmod(n_fastq,4) 279 l_fastq = math.fmod(line_no, 4)
268 n_fastq+=1 280 if l_fastq==1 :
269 seq_ready="N" 281 all_raw_reads += 1
270 if m_fastq==0: 282 read_id = str(line_no/4+1).zfill(12)
271 n+=1 283 if f==1 :
272 seq_id=str(n) 284 read_id_lst_1[read_id] = l[0][1:]
273 seq_id=seq_id.zfill(12) 285 else :
274 seq="" 286 read_id_lst_2[read_id] = l[0][1:]
275 elif m_fastq==1: 287 seq_ready="N"
276 seq=l[0] 288 elif l_fastq==2 :
277 seq_ready="Y" 289 seq = l[0]
278 else: 290 seq_ready = "Y"
279 seq="" 291 else :
292 seq = ""
293 seq_ready = "N"
280 elif input_format=="qseq": 294 elif input_format=="qseq":
281 n+=1 295 all_raw_reads += 1
282 seq_id=str(n) 296 read_id = str(line_no)
283 seq_id=seq_id.zfill(12) 297 read_id = read_id.zfill(12)
284 seq=l[8] 298 if f == 1 :
285 seq_ready="Y" 299 read_id_lst_1[read_id] = l[0][1:]
300 else :
301 read_id_lst_2[read_id] = l[0][1:]
302 seq = l[8]
303 seq_ready = "Y"
286 elif input_format=="fasta": 304 elif input_format=="fasta":
287 m_fasta=math.fmod(n_fasta,2) 305 l_fasta = math.fmod(line_no,2)
288 n_fasta+=1 306 seq_ready = "N"
289 seq_ready="N" 307 if l_fasta==1:
290 if m_fasta==0: 308 all_raw_reads += 1
291 n+=1 309 read_id = str(line_no/2+1).zfill(12)
292 seq_id=l[0][1:] 310 if f==1 :
293 seq_id=seq_id.zfill(17) 311 read_id_lst_1[read_id] = l[0][1:]
294 seq="" 312 else :
295 elif m_fasta==1: 313 read_id_lst_2[read_id] = l[0][1:]
296 seq=l[0] 314 seq = ""
297 seq_ready="Y" 315 elif l_fasta==0 :
298 else: 316 seq = l[0]
299 seq="" 317 seq_ready = "Y"
300 #---------------------------------------------------------------- 318 #----------------------------------------------------------------
301 if seq_ready=="Y": 319 if seq_ready=="Y":
302 seq=seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52 320 seq = seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52
303 seq=seq.upper() 321 seq = seq.upper()
304 seq=seq.replace(".","N") 322 seq = seq.replace(".","N")
305 323
306 #--striping BS adapter from 3' read -------------------------------------------------------------- 324 #--striping BS adapter from 3' read --------------------------------------------------------------
307 if (adapterA !="") and (adapterB !=""): 325 all_base_before_trim += len(seq)
308 signature=adapterA[:6] 326 if f==0 and adapterA!="" :
309 if signature in seq: 327 new_read = RemoveAdapter(seq, adapterA, adapter_mismatch)
310 signature_pos=seq.index(signature) 328 if len(new_read)<len(seq) :
311 if seq[signature_pos:] in adapterA: 329 all_trimmed += 1
312 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) 330 seq = new_read
313 all_trimmed+=1 331 if f==1 and adapterB!="" :
314 else: 332 new_read = RemoveAdapter(seq, adapterB, adapter_mismatch)
315 signature=adapterB[:6] 333 if len(new_read)<len(seq) :
316 if signature in seq: 334 all_trimmed += 1
317 #print seq_id,seq,signature; 335 seq = new_read
318 signature_pos=seq.index(signature) 336 all_base_after_trim += len(seq)
319 if seq[signature_pos:] in adapterB: 337
320 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) 338 if len(seq)<=4:
321 all_trimmed+=1
322
323 if len(seq) <= 4:
324 seq = "N" * (cut2-cut1+1) 339 seq = "N" * (cut2-cut1+1)
325 #--------- trimmed_raw_BS_read ------------------ 340 #--------- trimmed_raw_BS_read ------------------
326 original_bs_reads[seq_id] = seq 341 original_bs_reads[read_id] = seq
327 342
328 #--------- FW_C2T ------------------ 343 #--------- FW_C2T ------------------
329 outf_FCT.write('>%s\n%s\n' % (seq_id, seq.replace("C","T"))) 344 outf_FCT.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
330 #--------- RC_G2A ------------------ 345 #--------- RC_G2A ------------------
331 outf_RCT.write('>%s\n%s\n' % (seq_id, seq.replace("G","A"))) 346 outf_RGA.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
332 347
333 n_list[f]=n 348 n_list[f] = n
334 349
335 outf_FCT.close() 350 outf_FCT.close()
336 outf_RCT.close() 351 outf_RGA.close()
337 352
338 fileinput.close() 353 fileinput.close()
339 354
340
341 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); 355 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
342 all_raw_reads+=n 356 all_raw_reads += n
343 357
344 #-------------------------------------------------------------------------------- 358 #--------------------------------------------------------------------------------
345 # Bowtie mapping 359 # Bowtie mapping
346 #-------------------------------------------------------------------------------- 360 #--------------------------------------------------------------------------------
347 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) 361 WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
348 WC2T_rf=tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) 362 WC2T_rf = tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
349 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) 363 CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
350 CC2T_rf=tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) 364 CC2T_rf = tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
351 365
352 run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), 366 run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
353 'input_file_1' : outfile_1FCT, 367 'input_file_1' : outfile_1FCT,
354 'input_file_2' : outfile_2RCT, 368 'input_file_2' : outfile_2RCT,
355 'output_file' : WC2T_fr}, 369 'output_file' : WC2T_fr},
356 370
357 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), 371 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
358 'input_file_1' : outfile_1FCT, 372 'input_file_1' : outfile_1FCT,
359 'input_file_2' : outfile_2RCT, 373 'input_file_2' : outfile_2RCT,
360 'output_file' : CC2T_fr}, 374 'output_file' : CG2A_fr},
361 375
362 aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), 376 aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
363 'input_file_1' : outfile_2FCT, 377 'input_file_1' : outfile_2RGA,
364 'input_file_2' : outfile_1RCT, 378 'input_file_2' : outfile_1RCT,
365 'output_file' : WC2T_rf}, 379 'output_file' : WC2T_rf},
366 380
367 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), 381 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
368 'input_file_1' : outfile_2FCT, 382 'input_file_1' : outfile_2RGA,
369 'input_file_2' : outfile_1RCT, 383 'input_file_2' : outfile_1RCT,
370 'output_file' : CC2T_rf}]) 384 'output_file' : CC2T_rf}])
371 385
372 386 delete_files(outfile_1FCT, outfile_2RGA, outfile_1RCT, outfile_2RCT)
373 delete_files(outfile_1FCT, outfile_2FCT, outfile_1RCT, outfile_2RCT)
374 387
375 388
376 #-------------------------------------------------------------------------------- 389 #--------------------------------------------------------------------------------
377 # Post processing 390 # Post processing
378 #-------------------------------------------------------------------------------- 391 #--------------------------------------------------------------------------------
379 392
380
381 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) 393 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
382 FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf) 394 FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf)
383 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr) 395 RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr)
384 RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf) 396 RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf)
385 397
386 delete_files(WC2T_fr, WC2T_rf, CC2T_fr, CC2T_rf) 398 delete_files(WC2T_fr, WC2T_rf, CG2A_fr, CC2T_rf)
387 399
388 #---------------------------------------------------------------- 400 #----------------------------------------------------------------
389 # get uniq-hit reads 401 # get uniq-hit reads
390 #---------------------------------------------------------------- 402 #----------------------------------------------------------------
391 Union_set=set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys()) 403 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys())
392 404
393 Unique_FW_fr_C2T=set() # + 405 Unique_FW_fr_C2T = set() # +
394 Unique_FW_rf_C2T=set() # + 406 Unique_FW_rf_C2T = set() # +
395 Unique_RC_fr_C2T=set() # - 407 Unique_RC_fr_G2A = set() # -
396 Unique_RC_rf_C2T=set() # - 408 Unique_RC_rf_C2T = set() # -
397 Multiple_hits=set() 409 Multiple_hits = set()
398 410
399 411
400 for x in Union_set: 412 for x in Union_set:
401 _list=[] 413 _list = []
402 for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_C2T_fr_U, RC_C2T_rf_U]: 414 for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_G2A_fr_U, RC_C2T_rf_U]:
403 mis_lst=d.get(x,[99]) 415 mis_lst = d.get(x,[99])
404 mis=int(mis_lst[0]) 416 mis = int(mis_lst[0])
405 _list.append(mis) 417 _list.append(mis)
406 for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_C2T_fr_R, RC_C2T_rf_R]: 418 for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_G2A_fr_R, RC_C2T_rf_R]:
407 mis=d.get(x,99) 419 mis = d.get(x,99)
408 _list.append(mis) 420 _list.append(mis)
409 mini=min(_list) 421 mini = min(_list)
410 if _list.count(mini)==1: 422 if _list.count(mini)==1:
411 mini_index=_list.index(mini) 423 mini_index = _list.index(mini)
412 if mini_index==0: 424 if mini_index==0:
413 Unique_FW_fr_C2T.add(x) 425 Unique_FW_fr_C2T.add(x)
414 elif mini_index==1: 426 elif mini_index==1:
415 Unique_FW_rf_C2T.add(x) 427 Unique_FW_rf_C2T.add(x)
416 elif mini_index==2: 428 elif mini_index==2:
417 Unique_RC_fr_C2T.add(x) 429 Unique_RC_fr_G2A.add(x)
418 elif mini_index==3: 430 elif mini_index==3:
419 Unique_RC_rf_C2T.add(x) 431 Unique_RC_rf_C2T.add(x)
420 else : 432 else :
421 Multiple_hits.add(x) 433 Multiple_hits.add(x)
422 else : 434 else :
423 Multiple_hits.add(x) 435 Multiple_hits.add(x)
424 436
425 # write reads rejected by Multiple Hits to file 437 # write reads rejected by Multiple Hits to file
426 if show_multiple_hit : 438 if show_multiple_hit :
427 outf_MH=open("Multiple_hit.fa",'w') 439 outf_MH = open("Multiple_hit.fa",'w')
428 for i in Multiple_hits : 440 for i in Multiple_hits :
429 outf_MH.write(">%s\n" % i) 441 outf_MH.write(">%s\n" % i)
430 outf_MH.write("%s\n" % original_bs_reads[i]) 442 outf_MH.write("%s\n" % original_bs_reads[i])
431 outf_MH.close() 443 outf_MH.close()
432 444
433 del Union_set 445 del Union_set
434 del FW_C2T_fr_R 446 del FW_C2T_fr_R
435 del FW_C2T_rf_R 447 del FW_C2T_rf_R
436 del RC_C2T_fr_R 448 del RC_G2A_fr_R
437 del RC_C2T_rf_R 449 del RC_C2T_rf_R
438 450
439 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] 451 FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
440 FW_C2T_rf_uniq_lst=[[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T] 452 FW_C2T_rf_uniq_lst = [[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T]
441 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T] 453 RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A]
442 RC_C2T_rf_uniq_lst=[[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T] 454 RC_C2T_rf_uniq_lst = [[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T]
443 455
444 FW_C2T_fr_uniq_lst.sort() 456 FW_C2T_fr_uniq_lst.sort()
445 FW_C2T_rf_uniq_lst.sort() 457 FW_C2T_rf_uniq_lst.sort()
446 RC_C2T_fr_uniq_lst.sort() 458 RC_C2T_fr_uniq_lst.sort()
447 RC_C2T_rf_uniq_lst.sort() 459 RC_C2T_rf_uniq_lst.sort()
448 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst] 460 FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst]
449 FW_C2T_rf_uniq_lst=[x[1] for x in FW_C2T_rf_uniq_lst] 461 FW_C2T_rf_uniq_lst = [x[1] for x in FW_C2T_rf_uniq_lst]
450 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst] 462 RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst]
451 RC_C2T_rf_uniq_lst=[x[1] for x in RC_C2T_rf_uniq_lst] 463 RC_C2T_rf_uniq_lst = [x[1] for x in RC_C2T_rf_uniq_lst]
452 #---------------------------------------------------------------- 464 #----------------------------------------------------------------
453 465
454 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T) 466 numbers_premapped_lst[0] += len(Unique_FW_fr_C2T)
455 numbers_premapped_lst[1]+=len(Unique_FW_rf_C2T) 467 numbers_premapped_lst[1] += len(Unique_FW_rf_C2T)
456 numbers_premapped_lst[2]+=len(Unique_RC_fr_C2T) 468 numbers_premapped_lst[2] += len(Unique_RC_fr_G2A)
457 numbers_premapped_lst[3]+=len(Unique_RC_rf_C2T) 469 numbers_premapped_lst[3] += len(Unique_RC_rf_C2T)
458 470
459 del Unique_FW_fr_C2T 471 del Unique_FW_fr_C2T
460 del Unique_FW_rf_C2T 472 del Unique_FW_rf_C2T
461 del Unique_RC_fr_C2T 473 del Unique_RC_fr_G2A
462 del Unique_RC_rf_C2T 474 del Unique_RC_rf_C2T
463 475
464 #---------------------------------------------------------------- 476 #----------------------------------------------------------------
465 477
466 nn = 0 478 nn = 0
467 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), 479 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U),
468 (FW_C2T_rf_uniq_lst,FW_C2T_rf_U), 480 (FW_C2T_rf_uniq_lst,FW_C2T_rf_U),
469 (RC_C2T_fr_uniq_lst,RC_C2T_fr_U), 481 (RC_C2T_fr_uniq_lst,RC_G2A_fr_U),
470 (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]: 482 (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]:
471 nn += 1 483 nn += 1
472 484
473 mapped_chr0 = "" 485 mapped_chr0 = ""
474 for header in ali_unique_lst: 486 for header in ali_unique_lst:
475 487
476 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] 488 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
477 489
478 #------------------------------------- 490 #-------------------------------------
479 if mapped_chr != mapped_chr0: 491 if mapped_chr!=mapped_chr0:
480 my_gseq=deserialize(db_d(mapped_chr)) 492 my_gseq = deserialize(db_d(mapped_chr))
481 chr_length=len(my_gseq) 493 chr_length = len(my_gseq)
482 mapped_chr0=mapped_chr 494 mapped_chr0 = mapped_chr
483 #------------------------------------- 495 #-------------------------------------
484 if nn == 1 or nn == 3: 496 if nn==1 or nn==3 :
485 original_BS_1 = original_bs_reads_1[header] 497 original_BS_1 = original_bs_reads_1[header]
486 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) 498 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
487 else: 499 else :
488 original_BS_1 = original_bs_reads_2[header] 500 original_BS_1 = original_bs_reads_2[header]
489 original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) 501 original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
490 502
491 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) 503 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
492 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) 504 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
493 505
494 all_mapped += 1 506 all_mapped += 1
495 507
496 if nn == 1: # FW-RC mapped to + strand: 508 if nn==1 : # FW-RC mapped to + strand:
497
498 FR = "+FR" 509 FR = "+FR"
499
500 # mapped_location_1 += 1
501 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
502 # origin_genome_1 = origin_genome_long_1[2:-2]
503 mapped_strand_1 = "+" 510 mapped_strand_1 = "+"
504
505 # mapped_location_2 += 1
506 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
507 # origin_genome_2 = origin_genome_long_2[2:-2]
508 mapped_strand_2 = "+" 511 mapped_strand_2 = "+"
509 512 elif nn==2 : # RC-FW mapped to + strand:
510 elif nn==2: # RC-FW mapped to + strand:
511
512 # original_BS_1 = original_bs_reads_2[header]
513 # original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
514 FR = "+RF" 513 FR = "+RF"
515 # mapped_location_1 += 1
516 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
517 # origin_genome_1 = origin_genome_long_1[2:-2]
518 mapped_strand_1 = "+" 514 mapped_strand_1 = "+"
519
520 # mapped_location_2 += 1
521 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
522 # origin_genome_2 = origin_genome_long_2[2:-2]
523 mapped_strand_2 = "+" 515 mapped_strand_2 = "+"
524 516 elif nn==3 : # FW-RC mapped to - strand:
525
526 elif nn==3: # FW-RC mapped to - strand:
527 # original_BS_1=original_bs_reads_1[header]
528 # original_BS_2=reverse_compl_seq(original_bs_reads_2[header])
529
530 FR = "-FR" 517 FR = "-FR"
531 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 518 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
532 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
533 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
534 # origin_genome_1 = origin_genome_long_1[2:-2]
535 mapped_strand_1 = "-" 519 mapped_strand_1 = "-"
536
537 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 520 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
538 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1 ])
539 # origin_genome_2 = origin_genome_long_2[2:-2]
540 mapped_strand_2 = "-" 521 mapped_strand_2 = "-"
541 522 elif nn==4 : # RC-FW mapped to - strand:
542 elif nn==4: # RC-FW mapped to - strand:
543 # original_BS_1=original_bs_reads_2[header]
544 # original_BS_2=reverse_compl_seq(original_bs_reads_1[header])
545
546 FR = "-RF" 523 FR = "-RF"
547 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 524 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
548 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
549 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
550 # origin_genome_1 = origin_genome_long_1[2:-2]
551 mapped_strand_1 = "-" 525 mapped_strand_1 = "-"
552
553 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 526 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
554 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
555 # origin_genome_2 = origin_genome_long_2[2:-2]
556 mapped_strand_2 = "-" 527 mapped_strand_2 = "-"
557
558
559
560 528
561 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) 529 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
562 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) 530 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
563 531
564 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) 532 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
565 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) 533 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
566 534
567
568 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides 535 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
569 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides 536 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
570 537
571 538 N_mismatch = max(N_mismatch_1, N_mismatch_2)
572 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no) : 539 mm_no=float(max_mismatch_no)
540 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1)
541 and N_mismatch_2<=mm_no*len(original_BS_2)):
542
573 all_mapped_passed += 1 543 all_mapped_passed += 1
574 numbers_mapped_lst[nn-1] += 1 544 numbers_mapped_lst[nn-1] += 1
575 #---- unmapped ------------------------- 545 #---- unmapped -------------------------
576 del original_bs_reads_1[header] 546 del original_bs_reads_1[header]
577 del original_bs_reads_2[header] 547 del original_bs_reads_2[header]
578 #--------------------------------------- 548 #---------------------------------------
579 549
580 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:] 550 methy_1 = methy_seq(r_aln_1, g_aln_1 + next_1)
581 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:] 551 methy_2 = methy_seq(r_aln_2, g_aln_2 + next_2)
582 552
553 mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst)
554 mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst)
555
556 #---XS FILTER----------------
557 XS_1 = 0
558 nCH_1 = methy_1.count('y') + methy_1.count('z')
559 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
560 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
561 XS_1 = 1
562 XS_2 = 0
563 nCH_2 = methy_2.count('y') + methy_2.count('z')
564 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
565 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
566 XS_2 = 1
567
568 if mapped_strand_1=='+' :
569 flag_1 = 67 # 1000011 : 1st read, + strand
570 flag_2 = 131 #10000011 : 2nd read, + strand
571 else :
572 flag_1 = 115 # 1110011 : 1st read, - strand
573 flag_2 = 179 # 10110011 : 2nd read, - strand
574
575 outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
576 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
577 all_base_mapped += len(original_BS_1)
578
579 outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
580 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
581 all_base_mapped += len(original_BS_2)
582
583
584 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
585 #----------------------------------------------------------------
586 # output unmapped pairs
587 #----------------------------------------------------------------
588
589 unmapped_lst = original_bs_reads_1.keys()
590 unmapped_lst.sort()
591
592 # for u in unmapped_lst:
593 # outf_u1.write("%s\n"%original_bs_reads_1[u])
594 # outf_u2.write("%s\n"%original_bs_reads_2[u])
595
596 all_unmapped += len(unmapped_lst)
597
598
599 # Directional library
600 if asktag=="N":
601
602 #----------------------------------------------------------------
603 outfile_1FCT = tmp_d('Trimed_FCT_1.fa'+random_id)
604 outfile_2RGA = tmp_d('Trimed_RGA_2.fa'+random_id)
605
606 try :
607 if read_file_1.endswith(".gz") : # support input file ending with ".gz"
608 read_inf = gzip.open(tmp_d(read_file_1), "rb")
609 else :
610 read_inf = open(tmp_d(read_file_1),"r")
611 except IOError :
612 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
613 exit(-1)
614
615 oneline = read_inf.readline()
616 l = oneline.split()
617 input_format = ""
618
619 if oneline[0]=="@" :
620 input_format = "fastq"
621 elif len(l)==1 and oneline[0]!=">" :
622 input_format = "seq"
623 elif len(l)==11 :
624 input_format = "qseq"
625 elif oneline[0]==">" :
626 input_format = "fasta"
627
628 read_inf.close()
629
630 print "Detected data format: %s" % input_format
631
632 #----------------------------------------------------------------
633 read_file_list = [read_file_1,read_file_2]
634 outfile_FCT_list = [outfile_1FCT,outfile_2RGA]
635 n_list = [0,0]
636
637 for f in range(2):
638 read_file = read_file_list[f]
639 outf_FCT = open(outfile_FCT_list[f],'w')
640 original_bs_reads = original_bs_reads_lst[f]
641 n = n_list[f]
642
643 read_id = ""
644 seq = ""
645 seq_ready = "N"
646 line_no = 0
647 for line in fileinput.input(tmp_d(read_file)):
648 l = line.split()
649 line_no += 1
650 if input_format=="seq":
651 n += 1
652 read_id = str(n)
653 read_id = read_id.zfill(12)
654 if f == 1 :
655 read_id_lst_1[read_id] = read_id
656 else :
657 read_id_lst_2[read_id] = read_id
658 seq = l[0]
659 seq_ready = "Y"
660 elif input_format=="fastq":
661 l_fastq = math.fmod(line_no, 4)
662 if l_fastq==1 :
663 all_raw_reads += 1
664 read_id = str(line_no/4+1).zfill(12)
665 if f==1 :
666 read_id_lst_1[read_id] = l[0][1:]
667 else :
668 read_id_lst_2[read_id] = l[0][1:]
669 seq_ready = "N"
670 elif l_fastq==2 :
671 seq = l[0]
672 seq_ready = "Y"
673 else :
674 seq = ""
675 seq_ready = "N"
676 elif input_format=="qseq":
677 all_raw_reads += 1
678 read_id = str(line_no)
679 read_id = read_id.zfill(12)
680 if f == 1 :
681 read_id_lst_1[read_id] = l[0][1:]
682 else :
683 read_id_lst_2[read_id] = l[0][1:]
684 seq = l[8]
685 seq_ready = "Y"
686 elif input_format=="fasta":
687 l_fasta = math.fmod(line_no,2)
688 if l_fasta==1:
689 seq_ready = "N"
690 all_raw_reads += 1
691 read_id = str(line_no/2+1).zfill(12)
692 if f == 1 :
693 read_id_lst_1[read_id] = l[0][1:]
694 else :
695 read_id_lst_2[read_id] = l[0][1:]
696 seq = ""
697 elif l_fasta==0 :
698 seq = l[0]
699 seq_ready = "Y"
700 #----------------------------------------------------------------
701 if seq_ready=="Y":
702 seq = seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52
703 seq = seq.upper()
704 seq = seq.replace(".","N")
705
706 #--striping BS adapter from 3' read --------------------------------------------------------------
707 all_base_before_trim += len(seq)
708 if f==0 and adapterA!="" :
709 new_read = RemoveAdapter(seq, adapterA, adapter_mismatch)
710 if len(new_read) < len(seq) :
711 all_trimmed += 1
712 seq = new_read
713 if f==1 and adapterB!="" :
714 new_read = RemoveAdapter(seq, adapterB, adapter_mismatch)
715 if len(new_read)<len(seq) :
716 all_trimmed += 1
717 seq = new_read
718 all_base_after_trim += len(seq)
719
720 if len(seq)<=4:
721 seq = "N" * (cut2-cut1+1)
722 #--------- trimmed_raw_BS_read ------------------
723 original_bs_reads[read_id] = seq
724
725 #--------- FW_C2T ------------------
726 if f==0:
727 outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("C","T")))
728 elif f==1:
729 outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("G","A")))
730
731 n_list[f] = n
732 outf_FCT.close()
733 fileinput.close()
734
735 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
736 all_raw_reads += n
737
738 #--------------------------------------------------------------------------------
739 # Bowtie mapping
740 #--------------------------------------------------------------------------------
741 WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
742 CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
743
744 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
745 'input_file_1' : outfile_1FCT,
746 'input_file_2' : outfile_2RGA,
747 'output_file' : WC2T_fr},
748
749 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
750 'input_file_1' : outfile_1FCT,
751 'input_file_2' : outfile_2RGA,
752 'output_file' : CG2A_fr} ])
753
754
755 delete_files(outfile_1FCT, outfile_2RGA)
756
757 #--------------------------------------------------------------------------------
758 # Post processing
759 #--------------------------------------------------------------------------------
760
761 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
762 RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr)
763
764 #----------------------------------------------------------------
765 # get uniq-hit reads
766 #----------------------------------------------------------------
767 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys())
768
769 Unique_FW_fr_C2T = set() # +
770 Unique_RC_fr_G2A = set() # -
771 Multiple_hits=set()
772
773
774 for x in Union_set:
775 _list = []
776 for d in [FW_C2T_fr_U, RC_G2A_fr_U]:
777 mis_lst = d.get(x,[99])
778 mis = int(mis_lst[0])
779 _list.append(mis)
780 for d in [FW_C2T_fr_R, RC_G2A_fr_R]:
781 mis = d.get(x,99)
782 _list.append(mis)
783 mini = min(_list)
784 if _list.count(mini) == 1:
785 mini_index = _list.index(mini)
786 if mini_index == 0:
787 Unique_FW_fr_C2T.add(x)
788 elif mini_index == 1:
789 Unique_RC_fr_G2A.add(x)
790 else :
791 Multiple_hits.add(x)
792 else :
793 Multiple_hits.add(x)
794
795 # write reads rejected by Multiple Hits to file
796 if show_multiple_hit :
797 outf_MH = open("Multiple_hit.fa",'w')
798 for i in Multiple_hits :
799 outf_MH.write(">%s\n" % i)
800 outf_MH.write("%s\n" % original_bs_reads[i])
801 outf_MH.close()
802
803 FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
804 RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A]
805
806 FW_C2T_fr_uniq_lst.sort()
807 RC_C2T_fr_uniq_lst.sort()
808 FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst]
809 RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst]
810 #----------------------------------------------------------------
811
812 numbers_premapped_lst[0] += len(Unique_FW_fr_C2T)
813 numbers_premapped_lst[1] += len(Unique_RC_fr_G2A)
814
815
816 #----------------------------------------------------------------
817
818 nn = 0
819 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_G2A_fr_U)]:
820 nn += 1
821 mapped_chr0 = ""
822 for header in ali_unique_lst:
823 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
824
825 #-------------------------------------
826 if mapped_chr != mapped_chr0:
827 my_gseq = deserialize(db_d(mapped_chr))
828 chr_length = len(my_gseq)
829 mapped_chr0 = mapped_chr
830 #-------------------------------------
831
832 original_BS_1 = original_bs_reads_1[header]
833 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
834 #original_BS_2 = original_bs_reads_2[header]
835
836 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
837 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
838
839 all_mapped += 1
840
841 if nn == 1: # FW-RC mapped to + strand:
842 FR = "+FR"
843 mapped_strand_1 = "+"
844 mapped_strand_2 = "+"
845 elif nn == 2: # FW-RC mapped to - strand:
846 FR="-FR"
847 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
848 mapped_strand_1 = "-"
849 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
850 mapped_strand_2 = "-"
851
852 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
853 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
854
855 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
856 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
857
858 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
859 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
860
861 # if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no):
862 # if N_mismatch <= int(max_mismatch_no) :
863 N_mismatch = max(N_mismatch_1, N_mismatch_2)
864 mm_no=float(max_mismatch_no)
865 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1)
866 and N_mismatch_2<=mm_no*len(original_BS_2)):
867
868 numbers_mapped_lst[nn-1] += 1
869 all_mapped_passed += 1
870
871 #---- unmapped -------------------------
872 del original_bs_reads_1[header]
873 del original_bs_reads_2[header]
874
875 #---------------------------------------
583 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) 876 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
584 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) 877 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
585 878 mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst)
586 mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst) 879 mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst)
587 mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst)
588
589 880
590 # 881 #
591 #---XS FILTER---------------- 882 #---XS FILTER----------------
592 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 883 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
593 XS_1 = 0 884 XS_1 = 0
600 nCH_2 = methy_2.count('y') + methy_2.count('z') 891 nCH_2 = methy_2.count('y') + methy_2.count('z')
601 nmCH_2 = methy_2.count('Y') + methy_2.count('Z') 892 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
602 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : 893 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
603 XS_2 = 1 894 XS_2 = 1
604 895
605 896 if mapped_strand_1 == '+' :
606 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, 897 flag_1 = 67 # 1000011 : 1st read, + strand
607 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) 898 flag_2 = 131 # 10000011 : 2nd read, + strand
608 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, 899 else :
609 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) 900 flag_1 = 115 # 1110011 : 1st read, - strand
610 901 flag_2 = 179 # 10110011 : 2nd read, - strand
611 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) 902
612 #---------------------------------------------------------------- 903 outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
613 # output unmapped pairs
614 #----------------------------------------------------------------
615
616 unmapped_lst=original_bs_reads_1.keys()
617 unmapped_lst.sort()
618
619 # for u in unmapped_lst:
620 # outf_u1.write("%s\n"%original_bs_reads_1[u])
621 # outf_u2.write("%s\n"%original_bs_reads_2[u])
622
623 all_unmapped += len(unmapped_lst)
624
625
626 if asktag=="N":
627
628 #----------------------------------------------------------------
629 outfile_1FCT= tmp_d('Trimed_FCT_1.fa'+random_id)
630 outfile_2FCT= tmp_d('Trimed_FCT_2.fa'+random_id)
631
632 try :
633 read_inf=open(tmp_d(read_file_1),"r")
634 except IOError :
635 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
636 exit(-1)
637
638 oneline=read_inf.readline()
639 l=oneline.split()
640 input_format=""
641
642 #if len(l)==5: # old solexa format
643 # input_format="old Solexa Seq file"
644
645 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009)
646 input_format="FastQ"
647 n_fastq=0
648 elif len(l)==1 and oneline[0]!=">": # pure sequences
649 input_format="_list of sequences"
650 elif len(l)==11: # Illumina GAII qseq file
651 input_format="Illumina GAII qseq file"
652 elif oneline[0]==">": # fasta
653 input_format="fasta"
654 n_fasta=0
655
656 read_inf.close()
657
658 print "Detected data format: %s" % input_format
659
660 #----------------------------------------------------------------
661 read_file_list=[read_file_1,read_file_2]
662 outfile_FCT_list=[outfile_1FCT,outfile_2FCT]
663 n_list=[0,0]
664
665 for f in range(2):
666 read_file=read_file_list[f]
667 outf_FCT=open(outfile_FCT_list[f],'w')
668 original_bs_reads = original_bs_reads_lst[f]
669 n=n_list[f]
670
671 seq_id=""
672 seq=""
673 seq_ready="N"
674 for line in fileinput.input(tmp_d(read_file)):
675 l=line.split()
676 if input_format=="old Solexa Seq file":
677 n+=1
678 seq_id=str(n)
679 seq_id=seq_id.zfill(12)
680 seq=l[4]
681 seq_ready="Y"
682 elif input_format=="_list of sequences":
683 n+=1
684 seq_id=str(n)
685 seq_id=seq_id.zfill(12)
686 seq=l[0]
687 seq_ready="Y"
688 elif input_format=="FastQ":
689 m_fastq=math.fmod(n_fastq,4)
690 n_fastq+=1
691 seq_ready="N"
692 if m_fastq==0:
693 n+=1
694 seq_id=str(n)
695 seq_id=seq_id.zfill(12)
696 seq=""
697 elif m_fastq==1:
698 seq=l[0]
699 seq_ready="Y"
700 else:
701 seq=""
702 elif input_format=="Illumina GAII qseq file":
703 n+=1
704 seq_id=str(n)
705 seq_id=seq_id.zfill(12)
706 seq=l[8]
707 seq_ready="Y"
708 elif input_format=="fasta":
709 m_fasta=math.fmod(n_fasta,2)
710 n_fasta+=1
711 seq_ready="N"
712 if m_fasta==0:
713 n+=1
714 seq_id=l[0][1:]
715 seq_id=seq_id.zfill(17)
716 seq=""
717 elif m_fasta==1:
718 seq=l[0]
719 seq_ready="Y"
720 else:
721 seq=""
722 #----------------------------------------------------------------
723 if seq_ready=="Y":
724 seq=seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52
725 seq=seq.upper()
726 seq=seq.replace(".","N")
727
728 #--striping BS adapter from 3' read --------------------------------------------------------------
729 if (adapterA !="") and (adapterB !=""):
730 signature=adapterA[:6]
731 if signature in seq:
732 signature_pos=seq.index(signature)
733 if seq[signature_pos:] in adapterA:
734 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
735 all_trimmed+=1
736 else:
737 signature=adapterB[:6]
738 if signature in seq:
739 #print seq_id,seq,signature;
740 signature_pos=seq.index(signature)
741 if seq[signature_pos:] in adapterB:
742 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
743 all_trimmed+=1
744
745 if len(seq) <= 4:
746 seq = "N" * (cut2-cut1+1)
747 #--------- trimmed_raw_BS_read ------------------
748 original_bs_reads[seq_id] = seq
749
750 #--------- FW_C2T ------------------
751 if f==0:
752 outf_FCT.write('>%s\n%s\n'% (seq_id, seq.replace("C","T")))
753 elif f==1:
754 outf_FCT.write('>%s\n%s\n'% (seq_id, reverse_compl_seq(seq).replace("C","T")))
755
756
757 n_list[f]=n
758 outf_FCT.close()
759
760 fileinput.close()
761
762
763 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
764 all_raw_reads+=n
765
766 #--------------------------------------------------------------------------------
767 # Bowtie mapping
768 #--------------------------------------------------------------------------------
769 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
770 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
771
772 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
773 'input_file_1' : outfile_1FCT,
774 'input_file_2' : outfile_2FCT,
775 'output_file' : WC2T_fr},
776
777 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
778 'input_file_1' : outfile_1FCT,
779 'input_file_2' : outfile_2FCT,
780 'output_file' : CC2T_fr} ])
781
782
783 delete_files(outfile_1FCT, outfile_2FCT)
784
785 #--------------------------------------------------------------------------------
786 # Post processing
787 #--------------------------------------------------------------------------------
788
789 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
790 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
791
792 #----------------------------------------------------------------
793 # get uniq-hit reads
794 #----------------------------------------------------------------
795 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys())
796
797 Unique_FW_fr_C2T = set() # +
798 Unique_RC_fr_C2T = set() # -
799 Multiple_hits=set()
800
801
802 for x in Union_set:
803 _list = []
804 for d in [FW_C2T_fr_U, RC_C2T_fr_U]:
805 mis_lst = d.get(x,[99])
806 mis = int(mis_lst[0])
807 _list.append(mis)
808 for d in [FW_C2T_fr_R, RC_C2T_fr_R]:
809 mis = d.get(x,99)
810 _list.append(mis)
811 mini = min(_list)
812 if _list.count(mini) == 1:
813 mini_index = _list.index(mini)
814 if mini_index == 0:
815 Unique_FW_fr_C2T.add(x)
816 elif mini_index == 1:
817 Unique_RC_fr_C2T.add(x)
818 else :
819 Multiple_hits.add(x)
820 else :
821 Multiple_hits.add(x)
822
823 # write reads rejected by Multiple Hits to file
824 if show_multiple_hit :
825 outf_MH=open("Multiple_hit.fa",'w')
826 for i in Multiple_hits :
827 outf_MH.write(">%s\n" % i)
828 outf_MH.write("%s\n" % original_bs_reads[i])
829 outf_MH.close()
830
831 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
832 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
833
834 FW_C2T_fr_uniq_lst.sort()
835 RC_C2T_fr_uniq_lst.sort()
836 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
837 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
838 #----------------------------------------------------------------
839
840 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
841 numbers_premapped_lst[1]+=len(Unique_RC_fr_C2T)
842
843
844 #----------------------------------------------------------------
845
846 nn = 0
847 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_C2T_fr_U)]:
848 nn += 1
849 mapped_chr0 = ""
850 for header in ali_unique_lst:
851 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
852
853
854 #-------------------------------------
855 if mapped_chr != mapped_chr0:
856 my_gseq = deserialize(db_d(mapped_chr))
857 chr_length = len(my_gseq)
858 mapped_chr0 = mapped_chr
859 #-------------------------------------
860
861 original_BS_1 = original_bs_reads_1[header]
862 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
863
864 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
865 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
866
867 all_mapped += 1
868
869 if nn == 1: # FW-RC mapped to + strand:
870 FR = "+FR"
871 # mapped_location_1 += 1
872 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
873 # origin_genome_1 = origin_genome_long_1[2:-2]
874 mapped_strand_1 = "+"
875
876 # mapped_location_2 += 1
877 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
878 # origin_genome_2 = origin_genome_long_2[2:-2]
879 mapped_strand_2 = "+"
880
881 elif nn == 2: # FW-RC mapped to - strand:
882
883 FR="-FR"
884 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
885 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
886 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
887 # origin_genome_1 = origin_genome_long_1[2:-2]
888 mapped_strand_1 = "-"
889
890 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
891 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
892 # origin_genome_2 = origin_genome_long_2[2:-2]
893 mapped_strand_2 = "-"
894
895
896
897 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
898 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
899
900
901 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
902 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
903
904 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
905 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
906
907 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no):
908
909 numbers_mapped_lst[nn-1] += 1
910 all_mapped_passed += 1
911
912 #---- unmapped -------------------------
913 del original_bs_reads_1[header]
914 del original_bs_reads_2[header]
915
916 #---------------------------------------
917 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
918 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
919 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
920 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
921 mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst)
922 mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst)
923
924 #
925 #---XS FILTER----------------
926 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
927 XS_1 = 0
928 nCH_1 = methy_1.count('y') + methy_1.count('z')
929 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
930 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
931 XS_1 = 1
932 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
933 XS_2 = 0
934 nCH_2 = methy_2.count('y') + methy_2.count('z')
935 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
936 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
937 XS_2 = 1
938
939
940 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
941 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) 904 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
942 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, 905 all_base_mapped += len(original_BS_1)
906
907 outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
943 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) 908 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
909 all_base_mapped += len(original_BS_2)
944 910
945 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) 911 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
946 #---------------------------------------------------------------- 912 #----------------------------------------------------------------
947 # output unmapped pairs 913 # output unmapped pairs
948 #---------------------------------------------------------------- 914 #----------------------------------------------------------------
954 # outf_u1.write("%s\n"%(original_bs_reads_1[u])) 920 # outf_u1.write("%s\n"%(original_bs_reads_1[u]))
955 # outf_u2.write("%s\n"%(original_bs_reads_2[u]) ) 921 # outf_u2.write("%s\n"%(original_bs_reads_2[u]) )
956 922
957 all_unmapped+=len(unmapped_lst) 923 all_unmapped+=len(unmapped_lst)
958 924
959
960 #================================================================================================== 925 #==================================================================================================
961 # outf.close()
962 #
963 # outf_u1.close()
964 # outf_u2.close()
965 delete_files(tmp_path) 926 delete_files(tmp_path)
966 927
967 logm("-------------------------------- " ) 928 logm("-------------------------------- " )
968 logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) ) 929 logm("Number of raw BS-read pairs: %d " %(all_raw_reads/2) )
969 logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\n") 930 if adapterA != "" or adapterB != "" :
931 logm("Number of reads having adapter removed: %d \n"% all_trimmed)
932 logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) )
970 933
971 if all_raw_reads >0: 934 if all_raw_reads >0:
972 935
973 logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\n") 936 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
937 logm("Number of unique-hits reads (before post-filtering): %d" % all_mapped + "\n")
974 if asktag=="Y": 938 if asktag=="Y":
975 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) 939 logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
976 logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) 940 logm("-- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
977 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) 941 logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
978 logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) 942 logm("-- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
979 elif asktag=="N": 943 elif asktag=="N":
980 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) 944 logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
981 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) ) 945 logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
982 946 logm("--- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
983
984 logm("O --- Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
985 logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
986 if asktag=="Y": 947 if asktag=="Y":
987 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) 948 logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
988 logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) ) 949 logm("----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) )
989 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) ) 950 logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) )
990 logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) ) 951 logm("----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) )
991 elif asktag=="N": 952 elif asktag=="N":
992 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) 953 logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
993 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) ) 954 logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) )
994 955
995 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) 956 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)*2/all_raw_reads) )
996 logm("O Unmapped read pairs: %d"% all_unmapped+"\n") 957 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
997 958 logm("Unmapped read pairs: %d"% all_unmapped+"\n")
998 959
999 n_CG=mC_lst[0]+uC_lst[0] 960 n_CG=mC_lst[0]+uC_lst[0]
1000 n_CHG=mC_lst[1]+uC_lst[1] 961 n_CHG=mC_lst[1]+uC_lst[1]
1001 n_CHH=mC_lst[2]+uC_lst[2] 962 n_CHH=mC_lst[2]+uC_lst[2]
1002 963