Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_align/bs_pair_end.py @ 1:8b26adf64adc draft default tip
V2.0.5
author | weilong-guo |
---|---|
date | Tue, 05 Nov 2013 01:55:39 -0500 |
parents | e6df770c0e58 |
children |
comparison
equal
deleted
inserted
replaced
0:e6df770c0e58 | 1:8b26adf64adc |
---|---|
102 no_small_lines, | 102 no_small_lines, |
103 max_mismatch_no, | 103 max_mismatch_no, |
104 aligner_command, | 104 aligner_command, |
105 db_path, | 105 db_path, |
106 tmp_path, | 106 tmp_path, |
107 outfile, XS_pct, XS_count, show_multiple_hit=False): | 107 outfile, XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False): |
108 | 108 |
109 | 109 |
110 #---------------------------------------------------------------- | 110 #---------------------------------------------------------------- |
111 adapter="" | 111 adapter = "" |
112 adapterA="" | 112 adapterA = "" |
113 adapterB="" | 113 adapterB = "" |
114 if adapter_file !="": | 114 if adapter_file !="": |
115 try : | 115 try : |
116 adapter_inf=open(adapter_file,"r") | 116 adapter_inf=open(adapter_file,"r") |
117 if asktag=="N": #<--- directional library | 117 if asktag=="N": #<--- directional library |
118 adapter=adapter_inf.readline() | 118 adapter=adapter_inf.readline() |
132 #---------------------------------------------------------------- | 132 #---------------------------------------------------------------- |
133 | 133 |
134 logm("End 1 filename: %s"% main_read_file_1 ) | 134 logm("End 1 filename: %s"% main_read_file_1 ) |
135 logm("End 2 filename: %s"% main_read_file_2 ) | 135 logm("End 2 filename: %s"% main_read_file_2 ) |
136 logm("The first base (for mapping): %d"% cut1 ) | 136 logm("The first base (for mapping): %d"% cut1 ) |
137 logm("The last base (for mapping): %d"% cut2 ) | 137 logm("The last base (for mapping): %d"% cut2 ) |
138 | 138 |
139 logm("-------------------------------- " ) | |
140 logm("Un-directional library: %s" % asktag ) | |
141 logm("Path for short reads aligner: %s"% aligner_command + '\n') | 139 logm("Path for short reads aligner: %s"% aligner_command + '\n') |
142 logm("Reference genome library path: %s"% db_path ) | 140 logm("Reference genome library path: %s"% db_path ) |
141 | |
142 if asktag == "Y" : | |
143 logm("Un-directional library" ) | |
144 else : | |
145 logm("Directional library") | |
143 logm("Number of mismatches allowed: %s"% max_mismatch_no ) | 146 logm("Number of mismatches allowed: %s"% max_mismatch_no ) |
144 | 147 |
145 if adapter_file !="": | 148 if adapter_file !="": |
146 if asktag=="Y": | 149 if asktag=="Y": |
147 logm("Adapters to be removed from 3' of the reads:" ) | 150 logm("Adapters to be removed from 3' of the reads:" ) |
180 #---- Stats ------------------------------------------------------------ | 183 #---- Stats ------------------------------------------------------------ |
181 all_raw_reads=0 | 184 all_raw_reads=0 |
182 all_trimmed=0 | 185 all_trimmed=0 |
183 all_mapped=0 | 186 all_mapped=0 |
184 all_mapped_passed=0 | 187 all_mapped_passed=0 |
188 all_base_before_trim=0 | |
189 all_base_after_trim=0 | |
190 all_base_mapped=0 | |
185 | 191 |
186 all_unmapped=0 | 192 all_unmapped=0 |
187 | 193 |
188 numbers_premapped_lst=[0,0,0,0] | 194 numbers_premapped_lst=[0,0,0,0] |
189 numbers_mapped_lst=[0,0,0,0] | 195 numbers_mapped_lst=[0,0,0,0] |
190 | 196 |
191 mC_lst=[0,0,0] | 197 mC_lst=[0,0,0] |
192 uC_lst=[0,0,0] | 198 uC_lst=[0,0,0] |
193 | 199 |
194 no_my_files=0 | 200 no_my_files=0 |
201 read_id_lst_1 = dict() | |
202 read_id_lst_2 = dict() | |
195 | 203 |
196 #---------------------------------------------------------------- | 204 #---------------------------------------------------------------- |
197 print "== Start mapping ==" | 205 print "== Start mapping ==" |
198 | 206 |
199 for read_file_1, read_file_2 in my_files: | 207 for read_file_1, read_file_2 in my_files: |
210 if asktag == "Y" : | 218 if asktag == "Y" : |
211 | 219 |
212 #---------------------------------------------------------------- | 220 #---------------------------------------------------------------- |
213 outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id) | 221 outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id) |
214 outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id) | 222 outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id) |
215 outfile_2FCT = tmp_d('Trimmed_FCT_2.fa'+random_id) | 223 outfile_2RGA = tmp_d('Trimmed_FCT_2.fa'+random_id) |
216 outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id) | 224 outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id) |
217 | 225 |
218 try : | 226 try : |
219 read_inf = open(tmp_d(read_file_1),"r") | 227 if read_file_1.endswith(".gz") : # support input file ending with ".gz" |
228 read_inf = gzip.open(tmp_d(read_file_1), "rb") | |
229 else : | |
230 read_inf = open(tmp_d(read_file_1),"r") | |
220 except IOError : | 231 except IOError : |
221 print "[Error] Cannot open file : %s", tmp_d(read_file_1) | 232 print "[Error] Cannot open file : %s", tmp_d(read_file_1) |
222 exit(-1) | 233 exit(-1) |
223 oneline = read_inf.readline() | 234 oneline = read_inf.readline() |
224 l = oneline.split() | 235 l = oneline.split() |
225 input_format = "" | 236 input_format = "" |
226 | 237 |
227 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) | 238 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) |
228 input_format="fastq" | 239 input_format="fastq" |
229 n_fastq=0 | |
230 elif len(l)==1 and oneline[0]!=">": # pure sequences | 240 elif len(l)==1 and oneline[0]!=">": # pure sequences |
231 input_format="seq" | 241 input_format="seq" |
232 elif len(l)==11: # Illumina GAII qseq file | 242 elif len(l)==11: # Illumina GAII qseq file |
233 input_format="qseq" | 243 input_format="qseq" |
234 elif oneline[0]==">": # fasta | 244 elif oneline[0]==">": # fasta |
235 input_format="fasta" | 245 input_format="fasta" |
236 n_fasta=0 | |
237 read_inf.close() | 246 read_inf.close() |
238 | 247 |
239 print "Detected data format: %s" % input_format | 248 print "Detected data format: %s" % input_format |
240 | 249 |
241 #---------------------------------------------------------------- | 250 #---------------------------------------------------------------- |
242 read_file_list = [read_file_1, read_file_2] | 251 read_file_list = [read_file_1, read_file_2] |
243 outfile_FCT_list = [outfile_1FCT, outfile_2FCT] | 252 outfile_FCT_list = [outfile_1FCT, outfile_2RGA] |
244 outfile_RCT_list = [outfile_1RCT, outfile_2RCT] | 253 outfile_RCT_list = [outfile_1RCT, outfile_2RCT] |
245 n_list = [0, 0] | 254 n_list = [0, 0] |
246 | |
247 | |
248 for f in range(2): | 255 for f in range(2): |
249 read_file = read_file_list[f] | 256 read_file = read_file_list[f] |
250 outf_FCT = open(outfile_FCT_list[f], 'w') | 257 outf_FCT = open(outfile_FCT_list[f], 'w') |
251 outf_RCT = open(outfile_RCT_list[f], 'w') | 258 outf_RGA = open(outfile_RCT_list[f], 'w') |
252 original_bs_reads = original_bs_reads_lst[f] | 259 original_bs_reads = original_bs_reads_lst[f] |
253 n = n_list[f] | 260 n = n_list[f] |
254 | 261 read_id = "" |
255 seq_id = "" | |
256 seq = "" | 262 seq = "" |
257 seq_ready = "N" | 263 seq_ready = "N" |
264 line_no = 0 | |
258 for line in fileinput.input(tmp_d(read_file)): | 265 for line in fileinput.input(tmp_d(read_file)): |
259 l=line.split() | 266 l = line.split() |
267 line_no += 1 | |
260 if input_format=="seq": | 268 if input_format=="seq": |
261 n+=1 | 269 n += 1 |
262 seq_id=str(n) | 270 read_id = str(n) |
263 seq_id=seq_id.zfill(12) | 271 read_id = read_id.zfill(12) |
264 seq=l[0] | 272 if f == 1 : |
265 seq_ready="Y" | 273 read_id_lst_1[read_id] = read_id |
274 else : | |
275 read_id_lst_2[read_id] = read_id | |
276 seq = l[0] | |
277 seq_ready = "Y" | |
266 elif input_format=="fastq": | 278 elif input_format=="fastq": |
267 m_fastq=math.fmod(n_fastq,4) | 279 l_fastq = math.fmod(line_no, 4) |
268 n_fastq+=1 | 280 if l_fastq==1 : |
269 seq_ready="N" | 281 all_raw_reads += 1 |
270 if m_fastq==0: | 282 read_id = str(line_no/4+1).zfill(12) |
271 n+=1 | 283 if f==1 : |
272 seq_id=str(n) | 284 read_id_lst_1[read_id] = l[0][1:] |
273 seq_id=seq_id.zfill(12) | 285 else : |
274 seq="" | 286 read_id_lst_2[read_id] = l[0][1:] |
275 elif m_fastq==1: | 287 seq_ready="N" |
276 seq=l[0] | 288 elif l_fastq==2 : |
277 seq_ready="Y" | 289 seq = l[0] |
278 else: | 290 seq_ready = "Y" |
279 seq="" | 291 else : |
292 seq = "" | |
293 seq_ready = "N" | |
280 elif input_format=="qseq": | 294 elif input_format=="qseq": |
281 n+=1 | 295 all_raw_reads += 1 |
282 seq_id=str(n) | 296 read_id = str(line_no) |
283 seq_id=seq_id.zfill(12) | 297 read_id = read_id.zfill(12) |
284 seq=l[8] | 298 if f == 1 : |
285 seq_ready="Y" | 299 read_id_lst_1[read_id] = l[0][1:] |
300 else : | |
301 read_id_lst_2[read_id] = l[0][1:] | |
302 seq = l[8] | |
303 seq_ready = "Y" | |
286 elif input_format=="fasta": | 304 elif input_format=="fasta": |
287 m_fasta=math.fmod(n_fasta,2) | 305 l_fasta = math.fmod(line_no,2) |
288 n_fasta+=1 | 306 seq_ready = "N" |
289 seq_ready="N" | 307 if l_fasta==1: |
290 if m_fasta==0: | 308 all_raw_reads += 1 |
291 n+=1 | 309 read_id = str(line_no/2+1).zfill(12) |
292 seq_id=l[0][1:] | 310 if f==1 : |
293 seq_id=seq_id.zfill(17) | 311 read_id_lst_1[read_id] = l[0][1:] |
294 seq="" | 312 else : |
295 elif m_fasta==1: | 313 read_id_lst_2[read_id] = l[0][1:] |
296 seq=l[0] | 314 seq = "" |
297 seq_ready="Y" | 315 elif l_fasta==0 : |
298 else: | 316 seq = l[0] |
299 seq="" | 317 seq_ready = "Y" |
300 #---------------------------------------------------------------- | 318 #---------------------------------------------------------------- |
301 if seq_ready=="Y": | 319 if seq_ready=="Y": |
302 seq=seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52 | 320 seq = seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52 |
303 seq=seq.upper() | 321 seq = seq.upper() |
304 seq=seq.replace(".","N") | 322 seq = seq.replace(".","N") |
305 | 323 |
306 #--striping BS adapter from 3' read -------------------------------------------------------------- | 324 #--striping BS adapter from 3' read -------------------------------------------------------------- |
307 if (adapterA !="") and (adapterB !=""): | 325 all_base_before_trim += len(seq) |
308 signature=adapterA[:6] | 326 if f==0 and adapterA!="" : |
309 if signature in seq: | 327 new_read = RemoveAdapter(seq, adapterA, adapter_mismatch) |
310 signature_pos=seq.index(signature) | 328 if len(new_read)<len(seq) : |
311 if seq[signature_pos:] in adapterA: | 329 all_trimmed += 1 |
312 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) | 330 seq = new_read |
313 all_trimmed+=1 | 331 if f==1 and adapterB!="" : |
314 else: | 332 new_read = RemoveAdapter(seq, adapterB, adapter_mismatch) |
315 signature=adapterB[:6] | 333 if len(new_read)<len(seq) : |
316 if signature in seq: | 334 all_trimmed += 1 |
317 #print seq_id,seq,signature; | 335 seq = new_read |
318 signature_pos=seq.index(signature) | 336 all_base_after_trim += len(seq) |
319 if seq[signature_pos:] in adapterB: | 337 |
320 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) | 338 if len(seq)<=4: |
321 all_trimmed+=1 | |
322 | |
323 if len(seq) <= 4: | |
324 seq = "N" * (cut2-cut1+1) | 339 seq = "N" * (cut2-cut1+1) |
325 #--------- trimmed_raw_BS_read ------------------ | 340 #--------- trimmed_raw_BS_read ------------------ |
326 original_bs_reads[seq_id] = seq | 341 original_bs_reads[read_id] = seq |
327 | 342 |
328 #--------- FW_C2T ------------------ | 343 #--------- FW_C2T ------------------ |
329 outf_FCT.write('>%s\n%s\n' % (seq_id, seq.replace("C","T"))) | 344 outf_FCT.write('>%s\n%s\n' % (read_id, seq.replace("C","T"))) |
330 #--------- RC_G2A ------------------ | 345 #--------- RC_G2A ------------------ |
331 outf_RCT.write('>%s\n%s\n' % (seq_id, seq.replace("G","A"))) | 346 outf_RGA.write('>%s\n%s\n' % (read_id, seq.replace("G","A"))) |
332 | 347 |
333 n_list[f]=n | 348 n_list[f] = n |
334 | 349 |
335 outf_FCT.close() | 350 outf_FCT.close() |
336 outf_RCT.close() | 351 outf_RGA.close() |
337 | 352 |
338 fileinput.close() | 353 fileinput.close() |
339 | 354 |
340 | |
341 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); | 355 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); |
342 all_raw_reads+=n | 356 all_raw_reads += n |
343 | 357 |
344 #-------------------------------------------------------------------------------- | 358 #-------------------------------------------------------------------------------- |
345 # Bowtie mapping | 359 # Bowtie mapping |
346 #-------------------------------------------------------------------------------- | 360 #-------------------------------------------------------------------------------- |
347 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) | 361 WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) |
348 WC2T_rf=tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) | 362 WC2T_rf = tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) |
349 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) | 363 CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) |
350 CC2T_rf=tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) | 364 CC2T_rf = tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) |
351 | 365 |
352 run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | 366 run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), |
353 'input_file_1' : outfile_1FCT, | 367 'input_file_1' : outfile_1FCT, |
354 'input_file_2' : outfile_2RCT, | 368 'input_file_2' : outfile_2RCT, |
355 'output_file' : WC2T_fr}, | 369 'output_file' : WC2T_fr}, |
356 | 370 |
357 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | 371 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), |
358 'input_file_1' : outfile_1FCT, | 372 'input_file_1' : outfile_1FCT, |
359 'input_file_2' : outfile_2RCT, | 373 'input_file_2' : outfile_2RCT, |
360 'output_file' : CC2T_fr}, | 374 'output_file' : CG2A_fr}, |
361 | 375 |
362 aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | 376 aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), |
363 'input_file_1' : outfile_2FCT, | 377 'input_file_1' : outfile_2RGA, |
364 'input_file_2' : outfile_1RCT, | 378 'input_file_2' : outfile_1RCT, |
365 'output_file' : WC2T_rf}, | 379 'output_file' : WC2T_rf}, |
366 | 380 |
367 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | 381 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), |
368 'input_file_1' : outfile_2FCT, | 382 'input_file_1' : outfile_2RGA, |
369 'input_file_2' : outfile_1RCT, | 383 'input_file_2' : outfile_1RCT, |
370 'output_file' : CC2T_rf}]) | 384 'output_file' : CC2T_rf}]) |
371 | 385 |
372 | 386 delete_files(outfile_1FCT, outfile_2RGA, outfile_1RCT, outfile_2RCT) |
373 delete_files(outfile_1FCT, outfile_2FCT, outfile_1RCT, outfile_2RCT) | |
374 | 387 |
375 | 388 |
376 #-------------------------------------------------------------------------------- | 389 #-------------------------------------------------------------------------------- |
377 # Post processing | 390 # Post processing |
378 #-------------------------------------------------------------------------------- | 391 #-------------------------------------------------------------------------------- |
379 | 392 |
380 | |
381 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) | 393 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) |
382 FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf) | 394 FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf) |
383 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr) | 395 RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr) |
384 RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf) | 396 RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf) |
385 | 397 |
386 delete_files(WC2T_fr, WC2T_rf, CC2T_fr, CC2T_rf) | 398 delete_files(WC2T_fr, WC2T_rf, CG2A_fr, CC2T_rf) |
387 | 399 |
388 #---------------------------------------------------------------- | 400 #---------------------------------------------------------------- |
389 # get uniq-hit reads | 401 # get uniq-hit reads |
390 #---------------------------------------------------------------- | 402 #---------------------------------------------------------------- |
391 Union_set=set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys()) | 403 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys()) |
392 | 404 |
393 Unique_FW_fr_C2T=set() # + | 405 Unique_FW_fr_C2T = set() # + |
394 Unique_FW_rf_C2T=set() # + | 406 Unique_FW_rf_C2T = set() # + |
395 Unique_RC_fr_C2T=set() # - | 407 Unique_RC_fr_G2A = set() # - |
396 Unique_RC_rf_C2T=set() # - | 408 Unique_RC_rf_C2T = set() # - |
397 Multiple_hits=set() | 409 Multiple_hits = set() |
398 | 410 |
399 | 411 |
400 for x in Union_set: | 412 for x in Union_set: |
401 _list=[] | 413 _list = [] |
402 for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_C2T_fr_U, RC_C2T_rf_U]: | 414 for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_G2A_fr_U, RC_C2T_rf_U]: |
403 mis_lst=d.get(x,[99]) | 415 mis_lst = d.get(x,[99]) |
404 mis=int(mis_lst[0]) | 416 mis = int(mis_lst[0]) |
405 _list.append(mis) | 417 _list.append(mis) |
406 for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_C2T_fr_R, RC_C2T_rf_R]: | 418 for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_G2A_fr_R, RC_C2T_rf_R]: |
407 mis=d.get(x,99) | 419 mis = d.get(x,99) |
408 _list.append(mis) | 420 _list.append(mis) |
409 mini=min(_list) | 421 mini = min(_list) |
410 if _list.count(mini)==1: | 422 if _list.count(mini)==1: |
411 mini_index=_list.index(mini) | 423 mini_index = _list.index(mini) |
412 if mini_index==0: | 424 if mini_index==0: |
413 Unique_FW_fr_C2T.add(x) | 425 Unique_FW_fr_C2T.add(x) |
414 elif mini_index==1: | 426 elif mini_index==1: |
415 Unique_FW_rf_C2T.add(x) | 427 Unique_FW_rf_C2T.add(x) |
416 elif mini_index==2: | 428 elif mini_index==2: |
417 Unique_RC_fr_C2T.add(x) | 429 Unique_RC_fr_G2A.add(x) |
418 elif mini_index==3: | 430 elif mini_index==3: |
419 Unique_RC_rf_C2T.add(x) | 431 Unique_RC_rf_C2T.add(x) |
420 else : | 432 else : |
421 Multiple_hits.add(x) | 433 Multiple_hits.add(x) |
422 else : | 434 else : |
423 Multiple_hits.add(x) | 435 Multiple_hits.add(x) |
424 | 436 |
425 # write reads rejected by Multiple Hits to file | 437 # write reads rejected by Multiple Hits to file |
426 if show_multiple_hit : | 438 if show_multiple_hit : |
427 outf_MH=open("Multiple_hit.fa",'w') | 439 outf_MH = open("Multiple_hit.fa",'w') |
428 for i in Multiple_hits : | 440 for i in Multiple_hits : |
429 outf_MH.write(">%s\n" % i) | 441 outf_MH.write(">%s\n" % i) |
430 outf_MH.write("%s\n" % original_bs_reads[i]) | 442 outf_MH.write("%s\n" % original_bs_reads[i]) |
431 outf_MH.close() | 443 outf_MH.close() |
432 | 444 |
433 del Union_set | 445 del Union_set |
434 del FW_C2T_fr_R | 446 del FW_C2T_fr_R |
435 del FW_C2T_rf_R | 447 del FW_C2T_rf_R |
436 del RC_C2T_fr_R | 448 del RC_G2A_fr_R |
437 del RC_C2T_rf_R | 449 del RC_C2T_rf_R |
438 | 450 |
439 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] | 451 FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] |
440 FW_C2T_rf_uniq_lst=[[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T] | 452 FW_C2T_rf_uniq_lst = [[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T] |
441 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T] | 453 RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A] |
442 RC_C2T_rf_uniq_lst=[[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T] | 454 RC_C2T_rf_uniq_lst = [[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T] |
443 | 455 |
444 FW_C2T_fr_uniq_lst.sort() | 456 FW_C2T_fr_uniq_lst.sort() |
445 FW_C2T_rf_uniq_lst.sort() | 457 FW_C2T_rf_uniq_lst.sort() |
446 RC_C2T_fr_uniq_lst.sort() | 458 RC_C2T_fr_uniq_lst.sort() |
447 RC_C2T_rf_uniq_lst.sort() | 459 RC_C2T_rf_uniq_lst.sort() |
448 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst] | 460 FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst] |
449 FW_C2T_rf_uniq_lst=[x[1] for x in FW_C2T_rf_uniq_lst] | 461 FW_C2T_rf_uniq_lst = [x[1] for x in FW_C2T_rf_uniq_lst] |
450 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst] | 462 RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst] |
451 RC_C2T_rf_uniq_lst=[x[1] for x in RC_C2T_rf_uniq_lst] | 463 RC_C2T_rf_uniq_lst = [x[1] for x in RC_C2T_rf_uniq_lst] |
452 #---------------------------------------------------------------- | 464 #---------------------------------------------------------------- |
453 | 465 |
454 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T) | 466 numbers_premapped_lst[0] += len(Unique_FW_fr_C2T) |
455 numbers_premapped_lst[1]+=len(Unique_FW_rf_C2T) | 467 numbers_premapped_lst[1] += len(Unique_FW_rf_C2T) |
456 numbers_premapped_lst[2]+=len(Unique_RC_fr_C2T) | 468 numbers_premapped_lst[2] += len(Unique_RC_fr_G2A) |
457 numbers_premapped_lst[3]+=len(Unique_RC_rf_C2T) | 469 numbers_premapped_lst[3] += len(Unique_RC_rf_C2T) |
458 | 470 |
459 del Unique_FW_fr_C2T | 471 del Unique_FW_fr_C2T |
460 del Unique_FW_rf_C2T | 472 del Unique_FW_rf_C2T |
461 del Unique_RC_fr_C2T | 473 del Unique_RC_fr_G2A |
462 del Unique_RC_rf_C2T | 474 del Unique_RC_rf_C2T |
463 | 475 |
464 #---------------------------------------------------------------- | 476 #---------------------------------------------------------------- |
465 | 477 |
466 nn = 0 | 478 nn = 0 |
467 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), | 479 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), |
468 (FW_C2T_rf_uniq_lst,FW_C2T_rf_U), | 480 (FW_C2T_rf_uniq_lst,FW_C2T_rf_U), |
469 (RC_C2T_fr_uniq_lst,RC_C2T_fr_U), | 481 (RC_C2T_fr_uniq_lst,RC_G2A_fr_U), |
470 (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]: | 482 (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]: |
471 nn += 1 | 483 nn += 1 |
472 | 484 |
473 mapped_chr0 = "" | 485 mapped_chr0 = "" |
474 for header in ali_unique_lst: | 486 for header in ali_unique_lst: |
475 | 487 |
476 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] | 488 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] |
477 | 489 |
478 #------------------------------------- | 490 #------------------------------------- |
479 if mapped_chr != mapped_chr0: | 491 if mapped_chr!=mapped_chr0: |
480 my_gseq=deserialize(db_d(mapped_chr)) | 492 my_gseq = deserialize(db_d(mapped_chr)) |
481 chr_length=len(my_gseq) | 493 chr_length = len(my_gseq) |
482 mapped_chr0=mapped_chr | 494 mapped_chr0 = mapped_chr |
483 #------------------------------------- | 495 #------------------------------------- |
484 if nn == 1 or nn == 3: | 496 if nn==1 or nn==3 : |
485 original_BS_1 = original_bs_reads_1[header] | 497 original_BS_1 = original_bs_reads_1[header] |
486 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) | 498 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) |
487 else: | 499 else : |
488 original_BS_1 = original_bs_reads_2[header] | 500 original_BS_1 = original_bs_reads_2[header] |
489 original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) | 501 original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) |
490 | 502 |
491 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) | 503 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) |
492 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) | 504 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) |
493 | 505 |
494 all_mapped += 1 | 506 all_mapped += 1 |
495 | 507 |
496 if nn == 1: # FW-RC mapped to + strand: | 508 if nn==1 : # FW-RC mapped to + strand: |
497 | |
498 FR = "+FR" | 509 FR = "+FR" |
499 | |
500 # mapped_location_1 += 1 | |
501 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
502 # origin_genome_1 = origin_genome_long_1[2:-2] | |
503 mapped_strand_1 = "+" | 510 mapped_strand_1 = "+" |
504 | |
505 # mapped_location_2 += 1 | |
506 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] | |
507 # origin_genome_2 = origin_genome_long_2[2:-2] | |
508 mapped_strand_2 = "+" | 511 mapped_strand_2 = "+" |
509 | 512 elif nn==2 : # RC-FW mapped to + strand: |
510 elif nn==2: # RC-FW mapped to + strand: | |
511 | |
512 # original_BS_1 = original_bs_reads_2[header] | |
513 # original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) | |
514 FR = "+RF" | 513 FR = "+RF" |
515 # mapped_location_1 += 1 | |
516 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
517 # origin_genome_1 = origin_genome_long_1[2:-2] | |
518 mapped_strand_1 = "+" | 514 mapped_strand_1 = "+" |
519 | |
520 # mapped_location_2 += 1 | |
521 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] | |
522 # origin_genome_2 = origin_genome_long_2[2:-2] | |
523 mapped_strand_2 = "+" | 515 mapped_strand_2 = "+" |
524 | 516 elif nn==3 : # FW-RC mapped to - strand: |
525 | |
526 elif nn==3: # FW-RC mapped to - strand: | |
527 # original_BS_1=original_bs_reads_1[header] | |
528 # original_BS_2=reverse_compl_seq(original_bs_reads_2[header]) | |
529 | |
530 FR = "-FR" | 517 FR = "-FR" |
531 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 | 518 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 |
532 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
533 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) | |
534 # origin_genome_1 = origin_genome_long_1[2:-2] | |
535 mapped_strand_1 = "-" | 519 mapped_strand_1 = "-" |
536 | |
537 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 | 520 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 |
538 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1 ]) | |
539 # origin_genome_2 = origin_genome_long_2[2:-2] | |
540 mapped_strand_2 = "-" | 521 mapped_strand_2 = "-" |
541 | 522 elif nn==4 : # RC-FW mapped to - strand: |
542 elif nn==4: # RC-FW mapped to - strand: | |
543 # original_BS_1=original_bs_reads_2[header] | |
544 # original_BS_2=reverse_compl_seq(original_bs_reads_1[header]) | |
545 | |
546 FR = "-RF" | 523 FR = "-RF" |
547 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 | 524 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 |
548 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
549 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) | |
550 # origin_genome_1 = origin_genome_long_1[2:-2] | |
551 mapped_strand_1 = "-" | 525 mapped_strand_1 = "-" |
552 | |
553 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 | 526 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 |
554 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]) | |
555 # origin_genome_2 = origin_genome_long_2[2:-2] | |
556 mapped_strand_2 = "-" | 527 mapped_strand_2 = "-" |
557 | |
558 | |
559 | |
560 | 528 |
561 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) | 529 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) |
562 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) | 530 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) |
563 | 531 |
564 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) | 532 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) |
565 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) | 533 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) |
566 | 534 |
567 | |
568 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides | 535 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides |
569 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides | 536 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides |
570 | 537 |
571 | 538 N_mismatch = max(N_mismatch_1, N_mismatch_2) |
572 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no) : | 539 mm_no=float(max_mismatch_no) |
540 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1) | |
541 and N_mismatch_2<=mm_no*len(original_BS_2)): | |
542 | |
573 all_mapped_passed += 1 | 543 all_mapped_passed += 1 |
574 numbers_mapped_lst[nn-1] += 1 | 544 numbers_mapped_lst[nn-1] += 1 |
575 #---- unmapped ------------------------- | 545 #---- unmapped ------------------------- |
576 del original_bs_reads_1[header] | 546 del original_bs_reads_1[header] |
577 del original_bs_reads_2[header] | 547 del original_bs_reads_2[header] |
578 #--------------------------------------- | 548 #--------------------------------------- |
579 | 549 |
580 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:] | 550 methy_1 = methy_seq(r_aln_1, g_aln_1 + next_1) |
581 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:] | 551 methy_2 = methy_seq(r_aln_2, g_aln_2 + next_2) |
582 | 552 |
553 mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst) | |
554 mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst) | |
555 | |
556 #---XS FILTER---------------- | |
557 XS_1 = 0 | |
558 nCH_1 = methy_1.count('y') + methy_1.count('z') | |
559 nmCH_1 = methy_1.count('Y') + methy_1.count('Z') | |
560 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) : | |
561 XS_1 = 1 | |
562 XS_2 = 0 | |
563 nCH_2 = methy_2.count('y') + methy_2.count('z') | |
564 nmCH_2 = methy_2.count('Y') + methy_2.count('Z') | |
565 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : | |
566 XS_2 = 1 | |
567 | |
568 if mapped_strand_1=='+' : | |
569 flag_1 = 67 # 1000011 : 1st read, + strand | |
570 flag_2 = 131 #10000011 : 2nd read, + strand | |
571 else : | |
572 flag_1 = 115 # 1110011 : 1st read, - strand | |
573 flag_2 = 179 # 10110011 : 2nd read, - strand | |
574 | |
575 outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, | |
576 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) | |
577 all_base_mapped += len(original_BS_1) | |
578 | |
579 outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, | |
580 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) | |
581 all_base_mapped += len(original_BS_2) | |
582 | |
583 | |
584 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) | |
585 #---------------------------------------------------------------- | |
586 # output unmapped pairs | |
587 #---------------------------------------------------------------- | |
588 | |
589 unmapped_lst = original_bs_reads_1.keys() | |
590 unmapped_lst.sort() | |
591 | |
592 # for u in unmapped_lst: | |
593 # outf_u1.write("%s\n"%original_bs_reads_1[u]) | |
594 # outf_u2.write("%s\n"%original_bs_reads_2[u]) | |
595 | |
596 all_unmapped += len(unmapped_lst) | |
597 | |
598 | |
599 # Directional library | |
600 if asktag=="N": | |
601 | |
602 #---------------------------------------------------------------- | |
603 outfile_1FCT = tmp_d('Trimed_FCT_1.fa'+random_id) | |
604 outfile_2RGA = tmp_d('Trimed_RGA_2.fa'+random_id) | |
605 | |
606 try : | |
607 if read_file_1.endswith(".gz") : # support input file ending with ".gz" | |
608 read_inf = gzip.open(tmp_d(read_file_1), "rb") | |
609 else : | |
610 read_inf = open(tmp_d(read_file_1),"r") | |
611 except IOError : | |
612 print "[Error] Cannot open file : %s", tmp_d(read_file_1) | |
613 exit(-1) | |
614 | |
615 oneline = read_inf.readline() | |
616 l = oneline.split() | |
617 input_format = "" | |
618 | |
619 if oneline[0]=="@" : | |
620 input_format = "fastq" | |
621 elif len(l)==1 and oneline[0]!=">" : | |
622 input_format = "seq" | |
623 elif len(l)==11 : | |
624 input_format = "qseq" | |
625 elif oneline[0]==">" : | |
626 input_format = "fasta" | |
627 | |
628 read_inf.close() | |
629 | |
630 print "Detected data format: %s" % input_format | |
631 | |
632 #---------------------------------------------------------------- | |
633 read_file_list = [read_file_1,read_file_2] | |
634 outfile_FCT_list = [outfile_1FCT,outfile_2RGA] | |
635 n_list = [0,0] | |
636 | |
637 for f in range(2): | |
638 read_file = read_file_list[f] | |
639 outf_FCT = open(outfile_FCT_list[f],'w') | |
640 original_bs_reads = original_bs_reads_lst[f] | |
641 n = n_list[f] | |
642 | |
643 read_id = "" | |
644 seq = "" | |
645 seq_ready = "N" | |
646 line_no = 0 | |
647 for line in fileinput.input(tmp_d(read_file)): | |
648 l = line.split() | |
649 line_no += 1 | |
650 if input_format=="seq": | |
651 n += 1 | |
652 read_id = str(n) | |
653 read_id = read_id.zfill(12) | |
654 if f == 1 : | |
655 read_id_lst_1[read_id] = read_id | |
656 else : | |
657 read_id_lst_2[read_id] = read_id | |
658 seq = l[0] | |
659 seq_ready = "Y" | |
660 elif input_format=="fastq": | |
661 l_fastq = math.fmod(line_no, 4) | |
662 if l_fastq==1 : | |
663 all_raw_reads += 1 | |
664 read_id = str(line_no/4+1).zfill(12) | |
665 if f==1 : | |
666 read_id_lst_1[read_id] = l[0][1:] | |
667 else : | |
668 read_id_lst_2[read_id] = l[0][1:] | |
669 seq_ready = "N" | |
670 elif l_fastq==2 : | |
671 seq = l[0] | |
672 seq_ready = "Y" | |
673 else : | |
674 seq = "" | |
675 seq_ready = "N" | |
676 elif input_format=="qseq": | |
677 all_raw_reads += 1 | |
678 read_id = str(line_no) | |
679 read_id = read_id.zfill(12) | |
680 if f == 1 : | |
681 read_id_lst_1[read_id] = l[0][1:] | |
682 else : | |
683 read_id_lst_2[read_id] = l[0][1:] | |
684 seq = l[8] | |
685 seq_ready = "Y" | |
686 elif input_format=="fasta": | |
687 l_fasta = math.fmod(line_no,2) | |
688 if l_fasta==1: | |
689 seq_ready = "N" | |
690 all_raw_reads += 1 | |
691 read_id = str(line_no/2+1).zfill(12) | |
692 if f == 1 : | |
693 read_id_lst_1[read_id] = l[0][1:] | |
694 else : | |
695 read_id_lst_2[read_id] = l[0][1:] | |
696 seq = "" | |
697 elif l_fasta==0 : | |
698 seq = l[0] | |
699 seq_ready = "Y" | |
700 #---------------------------------------------------------------- | |
701 if seq_ready=="Y": | |
702 seq = seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52 | |
703 seq = seq.upper() | |
704 seq = seq.replace(".","N") | |
705 | |
706 #--striping BS adapter from 3' read -------------------------------------------------------------- | |
707 all_base_before_trim += len(seq) | |
708 if f==0 and adapterA!="" : | |
709 new_read = RemoveAdapter(seq, adapterA, adapter_mismatch) | |
710 if len(new_read) < len(seq) : | |
711 all_trimmed += 1 | |
712 seq = new_read | |
713 if f==1 and adapterB!="" : | |
714 new_read = RemoveAdapter(seq, adapterB, adapter_mismatch) | |
715 if len(new_read)<len(seq) : | |
716 all_trimmed += 1 | |
717 seq = new_read | |
718 all_base_after_trim += len(seq) | |
719 | |
720 if len(seq)<=4: | |
721 seq = "N" * (cut2-cut1+1) | |
722 #--------- trimmed_raw_BS_read ------------------ | |
723 original_bs_reads[read_id] = seq | |
724 | |
725 #--------- FW_C2T ------------------ | |
726 if f==0: | |
727 outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("C","T"))) | |
728 elif f==1: | |
729 outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("G","A"))) | |
730 | |
731 n_list[f] = n | |
732 outf_FCT.close() | |
733 fileinput.close() | |
734 | |
735 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); | |
736 all_raw_reads += n | |
737 | |
738 #-------------------------------------------------------------------------------- | |
739 # Bowtie mapping | |
740 #-------------------------------------------------------------------------------- | |
741 WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) | |
742 CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) | |
743 | |
744 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | |
745 'input_file_1' : outfile_1FCT, | |
746 'input_file_2' : outfile_2RGA, | |
747 'output_file' : WC2T_fr}, | |
748 | |
749 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | |
750 'input_file_1' : outfile_1FCT, | |
751 'input_file_2' : outfile_2RGA, | |
752 'output_file' : CG2A_fr} ]) | |
753 | |
754 | |
755 delete_files(outfile_1FCT, outfile_2RGA) | |
756 | |
757 #-------------------------------------------------------------------------------- | |
758 # Post processing | |
759 #-------------------------------------------------------------------------------- | |
760 | |
761 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) | |
762 RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr) | |
763 | |
764 #---------------------------------------------------------------- | |
765 # get uniq-hit reads | |
766 #---------------------------------------------------------------- | |
767 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys()) | |
768 | |
769 Unique_FW_fr_C2T = set() # + | |
770 Unique_RC_fr_G2A = set() # - | |
771 Multiple_hits=set() | |
772 | |
773 | |
774 for x in Union_set: | |
775 _list = [] | |
776 for d in [FW_C2T_fr_U, RC_G2A_fr_U]: | |
777 mis_lst = d.get(x,[99]) | |
778 mis = int(mis_lst[0]) | |
779 _list.append(mis) | |
780 for d in [FW_C2T_fr_R, RC_G2A_fr_R]: | |
781 mis = d.get(x,99) | |
782 _list.append(mis) | |
783 mini = min(_list) | |
784 if _list.count(mini) == 1: | |
785 mini_index = _list.index(mini) | |
786 if mini_index == 0: | |
787 Unique_FW_fr_C2T.add(x) | |
788 elif mini_index == 1: | |
789 Unique_RC_fr_G2A.add(x) | |
790 else : | |
791 Multiple_hits.add(x) | |
792 else : | |
793 Multiple_hits.add(x) | |
794 | |
795 # write reads rejected by Multiple Hits to file | |
796 if show_multiple_hit : | |
797 outf_MH = open("Multiple_hit.fa",'w') | |
798 for i in Multiple_hits : | |
799 outf_MH.write(">%s\n" % i) | |
800 outf_MH.write("%s\n" % original_bs_reads[i]) | |
801 outf_MH.close() | |
802 | |
803 FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] | |
804 RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A] | |
805 | |
806 FW_C2T_fr_uniq_lst.sort() | |
807 RC_C2T_fr_uniq_lst.sort() | |
808 FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst] | |
809 RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst] | |
810 #---------------------------------------------------------------- | |
811 | |
812 numbers_premapped_lst[0] += len(Unique_FW_fr_C2T) | |
813 numbers_premapped_lst[1] += len(Unique_RC_fr_G2A) | |
814 | |
815 | |
816 #---------------------------------------------------------------- | |
817 | |
818 nn = 0 | |
819 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_G2A_fr_U)]: | |
820 nn += 1 | |
821 mapped_chr0 = "" | |
822 for header in ali_unique_lst: | |
823 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] | |
824 | |
825 #------------------------------------- | |
826 if mapped_chr != mapped_chr0: | |
827 my_gseq = deserialize(db_d(mapped_chr)) | |
828 chr_length = len(my_gseq) | |
829 mapped_chr0 = mapped_chr | |
830 #------------------------------------- | |
831 | |
832 original_BS_1 = original_bs_reads_1[header] | |
833 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) | |
834 #original_BS_2 = original_bs_reads_2[header] | |
835 | |
836 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) | |
837 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) | |
838 | |
839 all_mapped += 1 | |
840 | |
841 if nn == 1: # FW-RC mapped to + strand: | |
842 FR = "+FR" | |
843 mapped_strand_1 = "+" | |
844 mapped_strand_2 = "+" | |
845 elif nn == 2: # FW-RC mapped to - strand: | |
846 FR="-FR" | |
847 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 | |
848 mapped_strand_1 = "-" | |
849 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 | |
850 mapped_strand_2 = "-" | |
851 | |
852 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) | |
853 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) | |
854 | |
855 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) | |
856 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) | |
857 | |
858 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides | |
859 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides | |
860 | |
861 # if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no): | |
862 # if N_mismatch <= int(max_mismatch_no) : | |
863 N_mismatch = max(N_mismatch_1, N_mismatch_2) | |
864 mm_no=float(max_mismatch_no) | |
865 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1) | |
866 and N_mismatch_2<=mm_no*len(original_BS_2)): | |
867 | |
868 numbers_mapped_lst[nn-1] += 1 | |
869 all_mapped_passed += 1 | |
870 | |
871 #---- unmapped ------------------------- | |
872 del original_bs_reads_1[header] | |
873 del original_bs_reads_2[header] | |
874 | |
875 #--------------------------------------- | |
583 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) | 876 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) |
584 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) | 877 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) |
585 | 878 mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst) |
586 mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst) | 879 mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst) |
587 mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst) | |
588 | |
589 | 880 |
590 # | 881 # |
591 #---XS FILTER---------------- | 882 #---XS FILTER---------------- |
592 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 | 883 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 |
593 XS_1 = 0 | 884 XS_1 = 0 |
600 nCH_2 = methy_2.count('y') + methy_2.count('z') | 891 nCH_2 = methy_2.count('y') + methy_2.count('z') |
601 nmCH_2 = methy_2.count('Y') + methy_2.count('Z') | 892 nmCH_2 = methy_2.count('Y') + methy_2.count('Z') |
602 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : | 893 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : |
603 XS_2 = 1 | 894 XS_2 = 1 |
604 | 895 |
605 | 896 if mapped_strand_1 == '+' : |
606 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, | 897 flag_1 = 67 # 1000011 : 1st read, + strand |
607 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) | 898 flag_2 = 131 # 10000011 : 2nd read, + strand |
608 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, | 899 else : |
609 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) | 900 flag_1 = 115 # 1110011 : 1st read, - strand |
610 | 901 flag_2 = 179 # 10110011 : 2nd read, - strand |
611 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) | 902 |
612 #---------------------------------------------------------------- | 903 outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, |
613 # output unmapped pairs | |
614 #---------------------------------------------------------------- | |
615 | |
616 unmapped_lst=original_bs_reads_1.keys() | |
617 unmapped_lst.sort() | |
618 | |
619 # for u in unmapped_lst: | |
620 # outf_u1.write("%s\n"%original_bs_reads_1[u]) | |
621 # outf_u2.write("%s\n"%original_bs_reads_2[u]) | |
622 | |
623 all_unmapped += len(unmapped_lst) | |
624 | |
625 | |
626 if asktag=="N": | |
627 | |
628 #---------------------------------------------------------------- | |
629 outfile_1FCT= tmp_d('Trimed_FCT_1.fa'+random_id) | |
630 outfile_2FCT= tmp_d('Trimed_FCT_2.fa'+random_id) | |
631 | |
632 try : | |
633 read_inf=open(tmp_d(read_file_1),"r") | |
634 except IOError : | |
635 print "[Error] Cannot open file : %s", tmp_d(read_file_1) | |
636 exit(-1) | |
637 | |
638 oneline=read_inf.readline() | |
639 l=oneline.split() | |
640 input_format="" | |
641 | |
642 #if len(l)==5: # old solexa format | |
643 # input_format="old Solexa Seq file" | |
644 | |
645 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) | |
646 input_format="FastQ" | |
647 n_fastq=0 | |
648 elif len(l)==1 and oneline[0]!=">": # pure sequences | |
649 input_format="_list of sequences" | |
650 elif len(l)==11: # Illumina GAII qseq file | |
651 input_format="Illumina GAII qseq file" | |
652 elif oneline[0]==">": # fasta | |
653 input_format="fasta" | |
654 n_fasta=0 | |
655 | |
656 read_inf.close() | |
657 | |
658 print "Detected data format: %s" % input_format | |
659 | |
660 #---------------------------------------------------------------- | |
661 read_file_list=[read_file_1,read_file_2] | |
662 outfile_FCT_list=[outfile_1FCT,outfile_2FCT] | |
663 n_list=[0,0] | |
664 | |
665 for f in range(2): | |
666 read_file=read_file_list[f] | |
667 outf_FCT=open(outfile_FCT_list[f],'w') | |
668 original_bs_reads = original_bs_reads_lst[f] | |
669 n=n_list[f] | |
670 | |
671 seq_id="" | |
672 seq="" | |
673 seq_ready="N" | |
674 for line in fileinput.input(tmp_d(read_file)): | |
675 l=line.split() | |
676 if input_format=="old Solexa Seq file": | |
677 n+=1 | |
678 seq_id=str(n) | |
679 seq_id=seq_id.zfill(12) | |
680 seq=l[4] | |
681 seq_ready="Y" | |
682 elif input_format=="_list of sequences": | |
683 n+=1 | |
684 seq_id=str(n) | |
685 seq_id=seq_id.zfill(12) | |
686 seq=l[0] | |
687 seq_ready="Y" | |
688 elif input_format=="FastQ": | |
689 m_fastq=math.fmod(n_fastq,4) | |
690 n_fastq+=1 | |
691 seq_ready="N" | |
692 if m_fastq==0: | |
693 n+=1 | |
694 seq_id=str(n) | |
695 seq_id=seq_id.zfill(12) | |
696 seq="" | |
697 elif m_fastq==1: | |
698 seq=l[0] | |
699 seq_ready="Y" | |
700 else: | |
701 seq="" | |
702 elif input_format=="Illumina GAII qseq file": | |
703 n+=1 | |
704 seq_id=str(n) | |
705 seq_id=seq_id.zfill(12) | |
706 seq=l[8] | |
707 seq_ready="Y" | |
708 elif input_format=="fasta": | |
709 m_fasta=math.fmod(n_fasta,2) | |
710 n_fasta+=1 | |
711 seq_ready="N" | |
712 if m_fasta==0: | |
713 n+=1 | |
714 seq_id=l[0][1:] | |
715 seq_id=seq_id.zfill(17) | |
716 seq="" | |
717 elif m_fasta==1: | |
718 seq=l[0] | |
719 seq_ready="Y" | |
720 else: | |
721 seq="" | |
722 #---------------------------------------------------------------- | |
723 if seq_ready=="Y": | |
724 seq=seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52 | |
725 seq=seq.upper() | |
726 seq=seq.replace(".","N") | |
727 | |
728 #--striping BS adapter from 3' read -------------------------------------------------------------- | |
729 if (adapterA !="") and (adapterB !=""): | |
730 signature=adapterA[:6] | |
731 if signature in seq: | |
732 signature_pos=seq.index(signature) | |
733 if seq[signature_pos:] in adapterA: | |
734 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) | |
735 all_trimmed+=1 | |
736 else: | |
737 signature=adapterB[:6] | |
738 if signature in seq: | |
739 #print seq_id,seq,signature; | |
740 signature_pos=seq.index(signature) | |
741 if seq[signature_pos:] in adapterB: | |
742 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) | |
743 all_trimmed+=1 | |
744 | |
745 if len(seq) <= 4: | |
746 seq = "N" * (cut2-cut1+1) | |
747 #--------- trimmed_raw_BS_read ------------------ | |
748 original_bs_reads[seq_id] = seq | |
749 | |
750 #--------- FW_C2T ------------------ | |
751 if f==0: | |
752 outf_FCT.write('>%s\n%s\n'% (seq_id, seq.replace("C","T"))) | |
753 elif f==1: | |
754 outf_FCT.write('>%s\n%s\n'% (seq_id, reverse_compl_seq(seq).replace("C","T"))) | |
755 | |
756 | |
757 n_list[f]=n | |
758 outf_FCT.close() | |
759 | |
760 fileinput.close() | |
761 | |
762 | |
763 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); | |
764 all_raw_reads+=n | |
765 | |
766 #-------------------------------------------------------------------------------- | |
767 # Bowtie mapping | |
768 #-------------------------------------------------------------------------------- | |
769 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) | |
770 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) | |
771 | |
772 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | |
773 'input_file_1' : outfile_1FCT, | |
774 'input_file_2' : outfile_2FCT, | |
775 'output_file' : WC2T_fr}, | |
776 | |
777 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | |
778 'input_file_1' : outfile_1FCT, | |
779 'input_file_2' : outfile_2FCT, | |
780 'output_file' : CC2T_fr} ]) | |
781 | |
782 | |
783 delete_files(outfile_1FCT, outfile_2FCT) | |
784 | |
785 #-------------------------------------------------------------------------------- | |
786 # Post processing | |
787 #-------------------------------------------------------------------------------- | |
788 | |
789 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) | |
790 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr) | |
791 | |
792 #---------------------------------------------------------------- | |
793 # get uniq-hit reads | |
794 #---------------------------------------------------------------- | |
795 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | |
796 | |
797 Unique_FW_fr_C2T = set() # + | |
798 Unique_RC_fr_C2T = set() # - | |
799 Multiple_hits=set() | |
800 | |
801 | |
802 for x in Union_set: | |
803 _list = [] | |
804 for d in [FW_C2T_fr_U, RC_C2T_fr_U]: | |
805 mis_lst = d.get(x,[99]) | |
806 mis = int(mis_lst[0]) | |
807 _list.append(mis) | |
808 for d in [FW_C2T_fr_R, RC_C2T_fr_R]: | |
809 mis = d.get(x,99) | |
810 _list.append(mis) | |
811 mini = min(_list) | |
812 if _list.count(mini) == 1: | |
813 mini_index = _list.index(mini) | |
814 if mini_index == 0: | |
815 Unique_FW_fr_C2T.add(x) | |
816 elif mini_index == 1: | |
817 Unique_RC_fr_C2T.add(x) | |
818 else : | |
819 Multiple_hits.add(x) | |
820 else : | |
821 Multiple_hits.add(x) | |
822 | |
823 # write reads rejected by Multiple Hits to file | |
824 if show_multiple_hit : | |
825 outf_MH=open("Multiple_hit.fa",'w') | |
826 for i in Multiple_hits : | |
827 outf_MH.write(">%s\n" % i) | |
828 outf_MH.write("%s\n" % original_bs_reads[i]) | |
829 outf_MH.close() | |
830 | |
831 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] | |
832 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T] | |
833 | |
834 FW_C2T_fr_uniq_lst.sort() | |
835 RC_C2T_fr_uniq_lst.sort() | |
836 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst] | |
837 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst] | |
838 #---------------------------------------------------------------- | |
839 | |
840 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T) | |
841 numbers_premapped_lst[1]+=len(Unique_RC_fr_C2T) | |
842 | |
843 | |
844 #---------------------------------------------------------------- | |
845 | |
846 nn = 0 | |
847 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_C2T_fr_U)]: | |
848 nn += 1 | |
849 mapped_chr0 = "" | |
850 for header in ali_unique_lst: | |
851 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] | |
852 | |
853 | |
854 #------------------------------------- | |
855 if mapped_chr != mapped_chr0: | |
856 my_gseq = deserialize(db_d(mapped_chr)) | |
857 chr_length = len(my_gseq) | |
858 mapped_chr0 = mapped_chr | |
859 #------------------------------------- | |
860 | |
861 original_BS_1 = original_bs_reads_1[header] | |
862 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) | |
863 | |
864 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) | |
865 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) | |
866 | |
867 all_mapped += 1 | |
868 | |
869 if nn == 1: # FW-RC mapped to + strand: | |
870 FR = "+FR" | |
871 # mapped_location_1 += 1 | |
872 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
873 # origin_genome_1 = origin_genome_long_1[2:-2] | |
874 mapped_strand_1 = "+" | |
875 | |
876 # mapped_location_2 += 1 | |
877 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] | |
878 # origin_genome_2 = origin_genome_long_2[2:-2] | |
879 mapped_strand_2 = "+" | |
880 | |
881 elif nn == 2: # FW-RC mapped to - strand: | |
882 | |
883 FR="-FR" | |
884 mapped_location_1 = chr_length - mapped_location_1 - g_len_1 | |
885 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] | |
886 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) | |
887 # origin_genome_1 = origin_genome_long_1[2:-2] | |
888 mapped_strand_1 = "-" | |
889 | |
890 mapped_location_2 = chr_length - mapped_location_2 - g_len_2 | |
891 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]) | |
892 # origin_genome_2 = origin_genome_long_2[2:-2] | |
893 mapped_strand_2 = "-" | |
894 | |
895 | |
896 | |
897 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) | |
898 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) | |
899 | |
900 | |
901 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) | |
902 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) | |
903 | |
904 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides | |
905 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides | |
906 | |
907 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no): | |
908 | |
909 numbers_mapped_lst[nn-1] += 1 | |
910 all_mapped_passed += 1 | |
911 | |
912 #---- unmapped ------------------------- | |
913 del original_bs_reads_1[header] | |
914 del original_bs_reads_2[header] | |
915 | |
916 #--------------------------------------- | |
917 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:] | |
918 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:] | |
919 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) | |
920 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) | |
921 mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst) | |
922 mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst) | |
923 | |
924 # | |
925 #---XS FILTER---------------- | |
926 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 | |
927 XS_1 = 0 | |
928 nCH_1 = methy_1.count('y') + methy_1.count('z') | |
929 nmCH_1 = methy_1.count('Y') + methy_1.count('Z') | |
930 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) : | |
931 XS_1 = 1 | |
932 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 | |
933 XS_2 = 0 | |
934 nCH_2 = methy_2.count('y') + methy_2.count('z') | |
935 nmCH_2 = methy_2.count('Y') + methy_2.count('Z') | |
936 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : | |
937 XS_2 = 1 | |
938 | |
939 | |
940 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, | |
941 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) | 904 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) |
942 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, | 905 all_base_mapped += len(original_BS_1) |
906 | |
907 outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, | |
943 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) | 908 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) |
909 all_base_mapped += len(original_BS_2) | |
944 | 910 |
945 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) | 911 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) |
946 #---------------------------------------------------------------- | 912 #---------------------------------------------------------------- |
947 # output unmapped pairs | 913 # output unmapped pairs |
948 #---------------------------------------------------------------- | 914 #---------------------------------------------------------------- |
954 # outf_u1.write("%s\n"%(original_bs_reads_1[u])) | 920 # outf_u1.write("%s\n"%(original_bs_reads_1[u])) |
955 # outf_u2.write("%s\n"%(original_bs_reads_2[u]) ) | 921 # outf_u2.write("%s\n"%(original_bs_reads_2[u]) ) |
956 | 922 |
957 all_unmapped+=len(unmapped_lst) | 923 all_unmapped+=len(unmapped_lst) |
958 | 924 |
959 | |
960 #================================================================================================== | 925 #================================================================================================== |
961 # outf.close() | |
962 # | |
963 # outf_u1.close() | |
964 # outf_u2.close() | |
965 delete_files(tmp_path) | 926 delete_files(tmp_path) |
966 | 927 |
967 logm("-------------------------------- " ) | 928 logm("-------------------------------- " ) |
968 logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) ) | 929 logm("Number of raw BS-read pairs: %d " %(all_raw_reads/2) ) |
969 logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\n") | 930 if adapterA != "" or adapterB != "" : |
931 logm("Number of reads having adapter removed: %d \n"% all_trimmed) | |
932 logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) ) | |
970 | 933 |
971 if all_raw_reads >0: | 934 if all_raw_reads >0: |
972 | 935 |
973 logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\n") | 936 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) |
937 logm("Number of unique-hits reads (before post-filtering): %d" % all_mapped + "\n") | |
974 if asktag=="Y": | 938 if asktag=="Y": |
975 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) | 939 logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) |
976 logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) | 940 logm("-- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) |
977 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) | 941 logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) |
978 logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) | 942 logm("-- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) |
979 elif asktag=="N": | 943 elif asktag=="N": |
980 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) | 944 logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) |
981 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) ) | 945 logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) ) |
982 | 946 logm("--- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) ) |
983 | |
984 logm("O --- Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) | |
985 logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) ) | |
986 if asktag=="Y": | 947 if asktag=="Y": |
987 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) | 948 logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) |
988 logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) ) | 949 logm("----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) ) |
989 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) ) | 950 logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) ) |
990 logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) ) | 951 logm("----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) ) |
991 elif asktag=="N": | 952 elif asktag=="N": |
992 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) | 953 logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) |
993 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) ) | 954 logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) ) |
994 | 955 |
995 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) | 956 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)*2/all_raw_reads) ) |
996 logm("O Unmapped read pairs: %d"% all_unmapped+"\n") | 957 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped ) |
997 | 958 logm("Unmapped read pairs: %d"% all_unmapped+"\n") |
998 | 959 |
999 n_CG=mC_lst[0]+uC_lst[0] | 960 n_CG=mC_lst[0]+uC_lst[0] |
1000 n_CHG=mC_lst[1]+uC_lst[1] | 961 n_CHG=mC_lst[1]+uC_lst[1] |
1001 n_CHH=mC_lst[2]+uC_lst[2] | 962 n_CHH=mC_lst[2]+uC_lst[2] |
1002 | 963 |