comparison BSseeker2/bs_align/bs_rrbs.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
comparison
equal deleted inserted replaced
0:e6df770c0e58 1:8b26adf64adc
40 show_multiple_hit=False): 40 show_multiple_hit=False):
41 #---------------------------------------------------------------- 41 #----------------------------------------------------------------
42 # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC" 42 # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC"
43 #cut_context = re.sub("-", "", cut_format) 43 #cut_context = re.sub("-", "", cut_format)
44 # Ex. cut_format="C-CGG,AT-CG,G-CWGC" 44 # Ex. cut_format="C-CGG,AT-CG,G-CWGC"
45 """ 45
46
47 :param main_read_file:
48 :param asktag:
49 :param adapter_file:
50 :param cut_s:
51 :param cut_e:
52 :param no_small_lines:
53 :param max_mismatch_no:
54 :param aligner_command:
55 :param db_path:
56 :param tmp_path:
57 :param outfile:
58 :param XS_pct:
59 :param XS_count:
60 :param adapter_mismatch:
61 :param cut_format:
62 """
63 cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC'] 46 cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC']
64 cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC'] 47 cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC']
65 cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G'] 48 cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G']
66 cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC'] 49 cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC']
67 cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5] 50 cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5]
104 adapter_inf.close() 87 adapter_inf.close()
105 except IOError: 88 except IOError:
106 print "[Error] Cannot find adapter file : %s !" % adapter_file 89 print "[Error] Cannot find adapter file : %s !" % adapter_file
107 exit(-1) 90 exit(-1)
108 91
109 logm("I Read filename: %s" % main_read_file) 92 logm("Read filename: %s" % main_read_file)
110 logm("I The last cycle (for mapping): %d" % cut_e ) 93 logm("The first base (for mapping): %d"% cut_s )
111 logm("I Bowtie path: %s" % aligner_command ) 94 logm("The last base (for mapping): %d"% cut_e )
112 logm("I Reference genome library path: %s" % db_path ) 95
113 logm("I Number of mismatches allowed: %s" % max_mismatch_no) 96 logm("Path for short reads aligner: %s"% aligner_command + '\n')
114 logm("I Adapter seq: %s" % whole_adapter_seq) 97 logm("Reference genome library path: %s"% db_path )
115 logm("----------------------------------------------") 98
99 if asktag == "Y" :
100 logm("Un-directional library" )
101 else :
102 logm("Directional library")
103
104 logm("Number of mismatches allowed: %s"% max_mismatch_no )
105
106 if adapter_file !="":
107 logm("Adapter seq: %s" % whole_adapter_seq)
108 logm("-------------------------------- " )
116 109
117 #---------------------------------------------------------------- 110 #----------------------------------------------------------------
118 all_raw_reads=0 111 all_raw_reads=0
119 all_tagged=0 112 all_tagged=0
120 all_tagged_trimmed=0 113 all_tagged_trimmed=0
114 all_base_before_trim=0
115 all_base_after_trim=0
116 all_base_mapped=0
121 all_mapped=0 117 all_mapped=0
122 all_mapped_passed=0 118 all_mapped_passed=0
123 n_cut_tag_lst={} 119 n_cut_tag_lst={}
124 #print cut3_tag_lst 120 #print cut3_tag_lst
125 for x in cut3_tag_lst: 121 for x in cut3_tag_lst:
133 num_mapped_FW_C2T = 0 129 num_mapped_FW_C2T = 0
134 num_mapped_RC_C2T = 0 130 num_mapped_RC_C2T = 0
135 num_mapped_FW_G2A = 0 131 num_mapped_FW_G2A = 0
136 num_mapped_RC_G2A = 0 132 num_mapped_RC_G2A = 0
137 133
134 # Count of nucleotides, which should be cut before the adapters
135 Extra_base_cut_5end_adapter = max([ abs(len(i)-len(j)) for i,j in zip(cut5_context, cut3_context)])
136
138 #=============================================== 137 #===============================================
139 # directional sequencing 138 # directional sequencing
140 #=============================================== 139 #===============================================
141 140
142 if asktag=="N" : 141 if asktag=="N" :
154 153
155 outf2=open(outfile2,'w') 154 outf2=open(outfile2,'w')
156 155
157 #--- Checking input format ------------------------------------------ 156 #--- Checking input format ------------------------------------------
158 try : 157 try :
159 read_inf=open(read_file,"r") 158 if read_file.endswith(".gz") : # support input file ending with ".gz"
159 read_inf = gzip.open(read_file, "rb")
160 else :
161 read_inf=open(read_file,"r")
160 except IOError: 162 except IOError:
161 print "[Error] Cannot open input file : %s" % read_file 163 print "[Error] Cannot open input file : %s" % read_file
162 exit(-1) 164 exit(-1)
163 165
164 oneline=read_inf.readline() 166 oneline = read_inf.readline()
165 l=oneline.split() 167 l = oneline.split()
166 n_fastq=0 168 input_format = ""
167 n_fasta=0 169 if oneline[0]=="@":
168 input_format="" 170 input_format = "fastq"
169 if oneline[0]=="@": # FastQ 171 elif len(l)==1 and oneline[0]!=">" :
170 input_format="fastq" 172 input_format = "seq"
171 elif len(l)==1 and oneline[0]!=">": # pure sequences 173 elif len(l)==11:
172 input_format="seq" 174 input_format = "qseq"
173 elif len(l)==11: # Illumina qseq 175 elif oneline[0]==">" :
174 input_format="qseq" 176 input_format = "fasta"
175 elif oneline[0]==">": # fasta
176 input_format="fasta"
177 read_inf.close() 177 read_inf.close()
178 178
179
179 #---------------------------------------------------------------- 180 #----------------------------------------------------------------
180 seq_id="" 181 read_id = ""
181 seq="" 182 seq = ""
182 seq_ready=0 183 seq_ready = 0
183 for line in fileinput.input(read_file): 184 line_no = 0
184 l=line.split() 185 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
185 186 l = line.split()
187 line_no += 1
186 if input_format=="seq": 188 if input_format=="seq":
187 all_raw_reads+=1 189 all_raw_reads += 1
188 seq_id=str(all_raw_reads) 190 read_id = str(all_raw_reads)
189 seq_id=seq_id.zfill(12) 191 read_id = read_id.zfill(12)
190 seq=l[0] 192 seq = l[0]
191 seq_ready="Y" 193 seq_ready = "Y"
192 elif input_format=="fastq": 194 elif input_format=="fastq":
193 m_fastq=math.fmod(n_fastq,4) 195 l_fastq = math.fmod(line_no, 4)
194 n_fastq+=1 196 if l_fastq == 1 :
195 seq_ready="N" 197 all_raw_reads += 1
196 if m_fastq==0: 198 read_id = l[0][1:]
197 all_raw_reads+=1 199 seq_ready = "N"
198 seq_id=str(all_raw_reads) 200 elif l_fastq == 2 :
199 seq_id=seq_id.zfill(12) 201 seq = l[0]
200 seq="" 202 seq_ready = "Y"
201 elif m_fastq==1: 203 else :
202 seq=l[0] 204 seq = ""
203 seq_ready="Y" 205 seq_ready = "N"
204 else:
205 seq=""
206 elif input_format=="qseq": 206 elif input_format=="qseq":
207 all_raw_reads+=1 207 all_raw_reads += 1
208 seq_id=str(all_raw_reads) 208 read_id = str(all_raw_reads)
209 seq_id=seq_id.zfill(12) 209 read_id = read_id.zfill(12)
210 seq=l[8] 210 seq = l[8]
211 seq_ready="Y" 211 seq_ready = "Y"
212 elif input_format=="fasta": 212 elif input_format=="fasta" :
213 m_fasta=math.fmod(n_fasta,2) 213 l_fasta = math.fmod(line_no,2)
214 n_fasta+=1 214 if l_fasta==1:
215 seq_ready="N" 215 all_raw_reads += 1
216 if m_fasta==0: 216 read_id = l[0][1:]
217 all_raw_reads+=1 217 seq = ""
218 seq_id=l[0][1:] 218 seq_ready = "N"
219 seq="" 219 elif l_fasta==0 :
220 elif m_fasta==1: 220 seq = l[0]
221 seq=l[0] 221 seq_ready = "Y"
222 seq_ready="Y" 222
223 else:
224 seq=""
225 #--------------------------------------------------------------- 223 #---------------------------------------------------------------
226 if seq_ready=="Y": 224 if seq_ready=="Y":
227 # Normalize the characters 225 # Normalize the characters
228 seq=seq.upper().replace(".","N") 226 seq=seq.upper().replace(".","N")
229 227
234 n_cut_tag_lst[i] += 1 232 n_cut_tag_lst[i] += 1
235 233
236 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default 234 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
237 235
238 #-- Trimming adapter sequence --- 236 #-- Trimming adapter sequence ---
237
238 all_base_before_trim += len(seq)
239 if adapter_seq != "" : 239 if adapter_seq != "" :
240 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch) 240 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter)
241 if len(new_read) < len(seq) : 241 if len(new_read) < len(seq) :
242 all_tagged_trimmed += 1 242 all_tagged_trimmed += 1
243 seq = new_read 243 seq = new_read
244 all_base_after_trim += len(seq)
244 if len(seq) <= 4 : 245 if len(seq) <= 4 :
245 seq = "N" * (cut_e - cut_s) 246 seq = "N" * (cut_e - cut_s)
246 247
247 # all reads will be considered, regardless of tags 248 # all reads will be considered, regardless of tags
248 #--------- trimmed_raw_BS_read and qscore ------------------ 249 #--------- trimmed_raw_BS_read and qscore ------------------
249 original_bs_reads[seq_id] = seq 250 original_bs_reads[read_id] = seq
250 #--------- FW_C2T ------------------ 251 #--------- FW_C2T ------------------
251 outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T'))) 252 outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T')))
252 fileinput.close() 253 fileinput.close()
253 254
254 outf2.close() 255 outf2.close()
255 256
256 delete_files(read_file) 257 delete_files(read_file)
382 # print "[For debug]: +FW read still can not find fragment serial" 383 # print "[For debug]: +FW read still can not find fragment serial"
383 # Tip: sometimes "my_region_serial" is still 0 ... 384 # Tip: sometimes "my_region_serial" is still 0 ...
384 385
385 386
386 N_mismatch = N_MIS(r_aln, g_aln) 387 N_mismatch = N_MIS(r_aln, g_aln)
387 if N_mismatch <= int(max_mismatch_no) : 388 # if N_mismatch <= int(max_mismatch_no) :
389 mm_no=float(max_mismatch_no)
390 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
388 all_mapped_passed += 1 391 all_mapped_passed += 1
389 methy = methy_seq(r_aln, g_aln + next2bp) 392 methy = methy_seq(r_aln, g_aln + next2bp)
390 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) 393 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
391 #---XS FILTER---------------- 394 #---XS FILTER----------------
392 XS = 0 395 XS = 0
400 output_genome = output_genome, 403 output_genome = output_genome,
401 rrbs = True, 404 rrbs = True,
402 my_region_serial = my_region_serial, 405 my_region_serial = my_region_serial,
403 my_region_start = my_region_start, 406 my_region_start = my_region_start,
404 my_region_end = my_region_end) 407 my_region_end = my_region_end)
408 all_base_mapped += len(original_BS)
405 else : 409 else :
406 print "[For debug]: reads not in same lengths" 410 print "[For debug]: reads not in same lengths"
407 411
408 #print "start RC" 412 #print "start RC"
409 # ---- RC ---- 413 # ---- RC ----
451 # print "[For debug]: chr=", mapped_chr 455 # print "[For debug]: chr=", mapped_chr
452 # print "[For debug]: -FW read still cannot find fragment serial" 456 # print "[For debug]: -FW read still cannot find fragment serial"
453 457
454 458
455 N_mismatch = N_MIS(r_aln, g_aln) 459 N_mismatch = N_MIS(r_aln, g_aln)
456 if N_mismatch <= int(max_mismatch_no) : 460 # if N_mismatch <= int(max_mismatch_no) :
461 mm_no=float(max_mismatch_no)
462 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
457 all_mapped_passed += 1 463 all_mapped_passed += 1
458 methy = methy_seq(r_aln, g_aln + next2bp) 464 methy = methy_seq(r_aln, g_aln + next2bp)
459 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) 465 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
460 #---XS FILTER---------------- 466 #---XS FILTER----------------
461 XS = 0 467 XS = 0
469 output_genome = output_genome, 475 output_genome = output_genome,
470 rrbs = True, 476 rrbs = True,
471 my_region_serial = my_region_serial, 477 my_region_serial = my_region_serial,
472 my_region_start = my_region_start, 478 my_region_start = my_region_start,
473 my_region_end = my_region_end) 479 my_region_end = my_region_end)
480 all_base_mapped += len(original_BS)
474 else : 481 else :
475 print "[For debug]: reads not in same lengths" 482 print "[For debug]: reads not in same lengths"
476 483
477 484
478 # Finished both FW and RC 485 # Finished both FW and RC
505 outf2=open(outfile2,'w') 512 outf2=open(outfile2,'w')
506 outf3=open(outfile3,'w') 513 outf3=open(outfile3,'w')
507 514
508 #--- Checking input format ------------------------------------------ 515 #--- Checking input format ------------------------------------------
509 try : 516 try :
510 read_inf=open(read_file,"r") 517 if read_file.endswith(".gz") : # support input file ending with ".gz"
518 read_inf = gzip.open(read_file, "rb")
519 else :
520 read_inf=open(read_file,"r")
511 except IOError: 521 except IOError:
512 print "[Error] Cannot open input file : %s" % read_file 522 print "[Error] Cannot open input file : %s" % read_file
513 exit(-1) 523 exit(-1)
514 524
515 oneline=read_inf.readline() 525 oneline = read_inf.readline()
516 l=oneline.split() 526 l = oneline.split()
517 n_fastq=0 527 input_format = ""
518 n_fasta=0 528 if oneline[0]=="@":
519 input_format="" 529 input_format = "fastq"
520 if oneline[0]=="@": # FastQ 530 elif len(l)==1 and oneline[0]!=">" :
521 input_format="fastq" 531 input_format = "seq"
522 elif len(l)==1 and oneline[0]!=">": # pure sequences 532 elif len(l)==11:
523 input_format="seq" 533 input_format = "qseq"
524 elif len(l)==11: # Illumina qseq 534 elif oneline[0]==">" :
525 input_format="qseq" 535 input_format = "fasta"
526 elif oneline[0]==">": # fasta
527 input_format="fasta"
528 read_inf.close() 536 read_inf.close()
529 537
530 #---------------------------------------------------------------- 538 #----------------------------------------------------------------
531 seq_id = "" 539 read_id = ""
532 seq = "" 540 seq = ""
533 seq_ready=0 541 seq_ready = 0
534 for line in fileinput.input(read_file): 542 line_no = 0
535 l=line.split() 543 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
536 544 l = line.split()
537 if input_format == "seq": 545 line_no += 1
538 all_raw_reads+=1 546 if input_format=="seq":
539 seq_id=str(all_raw_reads) 547 all_raw_reads += 1
540 seq_id=seq_id.zfill(12) 548 read_id = str(all_raw_reads)
541 seq=l[0] 549 read_id = read_id.zfill(12)
542 seq_ready="Y" 550 seq = l[0]
551 seq_ready = "Y"
543 elif input_format=="fastq": 552 elif input_format=="fastq":
544 m_fastq=math.fmod(n_fastq,4) 553 l_fastq = math.fmod(line_no, 4)
545 n_fastq+=1 554 if l_fastq == 1 :
546 seq_ready="N" 555 all_raw_reads += 1
547 if m_fastq==0: 556 read_id = l[0][1:]
548 all_raw_reads+=1 557 seq_ready = "N"
549 seq_id=str(all_raw_reads) 558 elif l_fastq == 2 :
550 seq_id=seq_id.zfill(12) 559 seq = l[0]
551 seq="" 560 seq_ready = "Y"
552 elif m_fastq==1: 561 else :
553 seq=l[0] 562 seq = ""
554 seq_ready="Y" 563 seq_ready = "N"
555 else:
556 seq=""
557 elif input_format=="qseq": 564 elif input_format=="qseq":
558 all_raw_reads+=1 565 all_raw_reads += 1
559 seq_id=str(all_raw_reads) 566 read_id = str(all_raw_reads)
560 seq_id=seq_id.zfill(12) 567 read_id = read_id.zfill(12)
561 seq=l[8] 568 seq = l[8]
562 seq_ready="Y" 569 seq_ready = "Y"
563 elif input_format=="fasta": 570 elif input_format=="fasta" :
564 m_fasta=math.fmod(n_fasta,2) 571 l_fasta = math.fmod(line_no,2)
565 n_fasta+=1 572 if l_fasta==1:
566 seq_ready="N" 573 all_raw_reads += 1
567 if m_fasta==0: 574 read_id = l[0][1:]
568 all_raw_reads+=1 575 seq = ""
569 seq_id=l[0][1:] 576 seq_ready = "N"
570 seq="" 577 elif l_fasta==0 :
571 elif m_fasta==1: 578 seq = l[0]
572 seq=l[0] 579 seq_ready = "Y"
573 seq_ready="Y" 580
574 else: 581 #---------------------------------------------------------------
575 seq=""
576 #---------------------------------------------------------------
577 if seq_ready=="Y": 582 if seq_ready=="Y":
578 # Normalize the characters 583 # Normalize the characters
579 seq=seq.upper().replace(".","N") 584 seq = seq.upper().replace(".","N")
580 585
581 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ] 586 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
582 if len(read_tag) > 0 : 587 if len(read_tag) > 0 :
583 all_tagged += 1 588 all_tagged += 1
584 for i in read_tag : 589 for i in read_tag :
585 n_cut_tag_lst[i] += 1 590 n_cut_tag_lst[i] += 1
586 591
587 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default 592 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
588 593
589 #-- Trimming adapter sequence --- 594 #-- Trimming adapter sequence ---
595 all_base_before_trim += len(seq)
590 if adapter_seq != "" : 596 if adapter_seq != "" :
591 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch) 597 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter)
592 if len(new_read) < len(seq) : 598 if len(new_read) < len(seq) :
593 all_tagged_trimmed += 1 599 all_tagged_trimmed += 1
594 seq = new_read 600 seq = new_read
601 all_base_after_trim += len(seq)
595 if len(seq) <= 4 : 602 if len(seq) <= 4 :
596 seq = "N" * (cut_e - cut_s) 603 seq = "N" * (cut_e - cut_s)
597 604
598 # all reads will be considered, regardless of tags 605 # all reads will be considered, regardless of tags
599 #--------- trimmed_raw_BS_read and qscore ------------------ 606 #--------- trimmed_raw_BS_read and qscore ------------------
600 original_bs_reads[seq_id] = seq 607 original_bs_reads[read_id] = seq
601 #--------- FW_C2T ------------------ 608 #--------- FW_C2T ------------------
602 outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T'))) 609 outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T')))
603 #--------- RC_G2A ------------------ 610 #--------- RC_G2A ------------------
604 outf3.write('>%s\n%s\n' % (seq_id, seq.replace("G","A"))) 611 outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
605 fileinput.close() 612 fileinput.close()
606 613
607 outf2.close() 614 outf2.close()
608 615
609 delete_files(read_file) 616 delete_files(read_file)
610 logm("Processing input is done") 617 logm("Processing input is done")
611 #-------------------------------------------------------------------------------- 618 #--------------------------------------------------------------------------------
612 619
613 # mapping 620 # mapping
614 #-------------------------------------------------------------------------------- 621 #--------------------------------------------------------------------------------
615 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) 622 WC2T = tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
616 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) 623 CC2T = tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
617 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id) 624 WG2A = tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
618 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id) 625 CG2A = tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
619 626
620 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), 627 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
621 'input_file' : outfile2, 628 'input_file' : outfile2,
622 'output_file' : WC2T}, 629 'output_file' : WC2T},
623 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), 630 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
636 643
637 #-------------------------------------------------------------------------------- 644 #--------------------------------------------------------------------------------
638 # Post processing 645 # Post processing
639 #-------------------------------------------------------------------------------- 646 #--------------------------------------------------------------------------------
640 647
641 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T) 648 FW_C2T_U,FW_C2T_R = extract_mapping(WC2T)
642 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A) 649 RC_G2A_U,RC_G2A_R = extract_mapping(CG2A)
643 650
644 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A) 651 FW_G2A_U,FW_G2A_R = extract_mapping(WG2A)
645 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T) 652 RC_C2T_U,RC_C2T_R = extract_mapping(CC2T)
646 653
647 logm("Extracting alignments is done") 654 logm("Extracting alignments is done")
648 655
649 #---------------------------------------------------------------- 656 #----------------------------------------------------------------
650 # get unique-hit reads 657 # get unique-hit reads
651 #---------------------------------------------------------------- 658 #----------------------------------------------------------------
652 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys()) 659 Union_set = set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
653 660
654 Unique_FW_C2T=set() # + 661 Unique_FW_C2T = set() # +
655 Unique_RC_G2A=set() # + 662 Unique_RC_G2A = set() # +
656 Unique_FW_G2A=set() # - 663 Unique_FW_G2A = set() # -
657 Unique_RC_C2T=set() # - 664 Unique_RC_C2T = set() # -
658 Multiple_hits=set() 665 Multiple_hits = set()
659 666
660 for x in Union_set: 667 for x in Union_set :
661 _list=[] 668 _list = []
662 for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]: 669 for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
663 mis_lst=dx.get(x,[99]) 670 mis_lst = dx.get(x,[99])
664 mis=int(mis_lst[0]) 671 mis = int(mis_lst[0])
665 _list.append(mis) 672 _list.append(mis)
666 for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]: 673 for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
667 mis=dx.get(x,99) 674 mis = dx.get(x,99)
668 _list.append(mis) 675 _list.append(mis)
669 mini=min(_list) 676 mini = min(_list)
670 if _list.count(mini) == 1: 677 if _list.count(mini) == 1:
671 mini_index=_list.index(mini) 678 mini_index = _list.index(mini)
672 if mini_index == 0: 679 if mini_index == 0:
673 Unique_FW_C2T.add(x) 680 Unique_FW_C2T.add(x)
674 elif mini_index == 1: 681 elif mini_index == 1:
675 Unique_RC_G2A.add(x) 682 Unique_RC_G2A.add(x)
676 elif mini_index == 2: 683 elif mini_index == 2:
681 Multiple_hits.add(x) 688 Multiple_hits.add(x)
682 else : 689 else :
683 Multiple_hits.add(x) 690 Multiple_hits.add(x)
684 # write reads rejected by Multiple Hits to file 691 # write reads rejected by Multiple Hits to file
685 if show_multiple_hit : 692 if show_multiple_hit :
686 outf_MH=open("Multiple_hit.fa",'w') 693 outf_MH = open("Multiple_hit.fa",'w')
687 for i in Multiple_hits : 694 for i in Multiple_hits :
688 outf_MH.write(">%s\n" % i) 695 outf_MH.write(">%s\n" % i)
689 outf_MH.write("%s\n" % original_bs_reads[i]) 696 outf_MH.write("%s\n" % original_bs_reads[i])
690 outf_MH.close() 697 outf_MH.close()
691 698
693 del FW_C2T_R 700 del FW_C2T_R
694 del FW_G2A_R 701 del FW_G2A_R
695 del RC_C2T_R 702 del RC_C2T_R
696 del RC_G2A_R 703 del RC_G2A_R
697 704
698 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] 705 FW_C2T_uniq_lst = [[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
699 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A] 706 FW_G2A_uniq_lst = [[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
700 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] 707 RC_C2T_uniq_lst = [[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
701 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A] 708 RC_G2A_uniq_lst = [[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
702 FW_C2T_uniq_lst.sort() 709 FW_C2T_uniq_lst.sort()
703 RC_C2T_uniq_lst.sort() 710 RC_C2T_uniq_lst.sort()
704 FW_G2A_uniq_lst.sort() 711 FW_G2A_uniq_lst.sort()
705 RC_G2A_uniq_lst.sort() 712 RC_G2A_uniq_lst.sort()
706 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] 713 FW_C2T_uniq_lst = [x[1] for x in FW_C2T_uniq_lst]
707 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] 714 RC_C2T_uniq_lst = [x[1] for x in RC_C2T_uniq_lst]
708 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] 715 FW_G2A_uniq_lst = [x[1] for x in FW_G2A_uniq_lst]
709 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] 716 RC_G2A_uniq_lst = [x[1] for x in RC_G2A_uniq_lst]
710 717
711 del Unique_FW_C2T 718 del Unique_FW_C2T
712 del Unique_FW_G2A 719 del Unique_FW_G2A
713 del Unique_RC_C2T 720 del Unique_RC_C2T
714 del Unique_RC_G2A 721 del Unique_RC_G2A
715 722
716 723
717 #---------------------------------------------------------------- 724 #----------------------------------------------------------------
718 # Post-filtering reads 725 # Post-filtering reads
719 # ---- FW_C2T ---- undirectional 726 # ---- FW_C2T ---- un-directional
720 FW_regions = dict() 727 FW_regions = dict()
721 gseq = dict() 728 gseq = dict()
722 chr_length = dict() 729 chr_length = dict()
723 for header in FW_C2T_uniq_lst : 730 for header in FW_C2T_uniq_lst :
724 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header] 731 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
756 try_pos, FR) 763 try_pos, FR)
757 try_pos -= 1 764 try_pos -= 1
758 try_count += 1 765 try_count += 1
759 766
760 N_mismatch = N_MIS(r_aln, g_aln) 767 N_mismatch = N_MIS(r_aln, g_aln)
761 if N_mismatch <= int(max_mismatch_no) : 768 # if N_mismatch <= int(max_mismatch_no) :
769 mm_no=float(max_mismatch_no)
770 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
762 all_mapped_passed += 1 771 all_mapped_passed += 1
763 methy = methy_seq(r_aln, g_aln + next2bp) 772 methy = methy_seq(r_aln, g_aln + next2bp)
764 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) 773 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
765 #---XS FILTER---------------- 774 #---XS FILTER----------------
766 XS = 0 775 XS = 0
774 output_genome = output_genome, 783 output_genome = output_genome,
775 rrbs = True, 784 rrbs = True,
776 my_region_serial = my_region_serial, 785 my_region_serial = my_region_serial,
777 my_region_start = my_region_start, 786 my_region_start = my_region_start,
778 my_region_end = my_region_end) 787 my_region_end = my_region_end)
788 all_base_mapped += len(original_BS)
779 else : 789 else :
780 print "[For debug]: reads not in same lengths" 790 print "[For debug]: reads not in same lengths"
781 791
782 792
783 # ---- RC_C2T ---- undirectional 793 # ---- RC_C2T ---- un-directional
784 RC_regions = dict() 794 RC_regions = dict()
785 for header in RC_C2T_uniq_lst : 795 for header in RC_C2T_uniq_lst :
786 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header] 796 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
787 original_BS = original_bs_reads[header] 797 original_BS = original_bs_reads[header]
788 if mapped_chr not in RC_regions : 798 if mapped_chr not in RC_regions :
819 try_pos, FR) 829 try_pos, FR)
820 try_pos += 1 830 try_pos += 1
821 try_count += 1 831 try_count += 1
822 832
823 N_mismatch = N_MIS(r_aln, g_aln) 833 N_mismatch = N_MIS(r_aln, g_aln)
824 if N_mismatch <= int(max_mismatch_no) : 834 # if N_mismatch <= int(max_mismatch_no) :
835 mm_no=float(max_mismatch_no)
836 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
825 all_mapped_passed += 1 837 all_mapped_passed += 1
826 methy = methy_seq(r_aln, g_aln + next2bp) 838 methy = methy_seq(r_aln, g_aln + next2bp)
827 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) 839 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
828 #---XS FILTER---------------- 840 #---XS FILTER----------------
829 XS = 0 841 XS = 0
837 output_genome = output_genome, 849 output_genome = output_genome,
838 rrbs = True, 850 rrbs = True,
839 my_region_serial = my_region_serial, 851 my_region_serial = my_region_serial,
840 my_region_start = my_region_start, 852 my_region_start = my_region_start,
841 my_region_end = my_region_end) 853 my_region_end = my_region_end)
854 all_base_mapped += len(original_BS)
842 855
843 else : 856 else :
844 print "[For debug]: reads not in same lengths" 857 print "[For debug]: reads not in same lengths"
845 858
846 859
847 # ---- FW_G2A ---- undirectional 860 # ---- FW_G2A ---- un-directional
848 FW_regions = dict() 861 FW_regions = dict()
849 gseq = dict() 862 gseq = dict()
850 chr_length = dict() 863 chr_length = dict()
851 for header in FW_G2A_uniq_lst : 864 for header in FW_G2A_uniq_lst :
852 _, mapped_chr, mapped_location, cigar = FW_G2A_U[header] 865 _, mapped_chr, mapped_location, cigar = FW_G2A_U[header]
889 #if my_region_serial == 0 : 902 #if my_region_serial == 0 :
890 # print "[For debug]: chr=", mapped_chr 903 # print "[For debug]: chr=", mapped_chr
891 # print "[For debug]: FW_G2A read still can not find fragment serial" 904 # print "[For debug]: FW_G2A read still can not find fragment serial"
892 # Tip: sometimes "my_region_serial" is still 0 ... 905 # Tip: sometimes "my_region_serial" is still 0 ...
893 906
894
895 N_mismatch = N_MIS(r_aln, g_aln) 907 N_mismatch = N_MIS(r_aln, g_aln)
896 if N_mismatch <= int(max_mismatch_no) : 908 # if N_mismatch <= int(max_mismatch_no) :
909 max_mismatch_no=float(max_mismatch_no)
910 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
897 all_mapped_passed += 1 911 all_mapped_passed += 1
898 methy = methy_seq(r_aln, g_aln + next2bp) 912 methy = methy_seq(r_aln, g_aln + next2bp)
899 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) 913 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
900 #---XS FILTER---------------- 914 #---XS FILTER----------------
901 XS = 0 915 XS = 0
909 output_genome = output_genome, 923 output_genome = output_genome,
910 rrbs = True, 924 rrbs = True,
911 my_region_serial = my_region_serial, 925 my_region_serial = my_region_serial,
912 my_region_start = my_region_start, 926 my_region_start = my_region_start,
913 my_region_end = my_region_end) 927 my_region_end = my_region_end)
928 all_base_mapped += len(original_BS)
914 else : 929 else :
915 print "[For debug]: reads not in same lengths" 930 print "[For debug]: reads not in same lengths"
916 931
917 932
918 # ---- RC_G2A ---- undirectional 933 # ---- RC_G2A ---- un-directional
919 RC_regions = dict() 934 RC_regions = dict()
920 for header in RC_G2A_uniq_lst : 935 for header in RC_G2A_uniq_lst :
921 _, mapped_chr, mapped_location, cigar = RC_G2A_U[header] 936 _, mapped_chr, mapped_location, cigar = RC_G2A_U[header]
922 original_BS = original_bs_reads[header] 937 original_BS = original_bs_reads[header]
923 if mapped_chr not in RC_regions : 938 if mapped_chr not in RC_regions :
959 974
960 #if my_region_serial == 0 : 975 #if my_region_serial == 0 :
961 # print "[For debug]: chr=", mapped_chr 976 # print "[For debug]: chr=", mapped_chr
962 # print "[For debug]: RC_C2A read still cannot find fragment serial" 977 # print "[For debug]: RC_C2A read still cannot find fragment serial"
963 978
964
965 N_mismatch = N_MIS(r_aln, g_aln) 979 N_mismatch = N_MIS(r_aln, g_aln)
966 if N_mismatch <= int(max_mismatch_no) : 980 # if N_mismatch <= int(max_mismatch_no) :
981 mm_no=float(max_mismatch_no)
982 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
967 all_mapped_passed += 1 983 all_mapped_passed += 1
968 methy = methy_seq(r_aln, g_aln + next2bp) 984 methy = methy_seq(r_aln, g_aln + next2bp)
969 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) 985 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
970 #---XS FILTER---------------- 986 #---XS FILTER----------------
971 XS = 0 987 XS = 0
979 output_genome = output_genome, 995 output_genome = output_genome,
980 rrbs = True, 996 rrbs = True,
981 my_region_serial = my_region_serial, 997 my_region_serial = my_region_serial,
982 my_region_start = my_region_start, 998 my_region_start = my_region_start,
983 my_region_end = my_region_end) 999 my_region_end = my_region_end)
1000 all_base_mapped += len(original_BS)
984 else : 1001 else :
985 print "[For debug]: reads not in same lengths" 1002 print "[For debug]: reads not in same lengths"
986
987
988 1003
989 # Finished both FW and RC 1004 # Finished both FW and RC
990 logm("Done: %s (%d) \n" % (read_file, no_my_files)) 1005 logm("Done: %s (%d) \n" % (read_file, no_my_files))
991 print "--> %s (%d) "%(read_file, no_my_files) 1006 print "--> %s (%d) "%(read_file, no_my_files)
992 del original_bs_reads 1007 del original_bs_reads
993 delete_files(WC2T, CC2T, WG2A, CG2A) 1008 delete_files(WC2T, CC2T, WG2A, CG2A)
994
995
996
997 # End of un-directional library 1009 # End of un-directional library
998 1010
999 delete_files(tmp_path) 1011 delete_files(tmp_path)
1000 1012
1001 1013
1002 logm("O Number of raw reads: %d "% all_raw_reads) 1014 logm("Number of raw reads: %d "% all_raw_reads)
1003 if all_raw_reads >0: 1015 if all_raw_reads>0:
1004 logm("O Number of CGG/TGG tagged reads: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads)) 1016 logm("Number of raw reads with CGG/TGG at 5' end: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads))
1005 for kk in range(len(n_cut_tag_lst)): 1017 for kk in range(len(n_cut_tag_lst)):
1006 logm("O Number of raw reads with %s tag: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads)) 1018 logm("Number of raw reads with tag %s: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))
1007 logm("O Number of CGG/TGG reads having adapter removed: %d "%all_tagged_trimmed) 1019 if adapter_seq!="" :
1008 logm("O Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) 1020 logm("Number of reads having adapter removed: %d "%all_tagged_trimmed)
1009 logm("O Number of unique-hits reads for post-filtering: %d"%all_mapped) 1021 logm("Number of bases in total: %d "%all_base_before_trim)
1010 1022 if adapter_seq!="" :
1011 logm("O ------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no)) 1023 logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim ) )
1012 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads)) 1024 logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) )
1013 1025 logm("Number of unique-hits reads (before post-filtering): %d"%all_mapped)
1014 if asktag=="Y": # undiretional 1026
1027 logm("------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))
1028 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads))
1029 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
1030
1031 if asktag=="Y": # un-diretional
1015 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) 1032 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
1016 logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) ) 1033 logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) )
1017 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) 1034 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
1018 logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) ) 1035 logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) )
1019 # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration 1036 # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration
1020 # according to literal meaning 1037 # according to literal meaning
1021 elif asktag=="N": # directional 1038 elif asktag=="N": # directional
1022 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) 1039 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
1023 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) 1040 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
1024 1041 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
1025 n_CG=mC_lst[0]+uC_lst[0] 1042
1026 n_CHG=mC_lst[1]+uC_lst[1] 1043 n_CG = mC_lst[0] + uC_lst[0]
1027 n_CHH=mC_lst[2]+uC_lst[2] 1044 n_CHG = mC_lst[1] + uC_lst[1]
1045 n_CHH = mC_lst[2] + uC_lst[2]
1028 1046
1029 logm("----------------------------------------------") 1047 logm("----------------------------------------------")
1030 logm("M Methylated C in mapped reads ") 1048 logm("Methylated C in mapped reads ")
1031 logm("M mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0)) 1049 logm(" mCG %1.3f%%"%( (100 * float(mC_lst[0]) / n_CG) if n_CG!=0 else 0))
1032 logm("M mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0)) 1050 logm(" mCHG %1.3f%%"%( (100 * float(mC_lst[1]) / n_CHG) if n_CHG!=0 else 0))
1033 logm("M mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0)) 1051 logm(" mCHH %1.3f%%"%( (100 * float(mC_lst[2]) / n_CHH) if n_CHH!=0 else 0))
1034 logm("----------------------------------------------") 1052 logm("----------------------------------------------")
1035 logm("------------------- END ----------------------") 1053 logm("------------------- END ----------------------")
1036 1054
1037 elapsed(main_read_file) 1055 elapsed(main_read_file)
1038 1056