Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_align/bs_rrbs.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e6df770c0e58 |
---|---|
1 import fileinput, random, math, os.path | |
2 from bs_index.rrbs_build import FWD_MAPPABLE_REGIONS, REV_MAPPABLE_REGIONS | |
3 from bs_utils.utils import * | |
4 | |
5 from bs_align.bs_single_end import extract_mapping | |
6 from bs_align_utils import * | |
7 | |
8 def my_mappable_region(chr_regions, mapped_location, FR): # start_position (first C), end_position (last G), serial, sequence | |
9 #print len(chr_regions) | |
10 out_serial = 0 | |
11 out_start = -1 | |
12 out_end = -1 | |
13 #print "mapped_location:", mapped_location | |
14 if FR == "+FW" or FR == "-RC": | |
15 my_location = str(mapped_location) | |
16 if my_location in chr_regions: | |
17 my_lst = chr_regions[my_location] | |
18 out_start = int(my_location) | |
19 out_end = my_lst[0] | |
20 out_serial = my_lst[1] | |
21 #else : | |
22 # print "[For debug]: +FW location %s cannot be found" % my_location | |
23 elif FR == "-FW" or FR == "+RC": | |
24 my_location = str(mapped_location) | |
25 if my_location in chr_regions: | |
26 my_lst = chr_regions[my_location] | |
27 out_end = int(my_location) | |
28 out_start = my_lst[0] | |
29 out_serial = my_lst[1] | |
30 #else : | |
31 # print "[For debug]: -FW location %s cannot be found" % my_location | |
32 | |
33 return out_serial, out_start, out_end | |
34 | |
35 | |
36 #---------------------------------------------------------------- | |
37 | |
38 def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines, max_mismatch_no, | |
39 aligner_command, db_path, tmp_path, outfile, XS_pct, XS_count, adapter_mismatch, cut_format="C-CGG", | |
40 show_multiple_hit=False): | |
41 #---------------------------------------------------------------- | |
42 # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC" | |
43 #cut_context = re.sub("-", "", cut_format) | |
44 # Ex. cut_format="C-CGG,AT-CG,G-CWGC" | |
45 """ | |
46 | |
47 :param main_read_file: | |
48 :param asktag: | |
49 :param adapter_file: | |
50 :param cut_s: | |
51 :param cut_e: | |
52 :param no_small_lines: | |
53 :param max_mismatch_no: | |
54 :param aligner_command: | |
55 :param db_path: | |
56 :param tmp_path: | |
57 :param outfile: | |
58 :param XS_pct: | |
59 :param XS_count: | |
60 :param adapter_mismatch: | |
61 :param cut_format: | |
62 """ | |
63 cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC'] | |
64 cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC'] | |
65 cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G'] | |
66 cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC'] | |
67 cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5] | |
68 min_cut5_len = min([len(i) for i in cut5_context]) | |
69 #print cut_format_lst | |
70 #print cut_format | |
71 #print cut5_context | |
72 | |
73 cut_tag_lst = Enumerate_C_to_CT(cut_format_lst) # ['G-TTGC', 'AT-TG', 'G-CAGT', 'T-CGG', 'G-TAGC', 'C-TGG', 'G-CAGC', 'G-CTGC', 'AT-CG', 'T-TGG', 'G-TTGT', 'G-TAGT', 'C-CGG', 'G-CTGT'] | |
74 cut5_tag_lst = [re.match(r'(.*)\-(.*)', i).group(1) for i in cut_tag_lst] | |
75 cut3_tag_lst = [re.match(r'(.*)\-(.*)', i).group(2) for i in cut_tag_lst] | |
76 check_pattern = [ i[-2:]+"_"+j for i,j in zip(cut5_tag_lst, cut3_tag_lst) ] | |
77 | |
78 #print "=======" | |
79 #print cut_tag_lst | |
80 #print cut3_tag_lst | |
81 #print cut5_tag_lst | |
82 #print check_pattern | |
83 | |
84 # set region[gx,gy] for checking_genome_context | |
85 gx = [ 0 if j>2 else 2-j for j in [len(i) for i in cut5_tag_lst] ] # [XC-CGG] | |
86 gy = [ 3+len(i) for i in cut3_tag_lst ] | |
87 | |
88 | |
89 #---------------------------------------------------------------- | |
90 | |
91 # helper method to join fname with tmp_path | |
92 tmp_d = lambda fname: os.path.join(tmp_path, fname) | |
93 db_d = lambda fname: os.path.join(db_path, fname) | |
94 | |
95 MAX_TRY = 500 # For finding the serial_no | |
96 whole_adapter_seq = "" | |
97 #---------------------------------------------------------------- | |
98 adapter_seq="" | |
99 if adapter_file: | |
100 try : | |
101 adapter_inf = open(adapter_file,"r") | |
102 whole_adapter_seq = adapter_inf.readline().strip() | |
103 adapter_seq = whole_adapter_seq[0:10] # only use first 10bp of adapter | |
104 adapter_inf.close() | |
105 except IOError: | |
106 print "[Error] Cannot find adapter file : %s !" % adapter_file | |
107 exit(-1) | |
108 | |
109 logm("I Read filename: %s" % main_read_file) | |
110 logm("I The last cycle (for mapping): %d" % cut_e ) | |
111 logm("I Bowtie path: %s" % aligner_command ) | |
112 logm("I Reference genome library path: %s" % db_path ) | |
113 logm("I Number of mismatches allowed: %s" % max_mismatch_no) | |
114 logm("I Adapter seq: %s" % whole_adapter_seq) | |
115 logm("----------------------------------------------") | |
116 | |
117 #---------------------------------------------------------------- | |
118 all_raw_reads=0 | |
119 all_tagged=0 | |
120 all_tagged_trimmed=0 | |
121 all_mapped=0 | |
122 all_mapped_passed=0 | |
123 n_cut_tag_lst={} | |
124 #print cut3_tag_lst | |
125 for x in cut3_tag_lst: | |
126 n_cut_tag_lst[x]=0 | |
127 | |
128 mC_lst=[0,0,0] | |
129 uC_lst=[0,0,0] | |
130 | |
131 no_my_files=0 | |
132 | |
133 num_mapped_FW_C2T = 0 | |
134 num_mapped_RC_C2T = 0 | |
135 num_mapped_FW_G2A = 0 | |
136 num_mapped_RC_G2A = 0 | |
137 | |
138 #=============================================== | |
139 # directional sequencing | |
140 #=============================================== | |
141 | |
142 if asktag=="N" : | |
143 #---------------------------------------------------------------- | |
144 logm("== Start mapping ==") | |
145 | |
146 input_fname = os.path.split(main_read_file)[1] | |
147 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines): | |
148 | |
149 logm("Processing read file: %s" % read_file) | |
150 original_bs_reads = {} | |
151 no_my_files+=1 | |
152 random_id = ".tmp-"+str(random.randint(1000000,9999999)) | |
153 outfile2=tmp_d('Trimmed_C2T.fa'+random_id) | |
154 | |
155 outf2=open(outfile2,'w') | |
156 | |
157 #--- Checking input format ------------------------------------------ | |
158 try : | |
159 read_inf=open(read_file,"r") | |
160 except IOError: | |
161 print "[Error] Cannot open input file : %s" % read_file | |
162 exit(-1) | |
163 | |
164 oneline=read_inf.readline() | |
165 l=oneline.split() | |
166 n_fastq=0 | |
167 n_fasta=0 | |
168 input_format="" | |
169 if oneline[0]=="@": # FastQ | |
170 input_format="fastq" | |
171 elif len(l)==1 and oneline[0]!=">": # pure sequences | |
172 input_format="seq" | |
173 elif len(l)==11: # Illumina qseq | |
174 input_format="qseq" | |
175 elif oneline[0]==">": # fasta | |
176 input_format="fasta" | |
177 read_inf.close() | |
178 | |
179 #---------------------------------------------------------------- | |
180 seq_id="" | |
181 seq="" | |
182 seq_ready=0 | |
183 for line in fileinput.input(read_file): | |
184 l=line.split() | |
185 | |
186 if input_format=="seq": | |
187 all_raw_reads+=1 | |
188 seq_id=str(all_raw_reads) | |
189 seq_id=seq_id.zfill(12) | |
190 seq=l[0] | |
191 seq_ready="Y" | |
192 elif input_format=="fastq": | |
193 m_fastq=math.fmod(n_fastq,4) | |
194 n_fastq+=1 | |
195 seq_ready="N" | |
196 if m_fastq==0: | |
197 all_raw_reads+=1 | |
198 seq_id=str(all_raw_reads) | |
199 seq_id=seq_id.zfill(12) | |
200 seq="" | |
201 elif m_fastq==1: | |
202 seq=l[0] | |
203 seq_ready="Y" | |
204 else: | |
205 seq="" | |
206 elif input_format=="qseq": | |
207 all_raw_reads+=1 | |
208 seq_id=str(all_raw_reads) | |
209 seq_id=seq_id.zfill(12) | |
210 seq=l[8] | |
211 seq_ready="Y" | |
212 elif input_format=="fasta": | |
213 m_fasta=math.fmod(n_fasta,2) | |
214 n_fasta+=1 | |
215 seq_ready="N" | |
216 if m_fasta==0: | |
217 all_raw_reads+=1 | |
218 seq_id=l[0][1:] | |
219 seq="" | |
220 elif m_fasta==1: | |
221 seq=l[0] | |
222 seq_ready="Y" | |
223 else: | |
224 seq="" | |
225 #--------------------------------------------------------------- | |
226 if seq_ready=="Y": | |
227 # Normalize the characters | |
228 seq=seq.upper().replace(".","N") | |
229 | |
230 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ] | |
231 if len(read_tag) > 0 : | |
232 all_tagged += 1 | |
233 for i in read_tag : | |
234 n_cut_tag_lst[i] += 1 | |
235 | |
236 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default | |
237 | |
238 #-- Trimming adapter sequence --- | |
239 if adapter_seq != "" : | |
240 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch) | |
241 if len(new_read) < len(seq) : | |
242 all_tagged_trimmed += 1 | |
243 seq = new_read | |
244 if len(seq) <= 4 : | |
245 seq = "N" * (cut_e - cut_s) | |
246 | |
247 # all reads will be considered, regardless of tags | |
248 #--------- trimmed_raw_BS_read and qscore ------------------ | |
249 original_bs_reads[seq_id] = seq | |
250 #--------- FW_C2T ------------------ | |
251 outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T'))) | |
252 fileinput.close() | |
253 | |
254 outf2.close() | |
255 | |
256 delete_files(read_file) | |
257 logm("Processing input is done") | |
258 #-------------------------------------------------------------------------------- | |
259 | |
260 # mapping | |
261 #-------------------------------------------------------------------------------- | |
262 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) | |
263 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) | |
264 | |
265 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | |
266 'input_file' : outfile2, | |
267 'output_file' : WC2T}, | |
268 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | |
269 'input_file' : outfile2, | |
270 'output_file' : CC2T} ]) | |
271 | |
272 logm("Aligning reads is done") | |
273 | |
274 delete_files(outfile2) | |
275 | |
276 #-------------------------------------------------------------------------------- | |
277 # Post processing | |
278 #-------------------------------------------------------------------------------- | |
279 | |
280 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T) | |
281 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T) | |
282 logm("Extracting alignments is done") | |
283 | |
284 #---------------------------------------------------------------- | |
285 # get uniq-hit reads | |
286 #---------------------------------------------------------------- | |
287 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys()) | |
288 | |
289 Unique_FW_C2T=set() # + | |
290 Unique_RC_C2T=set() # - | |
291 Multiple_hits=set() | |
292 | |
293 | |
294 for x in Union_set: | |
295 _list=[] | |
296 for dx in [FW_C2T_U, RC_C2T_U]: | |
297 mis_lst=dx.get(x,[99]) | |
298 mis=int(mis_lst[0]) | |
299 _list.append(mis) | |
300 for dx in [FW_C2T_R, RC_C2T_R]: | |
301 mis=dx.get(x,99) | |
302 _list.append(mis) | |
303 mini=min(_list) | |
304 if _list.count(mini)==1: | |
305 mini_index=_list.index(mini) | |
306 if mini_index==0: | |
307 Unique_FW_C2T.add(x) | |
308 elif mini_index==1: | |
309 Unique_RC_C2T.add(x) | |
310 else : | |
311 Multiple_hits.add(x) | |
312 else : | |
313 Multiple_hits.add(x) | |
314 # write reads rejected by Multiple Hits to file | |
315 if show_multiple_hit : | |
316 outf_MH=open("Multiple_hit.fa",'w') | |
317 for i in Multiple_hits : | |
318 outf_MH.write(">%s\n" % i) | |
319 outf_MH.write("%s\n" % original_bs_reads[i]) | |
320 outf_MH.close() | |
321 | |
322 del Union_set | |
323 del FW_C2T_R | |
324 del RC_C2T_R | |
325 | |
326 FW_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] | |
327 RC_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] | |
328 FW_uniq_lst.sort() | |
329 RC_uniq_lst.sort() | |
330 FW_uniq_lst=[x[1] for x in FW_uniq_lst] | |
331 RC_uniq_lst=[x[1] for x in RC_uniq_lst] | |
332 | |
333 del Unique_FW_C2T | |
334 del Unique_RC_C2T | |
335 | |
336 #---------------------------------------------------------------- | |
337 # Post-filtering reads | |
338 | |
339 # ---- FW ---- | |
340 FW_regions = dict() | |
341 gseq = dict() | |
342 chr_length = dict() | |
343 for header in FW_uniq_lst : | |
344 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header] | |
345 original_BS = original_bs_reads[header] | |
346 if mapped_chr not in FW_regions : | |
347 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr))) | |
348 if mapped_chr not in gseq : | |
349 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) | |
350 chr_length[mapped_chr] = len(gseq[mapped_chr]) | |
351 | |
352 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) | |
353 all_mapped+=1 | |
354 FR = "+FW" | |
355 mapped_strand = "+" | |
356 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], | |
357 mapped_location, | |
358 mapped_location + g_len, | |
359 mapped_strand) | |
360 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] | |
361 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | |
362 | |
363 if len(r_aln) == len(g_aln) : | |
364 my_region_serial, my_region_start, my_region_end = [-1, 0, 0] | |
365 if True in checking_genome_context : | |
366 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] | |
367 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], | |
368 try_pos, FR) | |
369 if my_region_serial == 0 : # still be 0 | |
370 # for some cases, read has no tags; searching the upstream sequence for tags | |
371 # print "[For debug]: FW read has no tags" | |
372 try_count = 0 | |
373 try_pos = mapped_location - min_cut5_len + 1 | |
374 while my_region_serial == 0 and try_count < MAX_TRY : | |
375 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], | |
376 try_pos, FR) | |
377 try_pos -= 1 | |
378 try_count += 1 | |
379 | |
380 #if my_region_serial == 0 : | |
381 # print "[For debug]: chr=", mapped_chr | |
382 # print "[For debug]: +FW read still can not find fragment serial" | |
383 # Tip: sometimes "my_region_serial" is still 0 ... | |
384 | |
385 | |
386 N_mismatch = N_MIS(r_aln, g_aln) | |
387 if N_mismatch <= int(max_mismatch_no) : | |
388 all_mapped_passed += 1 | |
389 methy = methy_seq(r_aln, g_aln + next2bp) | |
390 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | |
391 #---XS FILTER---------------- | |
392 XS = 0 | |
393 nCH = methy.count('y') + methy.count('z') | |
394 nmCH = methy.count('Y') + methy.count('Z') | |
395 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | |
396 XS = 1 | |
397 num_mapped_FW_C2T += 1 | |
398 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, | |
399 mapped_location, cigar, original_BS, methy, XS, | |
400 output_genome = output_genome, | |
401 rrbs = True, | |
402 my_region_serial = my_region_serial, | |
403 my_region_start = my_region_start, | |
404 my_region_end = my_region_end) | |
405 else : | |
406 print "[For debug]: reads not in same lengths" | |
407 | |
408 #print "start RC" | |
409 # ---- RC ---- | |
410 RC_regions = dict() | |
411 for header in RC_uniq_lst : | |
412 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header] | |
413 original_BS = original_bs_reads[header] | |
414 if mapped_chr not in RC_regions : | |
415 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr))) | |
416 if mapped_chr not in gseq : | |
417 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) | |
418 chr_length[mapped_chr] = len(gseq[mapped_chr]) | |
419 | |
420 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) | |
421 mapped_location = chr_length[mapped_chr] - mapped_location - g_len | |
422 all_mapped+=1 | |
423 FR = "-FW" | |
424 mapped_strand = "-" | |
425 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], | |
426 mapped_location, | |
427 mapped_location + g_len, | |
428 mapped_strand) | |
429 #checking_genome_context = (output_genome[gx:gy] == check_pattern) | |
430 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] | |
431 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | |
432 | |
433 if len(r_aln) == len(g_aln) : # and checking_genome_context: | |
434 my_region_serial, my_region_start, my_region_end = [-1, 0, 0] | |
435 if True in checking_genome_context : | |
436 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] | |
437 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], | |
438 try_pos , FR) | |
439 if my_region_serial == 0 : # still be 0 | |
440 # for some cases, read has no tags; searching the upstream sequence for tags | |
441 #print "[For debug]: RC Read has no tags" | |
442 try_count = 0 | |
443 try_pos = mapped_location + g_len + min_cut5_len - 2 | |
444 while my_region_serial == 0 and try_count < MAX_TRY : | |
445 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], | |
446 try_pos, FR) | |
447 try_pos += 1 | |
448 try_count += 1 | |
449 | |
450 #if my_region_serial == 0 : | |
451 # print "[For debug]: chr=", mapped_chr | |
452 # print "[For debug]: -FW read still cannot find fragment serial" | |
453 | |
454 | |
455 N_mismatch = N_MIS(r_aln, g_aln) | |
456 if N_mismatch <= int(max_mismatch_no) : | |
457 all_mapped_passed += 1 | |
458 methy = methy_seq(r_aln, g_aln + next2bp) | |
459 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | |
460 #---XS FILTER---------------- | |
461 XS = 0 | |
462 nCH = methy.count('y') + methy.count('z') | |
463 nmCH = methy.count('Y') + methy.count('Z') | |
464 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | |
465 XS = 1 | |
466 num_mapped_RC_C2T += 1 | |
467 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, | |
468 mapped_location, cigar, original_BS, methy, XS, | |
469 output_genome = output_genome, | |
470 rrbs = True, | |
471 my_region_serial = my_region_serial, | |
472 my_region_start = my_region_start, | |
473 my_region_end = my_region_end) | |
474 else : | |
475 print "[For debug]: reads not in same lengths" | |
476 | |
477 | |
478 # Finished both FW and RC | |
479 logm("Done: %s (%d) \n" % (read_file, no_my_files)) | |
480 print "--> %s (%d) "%(read_file, no_my_files) | |
481 del original_bs_reads | |
482 delete_files(WC2T, CC2T) | |
483 | |
484 # End of directional library | |
485 | |
486 | |
487 # ==================================================== | |
488 # un-directional library | |
489 # ==================================================== | |
490 | |
491 elif asktag=="Y" : | |
492 #---------------------------------------------------------------- | |
493 logm("== Start mapping ==") | |
494 | |
495 input_fname = os.path.split(main_read_file)[1] | |
496 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines): | |
497 | |
498 logm("Processing read file: %s" % read_file) | |
499 original_bs_reads = {} | |
500 no_my_files+=1 | |
501 random_id = ".tmp-"+str(random.randint(1000000,9999999)) | |
502 outfile2=tmp_d('Trimmed_C2T.fa'+random_id) | |
503 outfile3=tmp_d('Trimmed_G2A.fa'+random_id) | |
504 | |
505 outf2=open(outfile2,'w') | |
506 outf3=open(outfile3,'w') | |
507 | |
508 #--- Checking input format ------------------------------------------ | |
509 try : | |
510 read_inf=open(read_file,"r") | |
511 except IOError: | |
512 print "[Error] Cannot open input file : %s" % read_file | |
513 exit(-1) | |
514 | |
515 oneline=read_inf.readline() | |
516 l=oneline.split() | |
517 n_fastq=0 | |
518 n_fasta=0 | |
519 input_format="" | |
520 if oneline[0]=="@": # FastQ | |
521 input_format="fastq" | |
522 elif len(l)==1 and oneline[0]!=">": # pure sequences | |
523 input_format="seq" | |
524 elif len(l)==11: # Illumina qseq | |
525 input_format="qseq" | |
526 elif oneline[0]==">": # fasta | |
527 input_format="fasta" | |
528 read_inf.close() | |
529 | |
530 #---------------------------------------------------------------- | |
531 seq_id = "" | |
532 seq = "" | |
533 seq_ready=0 | |
534 for line in fileinput.input(read_file): | |
535 l=line.split() | |
536 | |
537 if input_format == "seq": | |
538 all_raw_reads+=1 | |
539 seq_id=str(all_raw_reads) | |
540 seq_id=seq_id.zfill(12) | |
541 seq=l[0] | |
542 seq_ready="Y" | |
543 elif input_format=="fastq": | |
544 m_fastq=math.fmod(n_fastq,4) | |
545 n_fastq+=1 | |
546 seq_ready="N" | |
547 if m_fastq==0: | |
548 all_raw_reads+=1 | |
549 seq_id=str(all_raw_reads) | |
550 seq_id=seq_id.zfill(12) | |
551 seq="" | |
552 elif m_fastq==1: | |
553 seq=l[0] | |
554 seq_ready="Y" | |
555 else: | |
556 seq="" | |
557 elif input_format=="qseq": | |
558 all_raw_reads+=1 | |
559 seq_id=str(all_raw_reads) | |
560 seq_id=seq_id.zfill(12) | |
561 seq=l[8] | |
562 seq_ready="Y" | |
563 elif input_format=="fasta": | |
564 m_fasta=math.fmod(n_fasta,2) | |
565 n_fasta+=1 | |
566 seq_ready="N" | |
567 if m_fasta==0: | |
568 all_raw_reads+=1 | |
569 seq_id=l[0][1:] | |
570 seq="" | |
571 elif m_fasta==1: | |
572 seq=l[0] | |
573 seq_ready="Y" | |
574 else: | |
575 seq="" | |
576 #--------------------------------------------------------------- | |
577 if seq_ready=="Y": | |
578 # Normalize the characters | |
579 seq=seq.upper().replace(".","N") | |
580 | |
581 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ] | |
582 if len(read_tag) > 0 : | |
583 all_tagged += 1 | |
584 for i in read_tag : | |
585 n_cut_tag_lst[i] += 1 | |
586 | |
587 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default | |
588 | |
589 #-- Trimming adapter sequence --- | |
590 if adapter_seq != "" : | |
591 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch) | |
592 if len(new_read) < len(seq) : | |
593 all_tagged_trimmed += 1 | |
594 seq = new_read | |
595 if len(seq) <= 4 : | |
596 seq = "N" * (cut_e - cut_s) | |
597 | |
598 # all reads will be considered, regardless of tags | |
599 #--------- trimmed_raw_BS_read and qscore ------------------ | |
600 original_bs_reads[seq_id] = seq | |
601 #--------- FW_C2T ------------------ | |
602 outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T'))) | |
603 #--------- RC_G2A ------------------ | |
604 outf3.write('>%s\n%s\n' % (seq_id, seq.replace("G","A"))) | |
605 fileinput.close() | |
606 | |
607 outf2.close() | |
608 | |
609 delete_files(read_file) | |
610 logm("Processing input is done") | |
611 #-------------------------------------------------------------------------------- | |
612 | |
613 # mapping | |
614 #-------------------------------------------------------------------------------- | |
615 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) | |
616 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) | |
617 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id) | |
618 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id) | |
619 | |
620 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), | |
621 'input_file' : outfile2, | |
622 'output_file' : WC2T}, | |
623 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), | |
624 'input_file' : outfile2, | |
625 'output_file' : CC2T}, | |
626 aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'), | |
627 'input_file' : outfile3, | |
628 'output_file' : WG2A}, | |
629 aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'), | |
630 'input_file' : outfile3, | |
631 'output_file' : CG2A} ]) | |
632 | |
633 logm("Aligning reads is done") | |
634 | |
635 delete_files(outfile2) | |
636 | |
637 #-------------------------------------------------------------------------------- | |
638 # Post processing | |
639 #-------------------------------------------------------------------------------- | |
640 | |
641 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T) | |
642 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A) | |
643 | |
644 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A) | |
645 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T) | |
646 | |
647 logm("Extracting alignments is done") | |
648 | |
649 #---------------------------------------------------------------- | |
650 # get unique-hit reads | |
651 #---------------------------------------------------------------- | |
652 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys()) | |
653 | |
654 Unique_FW_C2T=set() # + | |
655 Unique_RC_G2A=set() # + | |
656 Unique_FW_G2A=set() # - | |
657 Unique_RC_C2T=set() # - | |
658 Multiple_hits=set() | |
659 | |
660 for x in Union_set: | |
661 _list=[] | |
662 for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]: | |
663 mis_lst=dx.get(x,[99]) | |
664 mis=int(mis_lst[0]) | |
665 _list.append(mis) | |
666 for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]: | |
667 mis=dx.get(x,99) | |
668 _list.append(mis) | |
669 mini=min(_list) | |
670 if _list.count(mini) == 1: | |
671 mini_index=_list.index(mini) | |
672 if mini_index == 0: | |
673 Unique_FW_C2T.add(x) | |
674 elif mini_index == 1: | |
675 Unique_RC_G2A.add(x) | |
676 elif mini_index == 2: | |
677 Unique_FW_G2A.add(x) | |
678 elif mini_index == 3: | |
679 Unique_RC_C2T.add(x) | |
680 else : | |
681 Multiple_hits.add(x) | |
682 else : | |
683 Multiple_hits.add(x) | |
684 # write reads rejected by Multiple Hits to file | |
685 if show_multiple_hit : | |
686 outf_MH=open("Multiple_hit.fa",'w') | |
687 for i in Multiple_hits : | |
688 outf_MH.write(">%s\n" % i) | |
689 outf_MH.write("%s\n" % original_bs_reads[i]) | |
690 outf_MH.close() | |
691 | |
692 del Union_set | |
693 del FW_C2T_R | |
694 del FW_G2A_R | |
695 del RC_C2T_R | |
696 del RC_G2A_R | |
697 | |
698 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] | |
699 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A] | |
700 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] | |
701 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A] | |
702 FW_C2T_uniq_lst.sort() | |
703 RC_C2T_uniq_lst.sort() | |
704 FW_G2A_uniq_lst.sort() | |
705 RC_G2A_uniq_lst.sort() | |
706 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] | |
707 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] | |
708 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] | |
709 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] | |
710 | |
711 del Unique_FW_C2T | |
712 del Unique_FW_G2A | |
713 del Unique_RC_C2T | |
714 del Unique_RC_G2A | |
715 | |
716 | |
717 #---------------------------------------------------------------- | |
718 # Post-filtering reads | |
719 # ---- FW_C2T ---- undirectional | |
720 FW_regions = dict() | |
721 gseq = dict() | |
722 chr_length = dict() | |
723 for header in FW_C2T_uniq_lst : | |
724 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header] | |
725 original_BS = original_bs_reads[header] | |
726 if mapped_chr not in FW_regions : | |
727 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr))) | |
728 if mapped_chr not in gseq : | |
729 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) | |
730 chr_length[mapped_chr] = len(gseq[mapped_chr]) | |
731 | |
732 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) | |
733 all_mapped+=1 | |
734 FR = "+FW" | |
735 mapped_strand = "+" | |
736 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], | |
737 mapped_location, | |
738 mapped_location + g_len, | |
739 mapped_strand) | |
740 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] | |
741 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | |
742 | |
743 if len(r_aln) == len(g_aln) : | |
744 my_region_serial, my_region_start, my_region_end = [-1, 0, 0] | |
745 if True in checking_genome_context : | |
746 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] | |
747 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], | |
748 try_pos, FR) | |
749 if my_region_serial == 0 : # still be 0 | |
750 # for some cases, read has no tags; searching the upstream sequence for tags | |
751 # print "[For debug]: FW read has no tags" | |
752 try_count = 0 | |
753 try_pos = mapped_location - min_cut5_len + 1 | |
754 while my_region_serial == 0 and try_count < MAX_TRY : | |
755 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], | |
756 try_pos, FR) | |
757 try_pos -= 1 | |
758 try_count += 1 | |
759 | |
760 N_mismatch = N_MIS(r_aln, g_aln) | |
761 if N_mismatch <= int(max_mismatch_no) : | |
762 all_mapped_passed += 1 | |
763 methy = methy_seq(r_aln, g_aln + next2bp) | |
764 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | |
765 #---XS FILTER---------------- | |
766 XS = 0 | |
767 nCH = methy.count('y') + methy.count('z') | |
768 nmCH = methy.count('Y') + methy.count('Z') | |
769 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | |
770 XS = 1 | |
771 num_mapped_FW_C2T += 1 | |
772 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, | |
773 mapped_location, cigar, original_BS, methy, XS, | |
774 output_genome = output_genome, | |
775 rrbs = True, | |
776 my_region_serial = my_region_serial, | |
777 my_region_start = my_region_start, | |
778 my_region_end = my_region_end) | |
779 else : | |
780 print "[For debug]: reads not in same lengths" | |
781 | |
782 | |
783 # ---- RC_C2T ---- undirectional | |
784 RC_regions = dict() | |
785 for header in RC_C2T_uniq_lst : | |
786 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header] | |
787 original_BS = original_bs_reads[header] | |
788 if mapped_chr not in RC_regions : | |
789 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr))) | |
790 if mapped_chr not in gseq : | |
791 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) | |
792 chr_length[mapped_chr] = len(gseq[mapped_chr]) | |
793 | |
794 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) | |
795 mapped_location = chr_length[mapped_chr] - mapped_location - g_len | |
796 all_mapped+=1 | |
797 FR = "-FW" | |
798 mapped_strand = "-" | |
799 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], | |
800 mapped_location, | |
801 mapped_location + g_len, | |
802 mapped_strand) | |
803 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] | |
804 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | |
805 | |
806 if len(r_aln) == len(g_aln) : # and checking_genome_context: | |
807 my_region_serial, my_region_start, my_region_end = [-1, 0, 0] | |
808 if True in checking_genome_context : | |
809 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] | |
810 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], | |
811 try_pos , FR) | |
812 if my_region_serial == 0 : # still be 0 | |
813 # for some cases, read has no tags; searching the upstream sequence for tags | |
814 #print "[For debug]: RC Read has no tags" | |
815 try_count = 0 | |
816 try_pos = mapped_location + g_len + min_cut5_len - 2 | |
817 while my_region_serial == 0 and try_count < MAX_TRY : | |
818 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], | |
819 try_pos, FR) | |
820 try_pos += 1 | |
821 try_count += 1 | |
822 | |
823 N_mismatch = N_MIS(r_aln, g_aln) | |
824 if N_mismatch <= int(max_mismatch_no) : | |
825 all_mapped_passed += 1 | |
826 methy = methy_seq(r_aln, g_aln + next2bp) | |
827 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | |
828 #---XS FILTER---------------- | |
829 XS = 0 | |
830 nCH = methy.count('y') + methy.count('z') | |
831 nmCH = methy.count('Y') + methy.count('Z') | |
832 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | |
833 XS = 1 | |
834 num_mapped_RC_C2T += 1 | |
835 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, | |
836 mapped_location, cigar, original_BS, methy, XS, | |
837 output_genome = output_genome, | |
838 rrbs = True, | |
839 my_region_serial = my_region_serial, | |
840 my_region_start = my_region_start, | |
841 my_region_end = my_region_end) | |
842 | |
843 else : | |
844 print "[For debug]: reads not in same lengths" | |
845 | |
846 | |
847 # ---- FW_G2A ---- undirectional | |
848 FW_regions = dict() | |
849 gseq = dict() | |
850 chr_length = dict() | |
851 for header in FW_G2A_uniq_lst : | |
852 _, mapped_chr, mapped_location, cigar = FW_G2A_U[header] | |
853 original_BS = original_bs_reads[header] | |
854 if mapped_chr not in FW_regions : | |
855 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr))) | |
856 if mapped_chr not in gseq : | |
857 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) | |
858 chr_length[mapped_chr] = len(gseq[mapped_chr]) | |
859 cigar = list(reversed(cigar)) | |
860 | |
861 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) | |
862 all_mapped+=1 | |
863 FR = "-RC" | |
864 mapped_strand = "-" | |
865 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], | |
866 mapped_location, | |
867 mapped_location + g_len, | |
868 mapped_strand) | |
869 original_BS = reverse_compl_seq(original_BS) # for RC reads | |
870 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] | |
871 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | |
872 | |
873 if len(r_aln) == len(g_aln) : | |
874 my_region_serial, my_region_start, my_region_end = [-1, 0, 0] | |
875 if True in checking_genome_context : | |
876 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] | |
877 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], | |
878 try_pos, FR) | |
879 if my_region_serial == 0 : # still be 0 | |
880 # for some cases, read has no tags; searching the upstream sequence for tags | |
881 #print "[For debug]: FW read has no tags" | |
882 try_count = 0 | |
883 try_pos = mapped_location - min_cut5_len + 1 | |
884 while my_region_serial == 0 and try_count < MAX_TRY : | |
885 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr], | |
886 try_pos, FR) | |
887 try_pos += 1 | |
888 try_count += 1 | |
889 #if my_region_serial == 0 : | |
890 # print "[For debug]: chr=", mapped_chr | |
891 # print "[For debug]: FW_G2A read still can not find fragment serial" | |
892 # Tip: sometimes "my_region_serial" is still 0 ... | |
893 | |
894 | |
895 N_mismatch = N_MIS(r_aln, g_aln) | |
896 if N_mismatch <= int(max_mismatch_no) : | |
897 all_mapped_passed += 1 | |
898 methy = methy_seq(r_aln, g_aln + next2bp) | |
899 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | |
900 #---XS FILTER---------------- | |
901 XS = 0 | |
902 nCH = methy.count('y') + methy.count('z') | |
903 nmCH = methy.count('Y') + methy.count('Z') | |
904 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | |
905 XS = 1 | |
906 num_mapped_FW_G2A += 1 | |
907 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, | |
908 mapped_location, cigar, original_BS, methy, XS, | |
909 output_genome = output_genome, | |
910 rrbs = True, | |
911 my_region_serial = my_region_serial, | |
912 my_region_start = my_region_start, | |
913 my_region_end = my_region_end) | |
914 else : | |
915 print "[For debug]: reads not in same lengths" | |
916 | |
917 | |
918 # ---- RC_G2A ---- undirectional | |
919 RC_regions = dict() | |
920 for header in RC_G2A_uniq_lst : | |
921 _, mapped_chr, mapped_location, cigar = RC_G2A_U[header] | |
922 original_BS = original_bs_reads[header] | |
923 if mapped_chr not in RC_regions : | |
924 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr))) | |
925 if mapped_chr not in gseq : | |
926 gseq[mapped_chr] = deserialize(db_d(mapped_chr)) | |
927 chr_length[mapped_chr] = len(gseq[mapped_chr]) | |
928 cigar = list(reversed(cigar)) | |
929 | |
930 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) | |
931 mapped_location = chr_length[mapped_chr] - mapped_location - g_len | |
932 all_mapped+=1 | |
933 FR = "+RC" | |
934 mapped_strand = "+" | |
935 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr], | |
936 mapped_location, | |
937 mapped_location + g_len, | |
938 mapped_strand) | |
939 original_BS = reverse_compl_seq(original_BS) # for RC reads | |
940 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ] | |
941 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) | |
942 | |
943 if len(r_aln) == len(g_aln) : # and checking_genome_context: | |
944 my_region_serial, my_region_start, my_region_end = [-1, 0, 0] | |
945 if True in checking_genome_context : | |
946 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0] | |
947 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], | |
948 mapped_location + g_len + min_cut5_len -1, FR) | |
949 if my_region_serial == 0 : # still be 0 | |
950 # for some cases, read has no tags; searching the upstream sequence for tags | |
951 #print "[For debug]: RC Read has no tags" | |
952 try_count = 0 | |
953 try_pos = mapped_location + g_len + min_cut5_len - 2 | |
954 while try_count < MAX_TRY : | |
955 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr], | |
956 try_pos, FR) | |
957 try_pos += 1 | |
958 try_count += 1 | |
959 | |
960 #if my_region_serial == 0 : | |
961 # print "[For debug]: chr=", mapped_chr | |
962 # print "[For debug]: RC_C2A read still cannot find fragment serial" | |
963 | |
964 | |
965 N_mismatch = N_MIS(r_aln, g_aln) | |
966 if N_mismatch <= int(max_mismatch_no) : | |
967 all_mapped_passed += 1 | |
968 methy = methy_seq(r_aln, g_aln + next2bp) | |
969 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) | |
970 #---XS FILTER---------------- | |
971 XS = 0 | |
972 nCH = methy.count('y') + methy.count('z') | |
973 nmCH = methy.count('Y') + methy.count('Z') | |
974 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : | |
975 XS = 1 | |
976 num_mapped_RC_G2A += 1 | |
977 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, | |
978 mapped_location, cigar, original_BS, methy, XS, | |
979 output_genome = output_genome, | |
980 rrbs = True, | |
981 my_region_serial = my_region_serial, | |
982 my_region_start = my_region_start, | |
983 my_region_end = my_region_end) | |
984 else : | |
985 print "[For debug]: reads not in same lengths" | |
986 | |
987 | |
988 | |
989 # Finished both FW and RC | |
990 logm("Done: %s (%d) \n" % (read_file, no_my_files)) | |
991 print "--> %s (%d) "%(read_file, no_my_files) | |
992 del original_bs_reads | |
993 delete_files(WC2T, CC2T, WG2A, CG2A) | |
994 | |
995 | |
996 | |
997 # End of un-directional library | |
998 | |
999 delete_files(tmp_path) | |
1000 | |
1001 | |
1002 logm("O Number of raw reads: %d "% all_raw_reads) | |
1003 if all_raw_reads >0: | |
1004 logm("O Number of CGG/TGG tagged reads: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads)) | |
1005 for kk in range(len(n_cut_tag_lst)): | |
1006 logm("O Number of raw reads with %s tag: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads)) | |
1007 logm("O Number of CGG/TGG reads having adapter removed: %d "%all_tagged_trimmed) | |
1008 logm("O Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) | |
1009 logm("O Number of unique-hits reads for post-filtering: %d"%all_mapped) | |
1010 | |
1011 logm("O ------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no)) | |
1012 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads)) | |
1013 | |
1014 if asktag=="Y": # undiretional | |
1015 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) | |
1016 logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) ) | |
1017 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) | |
1018 logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) ) | |
1019 # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration | |
1020 # according to literal meaning | |
1021 elif asktag=="N": # directional | |
1022 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) ) | |
1023 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) ) | |
1024 | |
1025 n_CG=mC_lst[0]+uC_lst[0] | |
1026 n_CHG=mC_lst[1]+uC_lst[1] | |
1027 n_CHH=mC_lst[2]+uC_lst[2] | |
1028 | |
1029 logm("----------------------------------------------") | |
1030 logm("M Methylated C in mapped reads ") | |
1031 logm("M mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0)) | |
1032 logm("M mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0)) | |
1033 logm("M mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0)) | |
1034 logm("----------------------------------------------") | |
1035 logm("------------------- END ----------------------") | |
1036 | |
1037 elapsed(main_read_file) | |
1038 | |
1039 close_log() | |
1040 | |
1041 |