comparison BSseeker2/bs_align/bs_rrbs.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
comparison
equal deleted inserted replaced
-1:000000000000 0:e6df770c0e58
1 import fileinput, random, math, os.path
2 from bs_index.rrbs_build import FWD_MAPPABLE_REGIONS, REV_MAPPABLE_REGIONS
3 from bs_utils.utils import *
4
5 from bs_align.bs_single_end import extract_mapping
6 from bs_align_utils import *
7
8 def my_mappable_region(chr_regions, mapped_location, FR): # start_position (first C), end_position (last G), serial, sequence
9 #print len(chr_regions)
10 out_serial = 0
11 out_start = -1
12 out_end = -1
13 #print "mapped_location:", mapped_location
14 if FR == "+FW" or FR == "-RC":
15 my_location = str(mapped_location)
16 if my_location in chr_regions:
17 my_lst = chr_regions[my_location]
18 out_start = int(my_location)
19 out_end = my_lst[0]
20 out_serial = my_lst[1]
21 #else :
22 # print "[For debug]: +FW location %s cannot be found" % my_location
23 elif FR == "-FW" or FR == "+RC":
24 my_location = str(mapped_location)
25 if my_location in chr_regions:
26 my_lst = chr_regions[my_location]
27 out_end = int(my_location)
28 out_start = my_lst[0]
29 out_serial = my_lst[1]
30 #else :
31 # print "[For debug]: -FW location %s cannot be found" % my_location
32
33 return out_serial, out_start, out_end
34
35
36 #----------------------------------------------------------------
37
38 def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines, max_mismatch_no,
39 aligner_command, db_path, tmp_path, outfile, XS_pct, XS_count, adapter_mismatch, cut_format="C-CGG",
40 show_multiple_hit=False):
41 #----------------------------------------------------------------
42 # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC"
43 #cut_context = re.sub("-", "", cut_format)
44 # Ex. cut_format="C-CGG,AT-CG,G-CWGC"
45 """
46
47 :param main_read_file:
48 :param asktag:
49 :param adapter_file:
50 :param cut_s:
51 :param cut_e:
52 :param no_small_lines:
53 :param max_mismatch_no:
54 :param aligner_command:
55 :param db_path:
56 :param tmp_path:
57 :param outfile:
58 :param XS_pct:
59 :param XS_count:
60 :param adapter_mismatch:
61 :param cut_format:
62 """
63 cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC']
64 cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC']
65 cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G']
66 cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC']
67 cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5]
68 min_cut5_len = min([len(i) for i in cut5_context])
69 #print cut_format_lst
70 #print cut_format
71 #print cut5_context
72
73 cut_tag_lst = Enumerate_C_to_CT(cut_format_lst) # ['G-TTGC', 'AT-TG', 'G-CAGT', 'T-CGG', 'G-TAGC', 'C-TGG', 'G-CAGC', 'G-CTGC', 'AT-CG', 'T-TGG', 'G-TTGT', 'G-TAGT', 'C-CGG', 'G-CTGT']
74 cut5_tag_lst = [re.match(r'(.*)\-(.*)', i).group(1) for i in cut_tag_lst]
75 cut3_tag_lst = [re.match(r'(.*)\-(.*)', i).group(2) for i in cut_tag_lst]
76 check_pattern = [ i[-2:]+"_"+j for i,j in zip(cut5_tag_lst, cut3_tag_lst) ]
77
78 #print "======="
79 #print cut_tag_lst
80 #print cut3_tag_lst
81 #print cut5_tag_lst
82 #print check_pattern
83
84 # set region[gx,gy] for checking_genome_context
85 gx = [ 0 if j>2 else 2-j for j in [len(i) for i in cut5_tag_lst] ] # [XC-CGG]
86 gy = [ 3+len(i) for i in cut3_tag_lst ]
87
88
89 #----------------------------------------------------------------
90
91 # helper method to join fname with tmp_path
92 tmp_d = lambda fname: os.path.join(tmp_path, fname)
93 db_d = lambda fname: os.path.join(db_path, fname)
94
95 MAX_TRY = 500 # For finding the serial_no
96 whole_adapter_seq = ""
97 #----------------------------------------------------------------
98 adapter_seq=""
99 if adapter_file:
100 try :
101 adapter_inf = open(adapter_file,"r")
102 whole_adapter_seq = adapter_inf.readline().strip()
103 adapter_seq = whole_adapter_seq[0:10] # only use first 10bp of adapter
104 adapter_inf.close()
105 except IOError:
106 print "[Error] Cannot find adapter file : %s !" % adapter_file
107 exit(-1)
108
109 logm("I Read filename: %s" % main_read_file)
110 logm("I The last cycle (for mapping): %d" % cut_e )
111 logm("I Bowtie path: %s" % aligner_command )
112 logm("I Reference genome library path: %s" % db_path )
113 logm("I Number of mismatches allowed: %s" % max_mismatch_no)
114 logm("I Adapter seq: %s" % whole_adapter_seq)
115 logm("----------------------------------------------")
116
117 #----------------------------------------------------------------
118 all_raw_reads=0
119 all_tagged=0
120 all_tagged_trimmed=0
121 all_mapped=0
122 all_mapped_passed=0
123 n_cut_tag_lst={}
124 #print cut3_tag_lst
125 for x in cut3_tag_lst:
126 n_cut_tag_lst[x]=0
127
128 mC_lst=[0,0,0]
129 uC_lst=[0,0,0]
130
131 no_my_files=0
132
133 num_mapped_FW_C2T = 0
134 num_mapped_RC_C2T = 0
135 num_mapped_FW_G2A = 0
136 num_mapped_RC_G2A = 0
137
138 #===============================================
139 # directional sequencing
140 #===============================================
141
142 if asktag=="N" :
143 #----------------------------------------------------------------
144 logm("== Start mapping ==")
145
146 input_fname = os.path.split(main_read_file)[1]
147 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
148
149 logm("Processing read file: %s" % read_file)
150 original_bs_reads = {}
151 no_my_files+=1
152 random_id = ".tmp-"+str(random.randint(1000000,9999999))
153 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
154
155 outf2=open(outfile2,'w')
156
157 #--- Checking input format ------------------------------------------
158 try :
159 read_inf=open(read_file,"r")
160 except IOError:
161 print "[Error] Cannot open input file : %s" % read_file
162 exit(-1)
163
164 oneline=read_inf.readline()
165 l=oneline.split()
166 n_fastq=0
167 n_fasta=0
168 input_format=""
169 if oneline[0]=="@": # FastQ
170 input_format="fastq"
171 elif len(l)==1 and oneline[0]!=">": # pure sequences
172 input_format="seq"
173 elif len(l)==11: # Illumina qseq
174 input_format="qseq"
175 elif oneline[0]==">": # fasta
176 input_format="fasta"
177 read_inf.close()
178
179 #----------------------------------------------------------------
180 seq_id=""
181 seq=""
182 seq_ready=0
183 for line in fileinput.input(read_file):
184 l=line.split()
185
186 if input_format=="seq":
187 all_raw_reads+=1
188 seq_id=str(all_raw_reads)
189 seq_id=seq_id.zfill(12)
190 seq=l[0]
191 seq_ready="Y"
192 elif input_format=="fastq":
193 m_fastq=math.fmod(n_fastq,4)
194 n_fastq+=1
195 seq_ready="N"
196 if m_fastq==0:
197 all_raw_reads+=1
198 seq_id=str(all_raw_reads)
199 seq_id=seq_id.zfill(12)
200 seq=""
201 elif m_fastq==1:
202 seq=l[0]
203 seq_ready="Y"
204 else:
205 seq=""
206 elif input_format=="qseq":
207 all_raw_reads+=1
208 seq_id=str(all_raw_reads)
209 seq_id=seq_id.zfill(12)
210 seq=l[8]
211 seq_ready="Y"
212 elif input_format=="fasta":
213 m_fasta=math.fmod(n_fasta,2)
214 n_fasta+=1
215 seq_ready="N"
216 if m_fasta==0:
217 all_raw_reads+=1
218 seq_id=l[0][1:]
219 seq=""
220 elif m_fasta==1:
221 seq=l[0]
222 seq_ready="Y"
223 else:
224 seq=""
225 #---------------------------------------------------------------
226 if seq_ready=="Y":
227 # Normalize the characters
228 seq=seq.upper().replace(".","N")
229
230 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
231 if len(read_tag) > 0 :
232 all_tagged += 1
233 for i in read_tag :
234 n_cut_tag_lst[i] += 1
235
236 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
237
238 #-- Trimming adapter sequence ---
239 if adapter_seq != "" :
240 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch)
241 if len(new_read) < len(seq) :
242 all_tagged_trimmed += 1
243 seq = new_read
244 if len(seq) <= 4 :
245 seq = "N" * (cut_e - cut_s)
246
247 # all reads will be considered, regardless of tags
248 #--------- trimmed_raw_BS_read and qscore ------------------
249 original_bs_reads[seq_id] = seq
250 #--------- FW_C2T ------------------
251 outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T')))
252 fileinput.close()
253
254 outf2.close()
255
256 delete_files(read_file)
257 logm("Processing input is done")
258 #--------------------------------------------------------------------------------
259
260 # mapping
261 #--------------------------------------------------------------------------------
262 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
263 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
264
265 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
266 'input_file' : outfile2,
267 'output_file' : WC2T},
268 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
269 'input_file' : outfile2,
270 'output_file' : CC2T} ])
271
272 logm("Aligning reads is done")
273
274 delete_files(outfile2)
275
276 #--------------------------------------------------------------------------------
277 # Post processing
278 #--------------------------------------------------------------------------------
279
280 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
281 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
282 logm("Extracting alignments is done")
283
284 #----------------------------------------------------------------
285 # get uniq-hit reads
286 #----------------------------------------------------------------
287 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())
288
289 Unique_FW_C2T=set() # +
290 Unique_RC_C2T=set() # -
291 Multiple_hits=set()
292
293
294 for x in Union_set:
295 _list=[]
296 for dx in [FW_C2T_U, RC_C2T_U]:
297 mis_lst=dx.get(x,[99])
298 mis=int(mis_lst[0])
299 _list.append(mis)
300 for dx in [FW_C2T_R, RC_C2T_R]:
301 mis=dx.get(x,99)
302 _list.append(mis)
303 mini=min(_list)
304 if _list.count(mini)==1:
305 mini_index=_list.index(mini)
306 if mini_index==0:
307 Unique_FW_C2T.add(x)
308 elif mini_index==1:
309 Unique_RC_C2T.add(x)
310 else :
311 Multiple_hits.add(x)
312 else :
313 Multiple_hits.add(x)
314 # write reads rejected by Multiple Hits to file
315 if show_multiple_hit :
316 outf_MH=open("Multiple_hit.fa",'w')
317 for i in Multiple_hits :
318 outf_MH.write(">%s\n" % i)
319 outf_MH.write("%s\n" % original_bs_reads[i])
320 outf_MH.close()
321
322 del Union_set
323 del FW_C2T_R
324 del RC_C2T_R
325
326 FW_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
327 RC_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
328 FW_uniq_lst.sort()
329 RC_uniq_lst.sort()
330 FW_uniq_lst=[x[1] for x in FW_uniq_lst]
331 RC_uniq_lst=[x[1] for x in RC_uniq_lst]
332
333 del Unique_FW_C2T
334 del Unique_RC_C2T
335
336 #----------------------------------------------------------------
337 # Post-filtering reads
338
339 # ---- FW ----
340 FW_regions = dict()
341 gseq = dict()
342 chr_length = dict()
343 for header in FW_uniq_lst :
344 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
345 original_BS = original_bs_reads[header]
346 if mapped_chr not in FW_regions :
347 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
348 if mapped_chr not in gseq :
349 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
350 chr_length[mapped_chr] = len(gseq[mapped_chr])
351
352 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
353 all_mapped+=1
354 FR = "+FW"
355 mapped_strand = "+"
356 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
357 mapped_location,
358 mapped_location + g_len,
359 mapped_strand)
360 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
361 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
362
363 if len(r_aln) == len(g_aln) :
364 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
365 if True in checking_genome_context :
366 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
367 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
368 try_pos, FR)
369 if my_region_serial == 0 : # still be 0
370 # for some cases, read has no tags; searching the upstream sequence for tags
371 # print "[For debug]: FW read has no tags"
372 try_count = 0
373 try_pos = mapped_location - min_cut5_len + 1
374 while my_region_serial == 0 and try_count < MAX_TRY :
375 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
376 try_pos, FR)
377 try_pos -= 1
378 try_count += 1
379
380 #if my_region_serial == 0 :
381 # print "[For debug]: chr=", mapped_chr
382 # print "[For debug]: +FW read still can not find fragment serial"
383 # Tip: sometimes "my_region_serial" is still 0 ...
384
385
386 N_mismatch = N_MIS(r_aln, g_aln)
387 if N_mismatch <= int(max_mismatch_no) :
388 all_mapped_passed += 1
389 methy = methy_seq(r_aln, g_aln + next2bp)
390 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
391 #---XS FILTER----------------
392 XS = 0
393 nCH = methy.count('y') + methy.count('z')
394 nmCH = methy.count('Y') + methy.count('Z')
395 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
396 XS = 1
397 num_mapped_FW_C2T += 1
398 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
399 mapped_location, cigar, original_BS, methy, XS,
400 output_genome = output_genome,
401 rrbs = True,
402 my_region_serial = my_region_serial,
403 my_region_start = my_region_start,
404 my_region_end = my_region_end)
405 else :
406 print "[For debug]: reads not in same lengths"
407
408 #print "start RC"
409 # ---- RC ----
410 RC_regions = dict()
411 for header in RC_uniq_lst :
412 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
413 original_BS = original_bs_reads[header]
414 if mapped_chr not in RC_regions :
415 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
416 if mapped_chr not in gseq :
417 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
418 chr_length[mapped_chr] = len(gseq[mapped_chr])
419
420 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
421 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
422 all_mapped+=1
423 FR = "-FW"
424 mapped_strand = "-"
425 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
426 mapped_location,
427 mapped_location + g_len,
428 mapped_strand)
429 #checking_genome_context = (output_genome[gx:gy] == check_pattern)
430 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
431 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
432
433 if len(r_aln) == len(g_aln) : # and checking_genome_context:
434 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
435 if True in checking_genome_context :
436 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
437 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
438 try_pos , FR)
439 if my_region_serial == 0 : # still be 0
440 # for some cases, read has no tags; searching the upstream sequence for tags
441 #print "[For debug]: RC Read has no tags"
442 try_count = 0
443 try_pos = mapped_location + g_len + min_cut5_len - 2
444 while my_region_serial == 0 and try_count < MAX_TRY :
445 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
446 try_pos, FR)
447 try_pos += 1
448 try_count += 1
449
450 #if my_region_serial == 0 :
451 # print "[For debug]: chr=", mapped_chr
452 # print "[For debug]: -FW read still cannot find fragment serial"
453
454
455 N_mismatch = N_MIS(r_aln, g_aln)
456 if N_mismatch <= int(max_mismatch_no) :
457 all_mapped_passed += 1
458 methy = methy_seq(r_aln, g_aln + next2bp)
459 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
460 #---XS FILTER----------------
461 XS = 0
462 nCH = methy.count('y') + methy.count('z')
463 nmCH = methy.count('Y') + methy.count('Z')
464 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
465 XS = 1
466 num_mapped_RC_C2T += 1
467 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
468 mapped_location, cigar, original_BS, methy, XS,
469 output_genome = output_genome,
470 rrbs = True,
471 my_region_serial = my_region_serial,
472 my_region_start = my_region_start,
473 my_region_end = my_region_end)
474 else :
475 print "[For debug]: reads not in same lengths"
476
477
478 # Finished both FW and RC
479 logm("Done: %s (%d) \n" % (read_file, no_my_files))
480 print "--> %s (%d) "%(read_file, no_my_files)
481 del original_bs_reads
482 delete_files(WC2T, CC2T)
483
484 # End of directional library
485
486
487 # ====================================================
488 # un-directional library
489 # ====================================================
490
491 elif asktag=="Y" :
492 #----------------------------------------------------------------
493 logm("== Start mapping ==")
494
495 input_fname = os.path.split(main_read_file)[1]
496 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
497
498 logm("Processing read file: %s" % read_file)
499 original_bs_reads = {}
500 no_my_files+=1
501 random_id = ".tmp-"+str(random.randint(1000000,9999999))
502 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
503 outfile3=tmp_d('Trimmed_G2A.fa'+random_id)
504
505 outf2=open(outfile2,'w')
506 outf3=open(outfile3,'w')
507
508 #--- Checking input format ------------------------------------------
509 try :
510 read_inf=open(read_file,"r")
511 except IOError:
512 print "[Error] Cannot open input file : %s" % read_file
513 exit(-1)
514
515 oneline=read_inf.readline()
516 l=oneline.split()
517 n_fastq=0
518 n_fasta=0
519 input_format=""
520 if oneline[0]=="@": # FastQ
521 input_format="fastq"
522 elif len(l)==1 and oneline[0]!=">": # pure sequences
523 input_format="seq"
524 elif len(l)==11: # Illumina qseq
525 input_format="qseq"
526 elif oneline[0]==">": # fasta
527 input_format="fasta"
528 read_inf.close()
529
530 #----------------------------------------------------------------
531 seq_id = ""
532 seq = ""
533 seq_ready=0
534 for line in fileinput.input(read_file):
535 l=line.split()
536
537 if input_format == "seq":
538 all_raw_reads+=1
539 seq_id=str(all_raw_reads)
540 seq_id=seq_id.zfill(12)
541 seq=l[0]
542 seq_ready="Y"
543 elif input_format=="fastq":
544 m_fastq=math.fmod(n_fastq,4)
545 n_fastq+=1
546 seq_ready="N"
547 if m_fastq==0:
548 all_raw_reads+=1
549 seq_id=str(all_raw_reads)
550 seq_id=seq_id.zfill(12)
551 seq=""
552 elif m_fastq==1:
553 seq=l[0]
554 seq_ready="Y"
555 else:
556 seq=""
557 elif input_format=="qseq":
558 all_raw_reads+=1
559 seq_id=str(all_raw_reads)
560 seq_id=seq_id.zfill(12)
561 seq=l[8]
562 seq_ready="Y"
563 elif input_format=="fasta":
564 m_fasta=math.fmod(n_fasta,2)
565 n_fasta+=1
566 seq_ready="N"
567 if m_fasta==0:
568 all_raw_reads+=1
569 seq_id=l[0][1:]
570 seq=""
571 elif m_fasta==1:
572 seq=l[0]
573 seq_ready="Y"
574 else:
575 seq=""
576 #---------------------------------------------------------------
577 if seq_ready=="Y":
578 # Normalize the characters
579 seq=seq.upper().replace(".","N")
580
581 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
582 if len(read_tag) > 0 :
583 all_tagged += 1
584 for i in read_tag :
585 n_cut_tag_lst[i] += 1
586
587 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
588
589 #-- Trimming adapter sequence ---
590 if adapter_seq != "" :
591 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch)
592 if len(new_read) < len(seq) :
593 all_tagged_trimmed += 1
594 seq = new_read
595 if len(seq) <= 4 :
596 seq = "N" * (cut_e - cut_s)
597
598 # all reads will be considered, regardless of tags
599 #--------- trimmed_raw_BS_read and qscore ------------------
600 original_bs_reads[seq_id] = seq
601 #--------- FW_C2T ------------------
602 outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T')))
603 #--------- RC_G2A ------------------
604 outf3.write('>%s\n%s\n' % (seq_id, seq.replace("G","A")))
605 fileinput.close()
606
607 outf2.close()
608
609 delete_files(read_file)
610 logm("Processing input is done")
611 #--------------------------------------------------------------------------------
612
613 # mapping
614 #--------------------------------------------------------------------------------
615 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
616 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
617 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
618 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
619
620 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
621 'input_file' : outfile2,
622 'output_file' : WC2T},
623 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
624 'input_file' : outfile2,
625 'output_file' : CC2T},
626 aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
627 'input_file' : outfile3,
628 'output_file' : WG2A},
629 aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
630 'input_file' : outfile3,
631 'output_file' : CG2A} ])
632
633 logm("Aligning reads is done")
634
635 delete_files(outfile2)
636
637 #--------------------------------------------------------------------------------
638 # Post processing
639 #--------------------------------------------------------------------------------
640
641 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
642 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)
643
644 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
645 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
646
647 logm("Extracting alignments is done")
648
649 #----------------------------------------------------------------
650 # get unique-hit reads
651 #----------------------------------------------------------------
652 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
653
654 Unique_FW_C2T=set() # +
655 Unique_RC_G2A=set() # +
656 Unique_FW_G2A=set() # -
657 Unique_RC_C2T=set() # -
658 Multiple_hits=set()
659
660 for x in Union_set:
661 _list=[]
662 for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
663 mis_lst=dx.get(x,[99])
664 mis=int(mis_lst[0])
665 _list.append(mis)
666 for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
667 mis=dx.get(x,99)
668 _list.append(mis)
669 mini=min(_list)
670 if _list.count(mini) == 1:
671 mini_index=_list.index(mini)
672 if mini_index == 0:
673 Unique_FW_C2T.add(x)
674 elif mini_index == 1:
675 Unique_RC_G2A.add(x)
676 elif mini_index == 2:
677 Unique_FW_G2A.add(x)
678 elif mini_index == 3:
679 Unique_RC_C2T.add(x)
680 else :
681 Multiple_hits.add(x)
682 else :
683 Multiple_hits.add(x)
684 # write reads rejected by Multiple Hits to file
685 if show_multiple_hit :
686 outf_MH=open("Multiple_hit.fa",'w')
687 for i in Multiple_hits :
688 outf_MH.write(">%s\n" % i)
689 outf_MH.write("%s\n" % original_bs_reads[i])
690 outf_MH.close()
691
692 del Union_set
693 del FW_C2T_R
694 del FW_G2A_R
695 del RC_C2T_R
696 del RC_G2A_R
697
698 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
699 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
700 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
701 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
702 FW_C2T_uniq_lst.sort()
703 RC_C2T_uniq_lst.sort()
704 FW_G2A_uniq_lst.sort()
705 RC_G2A_uniq_lst.sort()
706 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
707 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
708 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
709 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
710
711 del Unique_FW_C2T
712 del Unique_FW_G2A
713 del Unique_RC_C2T
714 del Unique_RC_G2A
715
716
717 #----------------------------------------------------------------
718 # Post-filtering reads
719 # ---- FW_C2T ---- undirectional
720 FW_regions = dict()
721 gseq = dict()
722 chr_length = dict()
723 for header in FW_C2T_uniq_lst :
724 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
725 original_BS = original_bs_reads[header]
726 if mapped_chr not in FW_regions :
727 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
728 if mapped_chr not in gseq :
729 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
730 chr_length[mapped_chr] = len(gseq[mapped_chr])
731
732 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
733 all_mapped+=1
734 FR = "+FW"
735 mapped_strand = "+"
736 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
737 mapped_location,
738 mapped_location + g_len,
739 mapped_strand)
740 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
741 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
742
743 if len(r_aln) == len(g_aln) :
744 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
745 if True in checking_genome_context :
746 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
747 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
748 try_pos, FR)
749 if my_region_serial == 0 : # still be 0
750 # for some cases, read has no tags; searching the upstream sequence for tags
751 # print "[For debug]: FW read has no tags"
752 try_count = 0
753 try_pos = mapped_location - min_cut5_len + 1
754 while my_region_serial == 0 and try_count < MAX_TRY :
755 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
756 try_pos, FR)
757 try_pos -= 1
758 try_count += 1
759
760 N_mismatch = N_MIS(r_aln, g_aln)
761 if N_mismatch <= int(max_mismatch_no) :
762 all_mapped_passed += 1
763 methy = methy_seq(r_aln, g_aln + next2bp)
764 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
765 #---XS FILTER----------------
766 XS = 0
767 nCH = methy.count('y') + methy.count('z')
768 nmCH = methy.count('Y') + methy.count('Z')
769 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
770 XS = 1
771 num_mapped_FW_C2T += 1
772 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
773 mapped_location, cigar, original_BS, methy, XS,
774 output_genome = output_genome,
775 rrbs = True,
776 my_region_serial = my_region_serial,
777 my_region_start = my_region_start,
778 my_region_end = my_region_end)
779 else :
780 print "[For debug]: reads not in same lengths"
781
782
783 # ---- RC_C2T ---- undirectional
784 RC_regions = dict()
785 for header in RC_C2T_uniq_lst :
786 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
787 original_BS = original_bs_reads[header]
788 if mapped_chr not in RC_regions :
789 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
790 if mapped_chr not in gseq :
791 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
792 chr_length[mapped_chr] = len(gseq[mapped_chr])
793
794 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
795 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
796 all_mapped+=1
797 FR = "-FW"
798 mapped_strand = "-"
799 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
800 mapped_location,
801 mapped_location + g_len,
802 mapped_strand)
803 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
804 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
805
806 if len(r_aln) == len(g_aln) : # and checking_genome_context:
807 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
808 if True in checking_genome_context :
809 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
810 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
811 try_pos , FR)
812 if my_region_serial == 0 : # still be 0
813 # for some cases, read has no tags; searching the upstream sequence for tags
814 #print "[For debug]: RC Read has no tags"
815 try_count = 0
816 try_pos = mapped_location + g_len + min_cut5_len - 2
817 while my_region_serial == 0 and try_count < MAX_TRY :
818 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
819 try_pos, FR)
820 try_pos += 1
821 try_count += 1
822
823 N_mismatch = N_MIS(r_aln, g_aln)
824 if N_mismatch <= int(max_mismatch_no) :
825 all_mapped_passed += 1
826 methy = methy_seq(r_aln, g_aln + next2bp)
827 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
828 #---XS FILTER----------------
829 XS = 0
830 nCH = methy.count('y') + methy.count('z')
831 nmCH = methy.count('Y') + methy.count('Z')
832 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
833 XS = 1
834 num_mapped_RC_C2T += 1
835 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
836 mapped_location, cigar, original_BS, methy, XS,
837 output_genome = output_genome,
838 rrbs = True,
839 my_region_serial = my_region_serial,
840 my_region_start = my_region_start,
841 my_region_end = my_region_end)
842
843 else :
844 print "[For debug]: reads not in same lengths"
845
846
847 # ---- FW_G2A ---- undirectional
848 FW_regions = dict()
849 gseq = dict()
850 chr_length = dict()
851 for header in FW_G2A_uniq_lst :
852 _, mapped_chr, mapped_location, cigar = FW_G2A_U[header]
853 original_BS = original_bs_reads[header]
854 if mapped_chr not in FW_regions :
855 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
856 if mapped_chr not in gseq :
857 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
858 chr_length[mapped_chr] = len(gseq[mapped_chr])
859 cigar = list(reversed(cigar))
860
861 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
862 all_mapped+=1
863 FR = "-RC"
864 mapped_strand = "-"
865 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
866 mapped_location,
867 mapped_location + g_len,
868 mapped_strand)
869 original_BS = reverse_compl_seq(original_BS) # for RC reads
870 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
871 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
872
873 if len(r_aln) == len(g_aln) :
874 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
875 if True in checking_genome_context :
876 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
877 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
878 try_pos, FR)
879 if my_region_serial == 0 : # still be 0
880 # for some cases, read has no tags; searching the upstream sequence for tags
881 #print "[For debug]: FW read has no tags"
882 try_count = 0
883 try_pos = mapped_location - min_cut5_len + 1
884 while my_region_serial == 0 and try_count < MAX_TRY :
885 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
886 try_pos, FR)
887 try_pos += 1
888 try_count += 1
889 #if my_region_serial == 0 :
890 # print "[For debug]: chr=", mapped_chr
891 # print "[For debug]: FW_G2A read still can not find fragment serial"
892 # Tip: sometimes "my_region_serial" is still 0 ...
893
894
895 N_mismatch = N_MIS(r_aln, g_aln)
896 if N_mismatch <= int(max_mismatch_no) :
897 all_mapped_passed += 1
898 methy = methy_seq(r_aln, g_aln + next2bp)
899 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
900 #---XS FILTER----------------
901 XS = 0
902 nCH = methy.count('y') + methy.count('z')
903 nmCH = methy.count('Y') + methy.count('Z')
904 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
905 XS = 1
906 num_mapped_FW_G2A += 1
907 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
908 mapped_location, cigar, original_BS, methy, XS,
909 output_genome = output_genome,
910 rrbs = True,
911 my_region_serial = my_region_serial,
912 my_region_start = my_region_start,
913 my_region_end = my_region_end)
914 else :
915 print "[For debug]: reads not in same lengths"
916
917
918 # ---- RC_G2A ---- undirectional
919 RC_regions = dict()
920 for header in RC_G2A_uniq_lst :
921 _, mapped_chr, mapped_location, cigar = RC_G2A_U[header]
922 original_BS = original_bs_reads[header]
923 if mapped_chr not in RC_regions :
924 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
925 if mapped_chr not in gseq :
926 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
927 chr_length[mapped_chr] = len(gseq[mapped_chr])
928 cigar = list(reversed(cigar))
929
930 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
931 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
932 all_mapped+=1
933 FR = "+RC"
934 mapped_strand = "+"
935 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
936 mapped_location,
937 mapped_location + g_len,
938 mapped_strand)
939 original_BS = reverse_compl_seq(original_BS) # for RC reads
940 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
941 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
942
943 if len(r_aln) == len(g_aln) : # and checking_genome_context:
944 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
945 if True in checking_genome_context :
946 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
947 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
948 mapped_location + g_len + min_cut5_len -1, FR)
949 if my_region_serial == 0 : # still be 0
950 # for some cases, read has no tags; searching the upstream sequence for tags
951 #print "[For debug]: RC Read has no tags"
952 try_count = 0
953 try_pos = mapped_location + g_len + min_cut5_len - 2
954 while try_count < MAX_TRY :
955 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
956 try_pos, FR)
957 try_pos += 1
958 try_count += 1
959
960 #if my_region_serial == 0 :
961 # print "[For debug]: chr=", mapped_chr
962 # print "[For debug]: RC_C2A read still cannot find fragment serial"
963
964
965 N_mismatch = N_MIS(r_aln, g_aln)
966 if N_mismatch <= int(max_mismatch_no) :
967 all_mapped_passed += 1
968 methy = methy_seq(r_aln, g_aln + next2bp)
969 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
970 #---XS FILTER----------------
971 XS = 0
972 nCH = methy.count('y') + methy.count('z')
973 nmCH = methy.count('Y') + methy.count('Z')
974 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
975 XS = 1
976 num_mapped_RC_G2A += 1
977 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
978 mapped_location, cigar, original_BS, methy, XS,
979 output_genome = output_genome,
980 rrbs = True,
981 my_region_serial = my_region_serial,
982 my_region_start = my_region_start,
983 my_region_end = my_region_end)
984 else :
985 print "[For debug]: reads not in same lengths"
986
987
988
989 # Finished both FW and RC
990 logm("Done: %s (%d) \n" % (read_file, no_my_files))
991 print "--> %s (%d) "%(read_file, no_my_files)
992 del original_bs_reads
993 delete_files(WC2T, CC2T, WG2A, CG2A)
994
995
996
997 # End of un-directional library
998
999 delete_files(tmp_path)
1000
1001
1002 logm("O Number of raw reads: %d "% all_raw_reads)
1003 if all_raw_reads >0:
1004 logm("O Number of CGG/TGG tagged reads: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads))
1005 for kk in range(len(n_cut_tag_lst)):
1006 logm("O Number of raw reads with %s tag: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))
1007 logm("O Number of CGG/TGG reads having adapter removed: %d "%all_tagged_trimmed)
1008 logm("O Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
1009 logm("O Number of unique-hits reads for post-filtering: %d"%all_mapped)
1010
1011 logm("O ------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))
1012 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads))
1013
1014 if asktag=="Y": # undiretional
1015 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
1016 logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) )
1017 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
1018 logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) )
1019 # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration
1020 # according to literal meaning
1021 elif asktag=="N": # directional
1022 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
1023 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
1024
1025 n_CG=mC_lst[0]+uC_lst[0]
1026 n_CHG=mC_lst[1]+uC_lst[1]
1027 n_CHH=mC_lst[2]+uC_lst[2]
1028
1029 logm("----------------------------------------------")
1030 logm("M Methylated C in mapped reads ")
1031 logm("M mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
1032 logm("M mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
1033 logm("M mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))
1034 logm("----------------------------------------------")
1035 logm("------------------- END ----------------------")
1036
1037 elapsed(main_read_file)
1038
1039 close_log()
1040
1041