comparison BSseeker2/bs_align/bs_single_end.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
comparison
equal deleted inserted replaced
-1:000000000000 0:e6df770c0e58
1 import fileinput, os, time, random, math
2 from bs_utils.utils import *
3 from bs_align_utils import *
4
5 #----------------------------------------------------------------
6 # Read from the mapped results, return lists of unique / multiple-hit reads
7 # The function suppose at most 2 hits will be reported in single file
8 def extract_mapping(ali_file):
9 unique_hits = {}
10 non_unique_hits = {}
11
12 header0 = ""
13 lst = []
14
15 for header, chr, location, no_mismatch, cigar in process_aligner_output(ali_file):
16 #------------------------------
17 if header != header0:
18 #---------- output -----------
19 if len(lst) == 1:
20 unique_hits[header0] = lst[0] # [no_mismatch, chr, location]
21 elif len(lst) > 1:
22 min_lst = min(lst, key = lambda x: x[0])
23 max_lst = max(lst, key = lambda x: x[0])
24
25 if min_lst[0] < max_lst[0]:
26 unique_hits[header0] = min_lst
27 else:
28 non_unique_hits[header0] = min_lst[0]
29 #print "multiple hit", header, chr, location, no_mismatch, cigar # test
30 header0 = header
31 lst = [(no_mismatch, chr, location, cigar)]
32 else: # header == header0, same header (read id)
33 lst.append((no_mismatch, chr, location, cigar))
34
35 if len(lst) == 1:
36 unique_hits[header0] = lst[0] # [no_mismatch, chr, location]
37 elif len(lst) > 1:
38 min_lst = min(lst, key = lambda x: x[0])
39 max_lst = max(lst, key = lambda x: x[0])
40
41 if min_lst[0] < max_lst[0]:
42 unique_hits[header0] = min_lst
43 else:
44 non_unique_hits[header0] = min_lst[0]
45
46
47 return unique_hits, non_unique_hits
48
49
50 def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines,
51 max_mismatch_no, aligner_command, db_path, tmp_path, outfile,
52 XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):
53 #----------------------------------------------------------------
54 # adapter : strand-specific or not
55 adapter=""
56 adapter_fw=""
57 adapter_rc=""
58 if adapter_file !="":
59 try :
60 adapter_inf=open(adapter_file,"r")
61 except IOError:
62 print "[Error] Cannot open adapter file : %s" % adapter_file
63 exit(-1)
64 if asktag == "N": #<--- directional library
65 adapter=adapter_inf.readline()
66 adapter_inf.close()
67 adapter=adapter.rstrip("\n")
68 elif asktag == "Y":#<--- un-directional library
69 adapter_fw=adapter_inf.readline()
70 adapter_rc=adapter_inf.readline()
71 adapter_inf.close()
72 adapter_fw=adapter_fw.rstrip("\n")
73 adapter_rc=adapter_rc.rstrip("\n")
74 adapter_inf.close()
75 #----------------------------------------------------------------
76
77
78
79 #----------------------------------------------------------------
80 logm("Read filename: %s"% main_read_file )
81 logm("Un-directional library: %s" % asktag )
82 logm("The first base (for mapping): %d" % cut1)
83 logm("The last base (for mapping): %d" % cut2)
84 logm("Max. lines per mapping: %d"% no_small_lines)
85 logm("Aligner: %s" % aligner_command)
86 logm("Reference genome library path: %s" % db_path )
87 logm("Number of mismatches allowed: %s" % max_mismatch_no )
88 if adapter_file !="":
89 if asktag=="N":
90 logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n")))
91 elif asktag=="Y":
92 logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) )
93 logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) )
94 #----------------------------------------------------------------
95
96 # helper method to join fname with tmp_path
97 tmp_d = lambda fname: os.path.join(tmp_path, fname)
98
99 db_d = lambda fname: os.path.join(db_path, fname)
100
101 #----------------------------------------------------------------
102 # splitting the big read file
103
104 input_fname = os.path.split(main_read_file)[1]
105
106 # split_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines)
107 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path)
108 # if splitted_file.startswith("%s-s-" % input_fname))
109
110 #---- Stats ------------------------------------------------------------
111 all_raw_reads=0
112 all_trimed=0
113 all_mapped=0
114 all_mapped_passed=0
115
116 numbers_premapped_lst=[0,0,0,0]
117 numbers_mapped_lst=[0,0,0,0]
118
119 mC_lst=[0,0,0]
120 uC_lst=[0,0,0]
121
122
123 no_my_files=0
124
125 #----------------------------------------------------------------
126 logm("== Start mapping ==")
127
128 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
129 # for read_file in my_files:
130 original_bs_reads = {}
131 no_my_files+=1
132 random_id = ".tmp-"+str(random.randint(1000000,9999999))
133
134 #-------------------------------------------------------------------
135 # undirectional sequencing
136 #-------------------------------------------------------------------
137 if asktag=="Y":
138
139 #----------------------------------------------------------------
140 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
141 outfile3=tmp_d('Trimmed_G2A.fa'+random_id)
142
143 outf2=open(outfile2,'w')
144 outf3=open(outfile3,'w')
145
146 #----------------------------------------------------------------
147 # detect format of input file
148 try :
149 read_inf=open(read_file,"r")
150 except IOError :
151 print "[Error] Cannot open input file : %s" % read_file
152 exit(-1)
153
154 oneline=read_inf.readline()
155 l=oneline.split()
156 input_format=""
157 if oneline[0]=="@": # fastq
158 input_format="fastq"
159 n_fastq=0
160 elif len(l)==1 and oneline[0]!=">": # pure sequences
161 input_format="seq"
162 elif len(l)==11: # qseq
163 input_format="qseq"
164 elif oneline[0]==">": # fasta
165 input_format="fasta"
166 n_fasta=0
167 read_inf.close()
168
169 #----------------------------------------------------------------
170 # read sequence, remove adapter and convert
171 read_id=""
172 seq=""
173 seq_ready="N"
174 for line in fileinput.input(read_file):
175 l=line.split()
176
177 if input_format=="seq":
178 all_raw_reads+=1
179 read_id=str(all_raw_reads)
180 read_id=read_id.zfill(12)
181 seq=l[0]
182 seq_ready="Y"
183 elif input_format=="fastq":
184 m_fastq=math.fmod(n_fastq,4)
185 n_fastq+=1
186 seq_ready="N"
187 if m_fastq==0:
188 all_raw_reads+=1
189 read_id=str(all_raw_reads)
190 read_id=read_id.zfill(12)
191 seq=""
192 elif m_fastq==1:
193 seq=l[0]
194 seq_ready="Y"
195 else:
196 seq=""
197 elif input_format=="qseq":
198 all_raw_reads+=1
199 read_id=str(all_raw_reads)
200 read_id=read_id.zfill(12)
201 seq=l[8]
202 seq_ready="Y"
203 elif input_format=="fasta":
204 m_fasta=math.fmod(n_fasta,2)
205 n_fasta+=1
206 seq_ready="N"
207 if m_fasta==0:
208 all_raw_reads+=1
209 #read_id=str(all_raw_reads)
210 read_id=l[0][1:]
211 seq=""
212 elif m_fasta==1:
213 seq=l[0]
214 seq_ready="Y"
215 else:
216 seq=""
217
218 #----------------------------------------------------------------
219 if seq_ready=="Y":
220 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52
221 seq=seq.upper()
222 seq=seq.replace(".","N")
223
224 # striping BS adapter from 3' read
225 if (adapter_fw !="") and (adapter_rc !="") :
226 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch)
227 new_read = Remove_5end_Adapter(new_read, adapter_rc)
228 if len(new_read) < len(seq) :
229 all_trimed += 1
230 seq = new_read
231
232 if len(seq)<=4:
233 seq=''.join(["N" for x in xrange(cut2-cut1+1)])
234
235 #--------- trimmed_raw_BS_read ------------------
236 original_bs_reads[read_id] = seq
237
238 #--------- FW_C2T ------------------
239 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
240 #--------- RC_G2A ------------------
241 outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
242
243 fileinput.close()
244
245 outf2.close()
246 outf3.close()
247
248 delete_files(read_file)
249
250 #--------------------------------------------------------------------------------
251 # Bowtie mapping
252 #-------------------------------------------------------------------------------
253 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
254 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
255 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
256 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
257
258 # print aligner_command % {'int_no_mismatches' : int_no_mismatches,
259 # 'reference_genome' : os.path.join(db_path,'W_C2T'),
260 # 'input_file' : outfile2,
261 # 'output_file' : WC2T}
262
263 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
264 'input_file' : outfile2,
265 'output_file' : WC2T},
266
267 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
268 'input_file' : outfile2,
269 'output_file' : CC2T},
270
271 aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
272 'input_file' : outfile3,
273 'output_file' : WG2A},
274
275 aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
276 'input_file' : outfile3,
277 'output_file' : CG2A} ])
278
279
280 delete_files(outfile2, outfile3)
281
282
283 #--------------------------------------------------------------------------------
284 # Post processing
285 #--------------------------------------------------------------------------------
286
287 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
288 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)
289
290 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
291 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
292
293 #----------------------------------------------------------------
294 # get unique-hit reads
295 #----------------------------------------------------------------
296 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
297
298 Unique_FW_C2T=set() # +
299 Unique_RC_G2A=set() # +
300 Unique_FW_G2A=set() # -
301 Unique_RC_C2T=set() # -
302 Multiple_hits=set()
303
304
305 for x in Union_set:
306 _list=[]
307 for d in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
308 mis_lst=d.get(x,[99])
309 mis=int(mis_lst[0])
310 _list.append(mis)
311 for d in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
312 mis=d.get(x,99)
313 _list.append(mis)
314 mini=min(_list)
315 if _list.count(mini) == 1:
316 mini_index=_list.index(mini)
317 if mini_index == 0:
318 Unique_FW_C2T.add(x)
319 elif mini_index == 1:
320 Unique_RC_G2A.add(x)
321 elif mini_index == 2:
322 Unique_FW_G2A.add(x)
323 elif mini_index == 3:
324 Unique_RC_C2T.add(x)
325 # if mini_index = 4,5,6,7, indicating multiple hits
326 else :
327 Multiple_hits.add(x)
328 else :
329 Multiple_hits.add(x)
330 # write reads rejected by Multiple Hits to file
331 if show_multiple_hit :
332 outf_MH=open("Multiple_hit.fa",'w')
333 for i in Multiple_hits :
334 outf_MH.write(">%s\n" % i)
335 outf_MH.write("%s\n" % original_bs_reads[i])
336 outf_MH.close()
337
338 del Union_set
339 del FW_C2T_R
340 del FW_G2A_R
341 del RC_C2T_R
342 del RC_G2A_R
343
344 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
345 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
346 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
347 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
348 FW_C2T_uniq_lst.sort()
349 RC_C2T_uniq_lst.sort()
350 FW_G2A_uniq_lst.sort()
351 RC_G2A_uniq_lst.sort()
352 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
353 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
354 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
355 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
356
357 del Unique_FW_C2T
358 del Unique_FW_G2A
359 del Unique_RC_C2T
360 del Unique_RC_G2A
361
362 #----------------------------------------------------------------
363 numbers_premapped_lst[0] += len(Unique_FW_C2T)
364 numbers_premapped_lst[1] += len(Unique_RC_G2A)
365 numbers_premapped_lst[2] += len(Unique_FW_G2A)
366 numbers_premapped_lst[3] += len(Unique_RC_C2T)
367
368
369 #----------------------------------------------------------------
370
371 nn=0
372 gseq = dict()
373 chr_length = dict()
374 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),
375 (RC_G2A_uniq_lst,RC_G2A_U),
376 (FW_G2A_uniq_lst,FW_G2A_U),
377 (RC_C2T_uniq_lst,RC_C2T_U)]:
378 nn += 1
379 mapped_chr0 = ""
380
381 for header in ali_unique_lst:
382
383 _, mapped_chr, mapped_location, cigar = ali_dic[header]
384
385 original_BS = original_bs_reads[header]
386 #-------------------------------------
387 if mapped_chr not in gseq:
388 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
389 chr_length[mapped_chr] = len(gseq[mapped_chr])
390
391 if nn == 2 or nn == 3:
392 cigar = list(reversed(cigar))
393 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
394
395
396 all_mapped += 1
397
398 if nn == 1: # +FW mapped to + strand:
399 FR = "+FW"
400 mapped_strand="+"
401
402 elif nn == 2: # +RC mapped to + strand:
403 FR = "+RC" # RC reads from -RC reflecting the methylation status on Watson strand (+)
404 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
405 mapped_strand = "+"
406 original_BS = reverse_compl_seq(original_BS) # for RC reads
407
408 elif nn == 3: # -RC mapped to - strand:
409 mapped_strand = "-"
410 FR = "-RC" # RC reads from +RC reflecting the methylation status on Crick strand (-)
411 original_BS = reverse_compl_seq(original_BS) # for RC reads
412
413 elif nn == 4: # -FW mapped to - strand:
414 mapped_strand = "-"
415 FR = "-FW"
416 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
417
418 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
419
420 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
421
422
423 if len(r_aln)==len(g_aln):
424 N_mismatch = N_MIS(r_aln, g_aln)
425 if N_mismatch <= int(max_mismatch_no):
426 numbers_mapped_lst[nn-1] += 1
427 all_mapped_passed += 1
428 methy = methy_seq(r_aln, g_aln + next)
429 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
430
431 #---XS FILTER----------------
432 XS = 0
433 nCH = methy.count('y') + methy.count('z')
434 nmCH = methy.count('Y') + methy.count('Z')
435 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
436 XS = 1
437
438 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
439
440 #----------------------------------------------------------------
441 logm("--> %s (%d) "%(read_file, no_my_files))
442 delete_files(WC2T, WG2A, CC2T, CG2A)
443
444
445
446 #--------------------------------------------------------------------
447 # directional sequencing
448 #--------------------------------------------------------------------
449
450 if asktag=="N":
451 #----------------------------------------------------------------
452 outfile2=tmp_d('Trimed_C2T.fa'+random_id)
453 outf2=open(outfile2,'w')
454
455 n=0
456 #----------------------------------------------------------------
457 try :
458 read_inf=open(read_file,"r")
459 except IOError :
460 print "[Error] Cannot open input file : %s" % read_file
461 exit(-1)
462
463 oneline=read_inf.readline()
464 l=oneline.split()
465 input_format=""
466 if oneline[0]=="@": # FastQ
467 input_format="fastq"
468 n_fastq=0
469 elif len(l)==1 and oneline[0]!=">": # pure sequences
470 input_format="seq"
471 elif len(l)==11: # Illumina GAII qseq file
472 input_format="qseq"
473 elif oneline[0]==">": # fasta
474 input_format="fasta"
475 n_fasta=0
476 read_inf.close()
477 #print "detected data format: %s"%(input_format)
478 #----------------------------------------------------------------
479 read_id=""
480 seq=""
481 seq_ready="N"
482 for line in fileinput.input(read_file):
483 l=line.split()
484 if input_format=="seq":
485 all_raw_reads+=1
486 read_id=str(all_raw_reads)
487 read_id=read_id.zfill(12)
488 seq=l[0]
489 seq_ready="Y"
490 elif input_format=="fastq":
491 m_fastq=math.fmod(n_fastq,4)
492 n_fastq+=1
493 seq_ready="N"
494 if m_fastq==0:
495 all_raw_reads+=1
496 read_id=str(all_raw_reads)
497 read_id=read_id.zfill(12)
498 seq=""
499 elif m_fastq==1:
500 seq=l[0]
501 seq_ready="Y"
502 else:
503 seq=""
504 elif input_format=="qseq":
505 all_raw_reads+=1
506 read_id=str(all_raw_reads)
507 read_id=read_id.zfill(12)
508 seq=l[8]
509 seq_ready="Y"
510 elif input_format=="fasta":
511 m_fasta=math.fmod(n_fasta,2)
512 n_fasta+=1
513 seq_ready="N"
514 if m_fasta==0:
515 all_raw_reads+=1
516 read_id=l[0][1:]
517 seq=""
518 elif m_fasta==1:
519 seq=l[0]
520 seq_ready="Y"
521 else:
522 seq=""
523
524 #--------------------------------
525 if seq_ready=="Y":
526 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52
527 seq=seq.upper()
528 seq=seq.replace(".","N")
529
530 #--striping adapter from 3' read -------
531 if adapter != "":
532 new_read = RemoveAdapter(seq, adapter, adapter_mismatch)
533 if len(new_read) < len(seq) :
534 all_trimed += 1
535 seq = new_read
536
537 if len(seq)<=4:
538 seq = "N" * (cut2-cut1+1)
539
540 #--------- trimmed_raw_BS_read ------------------
541 original_bs_reads[read_id] = seq
542
543
544 #--------- FW_C2T ------------------
545 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
546
547 fileinput.close()
548
549 outf2.close()
550 delete_files(read_file)
551
552 #--------------------------------------------------------------------------------
553 # Bowtie mapping
554 #--------------------------------------------------------------------------------
555 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
556 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
557
558 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
559 'input_file' : outfile2,
560 'output_file' : WC2T},
561 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
562 'input_file' : outfile2,
563 'output_file' : CC2T} ])
564
565 delete_files(outfile2)
566
567 #--------------------------------------------------------------------------------
568 # Post processing
569 #--------------------------------------------------------------------------------
570
571
572 FW_C2T_U, FW_C2T_R = extract_mapping(WC2T)
573 RC_C2T_U, RC_C2T_R = extract_mapping(CC2T)
574
575 #----------------------------------------------------------------
576 # get uniq-hit reads
577 #----------------------------------------------------------------
578 Union_set = set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())
579
580 Unique_FW_C2T = set() # +
581 Unique_RC_C2T = set() # -
582 Multiple_hits=set()
583 # write reads rejected by Multiple Hits to file
584
585 for x in Union_set:
586 _list=[]
587 for d in [FW_C2T_U,RC_C2T_U]:
588 mis_lst=d.get(x,[99])
589 mis=int(mis_lst[0])
590 _list.append(mis)
591 for d in [FW_C2T_R,RC_C2T_R]:
592 mis=d.get(x,99)
593 _list.append(mis)
594 mini=min(_list)
595 #print _list
596 if _list.count(mini)==1:
597 mini_index=_list.index(mini)
598 if mini_index==0:
599 Unique_FW_C2T.add(x)
600 elif mini_index==1:
601 Unique_RC_C2T.add(x)
602 else:
603 Multiple_hits.add(x)
604 else :
605 Multiple_hits.add(x)
606 # write reads rejected by Multiple Hits to file
607 if show_multiple_hit :
608 outf_MH=open("Multiple_hit.fa",'w')
609 for i in Multiple_hits :
610 outf_MH.write(">%s\n" % i)
611 outf_MH.write("%s\n" % original_bs_reads[i])
612 outf_MH.close()
613
614
615
616 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
617 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
618 FW_C2T_uniq_lst.sort()
619 RC_C2T_uniq_lst.sort()
620 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
621 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
622
623
624 #----------------------------------------------------------------
625
626 numbers_premapped_lst[0] += len(Unique_FW_C2T)
627 numbers_premapped_lst[1] += len(Unique_RC_C2T)
628
629 #----------------------------------------------------------------
630
631 nn = 0
632 gseq = dict()
633 chr_length = dict()
634 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]:
635 nn += 1
636 mapped_chr0 = ""
637 for header in ali_unique_lst:
638 _, mapped_chr, mapped_location, cigar = ali_dic[header]
639 original_BS = original_bs_reads[header]
640 #-------------------------------------
641 if mapped_chr not in gseq :
642 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
643 chr_length[mapped_chr] = len(gseq[mapped_chr])
644 #if mapped_chr != mapped_chr0:
645 # my_gseq = deserialize(db_d(mapped_chr))
646 # chr_length = len(my_gseq)
647 # mapped_chr0 = mapped_chr
648 #-------------------------------------
649
650 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
651
652 all_mapped+=1
653 if nn == 1: # +FW mapped to + strand:
654 FR = "+FW"
655 mapped_strand = "+"
656 elif nn == 2: # -FW mapped to - strand:
657 mapped_strand = "-"
658 FR = "-FW"
659 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
660
661
662 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
663 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
664
665 if len(r_aln) == len(g_aln):
666 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides
667 if N_mismatch <= int(max_mismatch_no):
668 numbers_mapped_lst[nn-1] += 1
669 all_mapped_passed += 1
670 methy = methy_seq(r_aln, g_aln+next)
671 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
672
673 #---XS FILTER----------------
674 XS = 0
675 nCH = methy.count('y') + methy.count('z')
676 nmCH = methy.count('Y') + methy.count('Z')
677 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
678 XS = 1
679
680 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
681
682 #----------------------------------------------------------------
683 logm("--> %s (%d) "%(read_file,no_my_files))
684 delete_files(WC2T, CC2T)
685
686
687 #----------------------------------------------------------------
688
689 # outf.close()
690 delete_files(tmp_path)
691
692 logm("Number of raw reads: %d \n"% all_raw_reads)
693 if all_raw_reads >0:
694 logm("Number of reads having adapter removed: %d \n" % all_trimed )
695 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
696 logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped)
697 if asktag=="Y":
698 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
699 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
700 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
701 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
702 elif asktag=="N":
703 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
704 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
705
706 logm("Post-filtering %d uniquely aligned reads with mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
707 if asktag=="Y":
708 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
709 logm(" ---- %7d RC reads mapped to Watson strand"%(numbers_mapped_lst[1]) )
710 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[2]) )
711 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) )
712 elif asktag=="N":
713 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
714 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) )
715 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
716
717 n_CG=mC_lst[0]+uC_lst[0]
718 n_CHG=mC_lst[1]+uC_lst[1]
719 n_CHH=mC_lst[2]+uC_lst[2]
720
721 logm("----------------------------------------------" )
722 logm("Methylated C in mapped reads ")
723
724 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
725 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
726 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))
727
728 logm("------------------- END --------------------" )
729 elapsed("=== END %s ===" % main_read_file)
730
731
732