annotate BSseeker2/bs_align/bs_single_end.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1 import fileinput, os, time, random, math
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
2 from bs_utils.utils import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
3 from bs_align_utils import *
1
weilong-guo
parents: 0
diff changeset
4 import gzip
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
5
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
6 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
7 # Read from the mapped results, return lists of unique / multiple-hit reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
8 # The function suppose at most 2 hits will be reported in single file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
9 def extract_mapping(ali_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
10 unique_hits = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
11 non_unique_hits = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
12
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
13 header0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
14 lst = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
15
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
16 for header, chr, location, no_mismatch, cigar in process_aligner_output(ali_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
17 #------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
18 if header != header0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
19 #---------- output -----------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
20 if len(lst) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
21 unique_hits[header0] = lst[0] # [no_mismatch, chr, location]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
22 elif len(lst) > 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
23 min_lst = min(lst, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
24 max_lst = max(lst, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
25
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
26 if min_lst[0] < max_lst[0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
27 unique_hits[header0] = min_lst
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
28 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
29 non_unique_hits[header0] = min_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
30 #print "multiple hit", header, chr, location, no_mismatch, cigar # test
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
31 header0 = header
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
32 lst = [(no_mismatch, chr, location, cigar)]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
33 else: # header == header0, same header (read id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
34 lst.append((no_mismatch, chr, location, cigar))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
35
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
36 if len(lst) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
37 unique_hits[header0] = lst[0] # [no_mismatch, chr, location]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
38 elif len(lst) > 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
39 min_lst = min(lst, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
40 max_lst = max(lst, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
41
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
42 if min_lst[0] < max_lst[0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
43 unique_hits[header0] = min_lst
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
44 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
45 non_unique_hits[header0] = min_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
46
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
47
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
48 return unique_hits, non_unique_hits
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
49
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
50
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
51 def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
52 max_mismatch_no, aligner_command, db_path, tmp_path, outfile,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
53 XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
54 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
55 # adapter : strand-specific or not
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
56 adapter=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
57 adapter_fw=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
58 adapter_rc=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
59 if adapter_file !="":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
60 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
61 adapter_inf=open(adapter_file,"r")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
62 except IOError:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
63 print "[Error] Cannot open adapter file : %s" % adapter_file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
64 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
65 if asktag == "N": #<--- directional library
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
66 adapter=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
67 adapter_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
68 adapter=adapter.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
69 elif asktag == "Y":#<--- un-directional library
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
70 adapter_fw=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
71 adapter_rc=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
72 adapter_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
73 adapter_fw=adapter_fw.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
74 adapter_rc=adapter_rc.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
75 adapter_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
76 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
77
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
78
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
79
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
80 #----------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
81 logm("Read filename: %s" % main_read_file)
weilong-guo
parents: 0
diff changeset
82 logm("The first base (for mapping): %d"% cut1 )
weilong-guo
parents: 0
diff changeset
83 logm("The last base (for mapping): %d"% cut2 )
weilong-guo
parents: 0
diff changeset
84
weilong-guo
parents: 0
diff changeset
85 logm("Path for short reads aligner: %s"% aligner_command + '\n')
weilong-guo
parents: 0
diff changeset
86 logm("Reference genome library path: %s"% db_path )
weilong-guo
parents: 0
diff changeset
87
weilong-guo
parents: 0
diff changeset
88 if asktag == "Y" :
weilong-guo
parents: 0
diff changeset
89 logm("Un-directional library" )
weilong-guo
parents: 0
diff changeset
90 else :
weilong-guo
parents: 0
diff changeset
91 logm("Directional library")
weilong-guo
parents: 0
diff changeset
92
weilong-guo
parents: 0
diff changeset
93 logm("Number of mismatches allowed: %s"% max_mismatch_no )
weilong-guo
parents: 0
diff changeset
94
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
95 if adapter_file !="":
1
weilong-guo
parents: 0
diff changeset
96 logm("Adapter seq: %s" % adapter_fw)
weilong-guo
parents: 0
diff changeset
97 logm("-------------------------------- " )
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
98 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
99
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
100 # helper method to join fname with tmp_path
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
101 tmp_d = lambda fname: os.path.join(tmp_path, fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
102
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
103 db_d = lambda fname: os.path.join(db_path, fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
104
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
105 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
106 # splitting the big read file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
107
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
108 input_fname = os.path.split(main_read_file)[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
109
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
110 # split_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
111 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
112 # if splitted_file.startswith("%s-s-" % input_fname))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
113
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
114 #---- Stats ------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
115 all_raw_reads=0
1
weilong-guo
parents: 0
diff changeset
116 all_trimmed=0
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
117 all_mapped=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
118 all_mapped_passed=0
1
weilong-guo
parents: 0
diff changeset
119 all_base_before_trim=0
weilong-guo
parents: 0
diff changeset
120 all_base_after_trim=0
weilong-guo
parents: 0
diff changeset
121 all_base_mapped=0
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
122
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
123 numbers_premapped_lst=[0,0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
124 numbers_mapped_lst=[0,0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
125
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
126 mC_lst=[0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
127 uC_lst=[0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
128
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
129 no_my_files=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
130
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
131 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
132 logm("== Start mapping ==")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
133
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
134 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
135 # for read_file in my_files:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
136 original_bs_reads = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
137 no_my_files+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
138 random_id = ".tmp-"+str(random.randint(1000000,9999999))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
139
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
140 #-------------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
141 # un-directional sequencing
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
142 #-------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
143 if asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
144
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
145 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
146 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
147 outfile3=tmp_d('Trimmed_G2A.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
148
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
149 outf2=open(outfile2,'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
150 outf3=open(outfile3,'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
151
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
152 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
153 # detect format of input file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
154 try :
1
weilong-guo
parents: 0
diff changeset
155 if read_file.endswith(".gz") : # support input file ending with ".gz"
weilong-guo
parents: 0
diff changeset
156 read_inf = gzip.open(read_file, "rb")
weilong-guo
parents: 0
diff changeset
157 else :
weilong-guo
parents: 0
diff changeset
158 read_inf=open(read_file,"r")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
159 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
160 print "[Error] Cannot open input file : %s" % read_file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
161 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
162
1
weilong-guo
parents: 0
diff changeset
163 oneline = read_inf.readline()
weilong-guo
parents: 0
diff changeset
164 l = oneline.split()
weilong-guo
parents: 0
diff changeset
165 input_format = ""
weilong-guo
parents: 0
diff changeset
166 if oneline[0]=="@":
weilong-guo
parents: 0
diff changeset
167 input_format = "fastq"
weilong-guo
parents: 0
diff changeset
168 elif len(l)==1 and oneline[0]!=">":
weilong-guo
parents: 0
diff changeset
169 input_format = "seq"
weilong-guo
parents: 0
diff changeset
170 elif len(l)==11:
weilong-guo
parents: 0
diff changeset
171 input_format = "qseq"
weilong-guo
parents: 0
diff changeset
172 elif oneline[0]==">":
weilong-guo
parents: 0
diff changeset
173 input_format = "fasta"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
174 read_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
175
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
176 #----------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
177 # read sequence, remove adapter and convert
weilong-guo
parents: 0
diff changeset
178 read_id = ""
weilong-guo
parents: 0
diff changeset
179 seq = ""
weilong-guo
parents: 0
diff changeset
180 seq_ready = "N"
weilong-guo
parents: 0
diff changeset
181 line_no = 0
weilong-guo
parents: 0
diff changeset
182 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): # allow input with .gz
weilong-guo
parents: 0
diff changeset
183 l = line.split()
weilong-guo
parents: 0
diff changeset
184 line_no += 1
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
185 if input_format=="seq":
1
weilong-guo
parents: 0
diff changeset
186 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
187 read_id = str(all_raw_reads)
weilong-guo
parents: 0
diff changeset
188 read_id = read_id.zfill(12)
weilong-guo
parents: 0
diff changeset
189 seq = l[0]
weilong-guo
parents: 0
diff changeset
190 seq_ready = "Y"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
191 elif input_format=="fastq":
1
weilong-guo
parents: 0
diff changeset
192 l_fastq = math.fmod(line_no, 4)
weilong-guo
parents: 0
diff changeset
193 if l_fastq == 1 :
weilong-guo
parents: 0
diff changeset
194 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
195 read_id = l[0][1:]
weilong-guo
parents: 0
diff changeset
196 seq_ready = "N"
weilong-guo
parents: 0
diff changeset
197 elif l_fastq == 2 :
weilong-guo
parents: 0
diff changeset
198 seq = l[0]
weilong-guo
parents: 0
diff changeset
199 seq_ready = "Y"
weilong-guo
parents: 0
diff changeset
200 else :
weilong-guo
parents: 0
diff changeset
201 seq = ""
weilong-guo
parents: 0
diff changeset
202 seq_ready = "N"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
203 elif input_format=="qseq":
1
weilong-guo
parents: 0
diff changeset
204 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
205 read_id = str(all_raw_reads)
weilong-guo
parents: 0
diff changeset
206 read_id = read_id.zfill(12)
weilong-guo
parents: 0
diff changeset
207 seq = l[8]
weilong-guo
parents: 0
diff changeset
208 seq_ready = "Y"
weilong-guo
parents: 0
diff changeset
209 elif input_format=="fasta" :
weilong-guo
parents: 0
diff changeset
210 l_fasta = math.fmod(line_no,2)
weilong-guo
parents: 0
diff changeset
211 if l_fasta==1:
weilong-guo
parents: 0
diff changeset
212 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
213 read_id = l[0][1:]
weilong-guo
parents: 0
diff changeset
214 seq = ""
weilong-guo
parents: 0
diff changeset
215 seq_ready = "N"
weilong-guo
parents: 0
diff changeset
216 elif l_fasta==0 :
weilong-guo
parents: 0
diff changeset
217 seq = l[0]
weilong-guo
parents: 0
diff changeset
218 seq_ready = "Y"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
219
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
220 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
221 if seq_ready=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
222 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
223 seq=seq.upper()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
224 seq=seq.replace(".","N")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
225
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
226 # striping BS adapter from 3' read
1
weilong-guo
parents: 0
diff changeset
227 all_base_before_trim += len(seq)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
228 if (adapter_fw !="") and (adapter_rc !="") :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
229 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
230 new_read = Remove_5end_Adapter(new_read, adapter_rc)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
231 if len(new_read) < len(seq) :
1
weilong-guo
parents: 0
diff changeset
232 all_trimmed += 1
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
233 seq = new_read
1
weilong-guo
parents: 0
diff changeset
234 all_base_after_trim += len(seq)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
235 if len(seq)<=4:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
236 seq=''.join(["N" for x in xrange(cut2-cut1+1)])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
237
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
238 #--------- trimmed_raw_BS_read ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
239 original_bs_reads[read_id] = seq
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
240
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
241 #--------- FW_C2T ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
242 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
243 #--------- RC_G2A ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
244 outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
245
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
246 fileinput.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
247
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
248 outf2.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
249 outf3.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
250
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
251 delete_files(read_file)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
252
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
253 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
254 # Bowtie mapping
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
255 #-------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
256 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
257 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
258 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
259 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
260
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
261 # print aligner_command % {'int_no_mismatches' : int_no_mismatches,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
262 # 'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
263 # 'input_file' : outfile2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
264 # 'output_file' : WC2T}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
265
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
266 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
267 'input_file' : outfile2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
268 'output_file' : WC2T},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
269
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
270 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
271 'input_file' : outfile2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
272 'output_file' : CC2T},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
273
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
274 aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
275 'input_file' : outfile3,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
276 'output_file' : WG2A},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
277
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
278 aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
279 'input_file' : outfile3,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
280 'output_file' : CG2A} ])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
281
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
282
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
283 delete_files(outfile2, outfile3)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
284
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
285
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
286 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
287 # Post processing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
288 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
289
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
290 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
291 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
292
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
293 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
294 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
295
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
296 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
297 # get unique-hit reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
298 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
299 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
300
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
301 Unique_FW_C2T=set() # +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
302 Unique_RC_G2A=set() # +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
303 Unique_FW_G2A=set() # -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
304 Unique_RC_C2T=set() # -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
305 Multiple_hits=set()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
306
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
307
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
308 for x in Union_set:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
309 _list=[]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
310 for d in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
311 mis_lst=d.get(x,[99])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
312 mis=int(mis_lst[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
313 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
314 for d in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
315 mis=d.get(x,99)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
316 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
317 mini=min(_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
318 if _list.count(mini) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
319 mini_index=_list.index(mini)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
320 if mini_index == 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
321 Unique_FW_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
322 elif mini_index == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
323 Unique_RC_G2A.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
324 elif mini_index == 2:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
325 Unique_FW_G2A.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
326 elif mini_index == 3:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
327 Unique_RC_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
328 # if mini_index = 4,5,6,7, indicating multiple hits
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
329 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
330 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
331 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
332 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
333 # write reads rejected by Multiple Hits to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
334 if show_multiple_hit :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
335 outf_MH=open("Multiple_hit.fa",'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
336 for i in Multiple_hits :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
337 outf_MH.write(">%s\n" % i)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
338 outf_MH.write("%s\n" % original_bs_reads[i])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
339 outf_MH.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
340
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
341 del Union_set
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
342 del FW_C2T_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
343 del FW_G2A_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
344 del RC_C2T_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
345 del RC_G2A_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
346
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
347 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
348 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
349 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
350 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
351 FW_C2T_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
352 RC_C2T_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
353 FW_G2A_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
354 RC_G2A_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
355 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
356 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
357 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
358 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
1
weilong-guo
parents: 0
diff changeset
359 #----------------------------------------------------------------
weilong-guo
parents: 0
diff changeset
360
weilong-guo
parents: 0
diff changeset
361 numbers_premapped_lst[0] += len(Unique_FW_C2T)
weilong-guo
parents: 0
diff changeset
362 numbers_premapped_lst[1] += len(Unique_RC_G2A)
weilong-guo
parents: 0
diff changeset
363 numbers_premapped_lst[2] += len(Unique_FW_G2A)
weilong-guo
parents: 0
diff changeset
364 numbers_premapped_lst[3] += len(Unique_RC_C2T)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
365
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
366 del Unique_FW_C2T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
367 del Unique_FW_G2A
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
368 del Unique_RC_C2T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
369 del Unique_RC_G2A
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
370
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
371
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
372 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
373
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
374 nn=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
375 gseq = dict()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
376 chr_length = dict()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
377 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
378 (RC_G2A_uniq_lst,RC_G2A_U),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
379 (FW_G2A_uniq_lst,FW_G2A_U),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
380 (RC_C2T_uniq_lst,RC_C2T_U)]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
381 nn += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
382
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
383 for header in ali_unique_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
384
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
385 _, mapped_chr, mapped_location, cigar = ali_dic[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
386
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
387 original_BS = original_bs_reads[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
388 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
389 if mapped_chr not in gseq:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
390 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
391 chr_length[mapped_chr] = len(gseq[mapped_chr])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
392
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
393 if nn == 2 or nn == 3:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
394 cigar = list(reversed(cigar))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
395 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
396
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
397
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
398 all_mapped += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
399
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
400 if nn == 1: # +FW mapped to + strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
401 FR = "+FW"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
402 mapped_strand="+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
403
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
404 elif nn == 2: # +RC mapped to + strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
405 FR = "+RC" # RC reads from -RC reflecting the methylation status on Watson strand (+)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
406 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
407 mapped_strand = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
408 original_BS = reverse_compl_seq(original_BS) # for RC reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
409
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
410 elif nn == 3: # -RC mapped to - strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
411 mapped_strand = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
412 FR = "-RC" # RC reads from +RC reflecting the methylation status on Crick strand (-)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
413 original_BS = reverse_compl_seq(original_BS) # for RC reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
414
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
415 elif nn == 4: # -FW mapped to - strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
416 mapped_strand = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
417 FR = "-FW"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
418 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
419
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
420 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
421
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
422 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
423
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
424
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
425 if len(r_aln)==len(g_aln):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
426 N_mismatch = N_MIS(r_aln, g_aln)
1
weilong-guo
parents: 0
diff changeset
427 # if N_mismatch <= int(max_mismatch_no):
weilong-guo
parents: 0
diff changeset
428 mm_no=float(max_mismatch_no)
weilong-guo
parents: 0
diff changeset
429 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
430 numbers_mapped_lst[nn-1] += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
431 all_mapped_passed += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
432 methy = methy_seq(r_aln, g_aln + next)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
433 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
434
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
435 #---XS FILTER----------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
436 XS = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
437 nCH = methy.count('y') + methy.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
438 nmCH = methy.count('Y') + methy.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
439 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
440 XS = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
441
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
442 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
1
weilong-guo
parents: 0
diff changeset
443 all_base_mapped += len(original_BS)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
444
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
445 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
446 logm("--> %s (%d) "%(read_file, no_my_files))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
447 delete_files(WC2T, WG2A, CC2T, CG2A)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
448
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
449
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
450
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
451 #--------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
452 # directional sequencing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
453 #--------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
454
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
455 if asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
456 #----------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
457 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
458 outf2=open(outfile2,'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
459 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
460 try :
1
weilong-guo
parents: 0
diff changeset
461 if read_file.endswith(".gz") : # support input file ending with ".gz"
weilong-guo
parents: 0
diff changeset
462 read_inf = gzip.open(read_file, "rb")
weilong-guo
parents: 0
diff changeset
463 else :
weilong-guo
parents: 0
diff changeset
464 read_inf=open(read_file,"r")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
465 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
466 print "[Error] Cannot open input file : %s" % read_file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
467 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
468
1
weilong-guo
parents: 0
diff changeset
469 oneline = read_inf.readline()
weilong-guo
parents: 0
diff changeset
470 l = oneline.split()
weilong-guo
parents: 0
diff changeset
471 input_format = ""
weilong-guo
parents: 0
diff changeset
472 if oneline[0]=="@":
weilong-guo
parents: 0
diff changeset
473 input_format = "fastq"
weilong-guo
parents: 0
diff changeset
474 elif len(l)==1 and oneline[0]!=">":
weilong-guo
parents: 0
diff changeset
475 input_format = "seq"
weilong-guo
parents: 0
diff changeset
476 elif len(l)==11:
weilong-guo
parents: 0
diff changeset
477 input_format = "qseq"
weilong-guo
parents: 0
diff changeset
478 elif oneline[0]==">":
weilong-guo
parents: 0
diff changeset
479 input_format = "fasta"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
480 read_inf.close()
1
weilong-guo
parents: 0
diff changeset
481
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
482 #print "detected data format: %s"%(input_format)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
483 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
484 read_id=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
485 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
486 seq_ready="N"
1
weilong-guo
parents: 0
diff changeset
487 line_no = 0
weilong-guo
parents: 0
diff changeset
488 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
weilong-guo
parents: 0
diff changeset
489 l = line.split()
weilong-guo
parents: 0
diff changeset
490 line_no += 1
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
491 if input_format=="seq":
1
weilong-guo
parents: 0
diff changeset
492 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
493 read_id = str(all_raw_reads)
weilong-guo
parents: 0
diff changeset
494 read_id = read_id.zfill(12)
weilong-guo
parents: 0
diff changeset
495 seq = l[0]
weilong-guo
parents: 0
diff changeset
496 seq_ready = "Y"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
497 elif input_format=="fastq":
1
weilong-guo
parents: 0
diff changeset
498 l_fastq = math.fmod(line_no, 4)
weilong-guo
parents: 0
diff changeset
499 if l_fastq == 1 :
weilong-guo
parents: 0
diff changeset
500 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
501 read_id = l[0][1:]
weilong-guo
parents: 0
diff changeset
502 seq_ready = "N"
weilong-guo
parents: 0
diff changeset
503 elif l_fastq == 2 :
weilong-guo
parents: 0
diff changeset
504 seq = l[0]
weilong-guo
parents: 0
diff changeset
505 seq_ready = "Y"
weilong-guo
parents: 0
diff changeset
506 else :
weilong-guo
parents: 0
diff changeset
507 seq = ""
weilong-guo
parents: 0
diff changeset
508 seq_ready = "N"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
509 elif input_format=="qseq":
1
weilong-guo
parents: 0
diff changeset
510 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
511 read_id = str(all_raw_reads)
weilong-guo
parents: 0
diff changeset
512 read_id = read_id.zfill(12)
weilong-guo
parents: 0
diff changeset
513 seq = l[8]
weilong-guo
parents: 0
diff changeset
514 seq_ready = "Y"
weilong-guo
parents: 0
diff changeset
515 elif input_format=="fasta" :
weilong-guo
parents: 0
diff changeset
516 l_fasta = math.fmod(line_no,2)
weilong-guo
parents: 0
diff changeset
517 if l_fasta==1:
weilong-guo
parents: 0
diff changeset
518 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
519 read_id = l[0][1:]
weilong-guo
parents: 0
diff changeset
520 seq = ""
weilong-guo
parents: 0
diff changeset
521 seq_ready = "N"
weilong-guo
parents: 0
diff changeset
522 elif l_fasta==0 :
weilong-guo
parents: 0
diff changeset
523 seq = l[0]
weilong-guo
parents: 0
diff changeset
524 seq_ready = "Y"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
525 #--------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
526 if seq_ready=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
527 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
528 seq=seq.upper()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
529 seq=seq.replace(".","N")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
530
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
531 #--striping adapter from 3' read -------
1
weilong-guo
parents: 0
diff changeset
532 all_base_before_trim += len(seq)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
533 if adapter != "":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
534 new_read = RemoveAdapter(seq, adapter, adapter_mismatch)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
535 if len(new_read) < len(seq) :
1
weilong-guo
parents: 0
diff changeset
536 all_trimmed += 1
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
537 seq = new_read
1
weilong-guo
parents: 0
diff changeset
538 all_base_after_trim += len(seq)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
539 if len(seq)<=4:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
540 seq = "N" * (cut2-cut1+1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
541
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
542 #--------- trimmed_raw_BS_read ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
543 original_bs_reads[read_id] = seq
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
544
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
545
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
546 #--------- FW_C2T ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
547 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
548
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
549 fileinput.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
550
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
551 outf2.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
552 delete_files(read_file)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
553
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
554 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
555 # Bowtie mapping
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
556 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
557 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
558 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
559
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
560 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
561 'input_file' : outfile2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
562 'output_file' : WC2T},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
563 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
564 'input_file' : outfile2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
565 'output_file' : CC2T} ])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
566
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
567 delete_files(outfile2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
568
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
569 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
570 # Post processing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
571 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
572
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
573
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
574 FW_C2T_U, FW_C2T_R = extract_mapping(WC2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
575 RC_C2T_U, RC_C2T_R = extract_mapping(CC2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
576
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
577 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
578 # get uniq-hit reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
579 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
580 Union_set = set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
581
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
582 Unique_FW_C2T = set() # +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
583 Unique_RC_C2T = set() # -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
584 Multiple_hits=set()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
585 # write reads rejected by Multiple Hits to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
586
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
587 for x in Union_set:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
588 _list=[]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
589 for d in [FW_C2T_U,RC_C2T_U]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
590 mis_lst=d.get(x,[99])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
591 mis=int(mis_lst[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
592 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
593 for d in [FW_C2T_R,RC_C2T_R]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
594 mis=d.get(x,99)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
595 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
596 mini=min(_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
597 #print _list
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
598 if _list.count(mini)==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
599 mini_index=_list.index(mini)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
600 if mini_index==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
601 Unique_FW_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
602 elif mini_index==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
603 Unique_RC_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
604 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
605 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
606 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
607 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
608 # write reads rejected by Multiple Hits to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
609 if show_multiple_hit :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
610 outf_MH=open("Multiple_hit.fa",'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
611 for i in Multiple_hits :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
612 outf_MH.write(">%s\n" % i)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
613 outf_MH.write("%s\n" % original_bs_reads[i])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
614 outf_MH.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
615
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
616
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
617 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
618 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
619 FW_C2T_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
620 RC_C2T_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
621 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
622 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
623
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
624
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
625 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
626
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
627 numbers_premapped_lst[0] += len(Unique_FW_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
628 numbers_premapped_lst[1] += len(Unique_RC_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
629
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
630 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
631
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
632 nn = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
633 gseq = dict()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
634 chr_length = dict()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
635 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
636 nn += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
637 for header in ali_unique_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
638 _, mapped_chr, mapped_location, cigar = ali_dic[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
639 original_BS = original_bs_reads[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
640 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
641 if mapped_chr not in gseq :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
642 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
643 chr_length[mapped_chr] = len(gseq[mapped_chr])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
644
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
645 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
646
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
647 all_mapped+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
648 if nn == 1: # +FW mapped to + strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
649 FR = "+FW"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
650 mapped_strand = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
651 elif nn == 2: # -FW mapped to - strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
652 mapped_strand = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
653 FR = "-FW"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
654 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
655
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
656
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
657 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
658 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
659
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
660 if len(r_aln) == len(g_aln):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
661 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides
1
weilong-guo
parents: 0
diff changeset
662 mm_no=float(max_mismatch_no)
weilong-guo
parents: 0
diff changeset
663 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
664 numbers_mapped_lst[nn-1] += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
665 all_mapped_passed += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
666 methy = methy_seq(r_aln, g_aln+next)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
667 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
668
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
669 #---XS FILTER----------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
670 XS = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
671 nCH = methy.count('y') + methy.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
672 nmCH = methy.count('Y') + methy.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
673 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
674 XS = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
675
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
676 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
1
weilong-guo
parents: 0
diff changeset
677 all_base_mapped += len(original_BS)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
678
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
679 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
680 logm("--> %s (%d) "%(read_file,no_my_files))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
681 delete_files(WC2T, CC2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
682
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
683
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
684 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
685
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
686 delete_files(tmp_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
687
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
688 logm("Number of raw reads: %d \n"% all_raw_reads)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
689 if all_raw_reads >0:
1
weilong-guo
parents: 0
diff changeset
690 logm("Number of bases in total: %d "%all_base_before_trim)
weilong-guo
parents: 0
diff changeset
691 if adapter != "" :
weilong-guo
parents: 0
diff changeset
692 logm("Number of reads having adapter removed: %d \n" % all_trimmed )
weilong-guo
parents: 0
diff changeset
693 logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) )
weilong-guo
parents: 0
diff changeset
694 logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) )
weilong-guo
parents: 0
diff changeset
695 logm("Number of unique-hits reads (before post-filtering): %d\n" % all_mapped)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
696 if asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
697 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
698 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
699 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
700 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
701 elif asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
702 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
703 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
704
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
705 logm("Post-filtering %d uniquely aligned reads with mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
706 if asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
707 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
708 logm(" ---- %7d RC reads mapped to Watson strand"%(numbers_mapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
709 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[2]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
710 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
711 elif asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
712 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
713 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
714 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
1
weilong-guo
parents: 0
diff changeset
715 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
weilong-guo
parents: 0
diff changeset
716
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
717
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
718 n_CG=mC_lst[0]+uC_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
719 n_CHG=mC_lst[1]+uC_lst[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
720 n_CHH=mC_lst[2]+uC_lst[2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
721
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
722 logm("----------------------------------------------" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
723 logm("Methylated C in mapped reads ")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
724
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
725 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
726 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
727 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
728
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
729 logm("------------------- END --------------------" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
730 elapsed("=== END %s ===" % main_read_file)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
731
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
732
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
733