annotate tools/metag_tools/shrimp_wrapper.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 TODO
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 1. decrease memory usage
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 2. multi-fasta fastq file, ex. 454
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 3. split reads into small chuncks?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 SHRiMP wrapper
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 Inputs:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 1. reference seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 2. reads
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 Outputs:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 1. table of 8 columns:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 chrom ref_loc read_id read_loc ref_nuc read_nuc quality coverage
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 2. SHRiMP output
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 Parameters:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 -s Spaced Seed (default: 111111011111)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 -n Seed Matches per Window (default: 2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 -t Seed Hit Taboo Length (default: 4)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 -9 Seed Generation Taboo Length (default: 0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 -w Seed Window Length (default: 115.00%)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 -o Maximum Hits per Read (default: 100)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 -r Maximum Read Length (default: 1000)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 -d Kmer Std. Deviation Limit (default: -1 [None])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 -m S-W Match Value (default: 100)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 -i S-W Mismatch Value (default: -150)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 -g S-W Gap Open Penalty (Reference) (default: -400)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 -q S-W Gap Open Penalty (Query) (default: -400)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 -e S-W Gap Extend Penalty (Reference) (default: -70)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 -f S-W Gap Extend Penalty (Query) (default: -70)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 -h S-W Hit Threshold (default: 68.00%)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 Command:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 %rmapper -s spaced_seed -n seed_matches_per_window -t seed_hit_taboo_length -9 seed_generation_taboo_length -w seed_window_length -o max_hits_per_read -r max_read_length -d kmer -m sw_match_value -i sw_mismatch_value -g sw_gap_open_ref -q sw_gap_open_query -e sw_gap_ext_ref -f sw_gap_ext_query -h sw_hit_threshold <query> <target> > <output> 2> <log>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 SHRiMP output:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 >7:2:1147:982/1 chr3 + 36586562 36586595 2 35 36 2900 3G16G13
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 >7:2:1147:982/1 chr3 + 95338194 95338225 4 35 36 2700 9T7C14
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 >7:2:587:93/1 chr3 + 14913541 14913577 1 35 36 2960 19--16
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 import os, sys, tempfile, os.path, re
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 assert sys.version_info[:2] >= (2.4)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 sys.stderr.write( "%s\n" % msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 def reverse_complement(s):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":".", "-":"-"}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 reversed_s = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 for i in s:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 reversed_s.append(complement_dna[i])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 reversed_s.reverse()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 return "".join(reversed_s)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 invalid_editstring_char = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 all_score_file = score_files.split(',')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 if len(all_score_file) != hit_per_read: stop_err('One or more query files is missing. Please check your dataset.')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 temp_table_name = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 temp_table = open(temp_table_name, 'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 outfile = open(table_outfile,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 # reference seq: not a single fasta seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 refseq = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 chrom_cov = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 seq = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 for i, line in enumerate(file(ref_file)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 line = line.rstrip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 if not line or line.startswith('#'): continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 if line.startswith('>'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 if seq:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 if refseq.has_key(title):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 refseq[title] = seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 chrom_cov[title] = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 seq = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 title = line[1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 seq += line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 if seq:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 if not refseq.has_key(title):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 refseq[title] = seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 chrom_cov[title] = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 # find hits : one end and/or the other
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 hits = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 for i, line in enumerate(file(result_file)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 line = line.rstrip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 if not line or line.startswith('#'): continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 #FORMAT: readname contigname strand contigstart contigend readstart readend readlength score editstring
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 fields = line.split('\t')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 readname = fields[0][1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 chrom = fields[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 strand = fields[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 chrom_start = int(fields[3]) - 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 chrom_end = int(fields[4])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 read_start = fields[5]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 read_end = fields[6]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 read_len = fields[7]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 score = fields[8]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 editstring = fields[9]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 if hit_per_read == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 endindex = '1'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 readname, endindex = readname.split('/')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 if hits.has_key(readname):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 if hits[readname].has_key(endindex):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 hits[readname][endindex].append([strand, editstring, chrom_start, chrom_end, read_start, chrom])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 hits[readname] = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 # find score : one end and the other end
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 hits_score = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 readname = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 score = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 for num_score_file in range(len(all_score_file)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 score_file = all_score_file[num_score_file]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 for i, line in enumerate(file(score_file)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 line = line.rstrip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 if not line or line.startswith('#'): continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 if line.startswith('>'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 if score:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 if hits.has_key(readname):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 if len(hits[readname]) == hit_per_read:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 if hits_score.has_key(readname):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 if hits_score[readname].has_key(endindex):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 hits_score[readname][endindex] = score
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 hits_score[readname] = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 hits_score[readname][endindex] = score
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 score = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 if hit_per_read == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 readname = line[1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 endindex = '1'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 readname, endindex = line[1:].split('/')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 score = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 if score: # the last one
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 if hits.has_key(readname):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 if len(hits[readname]) == hit_per_read:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 if hits_score.has_key(readname):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 if hits_score[readname].has_key(endindex):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 hits_score[readname][endindex] = score
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 hits_score[readname] = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 hits_score[readname][endindex] = score
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 # call to all mappings
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 for readkey in hits.keys():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182 if len(hits[readkey]) != hit_per_read: continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184 matches = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185 match_count = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 if hit_per_read == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188 if len(hits[readkey]['1']) == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 matches = [ hits[readkey]['1'] ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 match_count = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192 end1_data = hits[readkey]['1']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193 end2_data = hits[readkey]['2']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
194
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
195 for i, end1_hit in enumerate(end1_data):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
196 crin_strand = {'+': False, '-': False}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
197 crin_insertSize = {'+': False, '-': False}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
198
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
199 crin_strand[end1_hit[0]] = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
200 crin_insertSize[end1_hit[0]] = int(end1_hit[2])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
201
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
202 for j, end2_hit in enumerate(end2_data):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
203 crin_strand[end2_hit[0]] = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
204 crin_insertSize[end2_hit[0]] = int(end2_hit[2])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
205
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
206 if end1_hit[-1] != end2_hit[-1] : continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
207
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
208 if crin_strand['+'] and crin_strand['-']:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
209 if (crin_insertSize['-'] - crin_insertSize['+']) <= insertion_size:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
210 matches.append([end1_hit, end2_hit])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
211 match_count += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
212
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
213 if match_count == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
214
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
215 for x, end_data in enumerate(matches[0]):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
216
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
217 end_strand, end_editstring, end_chr_start, end_chr_end, end_read_start, end_chrom = end_data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
218 end_read_start = int(end_read_start) - 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
219
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
220 if end_strand == '-':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
221 refsegment = reverse_complement(refseq[end_chrom][end_chr_start:end_chr_end])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
222 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
223 refsegment = refseq[end_chrom][end_chr_start:end_chr_end]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
224
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
225 match_len = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
226 editindex = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
227 gap_read = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
228
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
229 while editindex < len(end_editstring):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
230
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
231 editchr = end_editstring[editindex]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
232 chrA = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
233 chrB = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
234 locIndex = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
235
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
236 if editchr.isdigit():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
237 editcode = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
238
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
239 while editchr.isdigit() and editindex < len(end_editstring):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
240 editcode += editchr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
241 editindex += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
242 if editindex < len(end_editstring): editchr = end_editstring[editindex]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
243
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
244 for baseIndex in range(int(editcode)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
245 chrA += refsegment[match_len+baseIndex]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
246 chrB = chrA
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
247
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
248 match_len += int(editcode)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
249
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
250 elif editchr == 'x':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
251 # crossover: inserted between the appropriate two bases
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
252 # Two sequencing errors: 4x15x6 (25 matches with 2 crossovers)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
253 # Treated as errors in the reads; Do nothing.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
254 editindex += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
255
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
256 elif editchr.isalpha():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
257 editcode = editchr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
258 editindex += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
259 chrA = refsegment[match_len]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
260 chrB = editcode
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
261 match_len += len(editcode)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
262
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
263 elif editchr == '-':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
264 editcode = editchr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
265 editindex += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
266 chrA = refsegment[match_len]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
267 chrB = editcode
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
268 match_len += len(editcode)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
269 gap_read += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
270
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
271 elif editchr == '(':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
272 editcode = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
273
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
274 while editchr != ')' and editindex < len(end_editstring):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
275 if editindex < len(end_editstring): editchr = end_editstring[editindex]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
276 editcode += editchr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
277 editindex += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
278
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
279 editcode = editcode[1:-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
280 chrA = '-'*len(editcode)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
281 chrB = editcode
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
282
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
283 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
284 invalid_editstring_char += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
285
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
286 if end_strand == '-':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
287
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
288 chrA = reverse_complement(chrA)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
289 chrB = reverse_complement(chrB)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
290
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
291 pos_line = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
292 rev_line = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
293
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
294 for mappingIndex in range(len(chrA)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
295 # reference
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
296 chrAx = chrA[mappingIndex]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
297 # read
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
298 chrBx = chrB[mappingIndex]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
299
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
300 if chrAx and chrBx and chrBx.upper() != 'N':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
301
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
302 if end_strand == '+':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
303
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
304 chrom_loc = end_chr_start+match_len-len(chrA)+mappingIndex
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
305 read_loc = end_read_start+match_len-len(chrA)+mappingIndex-gap_read
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
306
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
307 if chrAx == '-': chrom_loc -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
308
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
309 if chrBx == '-':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
310 scoreBx = '-1'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
311 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
312 scoreBx = hits_score[readkey][str(x+1)].split()[read_loc]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
313
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
314 # 1-based on chrom_loc and read_loc
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
315 pos_line = pos_line + '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) + '\n'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
316
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
317 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
318
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
319 chrom_loc = end_chr_end-match_len+mappingIndex
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
320 read_loc = end_read_start+match_len-1-mappingIndex-gap_read
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
321
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
322 if chrAx == '-': chrom_loc -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
323
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
324 if chrBx == '-':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
325 scoreBx = '-1'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
326 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
327 scoreBx = hits_score[readkey][str(x+1)].split()[read_loc]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
328
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
329 # 1-based on chrom_loc and read_loc
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
330 rev_line = '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) +'\n' + rev_line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
331
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
332 if chrom_cov.has_key(end_chrom):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
333
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
334 if chrom_cov[end_chrom].has_key(chrom_loc):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
335 chrom_cov[end_chrom][chrom_loc] += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
336 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
337 chrom_cov[end_chrom][chrom_loc] = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
338
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
339 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
340
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
341 chrom_cov[end_chrom] = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
342 chrom_cov[end_chrom][chrom_loc] = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
343
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
344 if pos_line: temp_table.write('%s\n' %(pos_line.rstrip('\r\n')))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
345 if rev_line: temp_table.write('%s\n' %(rev_line.rstrip('\r\n')))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
346
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
347 temp_table.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
348
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
349 # chrom-wide coverage
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
350 for i, line in enumerate(open(temp_table_name)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
351
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
352 line = line.rstrip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
353 if not line or line.startswith('#'): continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
354
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
355 fields = line.split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
356 chrom = fields[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
357 eachBp = int(fields[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
358 readname = fields[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
359
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
360 if hit_per_read == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
361 fields[2] = readname.split('/')[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
362
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
363 if chrom_cov[chrom].has_key(eachBp):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
364 outfile.write('%s\t%d\n' %('\t'.join(fields), chrom_cov[chrom][eachBp]))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
365 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
366 outfile.write('%s\t%d\n' %('\t'.join(fields), 0))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
367
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
368 outfile.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
369
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
370 if os.path.exists(temp_table_name): os.remove(temp_table_name)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
371
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
372 if invalid_editstring_char:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
373 print 'Skip ', invalid_editstring_char, ' invalid characters in editstrings'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
374
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
375 return True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
376
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
377 def convert_fastqsolexa_to_fasta_qual(infile_name, query_fasta, query_qual):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
378
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
379 outfile_seq = open( query_fasta, 'w' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
380 outfile_score = open( query_qual, 'w' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
381
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
382 seq_title_startswith = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
383 qual_title_startswith = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
384
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
385 default_coding_value = 64 # Solexa ascii-code
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
386 fastq_block_lines = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
387
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
388 for i, line in enumerate( file( infile_name ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
389 line = line.rstrip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
390 if not line or line.startswith( '#' ): continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
391
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
392 fastq_block_lines = ( fastq_block_lines + 1 ) % 4
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
393 line_startswith = line[0:1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
394
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
395 if fastq_block_lines == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
396 # first line is @title_of_seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
397 if not seq_title_startswith:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
398 seq_title_startswith = line_startswith
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
399
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
400 if line_startswith != seq_title_startswith:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
401 outfile_seq.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
402 outfile_score.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
403 stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
404
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
405 read_title = line[1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
406 outfile_seq.write( '>%s\n' % line[1:] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
407
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
408 elif fastq_block_lines == 2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
409 # second line is nucleotides
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
410 read_length = len( line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
411 outfile_seq.write( '%s\n' % line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
412
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
413 elif fastq_block_lines == 3:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
414 # third line is +title_of_qualityscore ( might be skipped )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
415 if not qual_title_startswith:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
416 qual_title_startswith = line_startswith
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
417
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
418 if line_startswith != qual_title_startswith:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
419 outfile_seq.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
420 outfile_score.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
421 stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
422
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
423 quality_title = line[1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
424 if quality_title and read_title != quality_title:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
425 outfile_seq.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
426 outfile_score.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
427 stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
428
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
429 if not quality_title:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
430 outfile_score.write( '>%s\n' % read_title )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
431 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
432 outfile_score.write( '>%s\n' % line[1:] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
433
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
434 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
435 # fourth line is quality scores
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
436 qual = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
437 fastq_integer = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
438 # peek: ascii or digits?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
439 val = line.split()[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
440 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
441 check = int( val )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
442 fastq_integer = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
443 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
444 fastq_integer = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
445
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
446 if fastq_integer:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
447 # digits
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
448 qual = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
449 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
450 # ascii
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
451 quality_score_length = len( line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
452 if quality_score_length == read_length + 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
453 # first char is qual_score_startswith
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
454 qual_score_startswith = ord( line[0:1] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
455 line = line[1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
456 elif quality_score_length == read_length:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
457 qual_score_startswith = default_coding_value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
458 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
459 stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
460
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
461 for j, char in enumerate( line ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
462 score = ord( char ) - qual_score_startswith # 64
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
463 qual = "%s%s " % ( qual, str( score ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
464
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
465 outfile_score.write( '%s\n' % qual )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
466
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
467 outfile_seq.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
468 outfile_score.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
469
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
470 return True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
471
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
472 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
473
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
474 # SHRiMP path
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
475 shrimp = 'rmapper-ls'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
476
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
477 # I/O
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
478 input_target_file = sys.argv[1] # fasta
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
479 shrimp_outfile = sys.argv[2] # shrimp output
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
480 table_outfile = sys.argv[3] # table output
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
481 single_or_paired = sys.argv[4].split(',')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
482
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
483 insertion_size = 600
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
484
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
485 if len(single_or_paired) == 1: # single or paired
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
486 type_of_reads = 'single'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
487 hit_per_read = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
488 input_query = single_or_paired[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
489 query_fasta = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
490 query_qual = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
491
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
492 else: # paired-end
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
493 type_of_reads = 'paired'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
494 hit_per_read = 2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
495 input_query_end1 = single_or_paired[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
496 input_query_end2 = single_or_paired[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
497 insertion_size = int(single_or_paired[2])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
498 query_fasta_end1 = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
499 query_fasta_end2 = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
500 query_qual_end1 = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
501 query_qual_end2 = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
502
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
503 # SHRiMP parameters: total = 15, default values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
504 spaced_seed = '111111011111'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
505 seed_matches_per_window = '2'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
506 seed_hit_taboo_length = '4'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
507 seed_generation_taboo_length = '0'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
508 seed_window_length = '115.0'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
509 max_hits_per_read = '100'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
510 max_read_length = '1000'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
511 kmer = '-1'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
512 sw_match_value = '100'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
513 sw_mismatch_value = '-150'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
514 sw_gap_open_ref = '-400'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
515 sw_gap_open_query = '-400'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
516 sw_gap_ext_ref = '-70'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
517 sw_gap_ext_query = '-70'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
518 sw_hit_threshold = '68.0'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
519
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
520 # TODO: put the threshold on each of these parameters
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
521 if len(sys.argv) > 5:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
522
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
523 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
524 if sys.argv[5].isdigit():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
525 spaced_seed = sys.argv[5]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
526 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
527 stop_err('Error in assigning parameter: Spaced seed.')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
528 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
529 stop_err('Spaced seed must be a combination of 1s and 0s.')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
530
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
531 seed_matches_per_window = sys.argv[6]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
532 seed_hit_taboo_length = sys.argv[7]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
533 seed_generation_taboo_length = sys.argv[8]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
534 seed_window_length = sys.argv[9]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
535 max_hits_per_read = sys.argv[10]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
536 max_read_length = sys.argv[11]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
537 kmer = sys.argv[12]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
538 sw_match_value = sys.argv[13]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
539 sw_mismatch_value = sys.argv[14]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
540 sw_gap_open_ref = sys.argv[15]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
541 sw_gap_open_query = sys.argv[16]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
542 sw_gap_ext_ref = sys.argv[17]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
543 sw_gap_ext_query = sys.argv[18]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
544 sw_hit_threshold = sys.argv[19]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
545
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
546 # temp file for shrimp log file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
547 shrimp_log = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
548
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
549 # convert fastq to fasta and quality score files
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
550 if type_of_reads == 'single':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
551 return_value = convert_fastqsolexa_to_fasta_qual(input_query, query_fasta, query_qual)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
552 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
553 return_value = convert_fastqsolexa_to_fasta_qual(input_query_end1, query_fasta_end1, query_qual_end1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
554 return_value = convert_fastqsolexa_to_fasta_qual(input_query_end2, query_fasta_end2, query_qual_end2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
555
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
556 # SHRiMP command
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
557 if type_of_reads == 'single':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
558 command = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
559
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
560 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
561 os.system(command)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
562 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
563 if os.path.exists(query_fasta): os.remove(query_fasta)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
564 if os.path.exists(query_qual): os.remove(query_qual)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
565 stop_err(str(e))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
566
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
567 else: # paired
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
568 command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
569 command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
570
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
571 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
572 os.system(command_end1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
573 os.system(command_end2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
574 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
575 if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
576 if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
577 if os.path.exists(query_qual_end1): os.remove(query_qual_end1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
578 if os.path.exists(query_qual_end2): os.remove(query_qual_end2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
579 stop_err(str(e))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
580
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
581 # check SHRiMP output: count number of lines
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
582 num_hits = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
583 if shrimp_outfile:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
584 for i, line in enumerate(file(shrimp_outfile)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
585 line = line.rstrip('\r\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
586 if not line or line.startswith('#'): continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
587 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
588 fields = line.split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
589 num_hits += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
590 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
591 stop_err(str(e))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
592
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
593 if num_hits == 0: # no hits generated
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
594 err_msg = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
595 if shrimp_log:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
596 for i, line in enumerate(file(shrimp_log)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
597 if line.startswith('error'): # deal with memory error:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
598 err_msg += line # error: realloc failed: Cannot allocate memory
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
599 if re.search('Reads Matched', line): # deal with zero hits
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
600 if int(line[8:].split()[2]) == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
601 err_msg = 'Zero hits found.\n'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
602 stop_err('SHRiMP Failed due to:\n' + err_msg)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
603
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
604 # convert to table
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
605 if type_of_reads == 'single':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
606 return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
607 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
608 return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1+','+query_qual_end2, table_outfile, hit_per_read, insertion_size)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
609
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
610 # remove temp. files
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
611 if type_of_reads == 'single':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
612 if os.path.exists(query_fasta): os.remove(query_fasta)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
613 if os.path.exists(query_qual): os.remove(query_qual)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
614 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
615 if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
616 if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
617 if os.path.exists(query_qual_end1): os.remove(query_qual_end1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
618 if os.path.exists(query_qual_end2): os.remove(query_qual_end2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
619
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
620 if os.path.exists(shrimp_log): os.remove(shrimp_log)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
621
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
622
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
623 if __name__ == '__main__': __main__()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
624