0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 TODO
|
|
5 1. decrease memory usage
|
|
6 2. multi-fasta fastq file, ex. 454
|
|
7 3. split reads into small chuncks?
|
|
8
|
|
9 SHRiMP wrapper
|
|
10
|
|
11 Inputs:
|
|
12 1. reference seq
|
|
13 2. reads
|
|
14
|
|
15 Outputs:
|
|
16 1. table of 8 columns:
|
|
17 chrom ref_loc read_id read_loc ref_nuc read_nuc quality coverage
|
|
18 2. SHRiMP output
|
|
19
|
|
20 Parameters:
|
|
21 -s Spaced Seed (default: 111111011111)
|
|
22 -n Seed Matches per Window (default: 2)
|
|
23 -t Seed Hit Taboo Length (default: 4)
|
|
24 -9 Seed Generation Taboo Length (default: 0)
|
|
25 -w Seed Window Length (default: 115.00%)
|
|
26 -o Maximum Hits per Read (default: 100)
|
|
27 -r Maximum Read Length (default: 1000)
|
|
28 -d Kmer Std. Deviation Limit (default: -1 [None])
|
|
29
|
|
30 -m S-W Match Value (default: 100)
|
|
31 -i S-W Mismatch Value (default: -150)
|
|
32 -g S-W Gap Open Penalty (Reference) (default: -400)
|
|
33 -q S-W Gap Open Penalty (Query) (default: -400)
|
|
34 -e S-W Gap Extend Penalty (Reference) (default: -70)
|
|
35 -f S-W Gap Extend Penalty (Query) (default: -70)
|
|
36 -h S-W Hit Threshold (default: 68.00%)
|
|
37
|
|
38 Command:
|
|
39 %rmapper -s spaced_seed -n seed_matches_per_window -t seed_hit_taboo_length -9 seed_generation_taboo_length -w seed_window_length -o max_hits_per_read -r max_read_length -d kmer -m sw_match_value -i sw_mismatch_value -g sw_gap_open_ref -q sw_gap_open_query -e sw_gap_ext_ref -f sw_gap_ext_query -h sw_hit_threshold <query> <target> > <output> 2> <log>
|
|
40
|
|
41 SHRiMP output:
|
|
42 >7:2:1147:982/1 chr3 + 36586562 36586595 2 35 36 2900 3G16G13
|
|
43 >7:2:1147:982/1 chr3 + 95338194 95338225 4 35 36 2700 9T7C14
|
|
44 >7:2:587:93/1 chr3 + 14913541 14913577 1 35 36 2960 19--16
|
|
45
|
|
46 """
|
|
47
|
|
48 import os, sys, tempfile, os.path, re
|
|
49
|
|
50 assert sys.version_info[:2] >= (2.4)
|
|
51
|
|
52 def stop_err( msg ):
|
|
53
|
|
54 sys.stderr.write( "%s\n" % msg )
|
|
55 sys.exit()
|
|
56
|
|
57 def reverse_complement(s):
|
|
58
|
|
59 complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":".", "-":"-"}
|
|
60 reversed_s = []
|
|
61 for i in s:
|
|
62 reversed_s.append(complement_dna[i])
|
|
63 reversed_s.reverse()
|
|
64 return "".join(reversed_s)
|
|
65
|
|
66 def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size):
|
|
67
|
|
68 invalid_editstring_char = 0
|
|
69
|
|
70 all_score_file = score_files.split(',')
|
|
71
|
|
72 if len(all_score_file) != hit_per_read: stop_err('One or more query files is missing. Please check your dataset.')
|
|
73
|
|
74 temp_table_name = tempfile.NamedTemporaryFile().name
|
|
75 temp_table = open(temp_table_name, 'w')
|
|
76
|
|
77 outfile = open(table_outfile,'w')
|
|
78
|
|
79 # reference seq: not a single fasta seq
|
|
80 refseq = {}
|
|
81 chrom_cov = {}
|
|
82 seq = ''
|
|
83
|
|
84 for i, line in enumerate(file(ref_file)):
|
|
85 line = line.rstrip()
|
|
86 if not line or line.startswith('#'): continue
|
|
87
|
|
88 if line.startswith('>'):
|
|
89 if seq:
|
|
90 if refseq.has_key(title):
|
|
91 pass
|
|
92 else:
|
|
93 refseq[title] = seq
|
|
94 chrom_cov[title] = {}
|
|
95 seq = ''
|
|
96 title = line[1:]
|
|
97 else:
|
|
98 seq += line
|
|
99 if seq:
|
|
100 if not refseq.has_key(title):
|
|
101 refseq[title] = seq
|
|
102 chrom_cov[title] = {}
|
|
103
|
|
104 # find hits : one end and/or the other
|
|
105 hits = {}
|
|
106 for i, line in enumerate(file(result_file)):
|
|
107 line = line.rstrip()
|
|
108 if not line or line.startswith('#'): continue
|
|
109
|
|
110 #FORMAT: readname contigname strand contigstart contigend readstart readend readlength score editstring
|
|
111 fields = line.split('\t')
|
|
112 readname = fields[0][1:]
|
|
113 chrom = fields[1]
|
|
114 strand = fields[2]
|
|
115 chrom_start = int(fields[3]) - 1
|
|
116 chrom_end = int(fields[4])
|
|
117 read_start = fields[5]
|
|
118 read_end = fields[6]
|
|
119 read_len = fields[7]
|
|
120 score = fields[8]
|
|
121 editstring = fields[9]
|
|
122
|
|
123 if hit_per_read == 1:
|
|
124 endindex = '1'
|
|
125 else:
|
|
126 readname, endindex = readname.split('/')
|
|
127
|
|
128 if hits.has_key(readname):
|
|
129 if hits[readname].has_key(endindex):
|
|
130 hits[readname][endindex].append([strand, editstring, chrom_start, chrom_end, read_start, chrom])
|
|
131 else:
|
|
132 hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]]
|
|
133 else:
|
|
134 hits[readname] = {}
|
|
135 hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]]
|
|
136
|
|
137 # find score : one end and the other end
|
|
138 hits_score = {}
|
|
139 readname = ''
|
|
140 score = ''
|
|
141 for num_score_file in range(len(all_score_file)):
|
|
142 score_file = all_score_file[num_score_file]
|
|
143 for i, line in enumerate(file(score_file)):
|
|
144 line = line.rstrip()
|
|
145 if not line or line.startswith('#'): continue
|
|
146
|
|
147 if line.startswith('>'):
|
|
148 if score:
|
|
149 if hits.has_key(readname):
|
|
150 if len(hits[readname]) == hit_per_read:
|
|
151 if hits_score.has_key(readname):
|
|
152 if hits_score[readname].has_key(endindex):
|
|
153 pass
|
|
154 else:
|
|
155 hits_score[readname][endindex] = score
|
|
156 else:
|
|
157 hits_score[readname] = {}
|
|
158 hits_score[readname][endindex] = score
|
|
159 score = ''
|
|
160 if hit_per_read == 1:
|
|
161 readname = line[1:]
|
|
162 endindex = '1'
|
|
163 else:
|
|
164 readname, endindex = line[1:].split('/')
|
|
165 else:
|
|
166 score = line
|
|
167
|
|
168 if score: # the last one
|
|
169 if hits.has_key(readname):
|
|
170 if len(hits[readname]) == hit_per_read:
|
|
171 if hits_score.has_key(readname):
|
|
172 if hits_score[readname].has_key(endindex):
|
|
173 pass
|
|
174 else:
|
|
175 hits_score[readname][endindex] = score
|
|
176 else:
|
|
177 hits_score[readname] = {}
|
|
178 hits_score[readname][endindex] = score
|
|
179
|
|
180 # call to all mappings
|
|
181 for readkey in hits.keys():
|
|
182 if len(hits[readkey]) != hit_per_read: continue
|
|
183
|
|
184 matches = []
|
|
185 match_count = 0
|
|
186
|
|
187 if hit_per_read == 1:
|
|
188 if len(hits[readkey]['1']) == 1:
|
|
189 matches = [ hits[readkey]['1'] ]
|
|
190 match_count = 1
|
|
191 else:
|
|
192 end1_data = hits[readkey]['1']
|
|
193 end2_data = hits[readkey]['2']
|
|
194
|
|
195 for i, end1_hit in enumerate(end1_data):
|
|
196 crin_strand = {'+': False, '-': False}
|
|
197 crin_insertSize = {'+': False, '-': False}
|
|
198
|
|
199 crin_strand[end1_hit[0]] = True
|
|
200 crin_insertSize[end1_hit[0]] = int(end1_hit[2])
|
|
201
|
|
202 for j, end2_hit in enumerate(end2_data):
|
|
203 crin_strand[end2_hit[0]] = True
|
|
204 crin_insertSize[end2_hit[0]] = int(end2_hit[2])
|
|
205
|
|
206 if end1_hit[-1] != end2_hit[-1] : continue
|
|
207
|
|
208 if crin_strand['+'] and crin_strand['-']:
|
|
209 if (crin_insertSize['-'] - crin_insertSize['+']) <= insertion_size:
|
|
210 matches.append([end1_hit, end2_hit])
|
|
211 match_count += 1
|
|
212
|
|
213 if match_count == 1:
|
|
214
|
|
215 for x, end_data in enumerate(matches[0]):
|
|
216
|
|
217 end_strand, end_editstring, end_chr_start, end_chr_end, end_read_start, end_chrom = end_data
|
|
218 end_read_start = int(end_read_start) - 1
|
|
219
|
|
220 if end_strand == '-':
|
|
221 refsegment = reverse_complement(refseq[end_chrom][end_chr_start:end_chr_end])
|
|
222 else:
|
|
223 refsegment = refseq[end_chrom][end_chr_start:end_chr_end]
|
|
224
|
|
225 match_len = 0
|
|
226 editindex = 0
|
|
227 gap_read = 0
|
|
228
|
|
229 while editindex < len(end_editstring):
|
|
230
|
|
231 editchr = end_editstring[editindex]
|
|
232 chrA = ''
|
|
233 chrB = ''
|
|
234 locIndex = []
|
|
235
|
|
236 if editchr.isdigit():
|
|
237 editcode = ''
|
|
238
|
|
239 while editchr.isdigit() and editindex < len(end_editstring):
|
|
240 editcode += editchr
|
|
241 editindex += 1
|
|
242 if editindex < len(end_editstring): editchr = end_editstring[editindex]
|
|
243
|
|
244 for baseIndex in range(int(editcode)):
|
|
245 chrA += refsegment[match_len+baseIndex]
|
|
246 chrB = chrA
|
|
247
|
|
248 match_len += int(editcode)
|
|
249
|
|
250 elif editchr == 'x':
|
|
251 # crossover: inserted between the appropriate two bases
|
|
252 # Two sequencing errors: 4x15x6 (25 matches with 2 crossovers)
|
|
253 # Treated as errors in the reads; Do nothing.
|
|
254 editindex += 1
|
|
255
|
|
256 elif editchr.isalpha():
|
|
257 editcode = editchr
|
|
258 editindex += 1
|
|
259 chrA = refsegment[match_len]
|
|
260 chrB = editcode
|
|
261 match_len += len(editcode)
|
|
262
|
|
263 elif editchr == '-':
|
|
264 editcode = editchr
|
|
265 editindex += 1
|
|
266 chrA = refsegment[match_len]
|
|
267 chrB = editcode
|
|
268 match_len += len(editcode)
|
|
269 gap_read += 1
|
|
270
|
|
271 elif editchr == '(':
|
|
272 editcode = ''
|
|
273
|
|
274 while editchr != ')' and editindex < len(end_editstring):
|
|
275 if editindex < len(end_editstring): editchr = end_editstring[editindex]
|
|
276 editcode += editchr
|
|
277 editindex += 1
|
|
278
|
|
279 editcode = editcode[1:-1]
|
|
280 chrA = '-'*len(editcode)
|
|
281 chrB = editcode
|
|
282
|
|
283 else:
|
|
284 invalid_editstring_char += 1
|
|
285
|
|
286 if end_strand == '-':
|
|
287
|
|
288 chrA = reverse_complement(chrA)
|
|
289 chrB = reverse_complement(chrB)
|
|
290
|
|
291 pos_line = ''
|
|
292 rev_line = ''
|
|
293
|
|
294 for mappingIndex in range(len(chrA)):
|
|
295 # reference
|
|
296 chrAx = chrA[mappingIndex]
|
|
297 # read
|
|
298 chrBx = chrB[mappingIndex]
|
|
299
|
|
300 if chrAx and chrBx and chrBx.upper() != 'N':
|
|
301
|
|
302 if end_strand == '+':
|
|
303
|
|
304 chrom_loc = end_chr_start+match_len-len(chrA)+mappingIndex
|
|
305 read_loc = end_read_start+match_len-len(chrA)+mappingIndex-gap_read
|
|
306
|
|
307 if chrAx == '-': chrom_loc -= 1
|
|
308
|
|
309 if chrBx == '-':
|
|
310 scoreBx = '-1'
|
|
311 else:
|
|
312 scoreBx = hits_score[readkey][str(x+1)].split()[read_loc]
|
|
313
|
|
314 # 1-based on chrom_loc and read_loc
|
|
315 pos_line = pos_line + '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) + '\n'
|
|
316
|
|
317 else:
|
|
318
|
|
319 chrom_loc = end_chr_end-match_len+mappingIndex
|
|
320 read_loc = end_read_start+match_len-1-mappingIndex-gap_read
|
|
321
|
|
322 if chrAx == '-': chrom_loc -= 1
|
|
323
|
|
324 if chrBx == '-':
|
|
325 scoreBx = '-1'
|
|
326 else:
|
|
327 scoreBx = hits_score[readkey][str(x+1)].split()[read_loc]
|
|
328
|
|
329 # 1-based on chrom_loc and read_loc
|
|
330 rev_line = '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) +'\n' + rev_line
|
|
331
|
|
332 if chrom_cov.has_key(end_chrom):
|
|
333
|
|
334 if chrom_cov[end_chrom].has_key(chrom_loc):
|
|
335 chrom_cov[end_chrom][chrom_loc] += 1
|
|
336 else:
|
|
337 chrom_cov[end_chrom][chrom_loc] = 1
|
|
338
|
|
339 else:
|
|
340
|
|
341 chrom_cov[end_chrom] = {}
|
|
342 chrom_cov[end_chrom][chrom_loc] = 1
|
|
343
|
|
344 if pos_line: temp_table.write('%s\n' %(pos_line.rstrip('\r\n')))
|
|
345 if rev_line: temp_table.write('%s\n' %(rev_line.rstrip('\r\n')))
|
|
346
|
|
347 temp_table.close()
|
|
348
|
|
349 # chrom-wide coverage
|
|
350 for i, line in enumerate(open(temp_table_name)):
|
|
351
|
|
352 line = line.rstrip()
|
|
353 if not line or line.startswith('#'): continue
|
|
354
|
|
355 fields = line.split()
|
|
356 chrom = fields[0]
|
|
357 eachBp = int(fields[1])
|
|
358 readname = fields[2]
|
|
359
|
|
360 if hit_per_read == 1:
|
|
361 fields[2] = readname.split('/')[0]
|
|
362
|
|
363 if chrom_cov[chrom].has_key(eachBp):
|
|
364 outfile.write('%s\t%d\n' %('\t'.join(fields), chrom_cov[chrom][eachBp]))
|
|
365 else:
|
|
366 outfile.write('%s\t%d\n' %('\t'.join(fields), 0))
|
|
367
|
|
368 outfile.close()
|
|
369
|
|
370 if os.path.exists(temp_table_name): os.remove(temp_table_name)
|
|
371
|
|
372 if invalid_editstring_char:
|
|
373 print 'Skip ', invalid_editstring_char, ' invalid characters in editstrings'
|
|
374
|
|
375 return True
|
|
376
|
|
377 def convert_fastqsolexa_to_fasta_qual(infile_name, query_fasta, query_qual):
|
|
378
|
|
379 outfile_seq = open( query_fasta, 'w' )
|
|
380 outfile_score = open( query_qual, 'w' )
|
|
381
|
|
382 seq_title_startswith = ''
|
|
383 qual_title_startswith = ''
|
|
384
|
|
385 default_coding_value = 64 # Solexa ascii-code
|
|
386 fastq_block_lines = 0
|
|
387
|
|
388 for i, line in enumerate( file( infile_name ) ):
|
|
389 line = line.rstrip()
|
|
390 if not line or line.startswith( '#' ): continue
|
|
391
|
|
392 fastq_block_lines = ( fastq_block_lines + 1 ) % 4
|
|
393 line_startswith = line[0:1]
|
|
394
|
|
395 if fastq_block_lines == 1:
|
|
396 # first line is @title_of_seq
|
|
397 if not seq_title_startswith:
|
|
398 seq_title_startswith = line_startswith
|
|
399
|
|
400 if line_startswith != seq_title_startswith:
|
|
401 outfile_seq.close()
|
|
402 outfile_score.close()
|
|
403 stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
|
|
404
|
|
405 read_title = line[1:]
|
|
406 outfile_seq.write( '>%s\n' % line[1:] )
|
|
407
|
|
408 elif fastq_block_lines == 2:
|
|
409 # second line is nucleotides
|
|
410 read_length = len( line )
|
|
411 outfile_seq.write( '%s\n' % line )
|
|
412
|
|
413 elif fastq_block_lines == 3:
|
|
414 # third line is +title_of_qualityscore ( might be skipped )
|
|
415 if not qual_title_startswith:
|
|
416 qual_title_startswith = line_startswith
|
|
417
|
|
418 if line_startswith != qual_title_startswith:
|
|
419 outfile_seq.close()
|
|
420 outfile_score.close()
|
|
421 stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
|
|
422
|
|
423 quality_title = line[1:]
|
|
424 if quality_title and read_title != quality_title:
|
|
425 outfile_seq.close()
|
|
426 outfile_score.close()
|
|
427 stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) )
|
|
428
|
|
429 if not quality_title:
|
|
430 outfile_score.write( '>%s\n' % read_title )
|
|
431 else:
|
|
432 outfile_score.write( '>%s\n' % line[1:] )
|
|
433
|
|
434 else:
|
|
435 # fourth line is quality scores
|
|
436 qual = ''
|
|
437 fastq_integer = True
|
|
438 # peek: ascii or digits?
|
|
439 val = line.split()[0]
|
|
440 try:
|
|
441 check = int( val )
|
|
442 fastq_integer = True
|
|
443 except:
|
|
444 fastq_integer = False
|
|
445
|
|
446 if fastq_integer:
|
|
447 # digits
|
|
448 qual = line
|
|
449 else:
|
|
450 # ascii
|
|
451 quality_score_length = len( line )
|
|
452 if quality_score_length == read_length + 1:
|
|
453 # first char is qual_score_startswith
|
|
454 qual_score_startswith = ord( line[0:1] )
|
|
455 line = line[1:]
|
|
456 elif quality_score_length == read_length:
|
|
457 qual_score_startswith = default_coding_value
|
|
458 else:
|
|
459 stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) )
|
|
460
|
|
461 for j, char in enumerate( line ):
|
|
462 score = ord( char ) - qual_score_startswith # 64
|
|
463 qual = "%s%s " % ( qual, str( score ) )
|
|
464
|
|
465 outfile_score.write( '%s\n' % qual )
|
|
466
|
|
467 outfile_seq.close()
|
|
468 outfile_score.close()
|
|
469
|
|
470 return True
|
|
471
|
|
472 def __main__():
|
|
473
|
|
474 # SHRiMP path
|
|
475 shrimp = 'rmapper-ls'
|
|
476
|
|
477 # I/O
|
|
478 input_target_file = sys.argv[1] # fasta
|
|
479 shrimp_outfile = sys.argv[2] # shrimp output
|
|
480 table_outfile = sys.argv[3] # table output
|
|
481 single_or_paired = sys.argv[4].split(',')
|
|
482
|
|
483 insertion_size = 600
|
|
484
|
|
485 if len(single_or_paired) == 1: # single or paired
|
|
486 type_of_reads = 'single'
|
|
487 hit_per_read = 1
|
|
488 input_query = single_or_paired[0]
|
|
489 query_fasta = tempfile.NamedTemporaryFile().name
|
|
490 query_qual = tempfile.NamedTemporaryFile().name
|
|
491
|
|
492 else: # paired-end
|
|
493 type_of_reads = 'paired'
|
|
494 hit_per_read = 2
|
|
495 input_query_end1 = single_or_paired[0]
|
|
496 input_query_end2 = single_or_paired[1]
|
|
497 insertion_size = int(single_or_paired[2])
|
|
498 query_fasta_end1 = tempfile.NamedTemporaryFile().name
|
|
499 query_fasta_end2 = tempfile.NamedTemporaryFile().name
|
|
500 query_qual_end1 = tempfile.NamedTemporaryFile().name
|
|
501 query_qual_end2 = tempfile.NamedTemporaryFile().name
|
|
502
|
|
503 # SHRiMP parameters: total = 15, default values
|
|
504 spaced_seed = '111111011111'
|
|
505 seed_matches_per_window = '2'
|
|
506 seed_hit_taboo_length = '4'
|
|
507 seed_generation_taboo_length = '0'
|
|
508 seed_window_length = '115.0'
|
|
509 max_hits_per_read = '100'
|
|
510 max_read_length = '1000'
|
|
511 kmer = '-1'
|
|
512 sw_match_value = '100'
|
|
513 sw_mismatch_value = '-150'
|
|
514 sw_gap_open_ref = '-400'
|
|
515 sw_gap_open_query = '-400'
|
|
516 sw_gap_ext_ref = '-70'
|
|
517 sw_gap_ext_query = '-70'
|
|
518 sw_hit_threshold = '68.0'
|
|
519
|
|
520 # TODO: put the threshold on each of these parameters
|
|
521 if len(sys.argv) > 5:
|
|
522
|
|
523 try:
|
|
524 if sys.argv[5].isdigit():
|
|
525 spaced_seed = sys.argv[5]
|
|
526 else:
|
|
527 stop_err('Error in assigning parameter: Spaced seed.')
|
|
528 except:
|
|
529 stop_err('Spaced seed must be a combination of 1s and 0s.')
|
|
530
|
|
531 seed_matches_per_window = sys.argv[6]
|
|
532 seed_hit_taboo_length = sys.argv[7]
|
|
533 seed_generation_taboo_length = sys.argv[8]
|
|
534 seed_window_length = sys.argv[9]
|
|
535 max_hits_per_read = sys.argv[10]
|
|
536 max_read_length = sys.argv[11]
|
|
537 kmer = sys.argv[12]
|
|
538 sw_match_value = sys.argv[13]
|
|
539 sw_mismatch_value = sys.argv[14]
|
|
540 sw_gap_open_ref = sys.argv[15]
|
|
541 sw_gap_open_query = sys.argv[16]
|
|
542 sw_gap_ext_ref = sys.argv[17]
|
|
543 sw_gap_ext_query = sys.argv[18]
|
|
544 sw_hit_threshold = sys.argv[19]
|
|
545
|
|
546 # temp file for shrimp log file
|
|
547 shrimp_log = tempfile.NamedTemporaryFile().name
|
|
548
|
|
549 # convert fastq to fasta and quality score files
|
|
550 if type_of_reads == 'single':
|
|
551 return_value = convert_fastqsolexa_to_fasta_qual(input_query, query_fasta, query_qual)
|
|
552 else:
|
|
553 return_value = convert_fastqsolexa_to_fasta_qual(input_query_end1, query_fasta_end1, query_qual_end1)
|
|
554 return_value = convert_fastqsolexa_to_fasta_qual(input_query_end2, query_fasta_end2, query_qual_end2)
|
|
555
|
|
556 # SHRiMP command
|
|
557 if type_of_reads == 'single':
|
|
558 command = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])
|
|
559
|
|
560 try:
|
|
561 os.system(command)
|
|
562 except Exception, e:
|
|
563 if os.path.exists(query_fasta): os.remove(query_fasta)
|
|
564 if os.path.exists(query_qual): os.remove(query_qual)
|
|
565 stop_err(str(e))
|
|
566
|
|
567 else: # paired
|
|
568 command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])
|
|
569 command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log])
|
|
570
|
|
571 try:
|
|
572 os.system(command_end1)
|
|
573 os.system(command_end2)
|
|
574 except Exception, e:
|
|
575 if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1)
|
|
576 if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2)
|
|
577 if os.path.exists(query_qual_end1): os.remove(query_qual_end1)
|
|
578 if os.path.exists(query_qual_end2): os.remove(query_qual_end2)
|
|
579 stop_err(str(e))
|
|
580
|
|
581 # check SHRiMP output: count number of lines
|
|
582 num_hits = 0
|
|
583 if shrimp_outfile:
|
|
584 for i, line in enumerate(file(shrimp_outfile)):
|
|
585 line = line.rstrip('\r\n')
|
|
586 if not line or line.startswith('#'): continue
|
|
587 try:
|
|
588 fields = line.split()
|
|
589 num_hits += 1
|
|
590 except Exception, e:
|
|
591 stop_err(str(e))
|
|
592
|
|
593 if num_hits == 0: # no hits generated
|
|
594 err_msg = ''
|
|
595 if shrimp_log:
|
|
596 for i, line in enumerate(file(shrimp_log)):
|
|
597 if line.startswith('error'): # deal with memory error:
|
|
598 err_msg += line # error: realloc failed: Cannot allocate memory
|
|
599 if re.search('Reads Matched', line): # deal with zero hits
|
|
600 if int(line[8:].split()[2]) == 0:
|
|
601 err_msg = 'Zero hits found.\n'
|
|
602 stop_err('SHRiMP Failed due to:\n' + err_msg)
|
|
603
|
|
604 # convert to table
|
|
605 if type_of_reads == 'single':
|
|
606 return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size)
|
|
607 else:
|
|
608 return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1+','+query_qual_end2, table_outfile, hit_per_read, insertion_size)
|
|
609
|
|
610 # remove temp. files
|
|
611 if type_of_reads == 'single':
|
|
612 if os.path.exists(query_fasta): os.remove(query_fasta)
|
|
613 if os.path.exists(query_qual): os.remove(query_qual)
|
|
614 else:
|
|
615 if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1)
|
|
616 if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2)
|
|
617 if os.path.exists(query_qual_end1): os.remove(query_qual_end1)
|
|
618 if os.path.exists(query_qual_end2): os.remove(query_qual_end2)
|
|
619
|
|
620 if os.path.exists(shrimp_log): os.remove(shrimp_log)
|
|
621
|
|
622
|
|
623 if __name__ == '__main__': __main__()
|
|
624
|