comparison lastz_paired_reads_wrapper.py @ 0:96825cee5c25 draft

Uploaded tarball
author devteam
date Mon, 26 Nov 2012 09:48:27 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:96825cee5c25
1 #!/usr/bin/env python
2
3 """
4 Runs Lastz paired read alignment process
5 Written for Lastz v. 1.02.00.
6
7 # Author(s): based on various scripts written by Bob Harris (rsharris@bx.psu.edu),
8 # then tweaked to this form by Greg Von Kuster (greg@bx.psu.edu)
9
10 This tool takes the following input:
11 a. A collection of 454 paired end reads ( a fasta file )
12 b. A linker sequence ( a very small fasta file )
13 c. A reference genome ( nob, 2bit or fasta )
14
15 and uses the following process:
16 1. Split reads into mates: the input to this step is the read file XXX.fasta, and the output is three
17 files; XXX.short.fasta, XXX.long.fasta and XXX.mapping. The mapping file records the information necessary
18 to convert mate coordinates back into the original read, which is needed later in the process.
19
20 2. Align short mates to the reference: this runs lastz against every chromosome. The input is XXX.short.fasta
21 and the reference genome, and the output is a SAM file, XXX.short.sam.
22
23 3. Align long mates to the reference: this runs lastz against every chromosome. The input is XXX.long.fasta
24 and the reference genome, and the output is a SAM file, XXX.long.sam.
25
26 4. Combine, and convert mate coordinates back to read coordinates. The input is XXX.mapping, XXX.short.sam and
27 XXX.long.sam, and the output is XXX.sam.
28
29 usage: lastz_paired_reads_wrapper.py [options]
30 --ref_name: The reference name to change all output matches to
31 --ref_source: The reference is cached or from the history
32 --source_select: Use pre-set or cached reference file
33 --input1: The name of the reference file if using history or reference base name if using cached
34 --input2: The reads file to align
35 --input3: The sequencing linker file
36 --input4: The base quality score 454 file
37 --ref_sequences: The number of sequences in the reference file if using one from history
38 --output: The name of the output file
39 --lastz_seqs_file_dir: Directory of local lastz_seqs.loc file
40 """
41 import optparse, os, subprocess, shutil, sys, tempfile, time
42 from string import maketrans
43
44 from galaxy import eggs
45 import pkg_resources
46 pkg_resources.require( 'bx-python' )
47 from bx.seq.twobit import *
48 from bx.seq.fasta import FastaReader
49 from galaxy.util.bunch import Bunch
50 from galaxy.util import string_as_bool
51
52 # Column indexes for SAM required fields
53 SAM_QNAME_COLUMN = 0
54 SAM_FLAG_COLUMN = 1
55 SAM_RNAME_COLUMN = 2
56 SAM_POS_COLUMN = 3
57 SAM_MAPQ_COLUMN = 4
58 SAM_CIGAR_COLUMN = 5
59 SAM_MRNM_COLUMN = 6
60 SAM_MPOS_COLUMN = 7
61 SAM_ISIZE_COLUMN = 8
62 SAM_SEQ_COLUMN = 9
63 SAM_QUAL_COLUMN = 10
64 SAM_MIN_COLUMNS = 11
65 # SAM bit-encoded flags
66 BAM_FPAIRED = 1 # the read is paired in sequencing, no matter whether it is mapped in a pair
67 BAM_FPROPER_PAIR = 2 # the read is mapped in a proper pair
68 BAM_FUNMAP = 4 # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
69 BAM_FMUNMAP = 8 # the mate is unmapped
70 BAM_FREVERSE = 16 # the read is mapped to the reverse strand
71 BAM_FMREVERSE = 32 # the mate is mapped to the reverse strand
72 BAM_FREAD1 = 64 # this is read1
73 BAM_FREAD2 = 128 # this is read2
74 BAM_FSECONDARY = 256 # not primary alignment
75 BAM_FQCFAIL = 512 # QC failure
76 BAM_FDUP = 1024 # optical or PCR duplicate
77
78 # Keep track of all created temporary files so they can be deleted
79 global tmp_file_names
80 tmp_file_names = []
81 # The values in the skipped_lines dict are tuples consisting of:
82 # - the number of skipped lines for that error
83 # If not a sequence error:
84 # - the 1st line number on which the error was found
85 # - the text of the 1st line on which the error was found
86 # If a sequence error:
87 # - The number of the sequence in the file
88 # - the sequence name on which the error occurred
89 # We may need to improve dealing with file position and text as
90 # much of it comes from temporary files that are created from the
91 # inputs, and not the inputs themselves, so this could be confusing
92 # to the user.
93 global skipped_lines
94 skipped_lines = dict( bad_interval=( 0, 0, '' ),
95 inconsistent_read_lengths=( 0, 0, '' ),
96 inconsistent_reads=( 0, 0, '' ),
97 inconsistent_sizes=( 0, 0, '' ),
98 missing_mate=( 0, 0, '' ),
99 missing_quals=( 0, 0, '' ),
100 missing_seq=( 0, 0, '' ),
101 multiple_seqs=( 0, 0, '' ),
102 no_header=( 0, 0, '' ),
103 num_fields=( 0, 0, '' ),
104 reads_paired=( 0, 0, '' ),
105 sam_flag=( 0, 0, '' ),
106 sam_headers=( 0, 0, '' ),
107 sam_min_columns=( 0, 0, '' ),
108 two_mate_names=( 0, 0, '' ),
109 wrong_seq_len=( 0, 0, '' ) )
110 global total_skipped_lines
111 total_skipped_lines = 0
112
113 def stop_err( msg ):
114 sys.stderr.write( "%s" % msg )
115 sys.exit()
116
117 def skip_line( error_key, position, text ):
118 if not skipped_lines[ error_key ][2]:
119 skipped_lines[ error_key ][1] = position
120 skipped_lines[ error_key ][2] = text
121 skipped_lines[ error_key ][0] += 1
122 total_skipped_lines += 1
123
124 def get_tmp_file_name( dir=None, suffix=None ):
125 """
126 Return a unique temporary file name that can be managed. The
127 file must be manually removed after use.
128 """
129 if dir and suffix:
130 tmp_fd, tmp_name = tempfile.mkstemp( dir=dir, suffix=suffix )
131 elif dir:
132 tmp_fd, tmp_name = tempfile.mkstemp( dir=dir )
133 elif suffix:
134 tmp_fd, tmp_name = tempfile.mkstemp( suffix=suffix )
135 os.close( tmp_fd )
136 tmp_file_names.append( tmp_name )
137 return tmp_name
138
139 def run_command( command ):
140 proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE, )
141 proc.wait()
142 stderr = proc.stderr.read()
143 proc.wait()
144 if stderr:
145 stop_err( stderr )
146
147 def split_paired_reads( input2, combined_linker_file_name ):
148 """
149 Given a fasta file of allegedly paired end reads ( input2 ), and a list of intervals
150 showing where the linker is on each read ( combined_linker_file_name ), split the reads into left and right
151 halves.
152
153 The input intervals look like this. Note that they may include multiple intervals for the same read
154 ( which should overlap ), and we use the union of them as the linker interval. Non-overlaps are
155 reported to the user, and those reads are not processed. Starts are origin zero.
156
157 #name strand start len size
158 FG3OYDA05FTEES + 219 42 283
159 FG3OYDA05FVOLL + 263 41 416
160 FG3OYDA05FFL7J + 81 42 421
161 FG3OYDA05FOQWE + 55 42 332
162 FG3OYDA05FV4DW + 297 42 388
163 FG3OYDA05FWAQV + 325 42 419
164 FG3OYDA05FVLGA + 90 42 367
165 FG3OYDA05FWJ71 + 58 42 276
166
167 The output gives each half-sequence on a separate line, like this. This allows easy sorting of the
168 sequences by length, after the fact.
169
170 219 FG3OYDA05FTEES_L TTTAGTTACACTTAACTCACTTCCATCCTCTAAATACGTGATTACCTTTC...
171 22 FG3OYDA05FTEES_R CCTTCCTTAAGTCCTAAAACTG
172 """
173 # Bob says these should be hard-coded.
174 seq_len_lower_threshold = 17
175 short_mate_cutoff = 50
176 # We need to pass the name of this file back to the caller.
177 tmp_mates_file_name = get_tmp_file_name( suffix='mates.txt' )
178 mates_file = file( tmp_mates_file_name, "w+b" )
179 # Read the linker intervals
180 combined_linker_file = file( combined_linker_file_name, "rb" )
181 read_to_linker_dict = {}
182 i = 0
183 for i, line in enumerate( combined_linker_file ):
184 line = line.strip()
185 if line.startswith( "#" ):
186 continue
187 if line.find( '#' ) >= 0:
188 line = line.split( "#", 1 )[0].rstrip()
189 fields = line.split()
190 if len( fields ) != 4:
191 skip_line( 'num_fields', i+1, line )
192 continue
193 name, start, length, size = fields
194 start = int( start )
195 length = int( length )
196 size = int( size )
197 end = start + length
198 if end > size:
199 skip_line[ 'bad_interval' ] += 1
200 continue
201 if name not in read_to_linker_dict:
202 read_to_linker_dict[ name ] = ( start, end, size )
203 continue
204 if read_to_linker_dict[ name ] == None:
205 # Read previously marked as non-overlapping intervals, so skip this sequence - see below
206 continue
207 ( s, e, sz ) = read_to_linker_dict[ name ]
208 if sz != size:
209 skip_line( 'inconsistent_sizes', i+1, name )
210 continue
211 if s > end or e < start:
212 # Non-overlapping intervals, so skip this sequence
213 read_to_linker_dict[ name ] = None
214 continue
215 read_to_linker_dict[ name ] = ( min( s, start ), max( e, end ), size )
216 combined_linker_file.close()
217 # We need to pass the name of this file back to the caller.
218 tmp_mates_mapping_file_name = get_tmp_file_name( suffix='mates.mapping' )
219 mates_mapping_file = file( tmp_mates_mapping_file_name, 'w+b' )
220 # Process the sequences
221 seqs = 0
222 fasta_reader = FastaReader( file( input2, 'rb' ) )
223 while True:
224 seq = fasta_reader.next()
225 if not seq:
226 break
227 seqs += 1
228 if seq.name not in read_to_linker_dict:
229 if seq.length > seq_len_lower_threshold:
230 mates_file.write( "%-3d %s %s\n" % ( seq.length, seq.name, seq.text ) )
231 read_to_linker_dict[ seq.name ] = ""
232 continue
233 if read_to_linker_dict[ seq.name ] == "":
234 skip_line( 'multiple_seqs', seqs, seq.name )
235 continue
236 if read_to_linker_dict[ seq.name ] == None:
237 # Read previously marked as non-overlapping intervals, so skip this sequence - see above
238 continue
239 ( start, end, size ) = read_to_linker_dict[ seq.name ]
240 if seq.length != size:
241 skip_line( 'wrong_seq_len', seqs, seq.name )
242 continue
243 left = seq.text[ :start ]
244 right = seq.text[ end: ]
245 left_is_small = len( left ) <= seq_len_lower_threshold
246 right_is_small = len( right ) <= seq_len_lower_threshold
247 if left_is_small and right_is_small:
248 continue
249 if not left_is_small:
250 mates_file.write( "%-3d %s %s\n" % ( len( left ), seq.name + "_L", left ) )
251 mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_L", seq.name, 0, size - start ) )
252 if not right_is_small:
253 mates_file.write( "%-3d %s %s\n" % ( len( right ), seq.name + "_R", right ) )
254 mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_R", seq.name, end, 0 ) )
255 read_to_linker_dict[ seq.name ] = ""
256 combined_linker_file.close()
257 mates_file.close()
258 mates_mapping_file.close()
259 # Create temporary files for short and long mates
260 tmp_mates_short_file_name = get_tmp_file_name( suffix='mates.short' )
261 tmp_mates_long_file_name = get_tmp_file_name( suffix='mates.long' )
262 tmp_mates_short = open( tmp_mates_short_file_name, 'w+b' )
263 tmp_mates_long = open( tmp_mates_long_file_name, 'w+b' )
264 i = 0
265 for i, line in enumerate( file( tmp_mates_file_name, 'rb' ) ):
266 fields = line.split()
267 seq_len = int( fields[0] )
268 seq_name = fields[1]
269 seq_text = fields[2]
270 if seq_len <= short_mate_cutoff:
271 tmp_mates_short.write( ">%s\n%s\n" % ( seq_name, seq_text ) )
272 else:
273 tmp_mates_long.write( ">%s\n%s\n" % ( seq_name, seq_text ) )
274 tmp_mates_short.close()
275 tmp_mates_long.close()
276 return tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name
277
278 def align_mates( input1, ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ):
279 tmp_align_file_names = []
280 if ref_source == 'history':
281 # Reference is a fasta dataset from the history
282 # Create temporary files to contain the output from lastz executions
283 tmp_short_file_name = get_tmp_file_name( suffix='short_out' )
284 tmp_align_file_names.append( tmp_short_file_name )
285 tmp_long_file_name = get_tmp_file_name( suffix='long_out' )
286 tmp_align_file_names.append( tmp_long_file_name )
287 seqs = 0
288 fasta_reader = FastaReader( open( input1 ) )
289 while True:
290 # Read the next sequence from the reference dataset. Note that if the reference contains
291 # a small number of chromosomes this loop is ok, but in many cases the genome has a bunch
292 # of small straggler scaffolds and contigs and it is a computational waste to do each one
293 # of these in its own run. There is an I/O down side to running by subsets (even if they are
294 # one sequence per subset), compared to splitting the reference into sizes of 250 mb. With
295 # the subset action, lastz still has to read and parse the entire file for every run (this
296 # is true for fasta, but for .2bit files it can access each sequence directly within the file,
297 # so the overhead is minimal).
298 """
299 :> output_file (this creates the output file, empty)
300 while there are more sequences to align
301 find the next sequences that add up to 250M, put their names in farf.names
302 lastz ${refFile}[subset=farf.names][multi][unmask] ${matesPath}/${matesFile} ...
303 >> output_file
304 """
305 seq = fasta_reader.next()
306 if not seq:
307 break
308 seqs += 1
309 # Create a temporary file to contain the current sequence as input to lastz.
310 # We're doing this a bit differently here since we could be generating a huge
311 # number of temporary files.
312 tmp_in_fd, tmp_in_file_name = tempfile.mkstemp( suffix='seq_%d_in' % seqs )
313 tmp_in_file = os.fdopen( tmp_in_fd, 'w+b' )
314 tmp_in_file.write( '>%s\n%s\n' % ( seq.name, seq.text ) )
315 tmp_in_file.close()
316 # Align short mates
317 command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_short_file_name )
318 command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- '
319 command += '>> %s' % tmp_short_file_name
320 run_command( command )
321 # Align long mates
322 command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_long_file_name )
323 command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- '
324 command += '>> %s' % tmp_long_file_name
325 run_command( command )
326 # Remove the temporary file that contains the current sequence
327 os.remove( tmp_in_file_name )
328 else:
329 # Reference is a locally cached 2bit file, split lastz calls across number of chroms in 2bit file
330 tbf = TwoBitFile( open( input1, 'rb' ) )
331 for chrom in tbf.keys():
332 # Align short mates
333 tmp_short_file_name = get_tmp_file_name( suffix='short_vs_%s' % chrom )
334 tmp_align_file_names.append( tmp_short_file_name )
335 command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_short_file_name )
336 command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- '
337 command += '> %s' % tmp_short_file_name
338 run_command( command )
339 # Align long mates
340 tmp_long_file_name = get_tmp_file_name( suffix='long_vs_%s' % chrom )
341 tmp_align_file_names.append( tmp_long_file_name )
342 command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_long_file_name )
343 command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- '
344 command += '> %s' % tmp_long_file_name
345 run_command( command )
346 return tmp_align_file_names
347
348 def paired_mate_unmapper( input2, input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, output ):
349 """
350 Given a SAM file corresponding to alignments of *subsegments* of paired 'reads' to a reference sequence,
351 convert the positions on the subsegments to positions on the reads. Also (optionally) add quality values.
352
353 The input file is in SAM format, as shown below. Each line represents the alignment of a part of a read
354 to a reference sequence. Read pairs are indicated by suffixes in their names. Normally, the suffixes _L
355 and _R indicate the left and right mates of reads (this can be overridden with the --left and --right
356 options). Reads that were not mates have no suffix.
357
358 (SAM header lines omitted)
359 F2YP0BU02G7LK5_R 16 chr21 15557360 255 40M * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT *
360 F2YP0BU02HXV58_L 16 chr21 15952091 255 40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat *
361 F2YP0BU02HREML_R 0 chr21 16386077 255 33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc *
362 F2YP0BU02IOF1F_L 0 chr21 17567321 255 7S28M * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC *
363 F2YP0BU02IKX84_R 16 chr21 18491628 255 22M1D18M9S * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt *
364 F2YP0BU02GW5VA_L 16 chr21 20255344 255 6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA *
365 F2YP0BU02JIMJ4_R 0 chr21 22383051 255 19M * 0 0 CCCTTTATCATTTTTTATT *
366 F2YP0BU02IXZGF_L 16 chr21 23094798 255 13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT *
367 F2YP0BU02IODR5_L 0 chr21 30935325 255 37M * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA *
368 F2YP0BU02IMZBL_L 16 chr21 31603486 255 28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG *
369 F2YP0BU02JA9PR_L 16 chr21 31677159 255 23M * 0 0 CACACCTGTAACCCCAGCACTTT *
370 F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT *
371 F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT *
372 F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat *
373 F2YP0BU02JDCF1_L 0 chr21 31816600 255 38M * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT *
374 F2YP0BU02GZ1GO_R 0 chr21 33360122 255 6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC *
375 F2YP0BU02FX387_L 16 chr22 14786201 255 26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT *
376 F2YP0BU02IF2NE_R 0 chr22 16960842 255 40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc *
377 F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA *
378 F2YP0BU02HKC61_R 16 chr22 29516998 255 8S32M * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG *
379 F2YP0BU02FS4EM_R 0 chr22 30159364 255 29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG *
380 F2YP0BU02G197P_L 0 chr22 32044496 255 40M10S * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc *
381 F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG *
382 F2YP0BU02GUB9L_L 16 chr22 49198404 255 16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA *
383
384 The user must provide a mapping file (which might better be called an unmapping file). This file is usually
385 created by split_paired_reads, and tells us how to map the subsegments back to original coordinates in a single
386 read (this means the left and right mates were part of a single read). The mapping file contains four columns.
387 The first two give the mates's name (including the suffix) and the read name. The last two columns describe how
388 much of the full original sequence is missing from the mate. For example, in the read below, the left mate is
389 missing 63 on the right (42 for the linker and 21 for the right half). The right mate is missing 339 on the left.
390
391 left half: TTTCAACATATGCAAATCAATAAATGTAATCCAGCATATAAACAGAACCA
392 AAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAGGCCTTCGGC
393 AAAATTCAACAAAACTCCATGCTAAAACTCTCAATAAGGTATTGATGGGA
394 CATGCCGCATAATAATAAGACATATCTATGACAAACCCACAGCCAATATC
395 ATGCTGAATGCACAAAAATTGGAAGCATTCCCTTTGAAAACTGGCACAAG
396 ACTGGGATGCCCTCTCTCACAACTCCTATTCAACATAGTGTTGGAAG
397 linker: CGTAATAACTTCGTATAGCATACATTATACGAAGTCATACGA
398 right half: CTCCTGCCTCAGCCTCCCGAG
399
400 mate_name read_name offset_to_start offset_from_end
401 F2YP0BU02FS4EM_L F2YP0BU02FS4EM 0 71
402 F2YP0BU02FS4EM_R F2YP0BU02FS4EM 339 0
403
404 The user can also specify a quality scores file, which should look something like this. Quality values are presumed
405 to be PHRED scores, written in space-delimited decimal.
406
407 >F2YP0BU02FS4EM
408 38 38 38 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 38 21 21 21 40
409 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 33
410 32 32 40 40 40 21 21 18 18 21 34 34 31 40 40 40 40 40 40 40 40 40 40 40 40
411 40 40 40 40 40 40 40 40 40 40 40 32 32 32 32 40 40 40 40 40 40 40 34 34 35
412 31 31 28 28 33 33 33 36 36 36 17 17 17 19 26 36 36 36 40 40 40 40 40 33 34
413 34 34 39 39 39 40 40 40 40 40 33 33 34 34 40 40 40 40 40 40 40 39 39 39 40
414 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
415 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 39 39 39 40 40 40 40 40 40
416 40 40 40 40 40 40 40 40 40 40 40 40 40 26 26 26 26 26 40 40 38 38 37 35 33
417 36 40 19 17 17 17 17 19 19 23 30 20 20 20 23 35 40 36 36 36 36 36 36 36 36
418 39 40 34 20 27 27 35 39 40 37 40 40 40 40 40 40 40 40 40 40 34 34 35 39 40
419 40 40 40 40 40 40 39 39 39 40 40 40 40 36 36 32 32 28 28 29 30 36 40 30 26
420 26 26 34 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39
421 40 39 35 34 34 40 40 40 40 30 30 30 35 40 40 40 40 40 39 39 36 40 40 40 40
422 39 39 39 39 30 30 28 35 35 39 40 40 40 40 40 35 35 35
423 >F2YP0BU02G197P
424 40 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 40 40 40 40 40 40
425 40 40 40 40 26 26 26 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
426 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
427 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
428 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 34 34 34 40 40
429 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40
430 40 40 40 40 40 40 40 34 34 34 34 40 40 40 40 34 34 34 34 40 40 40 40 40 40
431 40 40 40 40 40 39 39 39 34 34 34 34 40 40 40 40 39 39 25 25 26 39 40 40 40
432 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
433 33 33 33 33 40 35 21 21 21 30 38 40 40 40 40 40 40 40 40 35 35 30 30 30 40
434 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
435 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40
436 40 40 40 39 39 39 40 40
437 >F2YP0BU02FIING
438 32 32 32 25 25 25 25 24 25 30 31 30 27 27 27 28 28 21 19 19 13 13 13 14 19
439 19 17 19 16 16 25 28 22 21 17 17 18 25 24 25 25 25
440
441 The output file is also SAM:
442
443 (SAM header lines omitted)
444 F2YP0BU02G7LK5 81 chr21 15557360 255 40M303H * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT D>>>>IIIIIIHHG???IIIIIIIIIHHHFFEIH999HII
445 F2YP0BU02HXV58 145 chr21 15952091 255 226H40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat AA===DDDDAAAAD???:::ABBBBBAAA:888ECF;F>>>?8??@
446 F2YP0BU02HREML 65 chr21 16386077 255 320H33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc HH???HHIIIHFHIIIIIIICDDHHIIIIIIHHHHHHH
447 F2YP0BU02IOF1F 129 chr21 17567321 255 7S28M409H * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC 4100<<A>4113:<EFGGGFFFHHHHHHDFFFFED
448 F2YP0BU02IKX84 81 chr21 18491628 255 22M1D18M9S341H * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt ;;;=7@.55------?2?11112GGB=CCCCDIIIIIIIIIHHHHHHII
449 F2YP0BU02GW5VA 145 chr21 20255344 255 286H6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA IIIIIIIHHHIIIIIIICCCCIIIIIIIIIIIIIIIII
450 F2YP0BU02JIMJ4 65 chr21 22383051 255 208H19M * 0 0 CCCTTTATCATTTTTTATT 555544E?GE113344I22
451 F2YP0BU02IXZGF 145 chr21 23094798 255 291H13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT IIIIIIIIIIIGG;;;GGHIIIIIGGGIIIII
452 F2YP0BU02IODR5 129 chr21 30935325 255 37M154H * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA 6...7/--..,30;9<<>@BFFFAAAAHIIIIIH@@@
453 F2YP0BU02IMZBL 145 chr21 31603486 255 342H28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG BB1552222<<>9==8;;?AA=??A???A
454 F2YP0BU02JA9PR 145 chr21 31677159 255 229H23M * 0 0 CACACCTGTAACCCCAGCACTTT IIIIIIIIIIICCCCIIIIIHHH
455 F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442
456 F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442
457 F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat >8888DFFHHGFHHHH@@?@?DDC96666HIIIFFFFFFFFFFF
458 F2YP0BU02JDCF1 129 chr21 31816600 255 38M103H * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT IIIIIIIIIIIHHHIIHHHIIIIIIIIIIIIIIIIIII
459 F2YP0BU02GZ1GO 65 chr21 33360122 255 76H6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC BBBBD?:688CFFFFFFFFFFFFFFFFFFFFFFFFFFDDBBB51
460 F2YP0BU02FX387 145 chr22 14786201 255 201H26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT IIHHHHHHHHHHHHHFFFFFFFFFFF
461 F2YP0BU02IF2NE 65 chr22 16960842 255 209H40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc BAAADDDDFDDDDDDBBA889<A?4444000@<>AA?9444;;8>77<7-
462 F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA FFF???FFFFFIIIIIIIIIIIIIIIIIIIIIIIHHIIFHFFFGDDB=5
463 F2YP0BU02HKC61 81 chr22 29516998 255 8S32M300H * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG 2444<<<<>?>><40009<:8888?A@AAA==:::DB@AA
464 F2YP0BU02FS4EM 65 chr22 30159364 255 339H29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG IIIIHHEIIIIHHHH??=DDHIIIIIDDD
465 F2YP0BU02G197P 129 chr22 32044496 255 40M10S258H * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc IIIIIIIIIIHHHHHHIIIIIIIIIIIII;;;IIIIIIIIIIIIIIIIII
466 F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG :::9:32267=:114244/...446==<<<?@?:9::::AAA
467 F2YP0BU02GUB9L 145 chr22 49198404 255 176H16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA IIIIIIIIIHAAC;<</////@4F5778;IIIIIIII
468
469 """
470 left_suffix = "_L"
471 right_suffix = "_R"
472 # Read the mapping
473 mate_to_read_dict = {}
474 i = 0
475 for i, line in enumerate( file( tmp_mates_mapping_file_name, 'rb' ) ):
476 line = line.strip()
477 if not line.startswith( "#" ):
478 fields = line.split()
479 if len( fields ) != 4:
480 skip_line( "num_fields", i+1, line )
481 continue
482 mate_name, read_name, s_offset, e_offset = fields
483 if mate_name in mate_to_read_dict:
484 skip_line( 'two_mate_names', i+1, mate_name )
485 continue
486 mate_to_read_dict[ mate_name ] = ( read_name, int( s_offset ), int( e_offset ) )
487 # Read sequence data
488 read_to_nucs_dict = {}
489 seqs = 0
490 fasta_reader = FastaReader( file( input2, 'rb' ) )
491 while True:
492 seq = fasta_reader.next()
493 if not seq:
494 break
495 seqs += 1
496 seq_text_upper = seq.text.upper()
497 if seq.name in read_to_nucs_dict:
498 if seq_text_upper != read_to_nucs_dict[ seq.name ]:
499 skip_line( 'inconsistent_reads', seqs, seq.name )
500 continue
501 read_to_nucs_dict[ seq.name ] = seq_text_upper
502 # Read quality data
503 def quality_sequences( f ):
504 seq_name = None
505 seq_quals = None
506 line_number = 0
507 for line in f:
508 line_number += 1
509 line = line.strip()
510 if line.startswith( ">" ):
511 if seq_name != None:
512 yield ( seq_name, seq_quals, seq_line )
513 seq_name = sequence_name( line )
514 seq_line = line_number
515 seq_quals = []
516 elif seq_name is None:
517 skip_line( 'no_header', line_number, line )
518 continue
519 else:
520 seq_quals += [ int( q ) for q in line.split() ]
521 if seq_name is not None:
522 yield ( seq_name, seq_quals, seq_line )
523 def sequence_name( s ):
524 s = s[ 1: ].strip()
525 if not s:
526 return ""
527 else:
528 return s.split()[ 0 ]
529 read_to_quals_dict = {}
530 # TODO: should we use Dan's fastaNamedReader here?
531 for seq_name, quals, line_number in quality_sequences( file( input4 ) ):
532 quals = samify_phred_scores( quals )
533 if seq_name in read_to_quals_dict:
534 if quals != read_to_quals_dict[ seq_name ]:
535 skip_line( 'inconsistent_reads', line_number, seq_name )
536 continue
537 if len( quals ) != len( read_to_nucs_dict[ seq_name ] ):
538 skip_line( 'inconsistent_read_lengths', line_number, seq_name )
539 continue
540 read_to_quals_dict[ seq_name ] = quals
541 # process the SAM file
542 tmp_align_file_names = ' '.join( tmp_align_file_name_list )
543 combined_chrom_file_name = get_tmp_file_name( suffix='combined_chrom' )
544 command = 'cat %s | grep -v "^@" | sort -k 1 > %s' % ( tmp_align_file_names, combined_chrom_file_name )
545 run_command( command )
546 fout = file( output, 'w+b' )
547 has_non_header = False
548 i = 0
549 for i, line in enumerate( file( combined_chrom_file_name, 'rb' ) ):
550 line = line.strip()
551 if line.startswith( "@" ):
552 if has_non_header:
553 skip_line( 'sam_headers', i+1, line )
554 continue
555 fout.write( "%s\n" % line )
556 continue
557 has_non_header = True
558 fields = line.split()
559 num_fields = len( fields )
560 if num_fields < SAM_MIN_COLUMNS:
561 skip_line( 'sam_min_columns', i+1, line )
562 continue
563 # Set flags for mates
564 try:
565 flag = int( fields[ SAM_FLAG_COLUMN ] )
566 except ValueError:
567 skip_line( 'sam_flag', i+1, line )
568 continue
569 if not( flag & ( BAM_FPAIRED + BAM_FREAD1 + BAM_FREAD2 ) == 0 ):
570 skip_line( 'reads_paired', i+1, line )
571 continue
572 mate_name = fields[ SAM_QNAME_COLUMN ]
573 unmap_it = False
574 half = None
575 if mate_name.endswith( left_suffix ):
576 flag += BAM_FPAIRED + BAM_FREAD2
577 fields[ SAM_FLAG_COLUMN ] = "%d" % flag
578 unmap_it = True
579 half = "L"
580 elif mate_name.endswith( right_suffix ):
581 flag += BAM_FPAIRED + BAM_FREAD1
582 fields[ SAM_FLAG_COLUMN ] = "%d" % flag
583 unmap_it = True
584 half = "R"
585 on_plus_strand = ( flag & BAM_FREVERSE == 0 )
586 # Convert position from mate to read by adding clipping to cigar
587 if not unmap_it:
588 read_name = mate_name
589 else:
590 try:
591 read_name, s_offset, e_offset = mate_to_read_dict[ mate_name ]
592 except KeyError:
593 skip_line( 'missing_mate', i+1, mate_name )
594 continue
595 cigar = fields[ SAM_CIGAR_COLUMN ]
596 cigar_prefix = None
597 cigar_suffix = None
598 if half == "L":
599 if on_plus_strand:
600 if s_offset > 0:
601 cigar_prefix = ( s_offset, "S" )
602 if e_offset > 0:
603 cigar_suffix = ( e_offset, "H" )
604 else:
605 if e_offset > 0:
606 cigar_prefix = ( e_offset, "H" )
607 if s_offset > 0:
608 cigar_suffix = ( s_offset, "S" )
609 elif half == "R":
610 if on_plus_strand:
611 if s_offset > 0:
612 cigar_prefix = ( s_offset, "H" )
613 if e_offset > 0:
614 cigar_suffix = ( e_offset, "S" )
615 else:
616 if e_offset > 0:
617 cigar_prefix = ( e_offset, "S" )
618 if s_offset > 0:
619 cigar_suffix = ( s_offset, "H" )
620 else:
621 if on_plus_strand:
622 if s_offset > 0:
623 cigar_prefix = ( s_offset, "S" )
624 if e_offset > 0:
625 cigar_suffix = ( e_offset, "S" )
626 else:
627 if e_offset > 0:
628 cigar_prefix = ( e_offset, "S" )
629 if s_offset > 0:
630 cigar_suffix = ( s_offset, "S" )
631 if cigar_prefix != None:
632 count, op = cigar_prefix
633 cigar = prefix_cigar( "%d%s" % ( count, op ), cigar )
634 if op == "S":
635 refPos = int( fields[ SAM_POS_COLUMN ] ) - count
636 fields[ SAM_POS_COLUMN ] = "%d" % refPos
637 if cigar_suffix != None:
638 count, op = cigar_suffix
639 cigar = suffix_cigar( cigar,"%d%s" % ( count, op) )
640 fields[ SAM_QNAME_COLUMN ] = read_name
641 fields[ SAM_CIGAR_COLUMN ] = cigar
642 # Fetch sequence and quality values, and flip/clip them
643 if read_name not in read_to_nucs_dict:
644 skip_line( 'missing_seq', i+1, read_name )
645 continue
646 nucs = read_to_nucs_dict[ read_name ]
647 if not on_plus_strand:
648 nucs = reverse_complement( nucs )
649 quals = None
650 if read_to_quals_dict != None:
651 if read_name not in read_to_quals_dict:
652 skip_line( 'missing_quals', i+1, read_name )
653 continue
654 quals = read_to_quals_dict[ read_name ]
655 if not on_plus_strand:
656 quals = reverse_string( quals )
657 cigar = split_cigar( fields[ SAM_CIGAR_COLUMN ] )
658 nucs, quals = clip_for_cigar( cigar, nucs, quals )
659 fields[ SAM_SEQ_COLUMN ] = nucs
660 if quals != None:
661 fields[ SAM_QUAL_COLUMN ] = quals
662 # Output the line
663 fout.write( "%s\n" % "\t".join( fields ) )
664 fout.close()
665
666 def prefix_cigar( prefix, cigar ):
667 ix = 0
668 while cigar[ ix ].isdigit():
669 ix += 1
670 if cigar[ ix ] != prefix[ -1 ]:
671 return prefix + cigar
672 count = int( prefix[ :-1 ] ) + int( cigar[ :ix ] )
673 return "%d%s%s" % ( count, prefix[ -1 ], cigar[ ix+1: ] )
674
675 def suffix_cigar( cigar, suffix ):
676 if cigar[ -1 ] != suffix[ -1 ]:
677 return cigar + suffix
678 ix = len( cigar ) - 2
679 while cigar[ix].isdigit():
680 ix -= 1
681 ix += 1
682 count = int( cigar[ ix:-1 ] ) + int( suffix[ :-1 ] )
683 return "%s%d%s" % ( cigar[ :ix ], count, suffix[ -1 ] )
684
685 def split_cigar( text ):
686 fields = []
687 field = []
688 for ch in text:
689 if ch not in "MIDHS":
690 field += ch
691 continue
692 if field == []:
693 raise ValueError
694 fields += [ ( int( "".join( field ) ), ch ) ]
695 field = []
696 if field != []:
697 raise ValueError
698 return fields
699
700 def clip_for_cigar( cigar, nucs, quals ):
701 # Hard clip prefix
702 count, op = cigar[0]
703 if op == "H":
704 nucs = nucs[ count: ]
705 if quals != None:
706 quals = quals[ count: ]
707 count, op = cigar[ 1 ]
708 # Soft clip prefix
709 if op == "S":
710 nucs = nucs[ :count ].lower() + nucs[ count: ]
711 # Hard clip suffix
712 count,op = cigar[ -1 ]
713 if op == "H":
714 nucs = nucs[ :-count ]
715 if quals != None:
716 quals = quals[ :-count ]
717 count, op = cigar[ -2 ]
718 # Soft clip suffix
719 if op == "S":
720 nucs = nucs[ :-count ] + nucs[ -count: ].lower()
721 return nucs, quals
722
723 def samify_phred_scores( quals ):
724 """
725 Convert a decimal list of phred base-quality scores to a sam quality string.
726 Note that if a quality is outside the dynamic range of sam's ability to
727 represent it, we clip the value to the max allowed. SAM quality scores
728 range from chr(33) to chr(126).
729 """
730 if min( quals ) >= 0 and max( quals ) <= 126-33:
731 return "".join( [ chr( 33 + q ) for q in quals ] )
732 else:
733 return "".join( [ chr( max( 33, min( 126, 33+q ) ) ) for q in quals ] )
734
735 def reverse_complement( nucs ):
736 complementMap = maketrans( "ACGTacgt", "TGCAtgca" )
737 return nucs[ ::-1 ].translate( complementMap )
738
739 def reverse_string( s ):
740 return s[ ::-1 ]
741
742 def __main__():
743 # Parse command line
744 # input1: a reference genome ( 2bit or fasta )
745 # input2: a collection of 454 paired end reads ( a fasta file )
746 # input3: a linker sequence ( a very small fasta file )
747 # input4: a base quality score 454 file ( qual454 )
748 parser = optparse.OptionParser()
749 parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' )
750 parser.add_option( '', '--ref_source', dest='ref_source', help='The reference is cached or from the history' )
751 parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' )
752 parser.add_option( '', '--source_select', dest='source_select', help='Use pre-set or cached reference file' )
753 parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' )
754 parser.add_option( '', '--input2', dest='input2', help='The 454 reads file to align' )
755 parser.add_option( '', '--input3', dest='input3', help='The sequencing linker file' )
756 parser.add_option( '', '--input4', dest='input4', help='The base quality score 454 file' )
757 parser.add_option( '', '--output', dest='output', help='The output file' )
758 parser.add_option( '', '--lastz_seqs_file_dir', dest='lastz_seqs_file_dir', help='Directory of local lastz_seqs.loc file' )
759 ( options, args ) = parser.parse_args()
760
761 # output version # of tool
762 try:
763 tmp = tempfile.NamedTemporaryFile().name
764 tmp_stdout = open( tmp, 'wb' )
765 proc = subprocess.Popen( args='lastz -v', shell=True, stdout=tmp_stdout )
766 tmp_stdout.close()
767 returncode = proc.wait()
768 stdout = None
769 for line in open( tmp_stdout.name, 'rb' ):
770 if line.lower().find( 'version' ) >= 0:
771 stdout = line.strip()
772 break
773 if stdout:
774 sys.stdout.write( '%s\n' % stdout )
775 else:
776 raise Exception
777 except:
778 sys.stdout.write( 'Could not determine Lastz version\n' )
779
780 if options.ref_name:
781 ref_name = '[nickname=%s]' % options.ref_name
782 else:
783 ref_name = ''
784 if options.ref_source == 'history':
785 # Reference is a fasta dataset from the history
786 try:
787 # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ).
788 error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes."
789 ref_sequences = int( options.ref_sequences )
790 if ref_sequences < 1:
791 stop_err( error_msg )
792 except:
793 stop_err( error_msg )
794 else:
795 ref_sequences = 0
796 tmp_w12_name = get_tmp_file_name( suffix='vs_linker.W12' )
797 tmp_T1_name = get_tmp_file_name( suffix='vs_linker.T1' )
798 # Run lastz twice ( with different options ) on the linker sequence and paired end reads,
799 # looking for the linker ( each run finds some the other doesn't )
800 command = 'lastz %s %s W=12 --notrans --exact=18 --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \
801 ( options.input3, options.input2, tmp_w12_name )
802 run_command( command )
803 command = 'lastz %s %s T=1 --match=1,2 O=1 E=2 X=15 K=10 Y=15 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \
804 ( options.input3, options.input2, tmp_T1_name )
805 run_command( command )
806 # Combine the alignment output from the two lastz runs
807 tmp_combined_linker_file_name = get_tmp_file_name( suffix='vs_linker' )
808 command = 'cat %s %s | sort -u > %s' % ( tmp_w12_name, tmp_T1_name, tmp_combined_linker_file_name )
809 run_command( command )
810 # Use the alignment info to split reads into left and right mates
811 tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name = split_paired_reads( options.input2, tmp_combined_linker_file_name )
812 # Align mates to the reference - tmp_align_file_names is a list of file names created by align_mates()
813 tmp_align_file_name_list = align_mates( options.input1, options.ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name )
814 # Combine and convert mate coordinates back to read coordinates
815 paired_mate_unmapper( options.input2, options.input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, options.output )
816 # Delete all temporary files
817 for file_name in tmp_file_names:
818 os.remove( file_name )
819 # Handle any invalid lines in the input data
820 if total_skipped_lines:
821 msgs = dict( bad_interval="Bad interval in line",
822 inconsistent_read_lengths="Inconsistent read/quality lengths for seq #",
823 inconsistent_reads="Inconsistent reads for seq #",
824 inconsistent_sizes="Inconsistent sizes for seq #",
825 missing_mate="Mapping file does not include mate on line",
826 missing_quals="Missing quality values for name on line",
827 missing_seq="Missing sequence for name on line",
828 multiple_seqs="Multiple names for seq #",
829 no_header="First quality sequence has no header",
830 num_fields="Must have 4 fields in line",
831 reads_paired="SAM flag indicates reads already paired on line",
832 sam_flag="Bad SAM flag on line",
833 sam_headers="SAM headers on line",
834 sam_min_columns="Need 11 columns on line",
835 two_mate_names="Mate name already seen, line",
836 wrong_seq_len="Size differs from length of seq #" )
837 print "Skipped %d invalid lines: "
838 msg = ""
839 for k, v in skipped_lines.items():
840 if v[0]:
841 # v[0] is the number of times the error occurred
842 # v[1] is the position of the line or sequence in the file
843 # v[2] is the name of the sequence or the text of the line
844 msg += "(%d)%s %d:%s. " % ( v[0], msgs[k], v[1], v[2] )
845 print msg
846
847 if __name__=="__main__": __main__()