annotate fastq_paired_end_joiner.py @ 1:270a8ed8a300 draft

Uploaded tool version 2.0.0 by Simone Leo.
author devteam
date Mon, 07 Jul 2014 15:37:28 -0400
parents 2793d1d765b9
children 6a7f5da7c76d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
1 """
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
2 Extended version of Dan Blankenberg's fastq joiner ( adds support for
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
3 recent Illumina headers ).
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
4 """
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
5
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
6 import sys, re
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
7 import galaxy_utils.sequence.fastq as fq
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
8
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
9
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
10 class IDManager( object ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
11
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
12 def __init__( self, sep="\t" ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
13 """
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
14 Recent Illumina FASTQ header format::
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
15
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
16 @<COORDS> <FLAGS>
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
17 COORDS = <Instrument>:<Run #>:<Flowcell ID>:<Lane>:<Tile>:<X>:<Y>
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
18 FLAGS = <Read>:<Is Filtered>:<Control Number>:<Index Sequence>
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
19
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
20 where the whitespace character between <COORDS> and <FLAGS> can be
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
21 either a space or a tab.
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
22 """
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
23 self.sep = sep
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
24
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
25 def parse_id( self, identifier ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
26 try:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
27 coords, flags = identifier.strip()[1:].split( self.sep, 1 )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
28 except ValueError:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
29 raise RuntimeError( "bad identifier: %r" % ( identifier, ))
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
30 return coords.split( ":" ), flags.split( ":" )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
31
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
32 def join_id( self, parsed_id ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
33 coords, flags = parsed_id
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
34 return "@%s%s%s" % ( ":".join( coords ), self.sep, ":".join( flags ))
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
35
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
36 def get_read_number( self, parsed_id ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
37 return int( parsed_id[1][0] )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
38
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
39 def set_read_number( self, parsed_id, n ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
40 parsed_id[1][0] = "%d" % n
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
41
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
42 def get_paired_identifier( self, read ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
43 t = self.parse_id( read.identifier )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
44 n = self.get_read_number( t )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
45 if n == 1:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
46 pn = 2
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
47 elif n == 2:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
48 pn = 1
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
49 else:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
50 raise RuntimeError( "Unknown read number '%d'" % n )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
51 self.set_read_number( t, pn )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
52 return self.join_id( t )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
53
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
54
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
55 class FastqJoiner( fq.fastqJoiner ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
56
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
57 def __init__( self, format, force_quality_encoding=None, sep="\t" ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
58 super( FastqJoiner, self ).__init__( format, force_quality_encoding )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
59 self.id_manager = IDManager( sep )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
60
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
61 def join( self, read1, read2 ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
62 force_quality_encoding = self.force_quality_encoding
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
63 if not force_quality_encoding:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
64 if read1.is_ascii_encoded():
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
65 force_quality_encoding = 'ascii'
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
66 else:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
67 force_quality_encoding = 'decimal'
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
68 read1 = read1.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
69 read2 = read2.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
70 #--
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
71 t1, t2 = [ self.id_manager.parse_id( r.identifier ) for r in ( read1, read2 ) ]
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
72 if self.id_manager.get_read_number( t1 ) == 2:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
73 if not self.id_manager.get_read_number( t2 ) == 1:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
74 raise RuntimeError( "input files are not from mated pairs" )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
75 read1, read2 = read2, read1
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
76 t1, t2 = t2, t1
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
77 #--
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
78 rval = fq.FASTQ_FORMATS[self.format]()
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
79 rval.identifier = read1.identifier
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
80 rval.description = "+"
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
81 if len( read1.description ) > 1:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
82 rval.description += rval.identifier[1:]
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
83 if rval.sequence_space == 'color':
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
84 # convert to nuc space, join, then convert back
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
85 rval.sequence = rval.convert_base_to_color_space(
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
86 read1.convert_color_to_base_space( read1.sequence ) +
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
87 read2.convert_color_to_base_space( read2.sequence )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
88 )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
89 else:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
90 rval.sequence = read1.sequence + read2.sequence
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
91 if force_quality_encoding == 'ascii':
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
92 rval.quality = read1.quality + read2.quality
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
93 else:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
94 rval.quality = "%s %s" % (
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
95 read1.quality.strip(), read2.quality.strip()
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
96 )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
97 return rval
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
98
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
99 def get_paired_identifier( self, read ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
100 return self.id_manager.get_paired_identifier( read )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
101
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
102
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
103 def sniff_sep( fastq_fn ):
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
104 header = ""
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
105 with open( fastq_fn ) as f:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
106 while header == "":
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
107 try:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
108 header = f.next().strip()
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
109 except StopIteration:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
110 raise RuntimeError( "%r: empty file" % ( fastq_fn, ) )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
111 return re.search( r"\s", header ).group()
0
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
112
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
113 def main():
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
114 #Read command line arguments
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
115 input1_filename = sys.argv[1]
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
116 input1_type = sys.argv[2] or 'sanger'
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
117 input2_filename = sys.argv[3]
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
118 input2_type = sys.argv[4] or 'sanger'
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
119 output_filename = sys.argv[5]
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
120
1
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
121 fastq_style = sys.argv[6] or 'old'
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
122 #--
0
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
123 if input1_type != input2_type:
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
124 print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type )
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
125
1
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
126 if fastq_style == 'new':
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
127 sep = sniff_sep( input1_filename )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
128 joiner = FastqJoiner( input1_type, sep=sep )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
129 else:
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
130 joiner = fq.fastqJoiner( input1_type )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
131 #--
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
132 input2 = fq.fastqNamedReader( open( input2_filename, 'rb' ), input2_type )
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
133 out = fq.fastqWriter( open( output_filename, 'wb' ), format=input1_type )
0
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
134 i = None
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
135 skip_count = 0
1
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
136 for i, fastq_read in enumerate( fq.fastqReader( open( input1_filename, 'rb' ), format=input1_type ) ):
0
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
137 identifier = joiner.get_paired_identifier( fastq_read )
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
138 fastq_paired = input2.get( identifier )
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
139 if fastq_paired is None:
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
140 skip_count += 1
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
141 else:
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
142 out.write( joiner.join( fastq_read, fastq_paired ) )
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
143 out.close()
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
144
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
145 if i is None:
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
146 print "Your file contains no valid FASTQ reads."
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
147 else:
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
148 print input2.has_data()
1
270a8ed8a300 Uploaded tool version 2.0.0 by Simone Leo.
devteam
parents: 0
diff changeset
149 print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, ( i - skip_count + 1 ) / ( i + 1 ) * 100.0 )
0
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
150
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
151 if __name__ == "__main__":
2793d1d765b9 Imported from capsule None
devteam
parents:
diff changeset
152 main()