comparison tools/fastq/fastq_paired_unpaired.py @ 3:6a14074bc810 draft

Uploaded v0.0.8, automated Biopython dependency handling via ToolShed; MIT license; reST markup for README file.
author peterjc
date Mon, 29 Jul 2013 09:28:55 -0400
parents
children
comparison
equal deleted inserted replaced
2:324775a016ce 3:6a14074bc810
1 #!/usr/bin/env python
2 """Divides a FASTQ into paired and single (orphan reads) as separate files.
3
4 The input file should be a valid FASTQ file which has been sorted so that
5 any partner forward+reverse reads are consecutive. The output files all
6 preserve this sort order. Pairing are recognised based on standard name
7 suffices. See below or run the tool with no arguments for more details.
8
9 Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even
10 Color Space should all work equally well).
11
12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
13 (formerly SCRI), Scotland, UK. All rights reserved.
14
15 See accompanying text file for licence details (MIT license).
16 """
17 import os
18 import sys
19 import re
20 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
21
22 if "-v" in sys.argv or "--version" in sys.argv:
23 print "Version 0.0.8"
24 sys.exit(0)
25
26 def stop_err(msg, err=1):
27 sys.stderr.write(msg.rstrip() + "\n")
28 sys.exit(err)
29
30 msg = """Expect either 3 or 4 arguments, all FASTQ filenames.
31
32 If you want two output files, use four arguments:
33 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger)
34 - Sorted input FASTQ filename,
35 - Output paired FASTQ filename (forward then reverse interleaved),
36 - Output singles FASTQ filename (orphan reads)
37
38 If you want three output files, use five arguments:
39 - FASTQ variant (e.g. sanger, solexa, illumina or cssanger)
40 - Sorted input FASTQ filename,
41 - Output forward paired FASTQ filename,
42 - Output reverse paired FASTQ filename,
43 - Output singles FASTQ filename (orphan reads)
44
45 The input file should be a valid FASTQ file which has been sorted so that
46 any partner forward+reverse reads are consecutive. The output files all
47 preserve this sort order.
48
49 Any reads where the forward/reverse naming suffix used is not recognised
50 are treated as orphan reads. The tool supports the /1 and /2 convention
51 originally used by Illumina, the .f and .r convention, and the Sanger
52 convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html
53 for details), and the new Illumina convention where the reads have the
54 same identifier with the fragment at the start of the description, e.g.
55
56 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA
57 @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA
58
59 Note that this does support multiple forward and reverse reads per template
60 (which is quite common with Sanger sequencing), e.g. this which is sorted
61 alphabetically:
62
63 WTSI_1055_4p17.p1kapIBF
64 WTSI_1055_4p17.p1kpIBF
65 WTSI_1055_4p17.q1kapIBR
66 WTSI_1055_4p17.q1kpIBR
67
68 or this where the reads already come in pairs:
69
70 WTSI_1055_4p17.p1kapIBF
71 WTSI_1055_4p17.q1kapIBR
72 WTSI_1055_4p17.p1kpIBF
73 WTSI_1055_4p17.q1kpIBR
74
75 both become:
76
77 WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR
78 WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR
79 """
80
81 if len(sys.argv) == 5:
82 format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:]
83 elif len(sys.argv) == 6:
84 pairs_fastq = None
85 format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:]
86 else:
87 stop_err(msg)
88
89 format = format.replace("fastq", "").lower()
90 if not format:
91 format="sanger" #safe default
92 elif format not in ["sanger","solexa","illumina","cssanger"]:
93 stop_err("Unrecognised format %s" % format)
94
95 def f_match(name):
96 if name.endswith("/1") or name.endswith(".f"):
97 return True
98
99 #Cope with three widely used suffix naming convensions,
100 #Illumina: /1 or /2
101 #Forward/revered: .f or .r
102 #Sanger, e.g. .p1k and .q1k
103 #See http://staden.sourceforge.net/manual/pregap4_unix_50.html
104 re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
105 re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
106
107 #assert re_f.match("demo/1")
108 assert re_f.search("demo.f")
109 assert re_f.search("demo.s1")
110 assert re_f.search("demo.f1k")
111 assert re_f.search("demo.p1")
112 assert re_f.search("demo.p1k")
113 assert re_f.search("demo.p1lk")
114 assert re_r.search("demo/2")
115 assert re_r.search("demo.r")
116 assert re_r.search("demo.q1")
117 assert re_r.search("demo.q1lk")
118 assert not re_r.search("demo/1")
119 assert not re_r.search("demo.f")
120 assert not re_r.search("demo.p")
121 assert not re_f.search("demo/2")
122 assert not re_f.search("demo.r")
123 assert not re_f.search("demo.q")
124
125 re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$")
126 re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$")
127 assert re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")
128 assert re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
129 assert not re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA")
130 assert not re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA")
131
132
133 count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0
134 in_handle = open(input_fastq)
135 if pairs_fastq:
136 pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format)
137 pairs_r_writer = pairs_f_writer
138 else:
139 pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format)
140 pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format)
141 singles_writer = fastqWriter(open(singles_fastq, "w"), format)
142 last_template, buffered_reads = None, []
143
144 for record in fastqReader(in_handle, format):
145 count += 1
146 name = record.identifier.split(None,1)[0]
147 assert name[0]=="@", record.identifier #Quirk of the Galaxy parser
148 is_forward = False
149 suffix = re_f.search(name)
150 if suffix:
151 #============
152 #Forward read
153 #============
154 template = name[:suffix.start()]
155 is_forward = True
156 elif re_illumina_f.match(record.identifier):
157 template = name #No suffix
158 is_forward = True
159 if is_forward:
160 #print name, "forward", template
161 forward += 1
162 if last_template == template:
163 buffered_reads.append(record)
164 else:
165 #Any old buffered reads are orphans
166 for old in buffered_reads:
167 singles_writer.write(old)
168 singles += 1
169 #Save this read in buffer
170 buffered_reads = [record]
171 last_template = template
172 else:
173 is_reverse = False
174 suffix = re_r.search(name)
175 if suffix:
176 #============
177 #Reverse read
178 #============
179 template = name[:suffix.start()]
180 is_reverse = True
181 elif re_illumina_r.match(record.identifier):
182 template = name #No suffix
183 is_reverse = True
184 if is_reverse:
185 #print name, "reverse", template
186 reverse += 1
187 if last_template == template and buffered_reads:
188 #We have a pair!
189 #If there are multiple buffered forward reads, want to pick
190 #the first one (although we could try and do something more
191 #clever looking at the suffix to match them up...)
192 old = buffered_reads.pop(0)
193 pairs_f_writer.write(old)
194 pairs_r_writer.write(record)
195 pairs += 2
196 else:
197 #As this is a reverse read, this and any buffered read(s) are
198 #all orphans
199 for old in buffered_reads:
200 singles_writer.write(old)
201 singles += 1
202 buffered_reads = []
203 singles_writer.write(record)
204 singles += 1
205 last_template = None
206 else:
207 #===========================
208 #Neither forward nor reverse
209 #===========================
210 singles_writer.write(record)
211 singles += 1
212 neither += 1
213 for old in buffered_reads:
214 singles_writer.write(old)
215 singles += 1
216 buffered_reads = []
217 last_template = None
218 if last_template:
219 #Left over singles...
220 for old in buffered_reads:
221 singles_writer.write(old)
222 singles += 1
223 in_handle.close
224 singles_writer.close()
225 if pairs_fastq:
226 pairs_f_writer.close()
227 assert pairs_r_writer.file.closed
228 else:
229 pairs_f_writer.close()
230 pairs_r_writer.close()
231
232 if neither:
233 print "%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" \
234 % (count, forward, reverse, neither, pairs, singles)
235 else:
236 print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \
237 % (count, forward, reverse, pairs, singles)
238
239 assert count == pairs + singles == forward + reverse + neither, \
240 "%i vs %i+%i=%i vs %i+%i+%i=%i" \
241 % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)