comparison tools/primers/seq_primer_clip.py @ 0:945053d79e60 draft

Uploaded v0.0.8, first public release
author peterjc
date Mon, 29 Apr 2013 06:11:00 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:945053d79e60
1 #!/usr/bin/env python
2 """Looks for the given primer sequences and clips matching SFF reads.
3
4 Takes eight command line options, input read filename, input read format,
5 input primer FASTA filename, type of primers (forward, reverse or reverse-
6 complement), number of mismatches (currently only 0, 1 and 2 are supported),
7 minimum length to keep a read (after primer trimming), should primer-less
8 reads be kept (boolean), and finally the output sequence filename.
9
10 Both the primer and read sequences can contain IUPAC ambiguity codes like N.
11
12 This supports FASTA, FASTQ and SFF sequence files. Colorspace reads are not
13 supported.
14
15 The mismatch parameter does not consider gapped alignemnts, however the
16 special case of missing bases at the very start or end of the read is handled.
17 e.g. a primer sequence CCGACTCGAG will match a read starting CGACTCGAG...
18 if one or more mismatches are allowed.
19
20 This can also be used for stripping off (and optionally filtering on) barcodes.
21
22 Note that only the trim/clip values in the SFF file are changed, not the flow
23 information of the full read sequence.
24
25 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
26 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
27 See accompanying text file for licence details (MIT/BSD style).
28
29 This is version 0.0.8 of the script. Currently it uses Python's regular
30 expression engine for finding the primers, which for my needs is fast enough.
31 """
32 import sys
33 import re
34 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
35 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
36
37 if "-v" in sys.argv or "--version" in sys.argv:
38 print "v0.0.5"
39 sys.exit(0)
40
41 def stop_err(msg, err=1):
42 sys.stderr.write(msg)
43 sys.exit(err)
44
45 try:
46 from Bio.Seq import reverse_complement
47 from Bio.SeqIO.SffIO import SffIterator, SffWriter
48 except ImportError:
49 stop_err("Requires Biopython 1.54 or later")
50 try:
51 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
52 except ImportError:
53 #Prior to Biopython 1.56 this was a private function
54 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
55
56 #Parse Command Line
57 try:
58 in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:]
59 except ValueError:
60 stop_err("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
61
62 if in_file == primer_fasta:
63 stop_err("Same file given as both primer sequences and sequences to clip!")
64 if in_file == out_file:
65 stop_err("Same file given as both sequences to clip and output!")
66 if primer_fasta == out_file:
67 stop_err("Same file given as both primer sequences and output!")
68
69 try:
70 mm = int(mm)
71 except ValueError:
72 stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm)
73 if mm < 0:
74 stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm)
75 if mm not in [0,1,2]:
76 raise NotImplementedError
77
78 try:
79 min_len = int(min_len)
80 except ValueError:
81 stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len)
82 if min_len < 0:
83 stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len)
84
85
86 if keep_negatives.lower() in ["true", "yes", "on"]:
87 keep_negatives = True
88 elif keep_negatives.lower() in ["false", "no", "off"]:
89 keep_negatives = False
90 else:
91 stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives)
92
93
94 if primer_type.lower() == "forward":
95 forward = True
96 rc = False
97 elif primer_type.lower() == "reverse":
98 forward = False
99 rc = False
100 elif primer_type.lower() == "reverse-complement":
101 forward = False
102 rc = True
103 else:
104 stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type)
105
106
107 ambiguous_dna_values = {
108 "A": "A",
109 "C": "C",
110 "G": "G",
111 "T": "T",
112 "M": "ACM",
113 "R": "AGR",
114 "W": "ATW",
115 "S": "CGS",
116 "Y": "CTY",
117 "K": "GTK",
118 "V": "ACGMRSV",
119 "H": "ACTMWYH",
120 "D": "AGTRWKD",
121 "B": "CGTSYKB",
122 "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC]
123 "N": ".",
124 }
125
126 ambiguous_dna_re = {}
127 for letter, values in ambiguous_dna_values.iteritems():
128 if len(values) == 1:
129 ambiguous_dna_re[letter] = values
130 else:
131 ambiguous_dna_re[letter] = "[%s]" % values
132
133
134 def make_reg_ex(seq):
135 return "".join(ambiguous_dna_re[letter] for letter in seq)
136
137 def make_reg_ex_mm(seq, mm):
138 if mm > 2:
139 raise NotImplementedError("At most 2 mismatches allowed!")
140 seq = seq.upper()
141 yield make_reg_ex(seq)
142 for i in range(1,mm+1):
143 #Missing first/last i bases at very start/end of sequence
144 for reg in make_reg_ex_mm(seq[i:], mm-i):
145 yield "^" + reg
146 for reg in make_reg_ex_mm(seq[:-i], mm-i):
147 yield "$" + reg
148 if mm >= 1:
149 for i,letter in enumerate(seq):
150 #We'll use a set to remove any duplicate patterns
151 #if letter not in "NX":
152 pattern = seq[:i] + "N" + seq[i+1:]
153 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
154 % (pattern, len(pattern), seq, len(seq))
155 yield make_reg_ex(pattern)
156 if mm >=2:
157 for i,letter in enumerate(seq):
158 #We'll use a set to remove any duplicate patterns
159 #if letter not in "NX":
160 for k,letter in enumerate(seq[i+1:]):
161 #We'll use a set to remove any duplicate patterns
162 #if letter not in "NX":
163 pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:]
164 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
165 % (pattern, len(pattern), seq, len(seq))
166 yield make_reg_ex(pattern)
167
168 def load_primers_as_re(primer_fasta, mm, rc=False):
169 #Read primer file and record all specified sequences
170 primers = set()
171 in_handle = open(primer_fasta, "rU")
172 reader = fastaReader(in_handle)
173 count = 0
174 for record in reader:
175 if rc:
176 seq = reverse_complement(record.sequence)
177 else:
178 seq = record.sequence
179 #primers.add(re.compile(make_reg_ex(seq)))
180 count += 1
181 for pattern in make_reg_ex_mm(seq, mm):
182 primers.add(pattern)
183 in_handle.close()
184 #Use set to avoid duplicates, sort to have longest first
185 #(so more specific primers found before less specific ones)
186 primers = sorted(set(primers), key=lambda p: -len(p))
187 return count, re.compile("|".join(primers)) #make one monster re!
188
189
190
191 #Read primer file and record all specified sequences
192 count, primer = load_primers_as_re(primer_fasta, mm, rc)
193 print "%i primer sequences" % count
194
195 short_neg = 0
196 short_clipped = 0
197 clipped = 0
198 negs = 0
199
200 if seq_format.lower()=="sff":
201 #SFF is different because we just change the trim points
202 if forward:
203 def process(records):
204 global short_clipped, short_neg, clipped, negs
205 for record in records:
206 left_clip = record.annotations["clip_qual_left"]
207 right_clip = record.annotations["clip_qual_right"]
208 seq = str(record.seq)[left_clip:right_clip].upper()
209 result = primer.search(seq)
210 if result:
211 #Forward primer, take everything after it
212 #so move the left clip along
213 if len(seq) - result.end() >= min_len:
214 record.annotations["clip_qual_left"] = left_clip + result.end()
215 clipped += 1
216 yield record
217 else:
218 short_clipped += 1
219 elif keep_negatives:
220 if len(seq) >= min_len:
221 negs += 1
222 yield record
223 else:
224 short_neg += 1
225 else:
226 def process(records):
227 global short_clipped, short_neg, clipped, negs
228 for record in records:
229 left_clip = record.annotations["clip_qual_left"]
230 right_clip = record.annotations["clip_qual_right"]
231 seq = str(record.seq)[left_clip:right_clip].upper()
232 result = primer.search(seq)
233 if result:
234 #Reverse primer, take everything before it
235 #so move the right clip back
236 new_len = result.start()
237 if new_len >= min_len:
238 record.annotations["clip_qual_right"] = left_clip + new_len
239 clipped += 1
240 yield record
241 else:
242 short_clipped += 1
243 elif keep_negatives:
244 if len(seq) >= min_len:
245 negs += 1
246 yield record
247 else:
248 short_neg += 1
249
250 in_handle = open(in_file, "rb")
251 try:
252 manifest = ReadRocheXmlManifest(in_handle)
253 except ValueError:
254 manifest = None
255 in_handle.seek(0)
256 out_handle = open(out_file, "wb")
257 writer = SffWriter(out_handle, xml=manifest)
258 writer.write_file(process(SffIterator(in_handle)))
259 #End of SFF code
260 elif seq_format.lower().startswith("fastq"):
261 in_handle = open(in_file, "rU")
262 out_handle = open(out_file, "w")
263 reader = fastqReader(in_handle)
264 writer = fastqWriter(out_handle)
265 if forward:
266 for record in reader:
267 seq = record.sequence.upper()
268 result = primer.search(seq)
269 if result:
270 #Forward primer, take everything after it
271 cut = result.end()
272 record.sequence = seq[cut:]
273 if len(record.sequence) >= min_len:
274 record.quality = record.quality[cut:]
275 clipped += 1
276 writer.write(record)
277 else:
278 short_clipped += 1
279 elif keep_negatives:
280 if len(record) >= min_len:
281 negs += 1
282 writer.write(record)
283 else:
284 short_negs += 1
285 else:
286 for record in reader:
287 seq = record.sequence.upper()
288 result = primer.search(seq)
289 if result:
290 #Reverse primer, take everything before it
291 cut = result.start()
292 record.sequence = seq[:cut]
293 if len(record.sequence) >= min_len:
294 record.quality = record.quality[:cut]
295 clipped += 1
296 writer.write(record)
297 else:
298 short_clipped += 1
299 elif keep_negatives:
300 if len(record) >= min_len:
301 negs += 1
302 writer.write(record)
303 else:
304 short_negs += 1
305 elif seq_format.lower()=="fasta":
306 in_handle = open(in_file, "rU")
307 out_handle = open(out_file, "w")
308 reader = fastaReader(in_handle)
309 writer = fastaWriter(out_handle)
310 #Following code is identical to that for FASTQ but without editing qualities
311 if forward:
312 for record in reader:
313 seq = record.sequence.upper()
314 result = primer.search(seq)
315 if result:
316 #Forward primer, take everything after it
317 cut = result.end()
318 record.sequence = seq[cut:]
319 if len(record.sequence) >= min_len:
320 clipped += 1
321 writer.write(record)
322 else:
323 short_clipped += 1
324 elif keep_negatives:
325 if len(record) >= min_len:
326 negs += 1
327 writer.write(record)
328 else:
329 short_negs += 1
330 else:
331 for record in reader:
332 seq = record.sequence.upper()
333 result = primer.search(seq)
334 if result:
335 #Reverse primer, take everything before it
336 cut = result.start()
337 record.sequence = seq[:cut]
338 if len(record.sequence) >= min_len:
339 clipped += 1
340 writer.write(record)
341 else:
342 short_clipped += 1
343 elif keep_negatives:
344 if len(record) >= min_len:
345 negs += 1
346 writer.write(record)
347 else:
348 short_negs += 1
349 else:
350 stop_err("Unsupported file type %r" % seq_format)
351 in_handle.close()
352 out_handle.close()
353
354 print "Kept %i clipped reads," % clipped
355 print "discarded %i short." % short_clipped
356 if keep_negatives:
357 print "Kept %i non-matching reads," % negs
358 print "discarded %i short." % short_neg