Mercurial > repos > peterjc > seq_primer_clip
comparison tools/seq_primer_clip/seq_primer_clip.py @ 4:9b074c1db68e draft
v0.0.14 galaxy_sequence_utils dependency etc
author | peterjc |
---|---|
date | Thu, 02 Feb 2017 11:52:37 -0500 |
parents | ee5acea162a7 |
children | 530c8d6fedd8 |
comparison
equal
deleted
inserted
replaced
3:708ce1e5ac94 | 4:9b074c1db68e |
---|---|
24 | 24 |
25 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | 25 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute |
26 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | 26 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. |
27 See accompanying text file for licence details (MIT/BSD style). | 27 See accompanying text file for licence details (MIT/BSD style). |
28 | 28 |
29 This is version 0.0.8 of the script. Currently it uses Python's regular | 29 NOTE: Currently it uses Python's regular expression engine for finding the |
30 expression engine for finding the primers, which for my needs is fast enough. | 30 primers, which for my needs is fast enough. |
31 """ | 31 """ |
32 import sys | 32 import sys |
33 import re | 33 import re |
34 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter | 34 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter |
35 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter | 35 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter |
36 | 36 |
37 if "-v" in sys.argv or "--version" in sys.argv: | 37 if "-v" in sys.argv or "--version" in sys.argv: |
38 print "v0.0.5" | 38 print "v0.0.12" |
39 sys.exit(0) | 39 sys.exit(0) |
40 | |
41 def stop_err(msg, err=1): | |
42 sys.stderr.write(msg) | |
43 sys.exit(err) | |
44 | 40 |
45 try: | 41 try: |
46 from Bio.Seq import reverse_complement | 42 from Bio.Seq import reverse_complement |
47 from Bio.SeqIO.SffIO import SffIterator, SffWriter | 43 from Bio.SeqIO.SffIO import SffIterator, SffWriter |
48 except ImportError: | 44 except ImportError: |
49 stop_err("Requires Biopython 1.54 or later") | 45 sys.exit("Requires Biopython 1.54 or later") |
50 try: | 46 try: |
51 from Bio.SeqIO.SffIO import ReadRocheXmlManifest | 47 from Bio.SeqIO.SffIO import ReadRocheXmlManifest |
52 except ImportError: | 48 except ImportError: |
53 #Prior to Biopython 1.56 this was a private function | 49 # Prior to Biopython 1.56 this was a private function |
54 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest | 50 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest |
55 | 51 |
56 #Parse Command Line | 52 # Parse Command Line |
57 try: | 53 try: |
58 in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:] | 54 in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:] |
59 except ValueError: | 55 except ValueError: |
60 stop_err("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) | 56 sys.exit("Expected 8 arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) |
61 | 57 |
62 if in_file == primer_fasta: | 58 if in_file == primer_fasta: |
63 stop_err("Same file given as both primer sequences and sequences to clip!") | 59 sys.exit("Same file given as both primer sequences and sequences to clip!") |
64 if in_file == out_file: | 60 if in_file == out_file: |
65 stop_err("Same file given as both sequences to clip and output!") | 61 sys.exit("Same file given as both sequences to clip and output!") |
66 if primer_fasta == out_file: | 62 if primer_fasta == out_file: |
67 stop_err("Same file given as both primer sequences and output!") | 63 sys.exit("Same file given as both primer sequences and output!") |
68 | 64 |
69 try: | 65 try: |
70 mm = int(mm) | 66 mm = int(mm) |
71 except ValueError: | 67 except ValueError: |
72 stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm) | 68 sys.exit("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm) |
73 if mm < 0: | 69 if mm < 0: |
74 stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm) | 70 sys.exit("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm) |
75 if mm not in [0,1,2]: | 71 if mm not in [0, 1, 2]: |
76 raise NotImplementedError | 72 raise NotImplementedError |
77 | 73 |
78 try: | 74 try: |
79 min_len = int(min_len) | 75 min_len = int(min_len) |
80 except ValueError: | 76 except ValueError: |
81 stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len) | 77 sys.exit("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len) |
82 if min_len < 0: | 78 if min_len < 0: |
83 stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len) | 79 sys.exit("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len) |
84 | 80 |
85 | 81 |
86 if keep_negatives.lower() in ["true", "yes", "on"]: | 82 if keep_negatives.lower() in ["true", "yes", "on"]: |
87 keep_negatives = True | 83 keep_negatives = True |
88 elif keep_negatives.lower() in ["false", "no", "off"]: | 84 elif keep_negatives.lower() in ["false", "no", "off"]: |
89 keep_negatives = False | 85 keep_negatives = False |
90 else: | 86 else: |
91 stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives) | 87 sys.exit("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives) |
92 | 88 |
93 | 89 |
94 if primer_type.lower() == "forward": | 90 if primer_type.lower() == "forward": |
95 forward = True | 91 forward = True |
96 rc = False | 92 rc = False |
99 rc = False | 95 rc = False |
100 elif primer_type.lower() == "reverse-complement": | 96 elif primer_type.lower() == "reverse-complement": |
101 forward = False | 97 forward = False |
102 rc = True | 98 rc = True |
103 else: | 99 else: |
104 stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type) | 100 sys.exit("Expected foward, reverse or reverse-complement not %r" % primer_type) |
105 | 101 |
106 | 102 |
107 ambiguous_dna_values = { | 103 ambiguous_dna_values = { |
108 "A": "A", | 104 "A": "A", |
109 "C": "C", | 105 "C": "C", |
117 "K": "GTK", | 113 "K": "GTK", |
118 "V": "ACGMRSV", | 114 "V": "ACGMRSV", |
119 "H": "ACTMWYH", | 115 "H": "ACTMWYH", |
120 "D": "AGTRWKD", | 116 "D": "AGTRWKD", |
121 "B": "CGTSYKB", | 117 "B": "CGTSYKB", |
122 "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC] | 118 "X": ".", # faster than [GATCMRWSYKVVHDBXN] or even [GATC] |
123 "N": ".", | 119 "N": ".", |
124 } | 120 } |
125 | 121 |
126 ambiguous_dna_re = {} | 122 ambiguous_dna_re = {} |
127 for letter, values in ambiguous_dna_values.iteritems(): | 123 for letter, values in ambiguous_dna_values.iteritems(): |
128 if len(values) == 1: | 124 if len(values) == 1: |
129 ambiguous_dna_re[letter] = values | 125 ambiguous_dna_re[letter] = values |
132 | 128 |
133 | 129 |
134 def make_reg_ex(seq): | 130 def make_reg_ex(seq): |
135 return "".join(ambiguous_dna_re[letter] for letter in seq) | 131 return "".join(ambiguous_dna_re[letter] for letter in seq) |
136 | 132 |
133 | |
137 def make_reg_ex_mm(seq, mm): | 134 def make_reg_ex_mm(seq, mm): |
138 if mm > 2: | 135 if mm > 2: |
139 raise NotImplementedError("At most 2 mismatches allowed!") | 136 raise NotImplementedError("At most 2 mismatches allowed!") |
140 seq = seq.upper() | 137 seq = seq.upper() |
141 yield make_reg_ex(seq) | 138 yield make_reg_ex(seq) |
142 for i in range(1,mm+1): | 139 for i in range(1, mm + 1): |
143 #Missing first/last i bases at very start/end of sequence | 140 # Missing first/last i bases at very start/end of sequence |
144 for reg in make_reg_ex_mm(seq[i:], mm-i): | 141 for reg in make_reg_ex_mm(seq[i:], mm - i): |
145 yield "^" + reg | 142 yield "^" + reg |
146 for reg in make_reg_ex_mm(seq[:-i], mm-i): | 143 for reg in make_reg_ex_mm(seq[:-i], mm - i): |
147 yield "$" + reg | 144 yield "$" + reg |
148 if mm >= 1: | 145 if mm >= 1: |
149 for i,letter in enumerate(seq): | 146 for i, letter in enumerate(seq): |
150 #We'll use a set to remove any duplicate patterns | 147 # We'll use a set to remove any duplicate patterns |
151 #if letter not in "NX": | 148 # if letter not in "NX": |
152 pattern = seq[:i] + "N" + seq[i+1:] | 149 pattern = seq[:i] + "N" + seq[i + 1:] |
153 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ | 150 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ |
154 % (pattern, len(pattern), seq, len(seq)) | 151 % (pattern, len(pattern), seq, len(seq)) |
155 yield make_reg_ex(pattern) | 152 yield make_reg_ex(pattern) |
156 if mm >=2: | 153 if mm >= 2: |
157 for i,letter in enumerate(seq): | 154 for i, letter in enumerate(seq): |
158 #We'll use a set to remove any duplicate patterns | 155 # We'll use a set to remove any duplicate patterns |
159 #if letter not in "NX": | 156 # if letter not in "NX": |
160 for k,letter in enumerate(seq[i+1:]): | 157 for k, letter in enumerate(seq[i + 1:]): |
161 #We'll use a set to remove any duplicate patterns | 158 # We'll use a set to remove any duplicate patterns |
162 #if letter not in "NX": | 159 # if letter not in "NX": |
163 pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:] | 160 pattern = seq[:i] + "N" + seq[i + 1:i + 1 + k] + "N" + seq[i + k + 2:] |
164 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ | 161 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ |
165 % (pattern, len(pattern), seq, len(seq)) | 162 % (pattern, len(pattern), seq, len(seq)) |
166 yield make_reg_ex(pattern) | 163 yield make_reg_ex(pattern) |
167 | 164 |
165 | |
168 def load_primers_as_re(primer_fasta, mm, rc=False): | 166 def load_primers_as_re(primer_fasta, mm, rc=False): |
169 #Read primer file and record all specified sequences | 167 # Read primer file and record all specified sequences |
170 primers = set() | 168 primers = set() |
171 in_handle = open(primer_fasta, "rU") | 169 in_handle = open(primer_fasta, "rU") |
172 reader = fastaReader(in_handle) | 170 reader = fastaReader(in_handle) |
173 count = 0 | 171 count = 0 |
174 for record in reader: | 172 for record in reader: |
175 if rc: | 173 if rc: |
176 seq = reverse_complement(record.sequence) | 174 seq = reverse_complement(record.sequence) |
177 else: | 175 else: |
178 seq = record.sequence | 176 seq = record.sequence |
179 #primers.add(re.compile(make_reg_ex(seq))) | 177 # primers.add(re.compile(make_reg_ex(seq))) |
180 count += 1 | 178 count += 1 |
181 for pattern in make_reg_ex_mm(seq, mm): | 179 for pattern in make_reg_ex_mm(seq, mm): |
182 primers.add(pattern) | 180 primers.add(pattern) |
183 in_handle.close() | 181 in_handle.close() |
184 #Use set to avoid duplicates, sort to have longest first | 182 # Use set to avoid duplicates, sort to have longest first |
185 #(so more specific primers found before less specific ones) | 183 # (so more specific primers found before less specific ones) |
186 primers = sorted(set(primers), key=lambda p: -len(p)) | 184 primers = sorted(set(primers), key=lambda p: -len(p)) |
187 return count, re.compile("|".join(primers)) #make one monster re! | 185 return count, re.compile("|".join(primers)) # make one monster re! |
188 | 186 |
189 | 187 |
190 | 188 # Read primer file and record all specified sequences |
191 #Read primer file and record all specified sequences | |
192 count, primer = load_primers_as_re(primer_fasta, mm, rc) | 189 count, primer = load_primers_as_re(primer_fasta, mm, rc) |
193 print "%i primer sequences" % count | 190 print "%i primer sequences" % count |
194 | 191 |
195 short_neg = 0 | 192 short_neg = 0 |
196 short_clipped = 0 | 193 short_clipped = 0 |
197 clipped = 0 | 194 clipped = 0 |
198 negs = 0 | 195 negs = 0 |
199 | 196 |
200 if seq_format.lower()=="sff": | 197 if seq_format.lower() == "sff": |
201 #SFF is different because we just change the trim points | 198 # SFF is different because we just change the trim points |
202 if forward: | 199 if forward: |
203 def process(records): | 200 def process(records): |
204 global short_clipped, short_neg, clipped, negs | 201 global short_clipped, short_neg, clipped, negs |
205 for record in records: | 202 for record in records: |
206 left_clip = record.annotations["clip_qual_left"] | 203 left_clip = record.annotations["clip_qual_left"] |
207 right_clip = record.annotations["clip_qual_right"] | 204 right_clip = record.annotations["clip_qual_right"] |
208 seq = str(record.seq)[left_clip:right_clip].upper() | 205 seq = str(record.seq)[left_clip:right_clip].upper() |
209 result = primer.search(seq) | 206 result = primer.search(seq) |
210 if result: | 207 if result: |
211 #Forward primer, take everything after it | 208 # Forward primer, take everything after it |
212 #so move the left clip along | 209 # so move the left clip along |
213 if len(seq) - result.end() >= min_len: | 210 if len(seq) - result.end() >= min_len: |
214 record.annotations["clip_qual_left"] = left_clip + result.end() | 211 record.annotations["clip_qual_left"] = left_clip + result.end() |
215 clipped += 1 | 212 clipped += 1 |
216 yield record | 213 yield record |
217 else: | 214 else: |
229 left_clip = record.annotations["clip_qual_left"] | 226 left_clip = record.annotations["clip_qual_left"] |
230 right_clip = record.annotations["clip_qual_right"] | 227 right_clip = record.annotations["clip_qual_right"] |
231 seq = str(record.seq)[left_clip:right_clip].upper() | 228 seq = str(record.seq)[left_clip:right_clip].upper() |
232 result = primer.search(seq) | 229 result = primer.search(seq) |
233 if result: | 230 if result: |
234 #Reverse primer, take everything before it | 231 # Reverse primer, take everything before it |
235 #so move the right clip back | 232 # so move the right clip back |
236 new_len = result.start() | 233 new_len = result.start() |
237 if new_len >= min_len: | 234 if new_len >= min_len: |
238 record.annotations["clip_qual_right"] = left_clip + new_len | 235 record.annotations["clip_qual_right"] = left_clip + new_len |
239 clipped += 1 | 236 clipped += 1 |
240 yield record | 237 yield record |
244 if len(seq) >= min_len: | 241 if len(seq) >= min_len: |
245 negs += 1 | 242 negs += 1 |
246 yield record | 243 yield record |
247 else: | 244 else: |
248 short_neg += 1 | 245 short_neg += 1 |
249 | 246 |
250 in_handle = open(in_file, "rb") | 247 in_handle = open(in_file, "rb") |
251 try: | 248 try: |
252 manifest = ReadRocheXmlManifest(in_handle) | 249 manifest = ReadRocheXmlManifest(in_handle) |
253 except ValueError: | 250 except ValueError: |
254 manifest = None | 251 manifest = None |
255 in_handle.seek(0) | 252 in_handle.seek(0) |
256 out_handle = open(out_file, "wb") | 253 out_handle = open(out_file, "wb") |
257 writer = SffWriter(out_handle, xml=manifest) | 254 writer = SffWriter(out_handle, xml=manifest) |
258 writer.write_file(process(SffIterator(in_handle))) | 255 writer.write_file(process(SffIterator(in_handle))) |
259 #End of SFF code | 256 # End of SFF code |
260 elif seq_format.lower().startswith("fastq"): | 257 elif seq_format.lower().startswith("fastq"): |
261 in_handle = open(in_file, "rU") | 258 in_handle = open(in_file, "rU") |
262 out_handle = open(out_file, "w") | 259 out_handle = open(out_file, "w") |
263 reader = fastqReader(in_handle) | 260 reader = fastqReader(in_handle) |
264 writer = fastqWriter(out_handle) | 261 writer = fastqWriter(out_handle) |
265 if forward: | 262 if forward: |
266 for record in reader: | 263 for record in reader: |
267 seq = record.sequence.upper() | 264 seq = record.sequence.upper() |
268 result = primer.search(seq) | 265 result = primer.search(seq) |
269 if result: | 266 if result: |
270 #Forward primer, take everything after it | 267 # Forward primer, take everything after it |
271 cut = result.end() | 268 cut = result.end() |
272 record.sequence = seq[cut:] | 269 record.sequence = seq[cut:] |
273 if len(record.sequence) >= min_len: | 270 if len(record.sequence) >= min_len: |
274 record.quality = record.quality[cut:] | 271 record.quality = record.quality[cut:] |
275 clipped += 1 | 272 clipped += 1 |
279 elif keep_negatives: | 276 elif keep_negatives: |
280 if len(record) >= min_len: | 277 if len(record) >= min_len: |
281 negs += 1 | 278 negs += 1 |
282 writer.write(record) | 279 writer.write(record) |
283 else: | 280 else: |
284 short_negs += 1 | 281 short_neg += 1 |
285 else: | 282 else: |
286 for record in reader: | 283 for record in reader: |
287 seq = record.sequence.upper() | 284 seq = record.sequence.upper() |
288 result = primer.search(seq) | 285 result = primer.search(seq) |
289 if result: | 286 if result: |
290 #Reverse primer, take everything before it | 287 # Reverse primer, take everything before it |
291 cut = result.start() | 288 cut = result.start() |
292 record.sequence = seq[:cut] | 289 record.sequence = seq[:cut] |
293 if len(record.sequence) >= min_len: | 290 if len(record.sequence) >= min_len: |
294 record.quality = record.quality[:cut] | 291 record.quality = record.quality[:cut] |
295 clipped += 1 | 292 clipped += 1 |
299 elif keep_negatives: | 296 elif keep_negatives: |
300 if len(record) >= min_len: | 297 if len(record) >= min_len: |
301 negs += 1 | 298 negs += 1 |
302 writer.write(record) | 299 writer.write(record) |
303 else: | 300 else: |
304 short_negs += 1 | 301 short_neg += 1 |
305 elif seq_format.lower()=="fasta": | 302 elif seq_format.lower() == "fasta": |
306 in_handle = open(in_file, "rU") | 303 in_handle = open(in_file, "rU") |
307 out_handle = open(out_file, "w") | 304 out_handle = open(out_file, "w") |
308 reader = fastaReader(in_handle) | 305 reader = fastaReader(in_handle) |
309 writer = fastaWriter(out_handle) | 306 writer = fastaWriter(out_handle) |
310 #Following code is identical to that for FASTQ but without editing qualities | 307 # Following code is identical to that for FASTQ but without editing qualities |
311 if forward: | 308 if forward: |
312 for record in reader: | 309 for record in reader: |
313 seq = record.sequence.upper() | 310 seq = record.sequence.upper() |
314 result = primer.search(seq) | 311 result = primer.search(seq) |
315 if result: | 312 if result: |
316 #Forward primer, take everything after it | 313 # Forward primer, take everything after it |
317 cut = result.end() | 314 cut = result.end() |
318 record.sequence = seq[cut:] | 315 record.sequence = seq[cut:] |
319 if len(record.sequence) >= min_len: | 316 if len(record.sequence) >= min_len: |
320 clipped += 1 | 317 clipped += 1 |
321 writer.write(record) | 318 writer.write(record) |
324 elif keep_negatives: | 321 elif keep_negatives: |
325 if len(record) >= min_len: | 322 if len(record) >= min_len: |
326 negs += 1 | 323 negs += 1 |
327 writer.write(record) | 324 writer.write(record) |
328 else: | 325 else: |
329 short_negs += 1 | 326 short_neg += 1 |
330 else: | 327 else: |
331 for record in reader: | 328 for record in reader: |
332 seq = record.sequence.upper() | 329 seq = record.sequence.upper() |
333 result = primer.search(seq) | 330 result = primer.search(seq) |
334 if result: | 331 if result: |
335 #Reverse primer, take everything before it | 332 # Reverse primer, take everything before it |
336 cut = result.start() | 333 cut = result.start() |
337 record.sequence = seq[:cut] | 334 record.sequence = seq[:cut] |
338 if len(record.sequence) >= min_len: | 335 if len(record.sequence) >= min_len: |
339 clipped += 1 | 336 clipped += 1 |
340 writer.write(record) | 337 writer.write(record) |
343 elif keep_negatives: | 340 elif keep_negatives: |
344 if len(record) >= min_len: | 341 if len(record) >= min_len: |
345 negs += 1 | 342 negs += 1 |
346 writer.write(record) | 343 writer.write(record) |
347 else: | 344 else: |
348 short_negs += 1 | 345 short_neg += 1 |
349 else: | 346 else: |
350 stop_err("Unsupported file type %r" % seq_format) | 347 sys.exit("Unsupported file type %r" % seq_format) |
351 in_handle.close() | 348 in_handle.close() |
352 out_handle.close() | 349 out_handle.close() |
353 | 350 |
354 print "Kept %i clipped reads," % clipped | 351 print "Kept %i clipped reads," % clipped |
355 print "discarded %i short." % short_clipped | 352 print "discarded %i short." % short_clipped |