comparison tools/seq_primer_clip/seq_primer_clip.py @ 4:9b074c1db68e draft

v0.0.14 galaxy_sequence_utils dependency etc
author peterjc
date Thu, 02 Feb 2017 11:52:37 -0500
parents ee5acea162a7
children 530c8d6fedd8
comparison
equal deleted inserted replaced
3:708ce1e5ac94 4:9b074c1db68e
24 24
25 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute 25 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
26 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. 26 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
27 See accompanying text file for licence details (MIT/BSD style). 27 See accompanying text file for licence details (MIT/BSD style).
28 28
29 This is version 0.0.8 of the script. Currently it uses Python's regular 29 NOTE: Currently it uses Python's regular expression engine for finding the
30 expression engine for finding the primers, which for my needs is fast enough. 30 primers, which for my needs is fast enough.
31 """ 31 """
32 import sys 32 import sys
33 import re 33 import re
34 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter 34 from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
35 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter 35 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
36 36
37 if "-v" in sys.argv or "--version" in sys.argv: 37 if "-v" in sys.argv or "--version" in sys.argv:
38 print "v0.0.5" 38 print "v0.0.12"
39 sys.exit(0) 39 sys.exit(0)
40
41 def stop_err(msg, err=1):
42 sys.stderr.write(msg)
43 sys.exit(err)
44 40
45 try: 41 try:
46 from Bio.Seq import reverse_complement 42 from Bio.Seq import reverse_complement
47 from Bio.SeqIO.SffIO import SffIterator, SffWriter 43 from Bio.SeqIO.SffIO import SffIterator, SffWriter
48 except ImportError: 44 except ImportError:
49 stop_err("Requires Biopython 1.54 or later") 45 sys.exit("Requires Biopython 1.54 or later")
50 try: 46 try:
51 from Bio.SeqIO.SffIO import ReadRocheXmlManifest 47 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
52 except ImportError: 48 except ImportError:
53 #Prior to Biopython 1.56 this was a private function 49 # Prior to Biopython 1.56 this was a private function
54 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest 50 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
55 51
56 #Parse Command Line 52 # Parse Command Line
57 try: 53 try:
58 in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:] 54 in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:]
59 except ValueError: 55 except ValueError:
60 stop_err("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) 56 sys.exit("Expected 8 arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
61 57
62 if in_file == primer_fasta: 58 if in_file == primer_fasta:
63 stop_err("Same file given as both primer sequences and sequences to clip!") 59 sys.exit("Same file given as both primer sequences and sequences to clip!")
64 if in_file == out_file: 60 if in_file == out_file:
65 stop_err("Same file given as both sequences to clip and output!") 61 sys.exit("Same file given as both sequences to clip and output!")
66 if primer_fasta == out_file: 62 if primer_fasta == out_file:
67 stop_err("Same file given as both primer sequences and output!") 63 sys.exit("Same file given as both primer sequences and output!")
68 64
69 try: 65 try:
70 mm = int(mm) 66 mm = int(mm)
71 except ValueError: 67 except ValueError:
72 stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm) 68 sys.exit("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm)
73 if mm < 0: 69 if mm < 0:
74 stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm) 70 sys.exit("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm)
75 if mm not in [0,1,2]: 71 if mm not in [0, 1, 2]:
76 raise NotImplementedError 72 raise NotImplementedError
77 73
78 try: 74 try:
79 min_len = int(min_len) 75 min_len = int(min_len)
80 except ValueError: 76 except ValueError:
81 stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len) 77 sys.exit("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len)
82 if min_len < 0: 78 if min_len < 0:
83 stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len) 79 sys.exit("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len)
84 80
85 81
86 if keep_negatives.lower() in ["true", "yes", "on"]: 82 if keep_negatives.lower() in ["true", "yes", "on"]:
87 keep_negatives = True 83 keep_negatives = True
88 elif keep_negatives.lower() in ["false", "no", "off"]: 84 elif keep_negatives.lower() in ["false", "no", "off"]:
89 keep_negatives = False 85 keep_negatives = False
90 else: 86 else:
91 stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives) 87 sys.exit("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives)
92 88
93 89
94 if primer_type.lower() == "forward": 90 if primer_type.lower() == "forward":
95 forward = True 91 forward = True
96 rc = False 92 rc = False
99 rc = False 95 rc = False
100 elif primer_type.lower() == "reverse-complement": 96 elif primer_type.lower() == "reverse-complement":
101 forward = False 97 forward = False
102 rc = True 98 rc = True
103 else: 99 else:
104 stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type) 100 sys.exit("Expected foward, reverse or reverse-complement not %r" % primer_type)
105 101
106 102
107 ambiguous_dna_values = { 103 ambiguous_dna_values = {
108 "A": "A", 104 "A": "A",
109 "C": "C", 105 "C": "C",
117 "K": "GTK", 113 "K": "GTK",
118 "V": "ACGMRSV", 114 "V": "ACGMRSV",
119 "H": "ACTMWYH", 115 "H": "ACTMWYH",
120 "D": "AGTRWKD", 116 "D": "AGTRWKD",
121 "B": "CGTSYKB", 117 "B": "CGTSYKB",
122 "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC] 118 "X": ".", # faster than [GATCMRWSYKVVHDBXN] or even [GATC]
123 "N": ".", 119 "N": ".",
124 } 120 }
125 121
126 ambiguous_dna_re = {} 122 ambiguous_dna_re = {}
127 for letter, values in ambiguous_dna_values.iteritems(): 123 for letter, values in ambiguous_dna_values.iteritems():
128 if len(values) == 1: 124 if len(values) == 1:
129 ambiguous_dna_re[letter] = values 125 ambiguous_dna_re[letter] = values
132 128
133 129
134 def make_reg_ex(seq): 130 def make_reg_ex(seq):
135 return "".join(ambiguous_dna_re[letter] for letter in seq) 131 return "".join(ambiguous_dna_re[letter] for letter in seq)
136 132
133
137 def make_reg_ex_mm(seq, mm): 134 def make_reg_ex_mm(seq, mm):
138 if mm > 2: 135 if mm > 2:
139 raise NotImplementedError("At most 2 mismatches allowed!") 136 raise NotImplementedError("At most 2 mismatches allowed!")
140 seq = seq.upper() 137 seq = seq.upper()
141 yield make_reg_ex(seq) 138 yield make_reg_ex(seq)
142 for i in range(1,mm+1): 139 for i in range(1, mm + 1):
143 #Missing first/last i bases at very start/end of sequence 140 # Missing first/last i bases at very start/end of sequence
144 for reg in make_reg_ex_mm(seq[i:], mm-i): 141 for reg in make_reg_ex_mm(seq[i:], mm - i):
145 yield "^" + reg 142 yield "^" + reg
146 for reg in make_reg_ex_mm(seq[:-i], mm-i): 143 for reg in make_reg_ex_mm(seq[:-i], mm - i):
147 yield "$" + reg 144 yield "$" + reg
148 if mm >= 1: 145 if mm >= 1:
149 for i,letter in enumerate(seq): 146 for i, letter in enumerate(seq):
150 #We'll use a set to remove any duplicate patterns 147 # We'll use a set to remove any duplicate patterns
151 #if letter not in "NX": 148 # if letter not in "NX":
152 pattern = seq[:i] + "N" + seq[i+1:] 149 pattern = seq[:i] + "N" + seq[i + 1:]
153 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ 150 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
154 % (pattern, len(pattern), seq, len(seq)) 151 % (pattern, len(pattern), seq, len(seq))
155 yield make_reg_ex(pattern) 152 yield make_reg_ex(pattern)
156 if mm >=2: 153 if mm >= 2:
157 for i,letter in enumerate(seq): 154 for i, letter in enumerate(seq):
158 #We'll use a set to remove any duplicate patterns 155 # We'll use a set to remove any duplicate patterns
159 #if letter not in "NX": 156 # if letter not in "NX":
160 for k,letter in enumerate(seq[i+1:]): 157 for k, letter in enumerate(seq[i + 1:]):
161 #We'll use a set to remove any duplicate patterns 158 # We'll use a set to remove any duplicate patterns
162 #if letter not in "NX": 159 # if letter not in "NX":
163 pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:] 160 pattern = seq[:i] + "N" + seq[i + 1:i + 1 + k] + "N" + seq[i + k + 2:]
164 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ 161 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
165 % (pattern, len(pattern), seq, len(seq)) 162 % (pattern, len(pattern), seq, len(seq))
166 yield make_reg_ex(pattern) 163 yield make_reg_ex(pattern)
167 164
165
168 def load_primers_as_re(primer_fasta, mm, rc=False): 166 def load_primers_as_re(primer_fasta, mm, rc=False):
169 #Read primer file and record all specified sequences 167 # Read primer file and record all specified sequences
170 primers = set() 168 primers = set()
171 in_handle = open(primer_fasta, "rU") 169 in_handle = open(primer_fasta, "rU")
172 reader = fastaReader(in_handle) 170 reader = fastaReader(in_handle)
173 count = 0 171 count = 0
174 for record in reader: 172 for record in reader:
175 if rc: 173 if rc:
176 seq = reverse_complement(record.sequence) 174 seq = reverse_complement(record.sequence)
177 else: 175 else:
178 seq = record.sequence 176 seq = record.sequence
179 #primers.add(re.compile(make_reg_ex(seq))) 177 # primers.add(re.compile(make_reg_ex(seq)))
180 count += 1 178 count += 1
181 for pattern in make_reg_ex_mm(seq, mm): 179 for pattern in make_reg_ex_mm(seq, mm):
182 primers.add(pattern) 180 primers.add(pattern)
183 in_handle.close() 181 in_handle.close()
184 #Use set to avoid duplicates, sort to have longest first 182 # Use set to avoid duplicates, sort to have longest first
185 #(so more specific primers found before less specific ones) 183 # (so more specific primers found before less specific ones)
186 primers = sorted(set(primers), key=lambda p: -len(p)) 184 primers = sorted(set(primers), key=lambda p: -len(p))
187 return count, re.compile("|".join(primers)) #make one monster re! 185 return count, re.compile("|".join(primers)) # make one monster re!
188 186
189 187
190 188 # Read primer file and record all specified sequences
191 #Read primer file and record all specified sequences
192 count, primer = load_primers_as_re(primer_fasta, mm, rc) 189 count, primer = load_primers_as_re(primer_fasta, mm, rc)
193 print "%i primer sequences" % count 190 print "%i primer sequences" % count
194 191
195 short_neg = 0 192 short_neg = 0
196 short_clipped = 0 193 short_clipped = 0
197 clipped = 0 194 clipped = 0
198 negs = 0 195 negs = 0
199 196
200 if seq_format.lower()=="sff": 197 if seq_format.lower() == "sff":
201 #SFF is different because we just change the trim points 198 # SFF is different because we just change the trim points
202 if forward: 199 if forward:
203 def process(records): 200 def process(records):
204 global short_clipped, short_neg, clipped, negs 201 global short_clipped, short_neg, clipped, negs
205 for record in records: 202 for record in records:
206 left_clip = record.annotations["clip_qual_left"] 203 left_clip = record.annotations["clip_qual_left"]
207 right_clip = record.annotations["clip_qual_right"] 204 right_clip = record.annotations["clip_qual_right"]
208 seq = str(record.seq)[left_clip:right_clip].upper() 205 seq = str(record.seq)[left_clip:right_clip].upper()
209 result = primer.search(seq) 206 result = primer.search(seq)
210 if result: 207 if result:
211 #Forward primer, take everything after it 208 # Forward primer, take everything after it
212 #so move the left clip along 209 # so move the left clip along
213 if len(seq) - result.end() >= min_len: 210 if len(seq) - result.end() >= min_len:
214 record.annotations["clip_qual_left"] = left_clip + result.end() 211 record.annotations["clip_qual_left"] = left_clip + result.end()
215 clipped += 1 212 clipped += 1
216 yield record 213 yield record
217 else: 214 else:
229 left_clip = record.annotations["clip_qual_left"] 226 left_clip = record.annotations["clip_qual_left"]
230 right_clip = record.annotations["clip_qual_right"] 227 right_clip = record.annotations["clip_qual_right"]
231 seq = str(record.seq)[left_clip:right_clip].upper() 228 seq = str(record.seq)[left_clip:right_clip].upper()
232 result = primer.search(seq) 229 result = primer.search(seq)
233 if result: 230 if result:
234 #Reverse primer, take everything before it 231 # Reverse primer, take everything before it
235 #so move the right clip back 232 # so move the right clip back
236 new_len = result.start() 233 new_len = result.start()
237 if new_len >= min_len: 234 if new_len >= min_len:
238 record.annotations["clip_qual_right"] = left_clip + new_len 235 record.annotations["clip_qual_right"] = left_clip + new_len
239 clipped += 1 236 clipped += 1
240 yield record 237 yield record
244 if len(seq) >= min_len: 241 if len(seq) >= min_len:
245 negs += 1 242 negs += 1
246 yield record 243 yield record
247 else: 244 else:
248 short_neg += 1 245 short_neg += 1
249 246
250 in_handle = open(in_file, "rb") 247 in_handle = open(in_file, "rb")
251 try: 248 try:
252 manifest = ReadRocheXmlManifest(in_handle) 249 manifest = ReadRocheXmlManifest(in_handle)
253 except ValueError: 250 except ValueError:
254 manifest = None 251 manifest = None
255 in_handle.seek(0) 252 in_handle.seek(0)
256 out_handle = open(out_file, "wb") 253 out_handle = open(out_file, "wb")
257 writer = SffWriter(out_handle, xml=manifest) 254 writer = SffWriter(out_handle, xml=manifest)
258 writer.write_file(process(SffIterator(in_handle))) 255 writer.write_file(process(SffIterator(in_handle)))
259 #End of SFF code 256 # End of SFF code
260 elif seq_format.lower().startswith("fastq"): 257 elif seq_format.lower().startswith("fastq"):
261 in_handle = open(in_file, "rU") 258 in_handle = open(in_file, "rU")
262 out_handle = open(out_file, "w") 259 out_handle = open(out_file, "w")
263 reader = fastqReader(in_handle) 260 reader = fastqReader(in_handle)
264 writer = fastqWriter(out_handle) 261 writer = fastqWriter(out_handle)
265 if forward: 262 if forward:
266 for record in reader: 263 for record in reader:
267 seq = record.sequence.upper() 264 seq = record.sequence.upper()
268 result = primer.search(seq) 265 result = primer.search(seq)
269 if result: 266 if result:
270 #Forward primer, take everything after it 267 # Forward primer, take everything after it
271 cut = result.end() 268 cut = result.end()
272 record.sequence = seq[cut:] 269 record.sequence = seq[cut:]
273 if len(record.sequence) >= min_len: 270 if len(record.sequence) >= min_len:
274 record.quality = record.quality[cut:] 271 record.quality = record.quality[cut:]
275 clipped += 1 272 clipped += 1
279 elif keep_negatives: 276 elif keep_negatives:
280 if len(record) >= min_len: 277 if len(record) >= min_len:
281 negs += 1 278 negs += 1
282 writer.write(record) 279 writer.write(record)
283 else: 280 else:
284 short_negs += 1 281 short_neg += 1
285 else: 282 else:
286 for record in reader: 283 for record in reader:
287 seq = record.sequence.upper() 284 seq = record.sequence.upper()
288 result = primer.search(seq) 285 result = primer.search(seq)
289 if result: 286 if result:
290 #Reverse primer, take everything before it 287 # Reverse primer, take everything before it
291 cut = result.start() 288 cut = result.start()
292 record.sequence = seq[:cut] 289 record.sequence = seq[:cut]
293 if len(record.sequence) >= min_len: 290 if len(record.sequence) >= min_len:
294 record.quality = record.quality[:cut] 291 record.quality = record.quality[:cut]
295 clipped += 1 292 clipped += 1
299 elif keep_negatives: 296 elif keep_negatives:
300 if len(record) >= min_len: 297 if len(record) >= min_len:
301 negs += 1 298 negs += 1
302 writer.write(record) 299 writer.write(record)
303 else: 300 else:
304 short_negs += 1 301 short_neg += 1
305 elif seq_format.lower()=="fasta": 302 elif seq_format.lower() == "fasta":
306 in_handle = open(in_file, "rU") 303 in_handle = open(in_file, "rU")
307 out_handle = open(out_file, "w") 304 out_handle = open(out_file, "w")
308 reader = fastaReader(in_handle) 305 reader = fastaReader(in_handle)
309 writer = fastaWriter(out_handle) 306 writer = fastaWriter(out_handle)
310 #Following code is identical to that for FASTQ but without editing qualities 307 # Following code is identical to that for FASTQ but without editing qualities
311 if forward: 308 if forward:
312 for record in reader: 309 for record in reader:
313 seq = record.sequence.upper() 310 seq = record.sequence.upper()
314 result = primer.search(seq) 311 result = primer.search(seq)
315 if result: 312 if result:
316 #Forward primer, take everything after it 313 # Forward primer, take everything after it
317 cut = result.end() 314 cut = result.end()
318 record.sequence = seq[cut:] 315 record.sequence = seq[cut:]
319 if len(record.sequence) >= min_len: 316 if len(record.sequence) >= min_len:
320 clipped += 1 317 clipped += 1
321 writer.write(record) 318 writer.write(record)
324 elif keep_negatives: 321 elif keep_negatives:
325 if len(record) >= min_len: 322 if len(record) >= min_len:
326 negs += 1 323 negs += 1
327 writer.write(record) 324 writer.write(record)
328 else: 325 else:
329 short_negs += 1 326 short_neg += 1
330 else: 327 else:
331 for record in reader: 328 for record in reader:
332 seq = record.sequence.upper() 329 seq = record.sequence.upper()
333 result = primer.search(seq) 330 result = primer.search(seq)
334 if result: 331 if result:
335 #Reverse primer, take everything before it 332 # Reverse primer, take everything before it
336 cut = result.start() 333 cut = result.start()
337 record.sequence = seq[:cut] 334 record.sequence = seq[:cut]
338 if len(record.sequence) >= min_len: 335 if len(record.sequence) >= min_len:
339 clipped += 1 336 clipped += 1
340 writer.write(record) 337 writer.write(record)
343 elif keep_negatives: 340 elif keep_negatives:
344 if len(record) >= min_len: 341 if len(record) >= min_len:
345 negs += 1 342 negs += 1
346 writer.write(record) 343 writer.write(record)
347 else: 344 else:
348 short_negs += 1 345 short_neg += 1
349 else: 346 else:
350 stop_err("Unsupported file type %r" % seq_format) 347 sys.exit("Unsupported file type %r" % seq_format)
351 in_handle.close() 348 in_handle.close()
352 out_handle.close() 349 out_handle.close()
353 350
354 print "Kept %i clipped reads," % clipped 351 print "Kept %i clipped reads," % clipped
355 print "discarded %i short." % short_clipped 352 print "discarded %i short." % short_clipped