Mercurial > repos > peterjc > seq_primer_clip
diff tools/seq_primer_clip/seq_primer_clip.py @ 4:9b074c1db68e draft
v0.0.14 galaxy_sequence_utils dependency etc
author | peterjc |
---|---|
date | Thu, 02 Feb 2017 11:52:37 -0500 |
parents | ee5acea162a7 |
children | 530c8d6fedd8 |
line wrap: on
line diff
--- a/tools/seq_primer_clip/seq_primer_clip.py Thu Nov 21 10:47:23 2013 -0500 +++ b/tools/seq_primer_clip/seq_primer_clip.py Thu Feb 02 11:52:37 2017 -0500 @@ -26,8 +26,8 @@ (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. See accompanying text file for licence details (MIT/BSD style). -This is version 0.0.8 of the script. Currently it uses Python's regular -expression engine for finding the primers, which for my needs is fast enough. +NOTE: Currently it uses Python's regular expression engine for finding the +primers, which for my needs is fast enough. """ import sys import re @@ -35,52 +35,48 @@ from galaxy_utils.sequence.fastq import fastqReader, fastqWriter if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.5" + print "v0.0.12" sys.exit(0) -def stop_err(msg, err=1): - sys.stderr.write(msg) - sys.exit(err) - try: from Bio.Seq import reverse_complement from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: - stop_err("Requires Biopython 1.54 or later") + sys.exit("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: - #Prior to Biopython 1.56 this was a private function + # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest -#Parse Command Line +# Parse Command Line try: in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:] except ValueError: - stop_err("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) + sys.exit("Expected 8 arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) if in_file == primer_fasta: - stop_err("Same file given as both primer sequences and sequences to clip!") + sys.exit("Same file given as both primer sequences and sequences to clip!") if in_file == out_file: - stop_err("Same file given as both sequences to clip and output!") + sys.exit("Same file given as both sequences to clip and output!") if primer_fasta == out_file: - stop_err("Same file given as both primer sequences and output!") + sys.exit("Same file given as both primer sequences and output!") try: mm = int(mm) except ValueError: - stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm) + sys.exit("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm) if mm < 0: - stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm) -if mm not in [0,1,2]: + sys.exit("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm) +if mm not in [0, 1, 2]: raise NotImplementedError try: min_len = int(min_len) except ValueError: - stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len) + sys.exit("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len) if min_len < 0: - stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len) + sys.exit("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len) if keep_negatives.lower() in ["true", "yes", "on"]: @@ -88,7 +84,7 @@ elif keep_negatives.lower() in ["false", "no", "off"]: keep_negatives = False else: - stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives) + sys.exit("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives) if primer_type.lower() == "forward": @@ -101,7 +97,7 @@ forward = False rc = True else: - stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type) + sys.exit("Expected foward, reverse or reverse-complement not %r" % primer_type) ambiguous_dna_values = { @@ -119,9 +115,9 @@ "H": "ACTMWYH", "D": "AGTRWKD", "B": "CGTSYKB", - "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC] + "X": ".", # faster than [GATCMRWSYKVVHDBXN] or even [GATC] "N": ".", - } +} ambiguous_dna_re = {} for letter, values in ambiguous_dna_values.iteritems(): @@ -134,39 +130,41 @@ def make_reg_ex(seq): return "".join(ambiguous_dna_re[letter] for letter in seq) + def make_reg_ex_mm(seq, mm): if mm > 2: raise NotImplementedError("At most 2 mismatches allowed!") seq = seq.upper() yield make_reg_ex(seq) - for i in range(1,mm+1): - #Missing first/last i bases at very start/end of sequence - for reg in make_reg_ex_mm(seq[i:], mm-i): + for i in range(1, mm + 1): + # Missing first/last i bases at very start/end of sequence + for reg in make_reg_ex_mm(seq[i:], mm - i): yield "^" + reg - for reg in make_reg_ex_mm(seq[:-i], mm-i): + for reg in make_reg_ex_mm(seq[:-i], mm - i): yield "$" + reg if mm >= 1: - for i,letter in enumerate(seq): - #We'll use a set to remove any duplicate patterns - #if letter not in "NX": - pattern = seq[:i] + "N" + seq[i+1:] + for i, letter in enumerate(seq): + # We'll use a set to remove any duplicate patterns + # if letter not in "NX": + pattern = seq[:i] + "N" + seq[i + 1:] assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ % (pattern, len(pattern), seq, len(seq)) yield make_reg_ex(pattern) - if mm >=2: - for i,letter in enumerate(seq): - #We'll use a set to remove any duplicate patterns - #if letter not in "NX": - for k,letter in enumerate(seq[i+1:]): - #We'll use a set to remove any duplicate patterns - #if letter not in "NX": - pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:] + if mm >= 2: + for i, letter in enumerate(seq): + # We'll use a set to remove any duplicate patterns + # if letter not in "NX": + for k, letter in enumerate(seq[i + 1:]): + # We'll use a set to remove any duplicate patterns + # if letter not in "NX": + pattern = seq[:i] + "N" + seq[i + 1:i + 1 + k] + "N" + seq[i + k + 2:] assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ % (pattern, len(pattern), seq, len(seq)) yield make_reg_ex(pattern) + def load_primers_as_re(primer_fasta, mm, rc=False): - #Read primer file and record all specified sequences + # Read primer file and record all specified sequences primers = set() in_handle = open(primer_fasta, "rU") reader = fastaReader(in_handle) @@ -176,19 +174,18 @@ seq = reverse_complement(record.sequence) else: seq = record.sequence - #primers.add(re.compile(make_reg_ex(seq))) + # primers.add(re.compile(make_reg_ex(seq))) count += 1 for pattern in make_reg_ex_mm(seq, mm): primers.add(pattern) in_handle.close() - #Use set to avoid duplicates, sort to have longest first - #(so more specific primers found before less specific ones) + # Use set to avoid duplicates, sort to have longest first + # (so more specific primers found before less specific ones) primers = sorted(set(primers), key=lambda p: -len(p)) - return count, re.compile("|".join(primers)) #make one monster re! + return count, re.compile("|".join(primers)) # make one monster re! - -#Read primer file and record all specified sequences +# Read primer file and record all specified sequences count, primer = load_primers_as_re(primer_fasta, mm, rc) print "%i primer sequences" % count @@ -197,8 +194,8 @@ clipped = 0 negs = 0 -if seq_format.lower()=="sff": - #SFF is different because we just change the trim points +if seq_format.lower() == "sff": + # SFF is different because we just change the trim points if forward: def process(records): global short_clipped, short_neg, clipped, negs @@ -208,8 +205,8 @@ seq = str(record.seq)[left_clip:right_clip].upper() result = primer.search(seq) if result: - #Forward primer, take everything after it - #so move the left clip along + # Forward primer, take everything after it + # so move the left clip along if len(seq) - result.end() >= min_len: record.annotations["clip_qual_left"] = left_clip + result.end() clipped += 1 @@ -231,8 +228,8 @@ seq = str(record.seq)[left_clip:right_clip].upper() result = primer.search(seq) if result: - #Reverse primer, take everything before it - #so move the right clip back + # Reverse primer, take everything before it + # so move the right clip back new_len = result.start() if new_len >= min_len: record.annotations["clip_qual_right"] = left_clip + new_len @@ -246,7 +243,7 @@ yield record else: short_neg += 1 - + in_handle = open(in_file, "rb") try: manifest = ReadRocheXmlManifest(in_handle) @@ -256,7 +253,7 @@ out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) writer.write_file(process(SffIterator(in_handle))) - #End of SFF code + # End of SFF code elif seq_format.lower().startswith("fastq"): in_handle = open(in_file, "rU") out_handle = open(out_file, "w") @@ -267,7 +264,7 @@ seq = record.sequence.upper() result = primer.search(seq) if result: - #Forward primer, take everything after it + # Forward primer, take everything after it cut = result.end() record.sequence = seq[cut:] if len(record.sequence) >= min_len: @@ -281,13 +278,13 @@ negs += 1 writer.write(record) else: - short_negs += 1 + short_neg += 1 else: for record in reader: seq = record.sequence.upper() result = primer.search(seq) if result: - #Reverse primer, take everything before it + # Reverse primer, take everything before it cut = result.start() record.sequence = seq[:cut] if len(record.sequence) >= min_len: @@ -301,19 +298,19 @@ negs += 1 writer.write(record) else: - short_negs += 1 -elif seq_format.lower()=="fasta": + short_neg += 1 +elif seq_format.lower() == "fasta": in_handle = open(in_file, "rU") out_handle = open(out_file, "w") reader = fastaReader(in_handle) writer = fastaWriter(out_handle) - #Following code is identical to that for FASTQ but without editing qualities + # Following code is identical to that for FASTQ but without editing qualities if forward: for record in reader: seq = record.sequence.upper() result = primer.search(seq) if result: - #Forward primer, take everything after it + # Forward primer, take everything after it cut = result.end() record.sequence = seq[cut:] if len(record.sequence) >= min_len: @@ -326,13 +323,13 @@ negs += 1 writer.write(record) else: - short_negs += 1 + short_neg += 1 else: for record in reader: seq = record.sequence.upper() result = primer.search(seq) if result: - #Reverse primer, take everything before it + # Reverse primer, take everything before it cut = result.start() record.sequence = seq[:cut] if len(record.sequence) >= min_len: @@ -345,9 +342,9 @@ negs += 1 writer.write(record) else: - short_negs += 1 + short_neg += 1 else: - stop_err("Unsupported file type %r" % seq_format) + sys.exit("Unsupported file type %r" % seq_format) in_handle.close() out_handle.close()