diff tools/seq_primer_clip/seq_primer_clip.py @ 4:9b074c1db68e draft

v0.0.14 galaxy_sequence_utils dependency etc
author peterjc
date Thu, 02 Feb 2017 11:52:37 -0500
parents ee5acea162a7
children 530c8d6fedd8
line wrap: on
line diff
--- a/tools/seq_primer_clip/seq_primer_clip.py	Thu Nov 21 10:47:23 2013 -0500
+++ b/tools/seq_primer_clip/seq_primer_clip.py	Thu Feb 02 11:52:37 2017 -0500
@@ -26,8 +26,8 @@
 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
 See accompanying text file for licence details (MIT/BSD style).
 
-This is version 0.0.8 of the script. Currently it uses Python's regular
-expression engine for finding the primers, which for my needs is fast enough.
+NOTE: Currently it uses Python's regular expression engine for finding the
+primers, which for my needs is fast enough.
 """
 import sys
 import re
@@ -35,52 +35,48 @@
 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
 
 if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.0.5"
+    print "v0.0.12"
     sys.exit(0)
 
-def stop_err(msg, err=1):
-    sys.stderr.write(msg)
-    sys.exit(err)
-
 try:
     from Bio.Seq import reverse_complement
     from Bio.SeqIO.SffIO import SffIterator, SffWriter
 except ImportError:
-    stop_err("Requires Biopython 1.54 or later")
+    sys.exit("Requires Biopython 1.54 or later")
 try:
     from Bio.SeqIO.SffIO import ReadRocheXmlManifest
 except ImportError:
-    #Prior to Biopython 1.56 this was a private function
+    # Prior to Biopython 1.56 this was a private function
     from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
 
-#Parse Command Line
+# Parse Command Line
 try:
     in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:]
 except ValueError:
-    stop_err("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+    sys.exit("Expected 8 arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
 
 if in_file == primer_fasta:
-    stop_err("Same file given as both primer sequences and sequences to clip!")
+    sys.exit("Same file given as both primer sequences and sequences to clip!")
 if in_file == out_file:
-    stop_err("Same file given as both sequences to clip and output!")
+    sys.exit("Same file given as both sequences to clip and output!")
 if primer_fasta == out_file:
-    stop_err("Same file given as both primer sequences and output!")
+    sys.exit("Same file given as both primer sequences and output!")
 
 try:
     mm = int(mm)
 except ValueError:
-    stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm)
+    sys.exit("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm)
 if mm < 0:
-    stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm)
-if mm not in [0,1,2]:
+    sys.exit("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm)
+if mm not in [0, 1, 2]:
     raise NotImplementedError
 
 try:
     min_len = int(min_len)
 except ValueError:
-    stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len)
+    sys.exit("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len)
 if min_len < 0:
-    stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len)
+    sys.exit("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len)
 
 
 if keep_negatives.lower() in ["true", "yes", "on"]:
@@ -88,7 +84,7 @@
 elif keep_negatives.lower() in ["false", "no", "off"]:
     keep_negatives = False
 else:
-    stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives)
+    sys.exit("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives)
 
 
 if primer_type.lower() == "forward":
@@ -101,7 +97,7 @@
     forward = False
     rc = True
 else:
-    stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type)
+    sys.exit("Expected foward, reverse or reverse-complement not %r" % primer_type)
 
 
 ambiguous_dna_values = {
@@ -119,9 +115,9 @@
     "H": "ACTMWYH",
     "D": "AGTRWKD",
     "B": "CGTSYKB",
-    "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC]
+    "X": ".",  # faster than [GATCMRWSYKVVHDBXN] or even [GATC]
     "N": ".",
-    }
+}
 
 ambiguous_dna_re = {}
 for letter, values in ambiguous_dna_values.iteritems():
@@ -134,39 +130,41 @@
 def make_reg_ex(seq):
     return "".join(ambiguous_dna_re[letter] for letter in seq)
 
+
 def make_reg_ex_mm(seq, mm):
     if mm > 2:
         raise NotImplementedError("At most 2 mismatches allowed!")
     seq = seq.upper()
     yield make_reg_ex(seq)
-    for i in range(1,mm+1):
-        #Missing first/last i bases at very start/end of sequence
-        for reg in make_reg_ex_mm(seq[i:],  mm-i):
+    for i in range(1, mm + 1):
+        # Missing first/last i bases at very start/end of sequence
+        for reg in make_reg_ex_mm(seq[i:], mm - i):
             yield "^" + reg
-        for reg in make_reg_ex_mm(seq[:-i], mm-i):
+        for reg in make_reg_ex_mm(seq[:-i], mm - i):
             yield "$" + reg
     if mm >= 1:
-        for i,letter in enumerate(seq):
-            #We'll use a set to remove any duplicate patterns
-            #if letter not in "NX":
-            pattern = seq[:i] + "N" + seq[i+1:]
+        for i, letter in enumerate(seq):
+            # We'll use a set to remove any duplicate patterns
+            # if letter not in "NX":
+            pattern = seq[:i] + "N" + seq[i + 1:]
             assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
                    % (pattern, len(pattern), seq, len(seq))
             yield make_reg_ex(pattern)
-    if mm >=2:
-        for i,letter in enumerate(seq):
-            #We'll use a set to remove any duplicate patterns
-            #if letter not in "NX":
-            for k,letter in enumerate(seq[i+1:]):
-                #We'll use a set to remove any duplicate patterns
-                #if letter not in "NX":
-                pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:]
+    if mm >= 2:
+        for i, letter in enumerate(seq):
+            # We'll use a set to remove any duplicate patterns
+            # if letter not in "NX":
+            for k, letter in enumerate(seq[i + 1:]):
+                # We'll use a set to remove any duplicate patterns
+                # if letter not in "NX":
+                pattern = seq[:i] + "N" + seq[i + 1:i + 1 + k] + "N" + seq[i + k + 2:]
                 assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
                        % (pattern, len(pattern), seq, len(seq))
                 yield make_reg_ex(pattern)
 
+
 def load_primers_as_re(primer_fasta, mm, rc=False):
-    #Read primer file and record all specified sequences
+    # Read primer file and record all specified sequences
     primers = set()
     in_handle = open(primer_fasta, "rU")
     reader = fastaReader(in_handle)
@@ -176,19 +174,18 @@
             seq = reverse_complement(record.sequence)
         else:
             seq = record.sequence
-        #primers.add(re.compile(make_reg_ex(seq)))
+        # primers.add(re.compile(make_reg_ex(seq)))
         count += 1
         for pattern in make_reg_ex_mm(seq, mm):
             primers.add(pattern)
     in_handle.close()
-    #Use set to avoid duplicates, sort to have longest first
-    #(so more specific primers found before less specific ones)
+    # Use set to avoid duplicates, sort to have longest first
+    # (so more specific primers found before less specific ones)
     primers = sorted(set(primers), key=lambda p: -len(p))
-    return count, re.compile("|".join(primers)) #make one monster re!
+    return count, re.compile("|".join(primers))  # make one monster re!
 
 
-
-#Read primer file and record all specified sequences
+# Read primer file and record all specified sequences
 count, primer = load_primers_as_re(primer_fasta, mm, rc)
 print "%i primer sequences" % count
 
@@ -197,8 +194,8 @@
 clipped = 0
 negs = 0
 
-if seq_format.lower()=="sff":
-    #SFF is different because we just change the trim points
+if seq_format.lower() == "sff":
+    # SFF is different because we just change the trim points
     if forward:
         def process(records):
             global short_clipped, short_neg, clipped, negs
@@ -208,8 +205,8 @@
                 seq = str(record.seq)[left_clip:right_clip].upper()
                 result = primer.search(seq)
                 if result:
-                    #Forward primer, take everything after it
-                    #so move the left clip along
+                    # Forward primer, take everything after it
+                    # so move the left clip along
                     if len(seq) - result.end() >= min_len:
                         record.annotations["clip_qual_left"] = left_clip + result.end()
                         clipped += 1
@@ -231,8 +228,8 @@
                 seq = str(record.seq)[left_clip:right_clip].upper()
                 result = primer.search(seq)
                 if result:
-                    #Reverse primer, take everything before it
-                    #so move the right clip back
+                    # Reverse primer, take everything before it
+                    # so move the right clip back
                     new_len = result.start()
                     if new_len >= min_len:
                         record.annotations["clip_qual_right"] = left_clip + new_len
@@ -246,7 +243,7 @@
                         yield record
                     else:
                         short_neg += 1
-    
+
     in_handle = open(in_file, "rb")
     try:
         manifest = ReadRocheXmlManifest(in_handle)
@@ -256,7 +253,7 @@
     out_handle = open(out_file, "wb")
     writer = SffWriter(out_handle, xml=manifest)
     writer.write_file(process(SffIterator(in_handle)))
-    #End of SFF code
+    # End of SFF code
 elif seq_format.lower().startswith("fastq"):
     in_handle = open(in_file, "rU")
     out_handle = open(out_file, "w")
@@ -267,7 +264,7 @@
             seq = record.sequence.upper()
             result = primer.search(seq)
             if result:
-                #Forward primer, take everything after it
+                # Forward primer, take everything after it
                 cut = result.end()
                 record.sequence = seq[cut:]
                 if len(record.sequence) >= min_len:
@@ -281,13 +278,13 @@
                     negs += 1
                     writer.write(record)
                 else:
-                    short_negs += 1
+                    short_neg += 1
     else:
         for record in reader:
             seq = record.sequence.upper()
             result = primer.search(seq)
             if result:
-                #Reverse primer, take everything before it
+                # Reverse primer, take everything before it
                 cut = result.start()
                 record.sequence = seq[:cut]
                 if len(record.sequence) >= min_len:
@@ -301,19 +298,19 @@
                     negs += 1
                     writer.write(record)
                 else:
-                    short_negs += 1
-elif seq_format.lower()=="fasta":
+                    short_neg += 1
+elif seq_format.lower() == "fasta":
     in_handle = open(in_file, "rU")
     out_handle = open(out_file, "w")
     reader = fastaReader(in_handle)
     writer = fastaWriter(out_handle)
-    #Following code is identical to that for FASTQ but without editing qualities
+    # Following code is identical to that for FASTQ but without editing qualities
     if forward:
         for record in reader:
             seq = record.sequence.upper()
             result = primer.search(seq)
             if result:
-                #Forward primer, take everything after it
+                # Forward primer, take everything after it
                 cut = result.end()
                 record.sequence = seq[cut:]
                 if len(record.sequence) >= min_len:
@@ -326,13 +323,13 @@
                     negs += 1
                     writer.write(record)
                 else:
-                    short_negs += 1
+                    short_neg += 1
     else:
         for record in reader:
             seq = record.sequence.upper()
             result = primer.search(seq)
             if result:
-                #Reverse primer, take everything before it
+                # Reverse primer, take everything before it
                 cut = result.start()
                 record.sequence = seq[:cut]
                 if len(record.sequence) >= min_len:
@@ -345,9 +342,9 @@
                     negs += 1
                     writer.write(record)
                 else:
-                    short_negs += 1
+                    short_neg += 1
 else:
-    stop_err("Unsupported file type %r" % seq_format)
+    sys.exit("Unsupported file type %r" % seq_format)
 in_handle.close()
 out_handle.close()