Mercurial > repos > peterjc > seq_select_by_id

diff tools/seq_select_by_id/seq_select_by_id.py @ 7:a5602454b0ad draft
v0.0.12 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda; Python 3 compatible print function
author: peterjc
date: Thu, 11 May 2017 06:26:05 -0400
parents: 91f55ee8fea5
children: 8e1a90917fa7
--- a/tools/seq_select_by_id/seq_select_by_id.py	Wed May 13 10:56:29 2015 -0400
+++ b/tools/seq_select_by_id/seq_select_by_id.py	Thu May 11 06:26:05 2017 -0400
@@ -16,51 +16,50 @@
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
 
-This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
+This script is copyright 2011-2017 by Peter Cock, The James Hutton Institute UK.
 All rights reserved. See accompanying text file for licence details (MIT
 license).
 """
+
+from __future__ import print_function
+
 import sys
 
-def sys_exit(msg, err=1):
-    sys.stderr.write(msg.rstrip() + "\n")
-    sys.exit(err)
-
 if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.0.9"
+    print("v0.0.12")
     sys.exit(0)
 
-#Parse Command Line
+# Parse Command Line
 try:
     tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:]
 except ValueError:
-    sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+    sys.exit("Expected five arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
 try:
     if col_arg.startswith("c"):
-        column = int(col_arg[1:])-1
+        column = int(col_arg[1:]) - 1
     else:
-        column = int(col_arg)-1
+        column = int(col_arg) - 1
 except ValueError:
-    sys_exit("Expected column number, got %s" % col_arg)
+    sys.exit("Expected column number, got %s" % col_arg)
 
 if seq_format == "fastqcssanger":
-    sys_exit("Colorspace FASTQ not supported.")
+    sys.exit("Colorspace FASTQ not supported.")
 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]:
     seq_format = seq_format.lower()
 elif seq_format.lower().startswith("fastq"):
-    #We don't care how the qualities are encoded    
+    # We don't care how the qualities are encoded
     seq_format = "fastq"
 elif seq_format.lower().startswith("qual"):
-    #We don't care what the scores are
+    # We don't care what the scores are
     seq_format = "qual"
 else:
-    sys_exit("Unrecognised file format %r" % seq_format)
+    sys.exit("Unrecognised file format %r" % seq_format)
 
 
 try:
     from Bio import SeqIO
 except ImportError:
-    sys_exit("Biopython 1.54 or later is required")
+    sys.exit("Biopython 1.54 or later is required")
 
 
 def parse_ids(tabular_file, col):
@@ -84,25 +83,26 @@
     if warn:
         sys.stderr.write(warn)
 
-#Index the sequence file.
-#If very big, could use SeqIO.index_db() to avoid memory bottleneck...
+
+# Index the sequence file.
+# If very big, could use SeqIO.index_db() to avoid memory bottleneck...
 records = SeqIO.index(in_file, seq_format)
-print "Indexed %i sequences" % len(records)
+print("Indexed %i sequences" % len(records))
 
-if seq_format.lower()=="sff":
-    #Special case to try to preserve the XML manifest
+if seq_format.lower() == "sff":
+    # Special case to try to preserve the XML manifest
     try:
-        from Bio.SeqIO.SffIO import SffIterator, SffWriter
+        from Bio.SeqIO.SffIO import SffWriter
     except ImportError:
-        sys_exit("Requires Biopython 1.54 or later")
+        sys.exit("Requires Biopython 1.54 or later")
 
     try:
         from Bio.SeqIO.SffIO import ReadRocheXmlManifest
     except ImportError:
-        #Prior to Biopython 1.56 this was a private function
+        # Prior to Biopython 1.56 this was a private function
         from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
 
-    in_handle = open(in_file, "rb") #must be binary mode!
+    in_handle = open(in_file, "rb")  # must be binary mode!
     try:
         manifest = ReadRocheXmlManifest(in_handle)
     except ValueError:
@@ -112,21 +112,22 @@
     out_handle = open(out_file, "wb")
     writer = SffWriter(out_handle, xml=manifest)
     count = 0
-    #This does have the overhead of parsing into SeqRecord objects,
-    #but doing the header and index at the low level is too fidly.
+    # This does have the overhead of parsing into SeqRecord objects,
+    # but doing the header and index at the low level is too fidly.
+    name = None  # We want the variable to leak from the iterator's scope...
     iterator = (records[name] for name in parse_ids(tabular_file, column))
     try:
         count = writer.write_file(iterator)
     except KeyError, err:
         out_handle.close()
         if name not in records:
-            sys_exit("Identifier %r not found in sequence file" % name)
+            sys.exit("Identifier %r not found in sequence file" % name)
         else:
             raise err
     out_handle.close()
 else:
-    #Avoid overhead of parsing into SeqRecord objects,
-    #just re-use the original formatting from the input file.
+    # Avoid overhead of parsing into SeqRecord objects,
+    # just re-use the original formatting from the input file.
     out_handle = open(out_file, "w")
     count = 0
     for name in parse_ids(tabular_file, column):
@@ -134,8 +135,8 @@
             out_handle.write(records.get_raw(name))
         except KeyError:
             out_handle.close()
-            sys_exit("Identifier %r not found in sequence file" % name)
+            sys.exit("Identifier %r not found in sequence file" % name)
         count += 1
     out_handle.close()
 
-print "Selected %i sequences by ID" % count
+print("Selected %i sequences by ID" % count)
author	peterjc
date	Thu, 11 May 2017 06:26:05 -0400
parents	91f55ee8fea5
children	8e1a90917fa7