diff tools/seq_select_by_id/seq_select_by_id.py @ 6:91f55ee8fea5 draft

v0.0.11; more tests and assorting minor changes
author peterjc
date Wed, 13 May 2015 10:56:29 -0400
parents 6842c0c7bc70
children a5602454b0ad
line wrap: on
line diff
--- a/tools/seq_select_by_id/seq_select_by_id.py	Thu Nov 21 04:54:59 2013 -0500
+++ b/tools/seq_select_by_id/seq_select_by_id.py	Wed May 13 10:56:29 2015 -0400
@@ -19,34 +19,32 @@
 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
 All rights reserved. See accompanying text file for licence details (MIT
 license).
-
-This is version 0.0.6 of the script.
 """
 import sys
 
-def stop_err(msg, err=1):
+def sys_exit(msg, err=1):
     sys.stderr.write(msg.rstrip() + "\n")
     sys.exit(err)
 
 if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.0.6"
+    print "v0.0.9"
     sys.exit(0)
 
 #Parse Command Line
 try:
     tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:]
 except ValueError:
-    stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+    sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
 try:
     if col_arg.startswith("c"):
         column = int(col_arg[1:])-1
     else:
         column = int(col_arg)-1
 except ValueError:
-    stop_err("Expected column number, got %s" % col_arg)
+    sys_exit("Expected column number, got %s" % col_arg)
 
 if seq_format == "fastqcssanger":
-    stop_err("Colorspace FASTQ not supported.")
+    sys_exit("Colorspace FASTQ not supported.")
 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]:
     seq_format = seq_format.lower()
 elif seq_format.lower().startswith("fastq"):
@@ -56,22 +54,35 @@
     #We don't care what the scores are
     seq_format = "qual"
 else:
-    stop_err("Unrecognised file format %r" % seq_format)
+    sys_exit("Unrecognised file format %r" % seq_format)
 
 
 try:
     from Bio import SeqIO
 except ImportError:
-    stop_err("Biopython 1.54 or later is required")
+    sys_exit("Biopython 1.54 or later is required")
 
 
 def parse_ids(tabular_file, col):
-    """Read tabular file and record all specified identifiers."""
+    """Read tabular file and record all specified identifiers.
+
+    Will print a single warning to stderr if any of the fields have
+    non-trailing white space (only the first word will be used as
+    the identifier).
+    """
     handle = open(tabular_file, "rU")
+    warn = False
     for line in handle:
         if line.strip() and not line.startswith("#"):
-            yield line.rstrip("\n").split("\t")[col].strip()
+            field = line.rstrip("\n").split("\t")[col].strip()
+            parts = field.split(None, 1)
+            if len(parts) > 1 and not warn:
+                warn = "WARNING: Some of your identifiers had white space in them, " + \
+                       "using first word only. e.g.:\n%s\n" % field
+            yield parts[0]
     handle.close()
+    if warn:
+        sys.stderr.write(warn)
 
 #Index the sequence file.
 #If very big, could use SeqIO.index_db() to avoid memory bottleneck...
@@ -83,7 +94,7 @@
     try:
         from Bio.SeqIO.SffIO import SffIterator, SffWriter
     except ImportError:
-        stop_err("Requires Biopython 1.54 or later")
+        sys_exit("Requires Biopython 1.54 or later")
 
     try:
         from Bio.SeqIO.SffIO import ReadRocheXmlManifest
@@ -109,7 +120,7 @@
     except KeyError, err:
         out_handle.close()
         if name not in records:
-            stop_err("Identifier %r not found in sequence file" % name)
+            sys_exit("Identifier %r not found in sequence file" % name)
         else:
             raise err
     out_handle.close()
@@ -123,7 +134,7 @@
             out_handle.write(records.get_raw(name))
         except KeyError:
             out_handle.close()
-            stop_err("Identifier %r not found in sequence file" % name)
+            sys_exit("Identifier %r not found in sequence file" % name)
         count += 1
     out_handle.close()