diff tools/seq_select_by_id/seq_select_by_id.py @ 4:6842c0c7bc70 draft

Uploaded v0.0.7, depend on Biopython 1.62, tabs to spaces in XML
author peterjc
date Mon, 28 Oct 2013 05:21:45 -0400
parents
children 91f55ee8fea5
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_select_by_id/seq_select_by_id.py	Mon Oct 28 05:21:45 2013 -0400
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+"""Select FASTA, QUAL, FASTQ or SSF sequences by IDs from a tabular file.
+
+Takes five command line options, tabular filename, ID column number (using
+one based counting), input filename, input type (e.g. FASTA or SFF) and the
+output filename (same format as input sequence file).
+
+When selecting from an SFF file, any Roche XML manifest in the input file is
+preserved in both output files.
+
+This tool is a short Python script which requires Biopython 1.54 or later
+for SFF file support. If you use this tool in scientific work leading to a
+publication, please cite the Biopython application note:
+
+Cock et al 2009. Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
+All rights reserved. See accompanying text file for licence details (MIT
+license).
+
+This is version 0.0.6 of the script.
+"""
+import sys
+
+def stop_err(msg, err=1):
+    sys.stderr.write(msg.rstrip() + "\n")
+    sys.exit(err)
+
+if "-v" in sys.argv or "--version" in sys.argv:
+    print "v0.0.6"
+    sys.exit(0)
+
+#Parse Command Line
+try:
+    tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:]
+except ValueError:
+    stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+try:
+    if col_arg.startswith("c"):
+        column = int(col_arg[1:])-1
+    else:
+        column = int(col_arg)-1
+except ValueError:
+    stop_err("Expected column number, got %s" % col_arg)
+
+if seq_format == "fastqcssanger":
+    stop_err("Colorspace FASTQ not supported.")
+elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]:
+    seq_format = seq_format.lower()
+elif seq_format.lower().startswith("fastq"):
+    #We don't care how the qualities are encoded    
+    seq_format = "fastq"
+elif seq_format.lower().startswith("qual"):
+    #We don't care what the scores are
+    seq_format = "qual"
+else:
+    stop_err("Unrecognised file format %r" % seq_format)
+
+
+try:
+    from Bio import SeqIO
+except ImportError:
+    stop_err("Biopython 1.54 or later is required")
+
+
+def parse_ids(tabular_file, col):
+    """Read tabular file and record all specified identifiers."""
+    handle = open(tabular_file, "rU")
+    for line in handle:
+        if line.strip() and not line.startswith("#"):
+            yield line.rstrip("\n").split("\t")[col].strip()
+    handle.close()
+
+#Index the sequence file.
+#If very big, could use SeqIO.index_db() to avoid memory bottleneck...
+records = SeqIO.index(in_file, seq_format)
+print "Indexed %i sequences" % len(records)
+
+if seq_format.lower()=="sff":
+    #Special case to try to preserve the XML manifest
+    try:
+        from Bio.SeqIO.SffIO import SffIterator, SffWriter
+    except ImportError:
+        stop_err("Requires Biopython 1.54 or later")
+
+    try:
+        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
+    except ImportError:
+        #Prior to Biopython 1.56 this was a private function
+        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
+
+    in_handle = open(in_file, "rb") #must be binary mode!
+    try:
+        manifest = ReadRocheXmlManifest(in_handle)
+    except ValueError:
+        manifest = None
+    in_handle.close()
+
+    out_handle = open(out_file, "wb")
+    writer = SffWriter(out_handle, xml=manifest)
+    count = 0
+    #This does have the overhead of parsing into SeqRecord objects,
+    #but doing the header and index at the low level is too fidly.
+    iterator = (records[name] for name in parse_ids(tabular_file, column))
+    try:
+        count = writer.write_file(iterator)
+    except KeyError, err:
+        out_handle.close()
+        if name not in records:
+            stop_err("Identifier %r not found in sequence file" % name)
+        else:
+            raise err
+    out_handle.close()
+else:
+    #Avoid overhead of parsing into SeqRecord objects,
+    #just re-use the original formatting from the input file.
+    out_handle = open(out_file, "w")
+    count = 0
+    for name in parse_ids(tabular_file, column):
+        try:
+            out_handle.write(records.get_raw(name))
+        except KeyError:
+            out_handle.close()
+            stop_err("Identifier %r not found in sequence file" % name)
+        count += 1
+    out_handle.close()
+
+print "Selected %i sequences by ID" % count