view tools/seq_select_by_id/ @ 6:91f55ee8fea5 draft

v0.0.11; more tests and assorting minor changes
author peterjc
date Wed, 13 May 2015 10:56:29 -0400
parents 6842c0c7bc70
children a5602454b0ad
line wrap: on
line source

#!/usr/bin/env python
"""Select FASTA, QUAL, FASTQ or SSF sequences by IDs from a tabular file.

Takes five command line options, tabular filename, ID column number (using
one based counting), input filename, input type (e.g. FASTA or SFF) and the
output filename (same format as input sequence file).

When selecting from an SFF file, any Roche XML manifest in the input file is
preserved in both output files.

This tool is a short Python script which requires Biopython 1.54 or later
for SFF file support. If you use this tool in scientific work leading to a
publication, please cite the Biopython application note:

Cock et al 2009. Biopython: freely available Python tools for computational
molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. pmid:19304878.

This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
All rights reserved. See accompanying text file for licence details (MIT
import sys

def sys_exit(msg, err=1):
    sys.stderr.write(msg.rstrip() + "\n")

if "-v" in sys.argv or "--version" in sys.argv:
    print "v0.0.9"

#Parse Command Line
    tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:]
except ValueError:
    sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
    if col_arg.startswith("c"):
        column = int(col_arg[1:])-1
        column = int(col_arg)-1
except ValueError:
    sys_exit("Expected column number, got %s" % col_arg)

if seq_format == "fastqcssanger":
    sys_exit("Colorspace FASTQ not supported.")
elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]:
    seq_format = seq_format.lower()
elif seq_format.lower().startswith("fastq"):
    #We don't care how the qualities are encoded    
    seq_format = "fastq"
elif seq_format.lower().startswith("qual"):
    #We don't care what the scores are
    seq_format = "qual"
    sys_exit("Unrecognised file format %r" % seq_format)

    from Bio import SeqIO
except ImportError:
    sys_exit("Biopython 1.54 or later is required")

def parse_ids(tabular_file, col):
    """Read tabular file and record all specified identifiers.

    Will print a single warning to stderr if any of the fields have
    non-trailing white space (only the first word will be used as
    the identifier).
    handle = open(tabular_file, "rU")
    warn = False
    for line in handle:
        if line.strip() and not line.startswith("#"):
            field = line.rstrip("\n").split("\t")[col].strip()
            parts = field.split(None, 1)
            if len(parts) > 1 and not warn:
                warn = "WARNING: Some of your identifiers had white space in them, " + \
                       "using first word only. e.g.:\n%s\n" % field
            yield parts[0]
    if warn:

#Index the sequence file.
#If very big, could use SeqIO.index_db() to avoid memory bottleneck...
records = SeqIO.index(in_file, seq_format)
print "Indexed %i sequences" % len(records)

if seq_format.lower()=="sff":
    #Special case to try to preserve the XML manifest
        from Bio.SeqIO.SffIO import SffIterator, SffWriter
    except ImportError:
        sys_exit("Requires Biopython 1.54 or later")

        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        #Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest

    in_handle = open(in_file, "rb") #must be binary mode!
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None

    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    count = 0
    #This does have the overhead of parsing into SeqRecord objects,
    #but doing the header and index at the low level is too fidly.
    iterator = (records[name] for name in parse_ids(tabular_file, column))
        count = writer.write_file(iterator)
    except KeyError, err:
        if name not in records:
            sys_exit("Identifier %r not found in sequence file" % name)
            raise err
    #Avoid overhead of parsing into SeqRecord objects,
    #just re-use the original formatting from the input file.
    out_handle = open(out_file, "w")
    count = 0
    for name in parse_ids(tabular_file, column):
        except KeyError:
            sys_exit("Identifier %r not found in sequence file" % name)
        count += 1

print "Selected %i sequences by ID" % count