Mercurial > repos > peterjc > seq_select_by_id
diff tools/seq_select_by_id/seq_select_by_id.py @ 7:a5602454b0ad draft
v0.0.12 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda; Python 3 compatible print function
author | peterjc |
---|---|
date | Thu, 11 May 2017 06:26:05 -0400 |
parents | 91f55ee8fea5 |
children | 8e1a90917fa7 |
line wrap: on
line diff
--- a/tools/seq_select_by_id/seq_select_by_id.py Wed May 13 10:56:29 2015 -0400 +++ b/tools/seq_select_by_id/seq_select_by_id.py Thu May 11 06:26:05 2017 -0400 @@ -16,51 +16,50 @@ molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. -This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK. +This script is copyright 2011-2017 by Peter Cock, The James Hutton Institute UK. All rights reserved. See accompanying text file for licence details (MIT license). """ + +from __future__ import print_function + import sys -def sys_exit(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.9" + print("v0.0.12") sys.exit(0) -#Parse Command Line +# Parse Command Line try: tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:] except ValueError: - sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) + sys.exit("Expected five arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) try: if col_arg.startswith("c"): - column = int(col_arg[1:])-1 + column = int(col_arg[1:]) - 1 else: - column = int(col_arg)-1 + column = int(col_arg) - 1 except ValueError: - sys_exit("Expected column number, got %s" % col_arg) + sys.exit("Expected column number, got %s" % col_arg) if seq_format == "fastqcssanger": - sys_exit("Colorspace FASTQ not supported.") + sys.exit("Colorspace FASTQ not supported.") elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]: seq_format = seq_format.lower() elif seq_format.lower().startswith("fastq"): - #We don't care how the qualities are encoded + # We don't care how the qualities are encoded seq_format = "fastq" elif seq_format.lower().startswith("qual"): - #We don't care what the scores are + # We don't care what the scores are seq_format = "qual" else: - sys_exit("Unrecognised file format %r" % seq_format) + sys.exit("Unrecognised file format %r" % seq_format) try: from Bio import SeqIO except ImportError: - sys_exit("Biopython 1.54 or later is required") + sys.exit("Biopython 1.54 or later is required") def parse_ids(tabular_file, col): @@ -84,25 +83,26 @@ if warn: sys.stderr.write(warn) -#Index the sequence file. -#If very big, could use SeqIO.index_db() to avoid memory bottleneck... + +# Index the sequence file. +# If very big, could use SeqIO.index_db() to avoid memory bottleneck... records = SeqIO.index(in_file, seq_format) -print "Indexed %i sequences" % len(records) +print("Indexed %i sequences" % len(records)) -if seq_format.lower()=="sff": - #Special case to try to preserve the XML manifest +if seq_format.lower() == "sff": + # Special case to try to preserve the XML manifest try: - from Bio.SeqIO.SffIO import SffIterator, SffWriter + from Bio.SeqIO.SffIO import SffWriter except ImportError: - sys_exit("Requires Biopython 1.54 or later") + sys.exit("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: - #Prior to Biopython 1.56 this was a private function + # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest - in_handle = open(in_file, "rb") #must be binary mode! + in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: @@ -112,21 +112,22 @@ out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) count = 0 - #This does have the overhead of parsing into SeqRecord objects, - #but doing the header and index at the low level is too fidly. + # This does have the overhead of parsing into SeqRecord objects, + # but doing the header and index at the low level is too fidly. + name = None # We want the variable to leak from the iterator's scope... iterator = (records[name] for name in parse_ids(tabular_file, column)) try: count = writer.write_file(iterator) except KeyError, err: out_handle.close() if name not in records: - sys_exit("Identifier %r not found in sequence file" % name) + sys.exit("Identifier %r not found in sequence file" % name) else: raise err out_handle.close() else: - #Avoid overhead of parsing into SeqRecord objects, - #just re-use the original formatting from the input file. + # Avoid overhead of parsing into SeqRecord objects, + # just re-use the original formatting from the input file. out_handle = open(out_file, "w") count = 0 for name in parse_ids(tabular_file, column): @@ -134,8 +135,8 @@ out_handle.write(records.get_raw(name)) except KeyError: out_handle.close() - sys_exit("Identifier %r not found in sequence file" % name) + sys.exit("Identifier %r not found in sequence file" % name) count += 1 out_handle.close() -print "Selected %i sequences by ID" % count +print("Selected %i sequences by ID" % count)