seq_select_by_id: tools/seq_select_by_id/seq_select_by

comparison tools/seq_select_by_id/seq_select_by_id.py @ 7:a5602454b0ad draft

v0.0.12 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda; Python 3 compatible print function

author	peterjc
date	Thu, 11 May 2017 06:26:05 -0400
parents	91f55ee8fea5
children	8e1a90917fa7

comparison

equal deleted inserted replaced

-:91f55ee8fea5
+:a5602454b0ad
 Cock et al 2009. Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
+This script is copyright 2011-2017 by Peter Cock, The James Hutton Institute UK.
 All rights reserved. See accompanying text file for licence details (MIT
 license).
 """
+from __future__ import print_function
 import sys
-def sys_exit(msg, err=1):
-sys.stderr.write(msg.rstrip() + "\n")
-sys.exit(err)
 if "-v" in sys.argv or "--version" in sys.argv:
-print "v0.0.9"
+print("v0.0.12")
 sys.exit(0)
-#Parse Command Line
+# Parse Command Line
 try:
 tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:]
 except ValueError:
-sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+sys.exit("Expected five arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
 try:
 if col_arg.startswith("c"):
-column = int(col_arg[1:])-1
+column = int(col_arg[1:]) - 1
 else:
-column = int(col_arg)-1
+column = int(col_arg) - 1
 except ValueError:
-sys_exit("Expected column number, got %s" % col_arg)
+sys.exit("Expected column number, got %s" % col_arg)
 if seq_format == "fastqcssanger":
-sys_exit("Colorspace FASTQ not supported.")
+sys.exit("Colorspace FASTQ not supported.")
 elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]:
 seq_format = seq_format.lower()
 elif seq_format.lower().startswith("fastq"):
-#We don't care how the qualities are encoded
+# We don't care how the qualities are encoded
 seq_format = "fastq"
 elif seq_format.lower().startswith("qual"):
-#We don't care what the scores are
+# We don't care what the scores are
 seq_format = "qual"
 else:
-sys_exit("Unrecognised file format %r" % seq_format)
+sys.exit("Unrecognised file format %r" % seq_format)
 try:
 from Bio import SeqIO
 except ImportError:
-sys_exit("Biopython 1.54 or later is required")
+sys.exit("Biopython 1.54 or later is required")
 def parse_ids(tabular_file, col):
 """Read tabular file and record all specified identifiers.
 yield parts[0]
 handle.close()
 if warn:
 sys.stderr.write(warn)
-#Index the sequence file.
-#If very big, could use SeqIO.index_db() to avoid memory bottleneck...
+# Index the sequence file.
+# If very big, could use SeqIO.index_db() to avoid memory bottleneck...
 records = SeqIO.index(in_file, seq_format)
-print "Indexed %i sequences" % len(records)
+print("Indexed %i sequences" % len(records))
-if seq_format.lower()=="sff":
+if seq_format.lower() == "sff":
-#Special case to try to preserve the XML manifest
+# Special case to try to preserve the XML manifest
 try:
-from Bio.SeqIO.SffIO import SffIterator, SffWriter
+from Bio.SeqIO.SffIO import SffWriter
 except ImportError:
-sys_exit("Requires Biopython 1.54 or later")
+sys.exit("Requires Biopython 1.54 or later")
 try:
 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
 except ImportError:
-#Prior to Biopython 1.56 this was a private function
+# Prior to Biopython 1.56 this was a private function
 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
-in_handle = open(in_file, "rb") #must be binary mode!
+in_handle = open(in_file, "rb")  # must be binary mode!
 try:
 manifest = ReadRocheXmlManifest(in_handle)
 except ValueError:
 manifest = None
 in_handle.close()
 out_handle = open(out_file, "wb")
 writer = SffWriter(out_handle, xml=manifest)
 count = 0
-#This does have the overhead of parsing into SeqRecord objects,
+# This does have the overhead of parsing into SeqRecord objects,
-#but doing the header and index at the low level is too fidly.
+# but doing the header and index at the low level is too fidly.
+name = None  # We want the variable to leak from the iterator's scope...
 iterator = (records[name] for name in parse_ids(tabular_file, column))
 try:
 count = writer.write_file(iterator)
 except KeyError, err:
 out_handle.close()
 if name not in records:
-sys_exit("Identifier %r not found in sequence file" % name)
+sys.exit("Identifier %r not found in sequence file" % name)
 else:
 raise err
 out_handle.close()
 else:
-#Avoid overhead of parsing into SeqRecord objects,
+# Avoid overhead of parsing into SeqRecord objects,
-#just re-use the original formatting from the input file.
+# just re-use the original formatting from the input file.
 out_handle = open(out_file, "w")
 count = 0
 for name in parse_ids(tabular_file, column):
 try:
 out_handle.write(records.get_raw(name))
 except KeyError:
 out_handle.close()
-sys_exit("Identifier %r not found in sequence file" % name)
+sys.exit("Identifier %r not found in sequence file" % name)
 count += 1
 out_handle.close()
-print "Selected %i sequences by ID" % count
+print("Selected %i sequences by ID" % count)

Mercurial > repos > peterjc > seq_select_by_id

comparison tools/seq_select_by_id/seq_select_by_id.py @ 7:a5602454b0ad draft