Mercurial > repos > peterjc > seq_select_by_id
changeset 7:a5602454b0ad draft
v0.0.12 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda; Python 3 compatible print function
author | peterjc |
---|---|
date | Thu, 11 May 2017 06:26:05 -0400 |
parents | 91f55ee8fea5 |
children | 8e1a90917fa7 |
files | tools/seq_select_by_id/README.rst tools/seq_select_by_id/seq_select_by_id.py tools/seq_select_by_id/seq_select_by_id.xml tools/seq_select_by_id/tool_dependencies.xml |
diffstat | 4 files changed, 50 insertions(+), 49 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/seq_select_by_id/README.rst Wed May 13 10:56:29 2015 -0400 +++ b/tools/seq_select_by_id/README.rst Thu May 11 06:26:05 2017 -0400 @@ -1,7 +1,7 @@ Galaxy tool to select FASTA, QUAL, FASTQ or SFF sequences by ID =============================================================== -This tool is copyright 2011-2015 by Peter Cock, The James Hutton Institute +This tool is copyright 2011-2017 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. See the licence text below. @@ -85,6 +85,10 @@ - Reorder XML elements (internal change only). - Planemo for Tool Shed upload (``.shed.yml``, internal change only). - Quote filenames in case of spaces (internal change only). +v0.0.12 - Python style changes (internal change only). + - Use ``<command detect_errors="aggressive">`` (internal change only). + - Depends on Biopython 1.67 via legacy Tool Shed package or bioconda. + - Python 3 compatible print function. ======= ====================================================================== @@ -101,17 +105,17 @@ Planemo commands (which requires you have set your Tool Shed access details in ``~/.planemo.yml`` and that you have access rights on the Tool Shed):: - $ planemo shed_upload --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/seq_select_by_id/ + $ planemo shed_update -t testtoolshed --check_diff tools/seq_select_by_id/ ... or:: - $ planemo shed_upload --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/seq_select_by_id/ + $ planemo shed_update -t toolshed --check_diff tools/seq_select_by_id/ ... To just build and check the tar ball, use:: - $ planemo shed_upload --tar_only ~/repositories/pico_galaxy/tools/seq_select_by_id/ + $ planemo shed_upload --tar_only tools/seq_select_by_id/ ... $ tar -tzf shed_upload.tar.gz test-data/k12_hypothetical.fasta
--- a/tools/seq_select_by_id/seq_select_by_id.py Wed May 13 10:56:29 2015 -0400 +++ b/tools/seq_select_by_id/seq_select_by_id.py Thu May 11 06:26:05 2017 -0400 @@ -16,51 +16,50 @@ molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. -This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK. +This script is copyright 2011-2017 by Peter Cock, The James Hutton Institute UK. All rights reserved. See accompanying text file for licence details (MIT license). """ + +from __future__ import print_function + import sys -def sys_exit(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.9" + print("v0.0.12") sys.exit(0) -#Parse Command Line +# Parse Command Line try: tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:] except ValueError: - sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) + sys.exit("Expected five arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) try: if col_arg.startswith("c"): - column = int(col_arg[1:])-1 + column = int(col_arg[1:]) - 1 else: - column = int(col_arg)-1 + column = int(col_arg) - 1 except ValueError: - sys_exit("Expected column number, got %s" % col_arg) + sys.exit("Expected column number, got %s" % col_arg) if seq_format == "fastqcssanger": - sys_exit("Colorspace FASTQ not supported.") + sys.exit("Colorspace FASTQ not supported.") elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]: seq_format = seq_format.lower() elif seq_format.lower().startswith("fastq"): - #We don't care how the qualities are encoded + # We don't care how the qualities are encoded seq_format = "fastq" elif seq_format.lower().startswith("qual"): - #We don't care what the scores are + # We don't care what the scores are seq_format = "qual" else: - sys_exit("Unrecognised file format %r" % seq_format) + sys.exit("Unrecognised file format %r" % seq_format) try: from Bio import SeqIO except ImportError: - sys_exit("Biopython 1.54 or later is required") + sys.exit("Biopython 1.54 or later is required") def parse_ids(tabular_file, col): @@ -84,25 +83,26 @@ if warn: sys.stderr.write(warn) -#Index the sequence file. -#If very big, could use SeqIO.index_db() to avoid memory bottleneck... + +# Index the sequence file. +# If very big, could use SeqIO.index_db() to avoid memory bottleneck... records = SeqIO.index(in_file, seq_format) -print "Indexed %i sequences" % len(records) +print("Indexed %i sequences" % len(records)) -if seq_format.lower()=="sff": - #Special case to try to preserve the XML manifest +if seq_format.lower() == "sff": + # Special case to try to preserve the XML manifest try: - from Bio.SeqIO.SffIO import SffIterator, SffWriter + from Bio.SeqIO.SffIO import SffWriter except ImportError: - sys_exit("Requires Biopython 1.54 or later") + sys.exit("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: - #Prior to Biopython 1.56 this was a private function + # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest - in_handle = open(in_file, "rb") #must be binary mode! + in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: @@ -112,21 +112,22 @@ out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) count = 0 - #This does have the overhead of parsing into SeqRecord objects, - #but doing the header and index at the low level is too fidly. + # This does have the overhead of parsing into SeqRecord objects, + # but doing the header and index at the low level is too fidly. + name = None # We want the variable to leak from the iterator's scope... iterator = (records[name] for name in parse_ids(tabular_file, column)) try: count = writer.write_file(iterator) except KeyError, err: out_handle.close() if name not in records: - sys_exit("Identifier %r not found in sequence file" % name) + sys.exit("Identifier %r not found in sequence file" % name) else: raise err out_handle.close() else: - #Avoid overhead of parsing into SeqRecord objects, - #just re-use the original formatting from the input file. + # Avoid overhead of parsing into SeqRecord objects, + # just re-use the original formatting from the input file. out_handle = open(out_file, "w") count = 0 for name in parse_ids(tabular_file, column): @@ -134,8 +135,8 @@ out_handle.write(records.get_raw(name)) except KeyError: out_handle.close() - sys_exit("Identifier %r not found in sequence file" % name) + sys.exit("Identifier %r not found in sequence file" % name) count += 1 out_handle.close() -print "Selected %i sequences by ID" % count +print("Selected %i sequences by ID" % count)
--- a/tools/seq_select_by_id/seq_select_by_id.xml Wed May 13 10:56:29 2015 -0400 +++ b/tools/seq_select_by_id/seq_select_by_id.xml Thu May 11 06:26:05 2017 -0400 @@ -1,17 +1,13 @@ -<tool id="seq_select_by_id" name="Select sequences by ID" version="0.0.11"> +<tool id="seq_select_by_id" name="Select sequences by ID" version="0.0.12"> <description>from a tabular file</description> <requirements> - <requirement type="package" version="1.62">biopython</requirement> - <requirement type="python-module">Bio</requirement> + <requirement type="package" version="1.67">biopython</requirement> </requirements> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> - <version_command interpreter="python">seq_select_by_id.py --version</version_command> - <command interpreter="python"> -seq_select_by_id.py "$input_tabular" "$column" "$input_file" "$input_file.ext" "$output_file" + <version_command> +python $__tool_directory__/seq_select_by_id.py --version + </version_command> + <command detect_errors="aggressive"> +python $__tool_directory__/seq_select_by_id.py '$input_tabular' '$column' '$input_file' '$input_file.ext' '$output_file' </command> <inputs> <param name="input_file" type="data" format="fasta,qual,fastq,sff" label="Sequence file to select from" help="FASTA, QUAL, FASTQ, or SFF format." />
--- a/tools/seq_select_by_id/tool_dependencies.xml Wed May 13 10:56:29 2015 -0400 +++ b/tools/seq_select_by_id/tool_dependencies.xml Thu May 11 06:26:05 2017 -0400 @@ -1,6 +1,6 @@ <?xml version="1.0"?> <tool_dependency> - <package name="biopython" version="1.62"> - <repository changeset_revision="3e82cbc44886" name="package_biopython_1_62" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + <package name="biopython" version="1.67"> + <repository changeset_revision="a42f244cce44" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency>