Mercurial > repos > peterjc > seq_filter_by_id
changeset 10:4a7d8ad2a983 draft
Bump Biopython dependency
author | peterjc |
---|---|
date | Thu, 30 Nov 2023 09:50:34 +0000 |
parents | 141612f8c3e3 |
children | 83a19df00eab |
files | tools/seq_filter_by_id/README.rst tools/seq_filter_by_id/seq_filter_by_id.py tools/seq_filter_by_id/seq_filter_by_id.xml tools/seq_filter_by_id/tool_dependencies.xml |
diffstat | 4 files changed, 132 insertions(+), 72 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/seq_filter_by_id/README.rst Thu May 11 12:18:52 2017 -0400 +++ b/tools/seq_filter_by_id/README.rst Thu Nov 30 09:50:34 2023 +0000 @@ -1,7 +1,7 @@ Galaxy tool to filter FASTA, FASTQ or SFF sequences by ID ========================================================= -This tool is copyright 2010-2017 by Peter Cock, The James Hutton Institute +This tool is copyright 2010-2023 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. See the licence text below. @@ -76,7 +76,7 @@ v0.0.8 - Simplified XML to apply input format to output data. v0.2.0 - Can supply ID list as a text parameter (instead of in a file) - Using ``optparse`` for the Python command line API. - - Advanced option to ignore paired read suffices. + - Advanced option to ignore paired read suffixes. - Updated dependencies to use Biopython 1.64. v0.2.1 - Use Biopython instead of Galaxy for FASTQ handling. - Tool definition now embeds citation information. @@ -95,6 +95,7 @@ v0.2.7 - Python 3 compatible print function. - Use ``<command detect_errors="aggressive">`` (internal change only). - Single quote command line arguments (internal change only). +v0.2.8 - Bumped Biopython dependency version for Python 3 fixes. ======= ====================================================================== @@ -124,7 +125,7 @@ $ planemo shed_upload --tar_only tools/seq_filter_by_id/ ... - $ tar -tzf shed_upload.tar.gz + $ tar -tzf shed_upload.tar.gz test-data/empty_file.dat test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular
--- a/tools/seq_filter_by_id/seq_filter_by_id.py Thu May 11 12:18:52 2017 -0400 +++ b/tools/seq_filter_by_id/seq_filter_by_id.py Thu Nov 30 09:50:34 2023 +0000 @@ -19,7 +19,7 @@ Cock et al 2009. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. +https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. @@ -49,31 +49,66 @@ the -t or --text option. """ parser = OptionParser(usage=usage) -parser.add_option('-i', '--input', dest='input', - default=None, help='Input sequences filename', - metavar="FILE") -parser.add_option('-f', '--format', dest='format', - default=None, - help='Input sequence format (e.g. fasta, fastq, sff)') -parser.add_option('-t', '--text', dest='id_list', - default=None, help="Lists of white space separated IDs (instead of a tabular file)") -parser.add_option('-p', '--positive', dest='output_positive', - default=None, - help='Output filename for matches', - metavar="FILE") -parser.add_option('-n', '--negative', dest='output_negative', - default=None, - help='Output filename for non-matches', - metavar="FILE") -parser.add_option("-l", "--logic", dest="logic", - default="UNION", - help="How to combined multiple ID columns (UNION or INTERSECTION)") -parser.add_option("-s", "--suffix", dest="suffix", - action="store_true", - help="Ignore pair-read suffices for matching names") -parser.add_option("-v", "--version", dest="version", - default=False, action="store_true", - help="Show version and quit") +parser.add_option( + "-i", + "--input", + dest="input", + default=None, + help="Input sequences filename", + metavar="FILE", +) +parser.add_option( + "-f", + "--format", + dest="format", + default=None, + help="Input sequence format (e.g. fasta, fastq, sff)", +) +parser.add_option( + "-t", + "--text", + dest="id_list", + default=None, + help="Lists of white space separated IDs (instead of a tabular file)", +) +parser.add_option( + "-p", + "--positive", + dest="output_positive", + default=None, + help="Output filename for matches", + metavar="FILE", +) +parser.add_option( + "-n", + "--negative", + dest="output_negative", + default=None, + help="Output filename for non-matches", + metavar="FILE", +) +parser.add_option( + "-l", + "--logic", + dest="logic", + default="UNION", + help="How to combined multiple ID columns (UNION or INTERSECTION)", +) +parser.add_option( + "-s", + "--suffix", + dest="suffix", + action="store_true", + help="Ignore pair-read suffixes for matching names", +) +parser.add_option( + "-v", + "--version", + dest="version", + default=False, + action="store_true", + help="Show version and quit", +) options, args = parser.parse_args() @@ -86,7 +121,7 @@ out_positive_file = options.output_positive out_negative_file = options.output_negative logic = options.logic -drop_suffices = bool(options.suffix) +drop_suffixes = bool(options.suffix) if in_file is None or not os.path.isfile(in_file): sys.exit("Missing input file: %r" % in_file) @@ -132,9 +167,14 @@ try: columns = [int(arg) - 1 for arg in cols_arg.split(",")] except ValueError: - sys.exit("Expected list of columns (comma separated integers), got %r" % cols_arg) + sys.exit( + "Expected list of columns (comma separated integers), got %r" % cols_arg + ) if min(columns) < 0: - sys.exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) + sys.exit( + "Expect one-based column numbers (not zero-based counting), got %r" + % cols_arg + ) identifiers.append((tabular_file, columns)) name_warn = False @@ -145,12 +185,15 @@ parts = name.split(None, 1) global name_warn if not name_warn and len(parts) > 1: - name_warn = "WARNING: Some of your identifiers had white space in them, " + \ - "using first word only. e.g.:\n%s\n" % name + name_warn = ( + "WARNING: Some of your identifiers had white space in them, " + + "using first word only. e.g.:\n%s\n" % name + ) return parts[0] -if drop_suffices: +if drop_suffixes: + def clean_name(name): """Remove suffix.""" name = check_white_space(name) @@ -158,10 +201,11 @@ if match: # Use the fact this is a suffix, and regular expression will be # anchored to the end of the name: - return name[:match.start()] + return name[: match.start()] else: # Nothing to do return name + assert clean_name("foo/1") == "foo" assert clean_name("foo/2") == "foo" assert clean_name("bar.f") == "bar" @@ -174,19 +218,19 @@ mapped_chars = { - '>': '__gt__', - '<': '__lt__', - "'": '__sq__', - '"': '__dq__', - '[': '__ob__', - ']': '__cb__', - '{': '__oc__', - '}': '__cc__', - '@': '__at__', - '\n': '__cn__', - '\r': '__cr__', - '\t': '__tc__', - '#': '__pd__', + ">": "__gt__", + "<": "__lt__", + "'": "__sq__", + '"': "__dq__", + "[": "__ob__", + "]": "__cb__", + "{": "__oc__", + "}": "__cc__", + "@": "__at__", + "\n": "__cn__", + "\r": "__cr__", + "\t": "__tc__", + "#": "__pd__", } # Read tabular file(s) and record all specified identifiers @@ -225,7 +269,10 @@ name = clean_name(line.rstrip("\n").split("\t")[col]) if name: file_ids.add(name) - print("Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns))) + print( + "Using %i IDs from column %s in tabular file" + % (len(file_ids), ", ".join(str(col + 1) for col in columns)) + ) if ids is None: ids = file_ids if logic == "UNION": @@ -235,15 +282,19 @@ handle.close() if len(identifiers) > 1: if logic == "UNION": - print("Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))) + print( + "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)) + ) else: - print("Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))) + print( + "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) + ) if name_warn: sys.stderr.write(name_warn) def crude_fasta_iterator(handle): - """Yields tuples, record ID and the full record as a string.""" + """Parse FASTA file yielding tuples of (name, sequence).""" while True: line = handle.readline() if line == "": @@ -254,8 +305,7 @@ no_id_warned = False while True: if line[0] != ">": - raise ValueError( - "Records in Fasta files should start with '>' character") + raise ValueError("Records in Fasta files should start with '>' character") try: id = line[1:].split(None, 1)[0] except IndexError: @@ -320,6 +370,7 @@ def fastq_filter(in_file, pos_file, neg_file, wanted): """FASTQ filter.""" from Bio.SeqIO.QualityIO import FastqGeneralIterator + handle = open(in_file, "r") if pos_file is not None and neg_file is not None: print("Generating two FASTQ files") @@ -378,13 +429,17 @@ out_handle = open(pos_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest - pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) + pos_count = writer.write_file( + rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted + ) out_handle.close() if neg_file is not None: out_handle = open(neg_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again - neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) + neg_count = writer.write_file( + rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted + ) out_handle.close() # And we're done in_handle.close() @@ -395,12 +450,16 @@ if seq_format.lower() == "sff": # Now write filtered SFF file based on IDs wanted - pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) + pos_count, neg_count = sff_filter( + in_file, out_positive_file, out_negative_file, ids + ) # At the time of writing, Galaxy doesn't show SFF file read counts, # so it is useful to put them in stdout and thus shown in job info. elif seq_format.lower() == "fasta": # Write filtered FASTA file based on IDs from tabular file - pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) + pos_count, neg_count = fasta_filter( + in_file, out_positive_file, out_negative_file, ids + ) print("%i with and %i without specified IDs" % (pos_count, neg_count)) elif seq_format.lower().startswith("fastq"): # Write filtered FASTQ file based on IDs from tabular file
--- a/tools/seq_filter_by_id/seq_filter_by_id.xml Thu May 11 12:18:52 2017 -0400 +++ b/tools/seq_filter_by_id/seq_filter_by_id.xml Thu Nov 30 09:50:34 2023 +0000 @@ -1,7 +1,7 @@ -<tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.2.7"> +<tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.2.8"> <description>from a tabular file</description> <requirements> - <requirement type="package" version="1.67">biopython</requirement> + <requirement type="package" version="1.81">biopython</requirement> </requirements> <version_command> python $__tool_directory__/seq_filter_by_id.py --version @@ -30,20 +30,20 @@ <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." /> <conditional name="id_opts"> <param name="id_opts_selector" type="select" label="Filter using the ID list from"> - <option value="tabular" selected="True">tabular file</option> + <option value="tabular" selected="true">tabular file</option> <option value="list">provided list</option> <!-- add UNION or INTERSECTION of multiple tabular files here? --> </param> <when value="tabular"> <param name="input_tabular" type="data" format="tabular" label="Tabular file containing sequence identifiers"/> - <param name="columns" type="data_column" data_ref="input_tabular" multiple="True" numerical="False" + <param name="columns" type="data_column" data_ref="input_tabular" multiple="true" numerical="false" label="Column(s) containing sequence identifiers" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"> <validator type="no_options" message="Pick at least one column"/> </param> </when> <when value="list"> - <param name="id_list" type="text" size="20x80" area="True" format="tabular" + <param name="id_list" type="text" size="20x80" area="true" format="tabular" label="List of sequence identifiers (white space separated)" help="You can use both spaces and new lines to separate your identifiers."> <sanitizer> @@ -69,12 +69,12 @@ </conditional> <conditional name="adv_opts"> <param name="adv_opts_selector" type="select" label="Advanced Options"> - <option value="basic" selected="True">Hide Advanced Options</option> + <option value="basic" selected="true">Hide Advanced Options</option> <option value="advanced">Show Advanced Options</option> </param> <when value="basic" /> <when value="advanced"> - <param name="strip_suffix" type="boolean" value="false" label="Remove typical pair read name suffices when matching identifiers?" help="Will remove suffices including Illumina /1 and /2, Roche 454 .f and .r, and assorted Sanger names like .p* and .q*" /> + <param name="strip_suffix" type="boolean" value="false" label="Remove typical pair read name suffixes when matching identifiers?" help="Will remove suffixes including Illumina /1 and /2, Roche 454 .f and .r, and assorted Sanger names like .p* and .q*" /> </when> </conditional> </inputs> @@ -128,7 +128,7 @@ <param name="adv_opts_selector" value="advanced" /> <param name="strip_suffix" value="true" /> <output name="output_pos" file="sanger-pairs-mixed.fastq" ftype="fastq" /> - <output name="output_neg" file="empty_file.dat" ftype="fastq" /> + <output name="output_neg" file="empty_file.dat" ftype="fastq" /> </test> <test> <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" /> @@ -180,14 +180,14 @@ Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). Galaxy tools and workflows for sequence analysis with applications in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 +https://doi.org/10.7717/peerj.167 This tool uses Biopython to read and write SFF files, so you may also wish to cite the Biopython application note (and Galaxy too of course): Cock et al (2009). Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. +https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This tool is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id
--- a/tools/seq_filter_by_id/tool_dependencies.xml Thu May 11 12:18:52 2017 -0400 +++ b/tools/seq_filter_by_id/tool_dependencies.xml Thu Nov 30 09:50:34 2023 +0000 @@ -1,6 +1,6 @@ -<?xml version="1.0"?> +<?xml version="1.0" ?> <tool_dependency> <package name="biopython" version="1.67"> - <repository changeset_revision="a42f244cce44" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="a12f73c3b116"/> </package> -</tool_dependency> +</tool_dependency> \ No newline at end of file