Mercurial > repos > peterjc > seq_filter_by_id

--- a/tools/seq_filter_by_id/README.rst	Thu May 11 12:18:52 2017 -0400
+++ b/tools/seq_filter_by_id/README.rst	Thu Nov 30 09:50:34 2023 +0000
@@ -1,7 +1,7 @@
 Galaxy tool to filter FASTA, FASTQ or SFF sequences by ID
 =========================================================

-This tool is copyright 2010-2017 by Peter Cock, The James Hutton Institute
+This tool is copyright 2010-2023 by Peter Cock, The James Hutton Institute
 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
 See the licence text below.

@@ -76,7 +76,7 @@
 v0.0.8  - Simplified XML to apply input format to output data.
 v0.2.0  - Can supply ID list as a text parameter (instead of in a file)
         - Using ``optparse`` for the Python command line API.
-        - Advanced option to ignore paired read suffices.
+        - Advanced option to ignore paired read suffixes.
         - Updated dependencies to use Biopython 1.64.
 v0.2.1  - Use Biopython instead of Galaxy for FASTQ handling.
         - Tool definition now embeds citation information.
@@ -95,6 +95,7 @@
 v0.2.7  - Python 3 compatible print function.
         - Use ``<command detect_errors="aggressive">`` (internal change only).
         - Single quote command line arguments (internal change only).
+v0.2.8  - Bumped Biopython dependency version for Python 3 fixes.
 ======= ======================================================================


@@ -124,7 +125,7 @@

     $ planemo shed_upload --tar_only tools/seq_filter_by_id/
     ...
-    $ tar -tzf shed_upload.tar.gz
+    $ tar -tzf shed_upload.tar.gz
     test-data/empty_file.dat
     test-data/k12_hypothetical.fasta
     test-data/k12_hypothetical.tabular
--- a/tools/seq_filter_by_id/seq_filter_by_id.py	Thu May 11 12:18:52 2017 -0400
+++ b/tools/seq_filter_by_id/seq_filter_by_id.py	Thu Nov 30 09:50:34 2023 +0000
@@ -19,7 +19,7 @@

 Cock et al 2009. Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute
 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
@@ -49,31 +49,66 @@
 the -t or --text option.
 """
 parser = OptionParser(usage=usage)
-parser.add_option('-i', '--input', dest='input',
-                  default=None, help='Input sequences filename',
-                  metavar="FILE")
-parser.add_option('-f', '--format', dest='format',
-                  default=None,
-                  help='Input sequence format (e.g. fasta, fastq, sff)')
-parser.add_option('-t', '--text', dest='id_list',
-                  default=None, help="Lists of white space separated IDs (instead of a tabular file)")
-parser.add_option('-p', '--positive', dest='output_positive',
-                  default=None,
-                  help='Output filename for matches',
-                  metavar="FILE")
-parser.add_option('-n', '--negative', dest='output_negative',
-                  default=None,
-                  help='Output filename for non-matches',
-                  metavar="FILE")
-parser.add_option("-l", "--logic", dest="logic",
-                  default="UNION",
-                  help="How to combined multiple ID columns (UNION or INTERSECTION)")
-parser.add_option("-s", "--suffix", dest="suffix",
-                  action="store_true",
-                  help="Ignore pair-read suffices for matching names")
-parser.add_option("-v", "--version", dest="version",
-                  default=False, action="store_true",
-                  help="Show version and quit")
+parser.add_option(
+    "-i",
+    "--input",
+    dest="input",
+    default=None,
+    help="Input sequences filename",
+    metavar="FILE",
+)
+parser.add_option(
+    "-f",
+    "--format",
+    dest="format",
+    default=None,
+    help="Input sequence format (e.g. fasta, fastq, sff)",
+)
+parser.add_option(
+    "-t",
+    "--text",
+    dest="id_list",
+    default=None,
+    help="Lists of white space separated IDs (instead of a tabular file)",
+)
+parser.add_option(
+    "-p",
+    "--positive",
+    dest="output_positive",
+    default=None,
+    help="Output filename for matches",
+    metavar="FILE",
+)
+parser.add_option(
+    "-n",
+    "--negative",
+    dest="output_negative",
+    default=None,
+    help="Output filename for non-matches",
+    metavar="FILE",
+)
+parser.add_option(
+    "-l",
+    "--logic",
+    dest="logic",
+    default="UNION",
+    help="How to combined multiple ID columns (UNION or INTERSECTION)",
+)
+parser.add_option(
+    "-s",
+    "--suffix",
+    dest="suffix",
+    action="store_true",
+    help="Ignore pair-read suffixes for matching names",
+)
+parser.add_option(
+    "-v",
+    "--version",
+    dest="version",
+    default=False,
+    action="store_true",
+    help="Show version and quit",
+)

 options, args = parser.parse_args()

@@ -86,7 +121,7 @@
 out_positive_file = options.output_positive
 out_negative_file = options.output_negative
 logic = options.logic
-drop_suffices = bool(options.suffix)
+drop_suffixes = bool(options.suffix)

 if in_file is None or not os.path.isfile(in_file):
     sys.exit("Missing input file: %r" % in_file)
@@ -132,9 +167,14 @@
     try:
         columns = [int(arg) - 1 for arg in cols_arg.split(",")]
     except ValueError:
-        sys.exit("Expected list of columns (comma separated integers), got %r" % cols_arg)
+        sys.exit(
+            "Expected list of columns (comma separated integers), got %r" % cols_arg
+        )
     if min(columns) < 0:
-        sys.exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
+        sys.exit(
+            "Expect one-based column numbers (not zero-based counting), got %r"
+            % cols_arg
+        )
     identifiers.append((tabular_file, columns))

 name_warn = False
@@ -145,12 +185,15 @@
     parts = name.split(None, 1)
     global name_warn
     if not name_warn and len(parts) > 1:
-        name_warn = "WARNING: Some of your identifiers had white space in them, " + \
-                    "using first word only. e.g.:\n%s\n" % name
+        name_warn = (
+            "WARNING: Some of your identifiers had white space in them, "
+            + "using first word only. e.g.:\n%s\n" % name
+        )
     return parts[0]


-if drop_suffices:
+if drop_suffixes:
+
     def clean_name(name):
         """Remove suffix."""
         name = check_white_space(name)
@@ -158,10 +201,11 @@
         if match:
             # Use the fact this is a suffix, and regular expression will be
             # anchored to the end of the name:
-            return name[:match.start()]
+            return name[: match.start()]
         else:
             # Nothing to do
             return name
+
     assert clean_name("foo/1") == "foo"
     assert clean_name("foo/2") == "foo"
     assert clean_name("bar.f") == "bar"
@@ -174,19 +218,19 @@


 mapped_chars = {
-    '>': '__gt__',
-    '<': '__lt__',
-    "'": '__sq__',
-    '"': '__dq__',
-    '[': '__ob__',
-    ']': '__cb__',
-    '{': '__oc__',
-    '}': '__cc__',
-    '@': '__at__',
-    '\n': '__cn__',
-    '\r': '__cr__',
-    '\t': '__tc__',
-    '#': '__pd__',
+    ">": "__gt__",
+    "<": "__lt__",
+    "'": "__sq__",
+    '"': "__dq__",
+    "[": "__ob__",
+    "]": "__cb__",
+    "{": "__oc__",
+    "}": "__cc__",
+    "@": "__at__",
+    "\n": "__cn__",
+    "\r": "__cr__",
+    "\t": "__tc__",
+    "#": "__pd__",
 }

 # Read tabular file(s) and record all specified identifiers
@@ -225,7 +269,10 @@
                 name = clean_name(line.rstrip("\n").split("\t")[col])
                 if name:
                     file_ids.add(name)
-    print("Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns)))
+    print(
+        "Using %i IDs from column %s in tabular file"
+        % (len(file_ids), ", ".join(str(col + 1) for col in columns))
+    )
     if ids is None:
         ids = file_ids
     if logic == "UNION":
@@ -235,15 +282,19 @@
     handle.close()
 if len(identifiers) > 1:
     if logic == "UNION":
-        print("Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)))
+        print(
+            "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))
+        )
     else:
-        print("Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)))
+        print(
+            "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))
+        )
 if name_warn:
     sys.stderr.write(name_warn)


 def crude_fasta_iterator(handle):
-    """Yields tuples, record ID and the full record as a string."""
+    """Parse FASTA file yielding tuples of (name, sequence)."""
     while True:
         line = handle.readline()
         if line == "":
@@ -254,8 +305,7 @@
     no_id_warned = False
     while True:
         if line[0] != ">":
-            raise ValueError(
-                "Records in Fasta files should start with '>' character")
+            raise ValueError("Records in Fasta files should start with '>' character")
         try:
             id = line[1:].split(None, 1)[0]
         except IndexError:
@@ -320,6 +370,7 @@
 def fastq_filter(in_file, pos_file, neg_file, wanted):
     """FASTQ filter."""
     from Bio.SeqIO.QualityIO import FastqGeneralIterator
+
     handle = open(in_file, "r")
     if pos_file is not None and neg_file is not None:
         print("Generating two FASTQ files")
@@ -378,13 +429,17 @@
         out_handle = open(pos_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
         in_handle.seek(0)  # start again after getting manifest
-        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted)
+        pos_count = writer.write_file(
+            rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted
+        )
         out_handle.close()
     if neg_file is not None:
         out_handle = open(neg_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
         in_handle.seek(0)  # start again
-        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted)
+        neg_count = writer.write_file(
+            rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted
+        )
         out_handle.close()
     # And we're done
     in_handle.close()
@@ -395,12 +450,16 @@

 if seq_format.lower() == "sff":
     # Now write filtered SFF file based on IDs wanted
-    pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids)
+    pos_count, neg_count = sff_filter(
+        in_file, out_positive_file, out_negative_file, ids
+    )
     # At the time of writing, Galaxy doesn't show SFF file read counts,
     # so it is useful to put them in stdout and thus shown in job info.
 elif seq_format.lower() == "fasta":
     # Write filtered FASTA file based on IDs from tabular file
-    pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
+    pos_count, neg_count = fasta_filter(
+        in_file, out_positive_file, out_negative_file, ids
+    )
     print("%i with and %i without specified IDs" % (pos_count, neg_count))
 elif seq_format.lower().startswith("fastq"):
     # Write filtered FASTQ file based on IDs from tabular file
--- a/tools/seq_filter_by_id/seq_filter_by_id.xml	Thu May 11 12:18:52 2017 -0400
+++ b/tools/seq_filter_by_id/seq_filter_by_id.xml	Thu Nov 30 09:50:34 2023 +0000
@@ -1,7 +1,7 @@
-<tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.2.7">
+<tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.2.8">
     <description>from a tabular file</description>
     <requirements>
-        <requirement type="package" version="1.67">biopython</requirement>
+        <requirement type="package" version="1.81">biopython</requirement>
     </requirements>
     <version_command>
 python $__tool_directory__/seq_filter_by_id.py --version
@@ -30,20 +30,20 @@
         <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." />
         <conditional name="id_opts">
             <param name="id_opts_selector" type="select" label="Filter using the ID list from">
-                <option value="tabular" selected="True">tabular file</option>
+                <option value="tabular" selected="true">tabular file</option>
                 <option value="list">provided list</option>
                 <!-- add UNION or INTERSECTION of multiple tabular files here? -->
             </param>
             <when value="tabular">
                 <param name="input_tabular" type="data" format="tabular" label="Tabular file containing sequence identifiers"/>
-                <param name="columns" type="data_column" data_ref="input_tabular" multiple="True" numerical="False"
+                <param name="columns" type="data_column" data_ref="input_tabular" multiple="true" numerical="false"
                        label="Column(s) containing sequence identifiers"
                        help="Multi-select list - hold the appropriate key while clicking to select multiple columns">
                     <validator type="no_options" message="Pick at least one column"/>
                 </param>
             </when>
             <when value="list">
-                <param name="id_list" type="text" size="20x80" area="True" format="tabular"
+                <param name="id_list" type="text" size="20x80" area="true" format="tabular"
                        label="List of sequence identifiers (white space separated)"
                        help="You can use both spaces and new lines to separate your identifiers.">
                     <sanitizer>
@@ -69,12 +69,12 @@
         </conditional>
         <conditional name="adv_opts">
             <param name="adv_opts_selector" type="select" label="Advanced Options">
-              <option value="basic" selected="True">Hide Advanced Options</option>
+              <option value="basic" selected="true">Hide Advanced Options</option>
               <option value="advanced">Show Advanced Options</option>
             </param>
             <when value="basic" />
             <when value="advanced">
-                <param name="strip_suffix" type="boolean" value="false" label="Remove typical pair read name suffices when matching identifiers?" help="Will remove suffices including Illumina /1 and /2, Roche 454 .f and .r, and assorted Sanger names like .p* and .q*" />
+                <param name="strip_suffix" type="boolean" value="false" label="Remove typical pair read name suffixes when matching identifiers?" help="Will remove suffixes including Illumina /1 and /2, Roche 454 .f and .r, and assorted Sanger names like .p* and .q*" />
             </when>
         </conditional>
     </inputs>
@@ -128,7 +128,7 @@
             <param name="adv_opts_selector" value="advanced" />
             <param name="strip_suffix" value="true" />
             <output name="output_pos" file="sanger-pairs-mixed.fastq" ftype="fastq" />
-	    <output name="output_neg" file="empty_file.dat" ftype="fastq" />
+            <output name="output_neg" file="empty_file.dat" ftype="fastq" />
         </test>
         <test>
             <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" />
@@ -180,14 +180,14 @@
 Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
 Galaxy tools and workflows for sequence analysis with applications
 in molecular plant pathology. PeerJ 1:e167
-http://dx.doi.org/10.7717/peerj.167
+https://doi.org/10.7717/peerj.167

 This tool uses Biopython to read and write SFF files, so you may also wish to
 cite the Biopython application note (and Galaxy too of course):

 Cock et al (2009). Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

 This tool is available to install into other Galaxy Instances via the Galaxy
 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id
--- a/tools/seq_filter_by_id/tool_dependencies.xml	Thu May 11 12:18:52 2017 -0400
+++ b/tools/seq_filter_by_id/tool_dependencies.xml	Thu Nov 30 09:50:34 2023 +0000
@@ -1,6 +1,6 @@
-<?xml version="1.0"?>
+<?xml version="1.0" ?>
 <tool_dependency>
     <package name="biopython" version="1.67">
-        <repository changeset_revision="a42f244cce44" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+        <repository name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="a12f73c3b116"/>
     </package>
-</tool_dependency>
+</tool_dependency>
\ No newline at end of file