Mercurial > repos > peterjc > seq_filter_by_mapping

--- a/tools/seq_filter_by_mapping/README.rst	Wed May 13 11:08:58 2015 -0400
+++ b/tools/seq_filter_by_mapping/README.rst	Wed May 10 13:16:44 2017 -0400
@@ -1,7 +1,7 @@
 Galaxy tool to filter FASTA, FASTQ or SFF sequences by SAM/BAM mapping
 ======================================================================

-This tool is copyright 2014-2015 by Peter Cock, The James Hutton Institute
+This tool is copyright 2014-2017 by Peter Cock, The James Hutton Institute
 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
 See the licence text below.

@@ -66,6 +66,10 @@
 v0.0.4  - Use the ``format_source=...`` tag.
         - Reorder XML elements (internal change only).
         - Planemo for Tool Shed upload (``.shed.yml``, internal change only).
+v0.0.5  - Python script cleanups (internal change only).
+        - Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
+        - Use ``<command detect_errors="aggressive">`` (internal change only).
+        - Single quote command line arguments (internal change only).
 ======= ======================================================================


@@ -82,17 +86,17 @@
 Planemo commands (which requires you have set your Tool Shed access details in
 ``~/.planemo.yml`` and that you have access rights on the Tool Shed)::

-    $ planemo shed_upload --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/
+    $ planemo shed_update -t testtoolshed --check_diff tools/seq_filter_by_mapping/
     ...

 or::

-    $ planemo shed_upload --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/
+    $ planemo shed_update -t toolshed --check_diff tools/seq_filter_by_mapping/
     ...

 To just build and check the tar ball, use::

-    $ planemo shed_upload --tar_only  ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/
+    $ planemo shed_upload --tar_only tools/seq_filter_by_mapping/
     ...
     $ tar -tzf shed_upload.tar.gz
     test-data/SRR639755_mito_pairs.fastq.gz
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py	Wed May 13 11:08:58 2015 -0400
+++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py	Wed May 10 13:16:44 2017 -0400
@@ -18,17 +18,15 @@

 Use -v or --version to get the version, -h or --help for help.
 """
+
 import os
-import sys
 import re
 import subprocess
+import sys
+
 from optparse import OptionParser

-def sys_exit(msg, err=1):
-    sys.stderr.write(msg.rstrip() + "\n")
-    sys.exit(err)
-
-#Parse Command Line
+# Parse Command Line
 usage = """Use as follows:

 $ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings]
@@ -64,7 +62,7 @@
 options, args = parser.parse_args()

 if options.version:
-    print "v0.0.3"
+    print "v0.0.5"
     sys.exit(0)

 in_file = options.input
@@ -74,27 +72,27 @@
 pair_mode = options.pair_mode

 if in_file is None or not os.path.isfile(in_file):
-    sys_exit("Missing input file: %r" % in_file)
+    sys.exit("Missing input file: %r" % in_file)
 if out_positive_file is None and out_negative_file is None:
-    sys_exit("Neither output file requested")
+    sys.exit("Neither output file requested")
 if seq_format is None:
-    sys_exit("Missing sequence format")
+    sys.exit("Missing sequence format")
 if pair_mode not in ["lax", "strict"]:
-    sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode)
+    sys.exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode)
 for mapping in args:
     if not os.path.isfile(mapping):
-        sys_exit("Mapping file %r not found" % mapping)
+        sys.exit("Mapping file %r not found" % mapping)
 if not args:
-    sys_exit("At least one SAM/BAM mapping file is required")
+    sys.exit("At least one SAM/BAM mapping file is required")


-#Cope with three widely used suffix naming convensions,
-#Illumina: /1 or /2
-#Forward/revered: .f or .r
-#Sanger, e.g. .p1k and .q1k
-#See http://staden.sourceforge.net/manual/pregap4_unix_50.html
-#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
-#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
+# Cope with three widely used suffix naming convensions,
+# Illumina: /1 or /2
+# Forward/revered: .f or .r
+# Sanger, e.g. .p1k and .q1k
+# See http://staden.sourceforge.net/manual/pregap4_unix_50.html
+# re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
+# re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
 assert re_suffix.search("demo.f")
 assert re_suffix.search("demo.s1")
@@ -107,6 +105,7 @@
 assert re_suffix.search("demo.q1")
 assert re_suffix.search("demo.q1lk")

+
 def clean_name(name):
     """Remove suffix."""
     match = re_suffix.search(name)
@@ -117,6 +116,8 @@
     else:
         # Nothing to do
         return name
+
+
 assert clean_name("foo/1") == "foo"
 assert clean_name("foo/2") == "foo"
 assert clean_name("bar.f") == "bar"
@@ -124,20 +125,22 @@
 assert clean_name("baz.p1") == "baz"
 assert clean_name("baz.q2") == "baz"

-mapped_chars = { '>' :'__gt__',
-                 '<' :'__lt__',
-                 "'" :'__sq__',
-                 '"' :'__dq__',
-                 '[' :'__ob__',
-                 ']' :'__cb__',
-                 '{' :'__oc__',
-                 '}' :'__cc__',
-                 '@' : '__at__',
-                 '\n' : '__cn__',
-                 '\r' : '__cr__',
-                 '\t' : '__tc__',
-                 '#' : '__pd__'
-                 }
+mapped_chars = {
+    '>': '__gt__',
+    '<': '__lt__',
+    "'": '__sq__',
+    '"': '__dq__',
+    '[': '__ob__',
+    ']': '__cb__',
+    '{': '__oc__',
+    '}': '__cc__',
+    '@': '__at__',
+    '\n': '__cn__',
+    '\r': '__cr__',
+    '\t': '__tc__',
+    '#': '__pd__',
+}
+

 def load_mapping_ids(filename, pair_mode, ids):
     """Parse SAM/BAM file, updating given set of ids.
@@ -189,7 +192,7 @@
         if child.returncode:
             msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode,
                                                             filename, stderr)
-            sys_exit(msg.strip(), child.returncode)
+            sys.exit(msg.strip(), child.returncode)
     else:
         handle.close()

@@ -204,12 +207,13 @@
 if len(ids) < 10:
     print("Looking for %s" % ", ".join(sorted(ids)))

+
 def crude_fasta_iterator(handle):
     """Yields tuples, record ID and the full record as a string."""
     while True:
         line = handle.readline()
         if line == "":
-            return # Premature end of file, or just empty?
+            return  # Premature end of file, or just empty?
         if line[0] == ">":
             break

@@ -236,16 +240,16 @@
             line = handle.readline()
         yield id, "".join(lines)
         if not line:
-            return # StopIteration
+            return  # StopIteration


 def fasta_filter(in_file, pos_file, neg_file, wanted):
     """FASTA filter producing 60 character line wrapped outout."""
     pos_count = neg_count = 0
-    #Galaxy now requires Python 2.5+ so can use with statements,
+    # Galaxy now requires Python 2.5+ so can use with statements,
     with open(in_file) as in_handle:
-        #Doing the if statement outside the loop for speed
-        #(with the downside of three very similar loops).
+        # Doing the if statement outside the loop for speed
+        # (with the downside of three very similar loops).
         if pos_file is not None and neg_file is not None:
             print "Generating two FASTA files"
             with open(pos_file, "w") as pos_handle:
@@ -284,14 +288,14 @@
     from Bio.SeqIO.QualityIO import FastqGeneralIterator
     pos_count = neg_count = 0
     handle = open(in_file, "r")
-    if out_positive_file is not None and out_negative_file is not None:
+    if pos_file is not None and neg_file is not None:
         print "Generating two FASTQ files"
-        positive_handle = open(out_positive_file, "w")
-        negative_handle = open(out_negative_file, "w")
+        positive_handle = open(pos_file, "w")
+        negative_handle = open(neg_file, "w")
         print in_file
         for title, seq, qual in FastqGeneralIterator(handle):
             # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
-            if clean_name(title.split(None, 1)[0]) in ids:
+            if clean_name(title.split(None, 1)[0]) in wanted:
                 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                 pos_count += 1
             else:
@@ -299,21 +303,21 @@
                 neg_count += 1
         positive_handle.close()
         negative_handle.close()
-    elif out_positive_file is not None:
+    elif pos_file is not None:
         print "Generating matching FASTQ file"
-        positive_handle = open(out_positive_file, "w")
+        positive_handle = open(pos_file, "w")
         for title, seq, qual in FastqGeneralIterator(handle):
-            if clean_name(title.split(None, 1)[0]) in ids:
+            if clean_name(title.split(None, 1)[0]) in wanted:
                 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                 pos_count += 1
             else:
                 neg_count += 1
         positive_handle.close()
-    elif out_negative_file is not None:
+    elif neg_file is not None:
         print "Generating non-matching FASTQ file"
-        negative_handle = open(out_negative_file, "w")
+        negative_handle = open(neg_file, "w")
         for title, seq, qual in FastqGeneralIterator(handle):
-            if clean_name(title.split(None, 1)[0]) in ids:
+            if clean_name(title.split(None, 1)[0]) in wanted:
                 pos_count += 1
             else:
                 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
@@ -328,48 +332,48 @@
     try:
         from Bio.SeqIO.SffIO import SffIterator, SffWriter
     except ImportError:
-        sys_exit("SFF filtering requires Biopython 1.54 or later")
+        sys.exit("SFF filtering requires Biopython 1.54 or later")

     try:
         from Bio.SeqIO.SffIO import ReadRocheXmlManifest
     except ImportError:
-        #Prior to Biopython 1.56 this was a private function
+        # Prior to Biopython 1.56 this was a private function
         from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest

-    in_handle = open(in_file, "rb") #must be binary mode!
+    in_handle = open(in_file, "rb")  # must be binary mode!
     try:
         manifest = ReadRocheXmlManifest(in_handle)
     except ValueError:
         manifest = None

-    #This makes two passes though the SFF file with isn't so efficient,
-    #but this makes the code simple.
+    # This makes two passes though the SFF file with isn't so efficient,
+    # but this makes the code simple.
     pos_count = neg_count = 0
-    if out_positive_file is not None:
-        out_handle = open(out_positive_file, "wb")
+    if pos_file is not None:
+        out_handle = open(pos_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
-        in_handle.seek(0) #start again after getting manifest
-        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
+        in_handle.seek(0)  # start again after getting manifest
+        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted)
         out_handle.close()
-    if out_negative_file is not None:
-        out_handle = open(out_negative_file, "wb")
+    if neg_file is not None:
+        out_handle = open(neg_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
-        in_handle.seek(0) #start again
-        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
+        in_handle.seek(0)  # start again
+        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted)
         out_handle.close()
-    #And we're done
+    # And we're done
     in_handle.close()
     return pos_count, neg_count


-if seq_format.lower()=="sff":
+if seq_format.lower() == "sff":
     sequence_filter = sff_filter
-elif seq_format.lower()=="fasta":
+elif seq_format.lower() == "fasta":
     sequence_filter = fasta_filter
 elif seq_format.lower().startswith("fastq"):
     sequence_filter = fastq_filter
 else:
-    sys_exit("Unsupported file type %r" % seq_format)
+    sys.exit("Unsupported file type %r" % seq_format)

 pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids)
 print("%i mapped and %i unmapped reads." % (pos_count, neg_count))
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml	Wed May 13 11:08:58 2015 -0400
+++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml	Wed May 10 13:16:44 2017 -0400
@@ -1,28 +1,23 @@
-<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.4">
+<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.5">
     <description>from SAM/BAM file</description>
     <requirements>
-        <requirement type="package" version="1.64">biopython</requirement>
-        <requirement type="python-module">Bio</requirement>
-        <requirement type="binary">samtools</requirement>
+        <requirement type="package" version="1.67">biopython</requirement>
         <requirement type="package" version="0.1.19">samtools</requirement>
     </requirements>
-    <stdio>
-        <!-- Anything other than zero is an error -->
-        <exit_code range="1:" />
-        <exit_code range=":-1" />
-    </stdio>
-    <version_command interpreter="python">seq_filter_by_mapping.py --version</version_command>
-    <command interpreter="python">
-seq_filter_by_mapping.py -i "$input_file" -f "$input_file.ext" -m $pair_mode
+    <version_command>
+python $__tool_directory__/seq_filter_by_mapping.py --version
+    </version_command>
+    <command detect_errors="aggressive">
+python $__tool_directory__/seq_filter_by_mapping.py -i '$input_file' -f '$input_file.ext' -m $pair_mode
 #if $output_choice_cond.output_choice=="both"
- -p $output_pos -n $output_neg
+ -p '$output_pos' -n '$output_neg'
 #elif $output_choice_cond.output_choice=="pos"
- -p $output_pos
+ -p '$output_pos'
 #elif $output_choice_cond.output_choice=="neg"
- -n $output_neg
+ -n '$output_neg'
 #end if
 ## Now loop over all the mapping files
-#for i in $mapping_file#${i} #end for#
+#for i in $mapping_file#'${i}' #end for#
     </command>
     <inputs>
         <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." />
--- a/tools/seq_filter_by_mapping/tool_dependencies.xml	Wed May 13 11:08:58 2015 -0400
+++ b/tools/seq_filter_by_mapping/tool_dependencies.xml	Wed May 10 13:16:44 2017 -0400
@@ -1,9 +1,9 @@
 <?xml version="1.0"?>
 <tool_dependency>
-    <package name="biopython" version="1.64">
-        <repository changeset_revision="5477a05cc158" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+    <package name="biopython" version="1.67">
+        <repository changeset_revision="a42f244cce44" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
     <package name="samtools" version="0.1.19">
-        <repository changeset_revision="96aab723499f" name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="c9bd782f5342" name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>