Mercurial > repos > peterjc > seq_filter_by_mapping

--- a/tools/seq_filter_by_mapping/README.rst	Wed May 17 09:24:01 2017 -0400
+++ b/tools/seq_filter_by_mapping/README.rst	Fri Apr 16 22:22:47 2021 +0000
@@ -71,6 +71,7 @@
         - Use ``<command detect_errors="aggressive">`` (internal change only).
         - Single quote command line arguments (internal change only).
 v0.0.6  - Python 3 compatible print function.
+v0.0.7  - Script works on Python 2 and 3 (fixed input file mode)
 ======= ======================================================================
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py	Wed May 17 09:24:01 2017 -0400
+++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py	Fri Apr 16 22:22:47 2021 +0000
@@ -10,7 +10,7 @@

 Cock et al 2009. Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

 This script is copyright 2014 by Peter Cock, The James Hutton Institute
 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
@@ -40,31 +40,57 @@
 Multiple SAM/BAM mapping files may be given.
 """
 parser = OptionParser(usage=usage)
-parser.add_option('-i', '--input', dest='input',
-                  default=None, help='Input sequences filename',
-                  metavar="FILE")
-parser.add_option('-f', '--format', dest='format',
-                  default=None,
-                  help='Input sequence format (e.g. fasta, fastq, sff)')
-parser.add_option('-p', '--positive', dest='output_positive',
-                  default=None,
-                  help='Output filename for mapping reads',
-                  metavar="FILE")
-parser.add_option('-n', '--negative', dest='output_negative',
-                  default=None,
-                  help='Output filename for non-mapping reads',
-                  metavar="FILE")
-parser.add_option("-m", "--pair-mode", dest="pair_mode",
-                  default="lax",
-                  help="How to treat paired reads (lax or strict, default lax)")
-parser.add_option("-v", "--version", dest="version",
-                  default=False, action="store_true",
-                  help="Show version and quit")
+parser.add_option(
+    "-i",
+    "--input",
+    dest="input",
+    default=None,
+    help="Input sequences filename",
+    metavar="FILE",
+)
+parser.add_option(
+    "-f",
+    "--format",
+    dest="format",
+    default=None,
+    help="Input sequence format (e.g. fasta, fastq, sff)",
+)
+parser.add_option(
+    "-p",
+    "--positive",
+    dest="output_positive",
+    default=None,
+    help="Output filename for mapping reads",
+    metavar="FILE",
+)
+parser.add_option(
+    "-n",
+    "--negative",
+    dest="output_negative",
+    default=None,
+    help="Output filename for non-mapping reads",
+    metavar="FILE",
+)
+parser.add_option(
+    "-m",
+    "--pair-mode",
+    dest="pair_mode",
+    default="lax",
+    help="How to treat paired reads (lax or strict, default lax)",
+)
+parser.add_option(
+    "-v",
+    "--version",
+    dest="version",
+    default=False,
+    action="store_true",
+    help="Show version and quit",
+)

 options, args = parser.parse_args()

 if options.version:
-    print("v0.0.6")
+    print("v0.0.7")
     sys.exit(0)

 in_file = options.input
@@ -114,7 +140,7 @@
     if match:
         # Use the fact this is a suffix, and regular expression will be
         # anchored to the end of the name:
-        return name[:match.start()]
+        return name[: match.start()]
     else:
         # Nothing to do
         return name
@@ -128,19 +154,19 @@
 assert clean_name("baz.q2") == "baz"

 mapped_chars = {
-    '>': '__gt__',
-    '<': '__lt__',
-    "'": '__sq__',
-    '"': '__dq__',
-    '[': '__ob__',
-    ']': '__cb__',
-    '{': '__oc__',
-    '}': '__cc__',
-    '@': '__at__',
-    '\n': '__cn__',
-    '\r': '__cr__',
-    '\t': '__tc__',
-    '#': '__pd__',
+    ">": "__gt__",
+    "<": "__lt__",
+    "'": "__sq__",
+    '"': "__dq__",
+    "[": "__ob__",
+    "]": "__cb__",
+    "{": "__oc__",
+    "}": "__cc__",
+    "@": "__at__",
+    "\n": "__cn__",
+    "\r": "__cr__",
+    "\t": "__tc__",
+    "#": "__pd__",
 }


@@ -149,21 +175,25 @@

     Parses BAM files via call out to samtools view command.
     """
-    handle = open(filename, "rb")
-    magic = handle.read(4)
+    with open(filename, "rb") as handle:
+        magic = handle.read(4)
+
     if magic == b"\x1f\x8b\x08\x04":
         # Presumably a BAM file then...
-        handle.close()
         # Call samtools view, don't need header so no -h added:
-        child = subprocess.Popen(["samtools", "view", filename],
-                                 stdin=None,
-                                 stdout=subprocess.PIPE,
-                                 stderr=subprocess.PIPE)
+        child = subprocess.Popen(
+            ["samtools", "view", filename],
+            universal_newlines=True,
+            stdin=None,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
         handle = child.stdout
     else:
         # Presumably a SAM file...
         child = None
-        handle.seek(0)
+        # Open in text mode
+        handle = open(filename)
     # Handle should now contain SAM records
     for line in handle:
         # Ignore header lines
@@ -178,7 +208,7 @@
                     ids.add(qname)
             elif pair_mode == "strict":
                 # For paired reads, require BOTH be mapped.
-                if (flag & 0x4):
+                if flag & 0x4:
                     # This is unmapped, ignore it
                     pass
                 elif not (flag & 0x1):
@@ -192,8 +222,11 @@
         stdout, stderr = child.communicate()
         assert child.returncode is not None
         if child.returncode:
-            msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode,
-                                                            filename, stderr)
+            msg = "Error %i from 'samtools view %s'\n%s" % (
+                child.returncode,
+                filename,
+                stderr,
+            )
             sys.exit(msg.strip(), child.returncode)
     else:
         handle.close()
@@ -211,7 +244,7 @@


 def crude_fasta_iterator(handle):
-    """Yields tuples, record ID and the full record as a string."""
+    """Yield tuples, record ID and the full record as a string."""
     while True:
         line = handle.readline()
         if line == "":
@@ -222,8 +255,7 @@
     no_id_warned = False
     while True:
         if line[0] != ">":
-            raise ValueError(
-                "Records in Fasta files should start with '>' character")
+            raise ValueError("Records in Fasta files should start with '>' character")
         try:
             id = line[1:].split(None, 1)[0]
         except IndexError:
@@ -288,6 +320,7 @@
 def fastq_filter(in_file, pos_file, neg_file, wanted):
     """FASTQ filter."""
     from Bio.SeqIO.QualityIO import FastqGeneralIterator
+
     pos_count = neg_count = 0
     handle = open(in_file, "r")
     if pos_file is not None and neg_file is not None:
@@ -355,13 +388,17 @@
         out_handle = open(pos_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
         in_handle.seek(0)  # start again after getting manifest
-        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted)
+        pos_count = writer.write_file(
+            rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted
+        )
         out_handle.close()
     if neg_file is not None:
         out_handle = open(neg_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
         in_handle.seek(0)  # start again
-        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted)
+        neg_count = writer.write_file(
+            rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted
+        )
         out_handle.close()
     # And we're done
     in_handle.close()
@@ -377,7 +414,9 @@
 else:
     sys.exit("Unsupported file type %r" % seq_format)

-pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids)
+pos_count, neg_count = sequence_filter(
+    in_file, out_positive_file, out_negative_file, ids
+)
 print("%i mapped and %i unmapped reads." % (pos_count, neg_count))
 fraction = float(pos_count) * 100.0 / float(pos_count + neg_count)
 print("In total %i reads, of which %0.1f%% mapped." % (pos_count + neg_count, fraction))
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml	Wed May 17 09:24:01 2017 -0400
+++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml	Fri Apr 16 22:22:47 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.6">
+<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.7">
     <description>from SAM/BAM file</description>
     <requirements>
         <requirement type="package" version="1.67">biopython</requirement>
@@ -21,7 +21,7 @@
     </command>
     <inputs>
         <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." />
-	<param name="mapping_file" type="data" format="sam,bam" multiple="true" label="SAM/BAM mapping of those sequences" help="SAM or BAM format." />
+        <param name="mapping_file" type="data" format="sam,bam" multiple="true" label="SAM/BAM mapping of those sequences" help="SAM or BAM format." />
         <conditional name="output_choice_cond">
             <param name="output_choice" type="select" label="Output mapped reads, unmapped reads, or both?">
                 <option value="both">Both mapped and unmapped reads, as two files</option>
@@ -33,12 +33,12 @@
             <when value="pos" />
             <when value="neg" />
         </conditional>
-	<param name="pair_mode" type="select" label="Paired read treatment">
+        <param name="pair_mode" type="select" label="Paired read treatment">
             <option value="lax" selected="true">Treat as a pair, allow either read to be mapped</option>
-	    <option value="strict">Treat as a pair, require both reads to be mapped</option>
-	    <!-- The following would actually be more work as have to store qname/1 and qname/2 separately for filter...
+            <option value="strict">Treat as a pair, require both reads to be mapped</option>
+            <!-- The following would actually be more work as have to store qname/1 and qname/2 separately for filter...
             <option value="solo">Treat independently (will split partners when only one maps)</option>
-	    -->
+            -->
         </param>
     </inputs>
     <outputs>
@@ -50,19 +50,19 @@
         </data>
     </outputs>
     <tests>
-	<test>
+        <test>
             <param name="input_file" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" />
             <param name="mapping_file" value="SRR639755_sample_by_coord.sam" ftype="sam" />
             <param name="pair_mode" value="lax" />
             <param name="output_choice" value="pos" />
             <output name="output_pos" file="SRR639755_sample_lax.fastq" ftype="fastqsanger" />
         </test>
-	<test>
+        <test>
             <param name="input_file" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" />
             <param name="mapping_file" value="SRR639755_sample_by_coord.sam" ftype="sam" />
             <param name="pair_mode" value="strict" />
             <param name="output_choice" value="pos" />
-            <output name="output_pos" file="SRR639755_sample_strict.fastq" ftype="fastqsanger" />
+            <output name="output_pos" file="SRR639755_sample_strict.fastq" ftype="fastqsanger" />
         </test>
     </tests>
     <help>
@@ -97,7 +97,7 @@

 Cock et al (2009). Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

 This tool is available to install into other Galaxy Instances via the Galaxy
 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping
--- a/tools/seq_filter_by_mapping/tool_dependencies.xml	Wed May 17 09:24:01 2017 -0400
+++ b/tools/seq_filter_by_mapping/tool_dependencies.xml	Fri Apr 16 22:22:47 2021 +0000
@@ -1,9 +1,9 @@
-<?xml version="1.0"?>
+<?xml version="1.0" ?>
 <tool_dependency>
     <package name="biopython" version="1.67">
-        <repository changeset_revision="a12f73c3b116" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+        <repository name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="a12f73c3b116"/>
     </package>
     <package name="samtools" version="0.1.19">
-        <repository changeset_revision="c9bd782f5342" name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+        <repository name="package_samtools_0_1_19" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="c9bd782f5342"/>
     </package>
-</tool_dependency>
+</tool_dependency>
\ No newline at end of file