changeset 1:84e483325b04 draft

"make_nr v0.0.2"
author peterjc
date Thu, 18 Mar 2021 12:48:57 +0000
parents c84f12187af9
children 0c71cd1cd99a
files test-data/empty.fasta tools/make_nr/README.rst tools/make_nr/make_nr.py tools/make_nr/make_nr.xml tools/make_nr/tool_dependencies.xml
diffstat 4 files changed, 57 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/tools/make_nr/README.rst	Fri Nov 09 11:00:03 2018 -0500
+++ b/tools/make_nr/README.rst	Thu Mar 18 12:48:57 2021 +0000
@@ -83,7 +83,9 @@
 ======= ======================================================================
 Version Changes
 ------- ----------------------------------------------------------------------
-v0.0.0  - Initial version
+v0.0.2  - Fixed bug writing files when there were no duplicates
+v0.0.1  - Added option to sort merged IDs, and support for gzipped files
+v0.0.0  - Initial version (not published to main Galaxy Tool Shed)
 ======= ======================================================================
 
 
--- a/tools/make_nr/make_nr.py	Fri Nov 09 11:00:03 2018 -0500
+++ b/tools/make_nr/make_nr.py	Thu Mar 18 12:48:57 2021 +0000
@@ -12,13 +12,14 @@
 
 import gzip
 import os
+import shutil
 import sys
 
 from optparse import OptionParser
 
 
 if "-v" in sys.argv or "--version" in sys.argv:
-    print("v0.0.1")
+    print("v0.0.2")
     sys.exit(0)
 
 
@@ -45,17 +46,30 @@
 """
 
 parser = OptionParser(usage=usage)
-parser.add_option("-s", "--sep", dest="sep",
-                  default=";",
-                  help="Separator character for combining identifiers "
-                  "of duplicated records e.g. '|' or ';' (required)")
-parser.add_option("-a", "--alphasort", action="store_true",
-                  help="When merging duplicated records sort their "
-                  "identifiers alphabetically before combining them. "
-                  "Default is input file order.")
-parser.add_option("-o", "--output", dest="output",
-                  default="/dev/stdout", metavar="FILE",
-                  help="Output filename (defaults to stdout)")
+parser.add_option(
+    "-s",
+    "--sep",
+    dest="sep",
+    default=";",
+    help="Separator character for combining identifiers "
+    "of duplicated records e.g. '|' or ';' (required)",
+)
+parser.add_option(
+    "-a",
+    "--alphasort",
+    action="store_true",
+    help="When merging duplicated records sort their "
+    "identifiers alphabetically before combining them. "
+    "Default is input file order.",
+)
+parser.add_option(
+    "-o",
+    "--output",
+    dest="output",
+    default="/dev/stdout",
+    metavar="FILE",
+    help="Output filename (defaults to stdout)",
+)
 options, args = parser.parse_args()
 
 if not args:
@@ -66,7 +80,7 @@
     """Open a possibly gzipped text file."""
     with open(filename, "rb") as h:
         magic = h.read(2)
-    if magic == b'\x1f\x8b':
+    if magic == b"\x1f\x8b":
         return gzip.open(filename, "rt")
     else:
         return open(filename)
@@ -121,13 +135,25 @@
                             continue
                         # TODO - line wrapping
                         handle.write(">%s\n%s\n" % (title, seq))
-        sys.stderr.write("%i unique entries; removed %i duplicates "
-                         "leaving %i representative records\n"
-                         % (unique, len(duplicates), len(representatives)))
+        sys.stderr.write(
+            "%i unique entries; removed %i duplicates "
+            "leaving %i representative records\n"
+            % (unique, len(duplicates), len(representatives))
+        )
+    elif len(input_fasta) == 1:
+        # Single file, no need to even edit titles
+        shutil.copy(os.path.abspath(input_fasta[0]), output_fasta)
+        sys.stderr.write("No perfect duplicates in file, %i unique entries\n" % unique)
     else:
-        os.symlink(os.path.abspath(input_fasta), output_fasta)
-        sys.stderr.write("No perfect duplicates in file, %i unique entries\n"
-                         % unique)
+        with open(output_fasta, "w") as handle:
+            for f in input_fasta:
+                with gzip_open(f) as in_handle:
+                    for title, seq in SimpleFastaParser(in_handle):
+                        handle.write(">%s\n%s\n" % (title, seq))
+        sys.stderr.write(
+            "No perfect duplicates in %i files, %i unique entries\n"
+            % (len(input_fasta), unique)
+        )
 
 
 make_nr(args, options.output, options.sep, options.alphasort)
--- a/tools/make_nr/make_nr.xml	Fri Nov 09 11:00:03 2018 -0500
+++ b/tools/make_nr/make_nr.xml	Thu Mar 18 12:48:57 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="make_nr" name="Make FASTA non-redundant" version="0.0.1">
+<tool id="make_nr" name="Make FASTA non-redundant" version="0.0.2">
     <description>by combining duplicated sequences</description>
     <requirements>
         <requirement type="package" version="1.67">biopython</requirement>
@@ -51,6 +51,14 @@
             <param name="alphasort" value="-a"/>
             <output name="output" file="deduplicate.sortids.fasta" ftype="fasta"/>
         </test>
+        <test>
+            <param name="input" value="empty.fasta" ftype="fasta"/>
+            <output name="output" file="empty.fasta" ftype="fasta"/>
+        </test>
+        <test>
+            <param name="input" value="empty.fasta,empty.fasta" ftype="fasta"/>
+            <output name="output" file="empty.fasta" ftype="fasta"/>
+        </test>
     </tests>
     <help>
 **What it does**
--- a/tools/make_nr/tool_dependencies.xml	Fri Nov 09 11:00:03 2018 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<?xml version="1.0" ?>
-<tool_dependency>
-    <package name="biopython" version="1.67">
-        <repository changeset_revision="a12f73c3b116" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu"/>
-    </package>
-</tool_dependency>
\ No newline at end of file