changeset 6:03e134cae41a draft

v0.2.3, ignore blank lines in ID file (contributed by Gildas Le Corguille)
author peterjc
date Tue, 17 May 2016 05:59:24 -0400
parents 832c1fd57852
children fb1313d79396
files tools/seq_filter_by_id/README.rst tools/seq_filter_by_id/seq_filter_by_id.py tools/seq_filter_by_id/seq_filter_by_id.xml tools/seq_filter_by_id/tool_dependencies.xml
diffstat 4 files changed, 100 insertions(+), 95 deletions(-) [+]
line wrap: on
line diff
--- a/tools/seq_filter_by_id/README.rst	Wed May 13 11:03:57 2015 -0400
+++ b/tools/seq_filter_by_id/README.rst	Tue May 17 05:59:24 2016 -0400
@@ -86,6 +86,9 @@
 v0.2.2  - Use the ``format_source=...`` tag.
         - Reorder XML elements (internal change only).
         - Planemo for Tool Shed upload (``.shed.yml``, internal change only).
+v0.2.3  - Ignore blank lines in ID file (contributed by Gildas Le Corguillé).
+        - Defensive quoting of filenames etc in the command definition
+          (internal change only).
 ======= ======================================================================
 
 
@@ -103,17 +106,17 @@
 Planemo commands (which requires you have set your Tool Shed access details in
 ``~/.planemo.yml`` and that you have access rights on the Tool Shed)::
 
-    $ planemo shed_upload --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/venn_list/
+    $ planemo shed_update -t testtoolshed --check_diff tools/seq_filter_by_id/ -m "New release"
     ...
 
 or::
 
-    $ planemo shed_upload --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/venn_list/
+    $ planemo shed_update -t toolshed --check_diff tools/seq_filter_by_id/ -m "New release"
     ...
 
 To just build and check the tar ball, use::
 
-    $ planemo shed_upload --tar_only  ~/repositories/pico_galaxy/tools/venn_list/
+    $ planemo shed_upload --tar_only tools/seq_filter_by_id/
     ...
     $ tar -tzf shed_upload.tar.gz 
     test-data/empty_file.dat
--- a/tools/seq_filter_by_id/seq_filter_by_id.py	Wed May 13 11:03:57 2015 -0400
+++ b/tools/seq_filter_by_id/seq_filter_by_id.py	Tue May 17 05:59:24 2016 -0400
@@ -32,11 +32,7 @@
 import re
 from optparse import OptionParser
 
-def sys_exit(msg, err=1):
-    sys.stderr.write(msg.rstrip() + "\n")
-    sys.exit(err)
-
-#Parse Command Line
+# Parse Command Line
 usage = """Use as follows:
 
 $ python seq_filter_by_id.py [options] tab1 cols1 [, tab2 cols2, ...]
@@ -78,7 +74,7 @@
 options, args = parser.parse_args()
 
 if options.version:
-    print "v0.2.1"
+    print "v0.2.3"
     sys.exit(0)
 
 in_file = options.input
@@ -89,28 +85,28 @@
 drop_suffices = bool(options.suffix)
 
 if in_file is None or not os.path.isfile(in_file):
-    sys_exit("Missing input file: %r" % in_file)
+    sys.exit("Missing input file: %r" % in_file)
 if out_positive_file is None and out_negative_file is None:
-    sys_exit("Neither output file requested")
+    sys.exit("Neither output file requested")
 if seq_format is None:
-    sys_exit("Missing sequence format")
+    sys.exit("Missing sequence format")
 if logic not in ["UNION", "INTERSECTION"]:
-    sys_exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic)
+    sys.exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic)
 if options.id_list and args:
-    sys_exit("Cannot accepted IDs via both -t and as tabular files")
+    sys.exit("Cannot accepted IDs via both -t and as tabular files")
 elif not options.id_list and not args:
-    sys_exit("Expected matched pairs of tabular files and columns (or -t given)")
+    sys.exit("Expected matched pairs of tabular files and columns (or -t given)")
 if len(args) % 2:
-    sys_exit("Expected matched pairs of tabular files and columns, not: %r" % args)
+    sys.exit("Expected matched pairs of tabular files and columns, not: %r" % args)
 
 
-#Cope with three widely used suffix naming convensions,
-#Illumina: /1 or /2
-#Forward/revered: .f or .r
-#Sanger, e.g. .p1k and .q1k
-#See http://staden.sourceforge.net/manual/pregap4_unix_50.html
-#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
-#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
+# Cope with three widely used suffix naming convensions,
+# Illumina: /1 or /2
+# Forward/revered: .f or .r
+# Sanger, e.g. .p1k and .q1k
+# See http://staden.sourceforge.net/manual/pregap4_unix_50.html
+# re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
+# re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
 assert re_suffix.search("demo.f")
 assert re_suffix.search("demo.s1")
@@ -125,19 +121,21 @@
 
 identifiers = []
 for i in range(len(args) // 2):
-    tabular_file = args[2*i]
-    cols_arg = args[2*i + 1]
+    tabular_file = args[2 * i]
+    cols_arg = args[2 * i + 1]
     if not os.path.isfile(tabular_file):
-        sys_exit("Missing tabular identifier file %r" % tabular_file)
+        sys.exit("Missing tabular identifier file %r" % tabular_file)
     try:
-        columns = [int(arg)-1 for arg in cols_arg.split(",")]
+        columns = [int(arg) - 1 for arg in cols_arg.split(",")]
     except ValueError:
-        sys_exit("Expected list of columns (comma separated integers), got %r" % cols_arg)
+        sys.exit("Expected list of columns (comma separated integers), got %r" % cols_arg)
     if min(columns) < 0:
-        sys_exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
+        sys.exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
     identifiers.append((tabular_file, columns))
 
 name_warn = False
+
+
 def check_white_space(name):
     parts = name.split(None, 1)
     global name_warn
@@ -169,28 +167,29 @@
     clean_name = check_white_space
 
 
-mapped_chars = { '>' :'__gt__',
-                 '<' :'__lt__',
-                 "'" :'__sq__',
-                 '"' :'__dq__',
-                 '[' :'__ob__',
-                 ']' :'__cb__',
-                 '{' :'__oc__',
-                 '}' :'__cc__',
-                 '@' : '__at__',
-                 '\n' : '__cn__',
-                 '\r' : '__cr__',
-                 '\t' : '__tc__',
-                 '#' : '__pd__'
-                 }
+mapped_chars = {
+    '>': '__gt__',
+    '<': '__lt__',
+    "'": '__sq__',
+    '"': '__dq__',
+    '[': '__ob__',
+    ']': '__cb__',
+    '{': '__oc__',
+    '}': '__cc__',
+    '@': '__at__',
+    '\n': '__cn__',
+    '\r': '__cr__',
+    '\t': '__tc__',
+    '#': '__pd__',
+    }
 
-#Read tabular file(s) and record all specified identifiers
-ids = None #Will be a set
+# Read tabular file(s) and record all specified identifiers
+ids = None  # Will be a set
 if options.id_list:
     assert not identifiers
     ids = set()
     id_list = options.id_list
-    #Galaxy turns \r into __cr__ (CR) etc
+    # Galaxy turns \r into __cr__ (CR) etc
     for k in mapped_chars:
         id_list = id_list.replace(mapped_chars[k], k)
     for x in options.id_list.split():
@@ -199,22 +198,24 @@
 for tabular_file, columns in identifiers:
     file_ids = set()
     handle = open(tabular_file, "rU")
-    if len(columns)>1:
-        #General case of many columns
+    if len(columns) > 1:
+        # General case of many columns
         for line in handle:
             if line.startswith("#"):
-                #Ignore comments
+                # Ignore comments
                 continue
             parts = line.rstrip("\n").split("\t")
             for col in columns:
                 file_ids.add(clean_name(parts[col]))
     else:
-        #Single column, special case speed up
+        # Single column, special case speed up
         col = columns[0]
         for line in handle:
+            if not line.strip(): #skip empty lines
+                continue
             if not line.startswith("#"):
                 file_ids.add(clean_name(line.rstrip("\n").split("\t")[col]))
-    print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col+1) for col in columns))
+    print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns))
     if ids is None:
         ids = file_ids
     if logic == "UNION":
@@ -230,12 +231,13 @@
 if name_warn:
     sys.stderr.write(name_warn)
 
+
 def crude_fasta_iterator(handle):
     """Yields tuples, record ID and the full record as a string."""
     while True:
         line = handle.readline()
         if line == "":
-            return # Premature end of file, or just empty?
+            return  # Premature end of file, or just empty?
         if line[0] == ">":
             break
 
@@ -262,16 +264,16 @@
             line = handle.readline()
         yield id, "".join(lines)
         if not line:
-            return # StopIteration
+            return  # StopIteration
 
 
 def fasta_filter(in_file, pos_file, neg_file, wanted):
     """FASTA filter producing 60 character line wrapped outout."""
     pos_count = neg_count = 0
-    #Galaxy now requires Python 2.5+ so can use with statements,
+    # Galaxy now requires Python 2.5+ so can use with statements,
     with open(in_file) as in_handle:
-        #Doing the if statement outside the loop for speed
-        #(with the downside of three very similar loops).
+        # Doing the if statement outside the loop for speed
+        # (with the downside of three very similar loops).
         if pos_file is not None and neg_file is not None:
             print "Generating two FASTA files"
             with open(pos_file, "w") as pos_handle:
@@ -309,31 +311,31 @@
     """FASTQ filter."""
     from Bio.SeqIO.QualityIO import FastqGeneralIterator
     handle = open(in_file, "r")
-    if out_positive_file is not None and out_negative_file is not None:
+    if pos_file is not None and neg_file is not None:
         print "Generating two FASTQ files"
-        positive_handle = open(out_positive_file, "w")
-        negative_handle = open(out_negative_file, "w")
+        positive_handle = open(pos_file, "w")
+        negative_handle = open(neg_file, "w")
         print in_file
         for title, seq, qual in FastqGeneralIterator(handle):
             print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
-            if clean_name(title.split(None, 1)[0]) in ids:
+            if clean_name(title.split(None, 1)[0]) in wanted:
                 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
             else:
                 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
         positive_handle.close()
         negative_handle.close()
-    elif out_positive_file is not None:
+    elif pos_file is not None:
         print "Generating matching FASTQ file"
-        positive_handle = open(out_positive_file, "w")
+        positive_handle = open(pos_file, "w")
         for title, seq, qual in FastqGeneralIterator(handle):
-            if clean_name(title.split(None, 1)[0]) in ids:
+            if clean_name(title.split(None, 1)[0]) in wanted:
                 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
         positive_handle.close()
-    elif out_negative_file is not None:
+    elif neg_file is not None:
         print "Generating non-matching FASTQ file"
-        negative_handle = open(out_negative_file, "w")
+        negative_handle = open(neg_file, "w")
         for title, seq, qual in FastqGeneralIterator(handle):
-            if clean_name(title.split(None, 1)[0]) not in ids:
+            if clean_name(title.split(None, 1)[0]) not in wanted:
                 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
         negative_handle.close()
     handle.close()
@@ -345,48 +347,48 @@
     try:
         from Bio.SeqIO.SffIO import SffIterator, SffWriter
     except ImportError:
-        sys_exit("SFF filtering requires Biopython 1.54 or later")
+        sys.exit("SFF filtering requires Biopython 1.54 or later")
 
     try:
         from Bio.SeqIO.SffIO import ReadRocheXmlManifest
     except ImportError:
-        #Prior to Biopython 1.56 this was a private function
+        # Prior to Biopython 1.56 this was a private function
         from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
 
-    in_handle = open(in_file, "rb") #must be binary mode!
+    in_handle = open(in_file, "rb")  # must be binary mode!
     try:
         manifest = ReadRocheXmlManifest(in_handle)
     except ValueError:
         manifest = None
 
-    #This makes two passes though the SFF file with isn't so efficient,
-    #but this makes the code simple.
+    # This makes two passes though the SFF file with isn't so efficient,
+    # but this makes the code simple.
     pos_count = neg_count = 0
-    if out_positive_file is not None:
-        out_handle = open(out_positive_file, "wb")
+    if pos_file is not None:
+        out_handle = open(pos_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
-        in_handle.seek(0) #start again after getting manifest
-        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
+        in_handle.seek(0)  # start again after getting manifest
+        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted)
         out_handle.close()
-    if out_negative_file is not None:
-        out_handle = open(out_negative_file, "wb")
+    if neg_file is not None:
+        out_handle = open(neg_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
-        in_handle.seek(0) #start again
-        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
+        in_handle.seek(0)  # start again
+        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted)
         out_handle.close()
-    #And we're done
+    # And we're done
     in_handle.close()
-    #At the time of writing, Galaxy doesn't show SFF file read counts,
-    #so it is useful to put them in stdout and thus shown in job info.
+    # At the time of writing, Galaxy doesn't show SFF file read counts,
+    # so it is useful to put them in stdout and thus shown in job info.
     return pos_count, neg_count
 
 
-if seq_format.lower()=="sff":
+if seq_format.lower() == "sff":
     # Now write filtered SFF file based on IDs wanted
     pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids)
     # At the time of writing, Galaxy doesn't show SFF file read counts,
     # so it is useful to put them in stdout and thus shown in job info.
-elif seq_format.lower()=="fasta":
+elif seq_format.lower() == "fasta":
     # Write filtered FASTA file based on IDs from tabular file
     pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
     print "%i with and %i without specified IDs" % (pos_count, neg_count)
@@ -395,4 +397,4 @@
     fastq_filter(in_file, out_positive_file, out_negative_file, ids)
     # This does not currently track the counts
 else:
-    sys_exit("Unsupported file type %r" % seq_format)
+    sys.exit("Unsupported file type %r" % seq_format)
--- a/tools/seq_filter_by_id/seq_filter_by_id.xml	Wed May 13 11:03:57 2015 -0400
+++ b/tools/seq_filter_by_id/seq_filter_by_id.xml	Tue May 17 05:59:24 2016 -0400
@@ -1,4 +1,4 @@
-<tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.2.2">
+<tool id="seq_filter_by_id" name="Filter sequences by ID" version="0.2.3">
     <description>from a tabular file</description>
     <requirements>
         <requirement type="package" version="1.64">biopython</requirement>
@@ -12,17 +12,17 @@
     <version_command interpreter="python">seq_filter_by_id.py --version</version_command>
     <command interpreter="python">
 seq_filter_by_id.py -i "$input_file" -f "$input_file.ext"
-#if $output_choice_cond.output_choice=="both"
- -p $output_pos -n $output_neg
-#elif $output_choice_cond.output_choice=="pos"
- -p $output_pos
-#elif $output_choice_cond.output_choice=="neg"
- -n $output_neg
+#if str($output_choice_cond.output_choice)=="both"
+ -p "$output_pos" -n "$output_neg"
+#elif str($output_choice_cond.output_choice)=="pos"
+ -p "$output_pos"
+#elif str($output_choice_cond.output_choice)=="neg"
+ -n "$output_neg"
 #end if
-#if $adv_opts.adv_opts_selector=="advanced" and $adv_opts.strip_suffix
+#if str($adv_opts.adv_opts_selector)=="advanced" and $adv_opts.strip_suffix
  -s
 #end if
-#if $id_opts.id_opts_selector=="tabular":
+#if str($id_opts.id_opts_selector)=="tabular":
 ## TODO - Decide on best way to expose multiple ID files via the XML wrapper.
 ## Single tabular file, can call the Python script with either UNION or INTERSECTION
 -l UNION "$id_opts.input_tabular" "$id_opts.columns"
--- a/tools/seq_filter_by_id/tool_dependencies.xml	Wed May 13 11:03:57 2015 -0400
+++ b/tools/seq_filter_by_id/tool_dependencies.xml	Tue May 17 05:59:24 2016 -0400
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <tool_dependency>
     <package name="biopython" version="1.64">
-        <repository changeset_revision="5477a05cc158" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="b64c8edb7e45" name="package_biopython_1_64" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>