Mercurial > repos > earlhaminst > hcluster_sg_parser

--- a/hcluster_sg_parser.pl	Mon Dec 12 07:12:23 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,25 +0,0 @@
-#!/usr/bin/perl
-#
-use strict;
-use warnings;
-# A simple perl parser to convert hcluster_sg 3-column output into list of ids in separate files
-# hcluster_sg_parser.pl <file>
-
-my $file1 = $ARGV[0];
-open my $fh1, '<', $file1;
-
-while (my $line = <$fh1>) {
-    chomp $line;
-    my @row = split(/\t/, $line);
-
-    my $cluster_id = $row[0];
-    my $id_list = $row[2];
-    # Change commas to newlines
-    $id_list =~ s/\,/\n/g;
-
-    my $outfile = $cluster_id."_output.txt";
-    open(my $fh, '>', $outfile) or die "Could not open file '$outfile' for writing: $!";
-    print $fh $id_list;
-    close $fh;
-}
-close $fh1;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hcluster_sg_parser.py	Fri Jan 20 06:13:23 2017 -0500
@@ -0,0 +1,36 @@
+"""
+A simple parser to convert the hcluster_sg 3-column output into lists of IDs, one list for each cluster.
+
+When a minimum and/or maximum number of cluster elements are specified, the IDs contained in the filtered-out clusters are collected in the "discarded IDS" output dataset.
+
+Usage:
+
+python hcluster_sg_parser.py [-m <N>] [-M <N>] <file> <discarded_out>
+"""
+import optparse
+import sys
+
+
+def main():
+    parser = optparse.OptionParser()
+    parser.add_option('-m', '--min', type='int', default=0, help='Minimum number of cluster elements')
+    parser.add_option('-M', '--max', type='int', default=sys.maxsize, help='Maximum number of cluster elements')
+    options, args = parser.parse_args()
+
+    with open(args[1], 'w') as discarded_out:
+        with open(args[0]) as fh:
+            for line in fh:
+                line = line.rstrip()
+                (cluster_id, n_ids, id_list) = line.split('\t')
+                n_ids = int(n_ids)
+                id_list = id_list.replace(',', '\n')
+                if n_ids >= options.min and n_ids <= options.max:
+                    outfile = cluster_id + '_output.txt'
+                    with open(outfile, 'w') as f:
+                        f.write(id_list)
+                else:
+                    discarded_out.write(id_list)
+
+
+if __name__ == "__main__":
+    main()
--- a/hcluster_sg_parser.xml	Mon Dec 12 07:12:23 2016 -0500
+++ b/hcluster_sg_parser.xml	Fri Jan 20 06:13:23 2017 -0500
@@ -1,18 +1,27 @@
-<tool id="hcluster_sg_parser" name="hcluster_sg_parser" version="0.1.1">
-    <description>Converts hcluster_sg 3-column output into lists of ids</description>
+<tool id="hcluster_sg_parser" name="hcluster_sg_parser" version="0.2.0">
+    <description>converts hcluster_sg 3-column output into lists of IDs</description>
     <command>
 <![CDATA[
-perl $__tool_directory__/hcluster_sg_parser.pl
-$inputFile
+python '$__tool_directory__/hcluster_sg_parser.py' '$inputFile'
+#if str($min_elems)
+    -m $min_elems
+#end if
+#if str($max_elems)
+    -M $max_elems
+#end if
+'$discarded'
 ]]>
     </command>
     <inputs>
         <param name="inputFile" type="data" format="tabular" label="hcluster output file in 3-column format" help="3-columns format: cluster_id cluster-size cluster-members" />
+        <param name="min_elems" type="integer" value="" min="0" optional="true" label="Minimum number of cluster elements" />
+        <param name="max_elems" type="integer" value="" min="2" optional="true" label="Maximum number of cluster elements" />
     </inputs>
     <outputs>
         <collection name="ids_lists" type="list" label="${tool.name} on ${on_string}">
             <discover_datasets pattern="(?P&lt;designation&gt;.+)_output\.txt" ext="txt" />
         </collection>
+        <data name="discarded" format="txt" label="${tool.name} on ${on_string}: discarded IDs" />
     </outputs>
     <tests>
         <test>
@@ -23,11 +32,23 @@
                 <element name="2" file="2_output.txt" ftype="txt" />
                 <element name="3" file="3_output.txt" ftype="txt" />
             </output_collection>
+            <output name="discarded" file="empty.txt" />
+        </test>
+        <test>
+            <param name="inputFile" ftype="tabular" value="hcluster_sg.tabular" />
+            <param name="min_elems" value="6" />
+            <output_collection name="ids_lists" type="list">
+                <element name="0" file="0_output.txt" ftype="txt" />
+                <element name="1" file="1_output.txt" ftype="txt" />
+            </output_collection>
+            <output name="discarded" file="discarded.txt" />
         </test>
     </tests>
     <help>
 <![CDATA[
-Simple wrapper for hcluster_sg output parser.
+A simple parser to convert the hcluster_sg 3-column output into lists of IDs, one list for each cluster.
+
+When a minimum and/or maximum number of cluster elements are specified, the IDs contained in the filtered-out clusters are collected in the "discarded IDS" output dataset.
 ]]>
     </help>
     <citations>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/discarded.txt	Fri Jan 20 06:13:23 2017 -0500
@@ -0,0 +1,10 @@
+74
+68
+2
+24
+58
+82
+18
+9
+12
+39