Mercurial > repos > earlhaminst > hcluster_sg_parser
changeset 1:17aa68582a05 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/hcluster_sg_parser commit a79c8f1799189754eae80aede6fbe5428570f36b
author | earlhaminst |
---|---|
date | Fri, 20 Jan 2017 06:13:23 -0500 |
parents | dbc49bd1a3e9 |
children | 0a33fd8ead70 |
files | hcluster_sg_parser.pl hcluster_sg_parser.py hcluster_sg_parser.xml test-data/discarded.txt test-data/empty.txt |
diffstat | 4 files changed, 72 insertions(+), 30 deletions(-) [+] |
line wrap: on
line diff
--- a/hcluster_sg_parser.pl Mon Dec 12 07:12:23 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -#!/usr/bin/perl -# -use strict; -use warnings; -# A simple perl parser to convert hcluster_sg 3-column output into list of ids in separate files -# hcluster_sg_parser.pl <file> - -my $file1 = $ARGV[0]; -open my $fh1, '<', $file1; - -while (my $line = <$fh1>) { - chomp $line; - my @row = split(/\t/, $line); - - my $cluster_id = $row[0]; - my $id_list = $row[2]; - # Change commas to newlines - $id_list =~ s/\,/\n/g; - - my $outfile = $cluster_id."_output.txt"; - open(my $fh, '>', $outfile) or die "Could not open file '$outfile' for writing: $!"; - print $fh $id_list; - close $fh; -} -close $fh1;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hcluster_sg_parser.py Fri Jan 20 06:13:23 2017 -0500 @@ -0,0 +1,36 @@ +""" +A simple parser to convert the hcluster_sg 3-column output into lists of IDs, one list for each cluster. + +When a minimum and/or maximum number of cluster elements are specified, the IDs contained in the filtered-out clusters are collected in the "discarded IDS" output dataset. + +Usage: + +python hcluster_sg_parser.py [-m <N>] [-M <N>] <file> <discarded_out> +""" +import optparse +import sys + + +def main(): + parser = optparse.OptionParser() + parser.add_option('-m', '--min', type='int', default=0, help='Minimum number of cluster elements') + parser.add_option('-M', '--max', type='int', default=sys.maxsize, help='Maximum number of cluster elements') + options, args = parser.parse_args() + + with open(args[1], 'w') as discarded_out: + with open(args[0]) as fh: + for line in fh: + line = line.rstrip() + (cluster_id, n_ids, id_list) = line.split('\t') + n_ids = int(n_ids) + id_list = id_list.replace(',', '\n') + if n_ids >= options.min and n_ids <= options.max: + outfile = cluster_id + '_output.txt' + with open(outfile, 'w') as f: + f.write(id_list) + else: + discarded_out.write(id_list) + + +if __name__ == "__main__": + main()
--- a/hcluster_sg_parser.xml Mon Dec 12 07:12:23 2016 -0500 +++ b/hcluster_sg_parser.xml Fri Jan 20 06:13:23 2017 -0500 @@ -1,18 +1,27 @@ -<tool id="hcluster_sg_parser" name="hcluster_sg_parser" version="0.1.1"> - <description>Converts hcluster_sg 3-column output into lists of ids</description> +<tool id="hcluster_sg_parser" name="hcluster_sg_parser" version="0.2.0"> + <description>converts hcluster_sg 3-column output into lists of IDs</description> <command> <![CDATA[ -perl $__tool_directory__/hcluster_sg_parser.pl -$inputFile +python '$__tool_directory__/hcluster_sg_parser.py' '$inputFile' +#if str($min_elems) + -m $min_elems +#end if +#if str($max_elems) + -M $max_elems +#end if +'$discarded' ]]> </command> <inputs> <param name="inputFile" type="data" format="tabular" label="hcluster output file in 3-column format" help="3-columns format: cluster_id cluster-size cluster-members" /> + <param name="min_elems" type="integer" value="" min="0" optional="true" label="Minimum number of cluster elements" /> + <param name="max_elems" type="integer" value="" min="2" optional="true" label="Maximum number of cluster elements" /> </inputs> <outputs> <collection name="ids_lists" type="list" label="${tool.name} on ${on_string}"> <discover_datasets pattern="(?P<designation>.+)_output\.txt" ext="txt" /> </collection> + <data name="discarded" format="txt" label="${tool.name} on ${on_string}: discarded IDs" /> </outputs> <tests> <test> @@ -23,11 +32,23 @@ <element name="2" file="2_output.txt" ftype="txt" /> <element name="3" file="3_output.txt" ftype="txt" /> </output_collection> + <output name="discarded" file="empty.txt" /> + </test> + <test> + <param name="inputFile" ftype="tabular" value="hcluster_sg.tabular" /> + <param name="min_elems" value="6" /> + <output_collection name="ids_lists" type="list"> + <element name="0" file="0_output.txt" ftype="txt" /> + <element name="1" file="1_output.txt" ftype="txt" /> + </output_collection> + <output name="discarded" file="discarded.txt" /> </test> </tests> <help> <![CDATA[ -Simple wrapper for hcluster_sg output parser. +A simple parser to convert the hcluster_sg 3-column output into lists of IDs, one list for each cluster. + +When a minimum and/or maximum number of cluster elements are specified, the IDs contained in the filtered-out clusters are collected in the "discarded IDS" output dataset. ]]> </help> <citations>