# HG changeset patch # User nml # Date 1510177130 18000 # Node ID 1855203c2e6c79f145d2ad09baef66bb32b24fb5 planemo upload for repository https://github.com/phac-nml/galaxy_tools commit 132092ff7fe1c4810d1221054419389180b81657 diff -r 000000000000 -r 1855203c2e6c combine_stats.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_stats.pl Wed Nov 08 16:38:50 2017 -0500 @@ -0,0 +1,103 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use autodie; +use Getopt::Long; + +#quick and dirty script to combine a list of assembly stats tab files into a simple csv file where each row is one strain + + +my ($files,$output) = prepare_inputs(); + + +my @strains = sort { $a cmp $b } keys %{$files}; + + +#get first file so we can determine the header +my $first = shift @strains; +my $top_header; +my $second_header; + + + +open my $out,'>',$output; +process($first,$files->{$first},$out,1); + + +foreach my $name( @strains) { + process($name,$files->{$name},$out); +} + + + +close $out; + + +exit; + +sub process { + my ($name,$file,$out,$header) = @_; + + my @header = ("Strain"); + my @values = ($name); + + open my $in,'<',$file; + while ( <$in>) { + chomp; + + if (length $_ ==0) { + next; + } + + #if we hit this section, we are done reading this file since the rest we do not care about + if ( $_ =~ /Simple Din.*repeats/) { + last; + } + + + my ($key,$value) = split /:/; + + #trim out the tabs + $key =~ s/\t//g; + $value =~ s/\t//g; + + if ( $value) { + push @header,$key; + push @values,$value; + } + + } + + + close $in; + + #check to see if we are printing out the header + if ( $header) { + print $out join ("\t",@header) . "\n"; + } + print $out join ("\t",@values) . "\n"; + + return; +} + + +sub prepare_inputs { + + my ($output,%files); + + + + if (!GetOptions('stats=s' => \%files, + 'output=s' => \$output + )){ + + die "Invalid options given\n"; + } + + + if ( scalar keys %files == 0){ + die "No files given\n"; + } + + return (\%files,$output); +} diff -r 000000000000 -r 1855203c2e6c combine_stats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_stats.xml Wed Nov 08 16:38:50 2017 -0500 @@ -0,0 +1,45 @@ + + Combines List Collection Assembly Statistics + + perl-getopt-long + + + + + + + + + + + + + + + + + + + + + + + @ARTICLE{a1, + title = {Combine AssemblyStats}, + author = {Mariam Iskander, Philip Mabon}, + url = {https://github.com/phac-nml/galaxy_tools/} + } + } + + diff -r 000000000000 -r 1855203c2e6c test-data/first.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/first.txt Wed Nov 08 16:38:50 2017 -0500 @@ -0,0 +1,34 @@ +Statistics for contig lengths: + Min contig length: 1,134 + Max contig length: 101,601 + Mean contig length: 24480.24 + Standard deviation of contig length: 23362.52 + Median contig length: 16,139 + N50 contig length: 45,147 + +Statistics for numbers of contigs: + Number of contigs: 186 + Number of contigs >=1kb: 186 + Number of contigs in N50: 35 + +Statistics for bases in the contigs: + Number of bases in all contigs: 4,553,325 + Number of bases in contigs >=1kb: 4,553,325 + GC Content of contigs: 47.56 % + +Simple Dinucleotide repeats: + Number of contigs with over 70% dinucleotode repeats: 0.00 % (0 contigs) + AT: 0.00 % (0 contigs) + CG: 0.00 % (0 contigs) + AC: 0.00 % (0 contigs) + TG: 0.00 % (0 contigs) + AG: 0.00 % (0 contigs) + TC: 0.00 % (0 contigs) + +Simple mononucleotide repeats: + Number of contigs with over 50% mononucleotode repeats: 0.00 % (0 contigs) + AA: 0.00 % (0 contigs) + TT: 0.00 % (0 contigs) + CC: 0.00 % (0 contigs) + GG: 0.00 % (0 contigs) + diff -r 000000000000 -r 1855203c2e6c test-data/results.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/results.tabular Wed Nov 08 16:38:50 2017 -0500 @@ -0,0 +1,3 @@ +Strain Min contig length Max contig length Mean contig length Standard deviation of contig length Median contig length N50 contig length Number of contigs Number of contigs >=1kb Number of contigs in N50 Number of bases in all contigs Number of bases in contigs >=1kb GC Content of contigs +first 1,134 101,601 24480.24 23362.52 16,139 45,147 186 186 35 4,553,325 4,553,325 47.56 % +second 1,134 101,601 24967.61 23081.09 16,588 44,563 185 185 37 4,619,008 4,619,008 47.50 % diff -r 000000000000 -r 1855203c2e6c test-data/second.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/second.txt Wed Nov 08 16:38:50 2017 -0500 @@ -0,0 +1,34 @@ +Statistics for contig lengths: + Min contig length: 1,134 + Max contig length: 101,601 + Mean contig length: 24967.61 + Standard deviation of contig length: 23081.09 + Median contig length: 16,588 + N50 contig length: 44,563 + +Statistics for numbers of contigs: + Number of contigs: 185 + Number of contigs >=1kb: 185 + Number of contigs in N50: 37 + +Statistics for bases in the contigs: + Number of bases in all contigs: 4,619,008 + Number of bases in contigs >=1kb: 4,619,008 + GC Content of contigs: 47.50 % + +Simple Dinucleotide repeats: + Number of contigs with over 70% dinucleotode repeats: 0.00 % (0 contigs) + AT: 0.00 % (0 contigs) + CG: 0.00 % (0 contigs) + AC: 0.00 % (0 contigs) + TG: 0.00 % (0 contigs) + AG: 0.00 % (0 contigs) + TC: 0.00 % (0 contigs) + +Simple mononucleotide repeats: + Number of contigs with over 50% mononucleotode repeats: 0.00 % (0 contigs) + AA: 0.00 % (0 contigs) + TT: 0.00 % (0 contigs) + CC: 0.00 % (0 contigs) + GG: 0.00 % (0 contigs) +