diff VelvetOptimiser-2.1.7_modified/VelvetOpt/Utils.pm @ 0:50ae1360fbbe default tip

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author konradpaszkiewicz
date Tue, 07 Jun 2011 18:07:56 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VelvetOptimiser-2.1.7_modified/VelvetOpt/Utils.pm	Tue Jun 07 18:07:56 2011 -0400
@@ -0,0 +1,218 @@
+#
+#       VelvetOpt::Utils.pm
+#
+#       Copyright 2008,2009,2010 Simon Gladman <simon.gladman@csiro.au>
+#
+#       This program is free software; you can redistribute it and/or modify
+#       it under the terms of the GNU General Public License as published by
+#       the Free Software Foundation; either version 2 of the License, or
+#       (at your option) any later version.
+#
+#       This program is distributed in the hope that it will be useful,
+#       but WITHOUT ANY WARRANTY; without even the implied warranty of
+#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#       GNU General Public License for more details.
+#
+#       You should have received a copy of the GNU General Public License
+#       along with this program; if not, write to the Free Software
+#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#       MA 02110-1301, USA.
+
+#		Version 2.1.3
+
+#	Changes for Version 2.0.1
+#	Added Mikael Brandstrom Durling's numCpus and freeMem for the Mac.
+#
+#	Changes for Version 2.1.0
+#	Fixed bug in estExpCov so it now correctly uses all short read categories not just the first two.
+#
+#	Changes for Version 2.1.2
+#	Fixed bug in estExpCov so it now won't take columns with "N/A" or "Inf" into account
+#
+#	Changes for Version 2.1.3
+#	Changed the minimum contig size to use for estimating expected coverage to 3*kmer size -1 and set the minimum coverage to 2 instead of 0.
+#	This should get rid of exp_covs of 1 when it should be very high for assembling reads that weren't ampped to a reference using one of the standard read mapping programs
+
+
+package VelvetOpt::Utils;
+
+use strict;
+use lib "/usr/local/lib/perl5/site_perl/5.8.8";
+use warnings;
+use POSIX qw(ceil floor);
+use Carp;
+use List::Util qw(max);
+use Bio::SeqIO;
+
+# 	num_cpu
+#	It returns the number of cpus present in the system if linux.
+#	If it is MAC then it returns the number of cores present.
+#	If the OS is not linux or Mac then it returns 1.
+#	Written by Torsten Seemann 2009 (linux) and Mikael Brandstrom Durling 2009 (Mac).
+
+sub num_cpu {
+    if ( $^O =~ m/linux/i ) {
+        my ($num) = qx(grep -c ^processor /proc/cpuinfo);
+        chomp $num;
+        return $num if $num =~ m/^\d+/;
+    }
+	elsif( $^O =~ m/darwin/i){
+		my ($num) = qx(system_profiler SPHardwareDataType | grep Cores);
+		$num =~ /.*Cores: (\d+)/;
+		$num =$1;
+		return $num;
+	}
+    return 1;
+}
+
+#	free_mem
+#	Returns the current amount of free memory
+#	Mac Section written by Mikael Brandstrom Durling 2009 (Mac).
+
+sub free_mem {
+	if( $^O =~ m/linux/i ) {
+		my $x       = `free | grep '^Mem:' | sed 's/  */~/g' | cut -d '~' -f 4,7`;
+		my @tmp     = split "~", $x;
+		my $total   = $tmp[0] + $tmp[1];
+		my $totalGB = $total / 1024 / 1024;
+		return $totalGB;
+	}
+	elsif( $^O =~ m/darwin/i){
+		my ($tmp) = qx(vm_stat | grep size);
+		$tmp =~ /.*size of (\d+) bytes.*/;
+		my $page_size = $1;
+		($tmp) = qx(vm_stat | grep free);
+		$tmp =~ /[^0-9]+(\d+).*/;
+		my $free_pages = $1;
+		my $totalGB = ($free_pages * $page_size) / 1024 / 1024 / 1024;
+		return $totalGB;
+	}
+}
+
+#	estExpCov
+#   it returns the expected coverage of short reads from an assembly by
+#   performing a math mode on the stats.txt file supplied..  It looks at
+#	all the short_cov? columns..  Uses minimum contig length and minimum coverage.
+#	needs the stats.txt file path and name, and the k-value used in the assembly.
+#	Original algorithm by Torsten Seemann 2009 under the GPL.
+#	Adapted by Simon Gladman 2009.
+#	It does a weighted mode...
+
+sub estExpCov {
+    use List::Util qw(max);
+    my $file   = shift;
+    my $kmer   = shift;
+    my $minlen = 3 * $kmer - 1;
+    my $mincov = 2;
+    my $fh;
+    unless ( open IN, $file ) {
+        croak "Unable to open $file for exp_cov determination.\n";
+    }
+    my @cov;
+    while (<IN>) {
+        chomp;
+        my @x = split m/\t/;
+		my $len = scalar @x;
+        next unless @x >= 7;
+        next unless $x[1] =~ m/^\d+$/;
+        next unless $x[1] >= $minlen;
+		
+		#add all the short_cov columns..
+		my $cov = 0;
+		for(my $i = 5; $i < $len; $i += 2){
+			if($x[$i] =~ /\d/){
+				$cov += $x[$i];
+			}
+		}
+        next unless $cov > $mincov;
+        push @cov, ( ( int($cov) ) x $x[1] );
+    }
+
+    my %freq_of;
+    map { $freq_of{$_}++ } @cov;
+    my $mode = 0;
+    $freq_of{$mode} = 0;            # sentinel
+    for my $x ( keys %freq_of ) {
+        $mode = $x if $freq_of{$x} > $freq_of{$mode};
+    }
+    return $mode;
+}
+
+#	estVelvetMemUse
+#	returns the estimated memory usage for velvet in GB
+
+sub estVelvetMemUse {
+	my ($readsize, $genomesize, $numreads, $k) = @_;
+	my $velvetgmem = -109635 + 18977*$readsize + 86326*$genomesize + 233353*$numreads - 51092*$k;
+	my $out = ($velvetgmem/1024) / 1024;
+	return $out;
+}
+
+#	getReadSizeNum
+#	returns the number of reads and average size in the short and shortPaired categories...
+
+sub getReadSizeNum {
+	my $f = shift;
+	my %reads;
+	my $num = 0;
+	my $currentfiletype = "fasta";
+	#first pull apart the velveth string and get the short and shortpaired filenames...
+	my @l = split /\s+/, $f;
+	my $i = 0;
+	foreach (@l){
+		if(/^-/){
+			if(/^-fasta/){
+				$currentfiletype = "fasta";
+			}
+			elsif(/^-fastq/){
+				$currentfiletype = "fastq";
+			}
+			elsif(/(-eland)|(-gerald)|(-fasta.gz)|(-fastq.gz)/) {
+				croak "Cannot estimate memory usage from file types other than fasta or fastq..\n";
+			}
+		}
+		elsif(-r $_){
+			my $file = $_;
+			if($currentfiletype eq "fasta"){
+				my $x = `grep -c "^>" $file`;
+				chomp($x);
+				$num += $x;
+				my $l = &getReadLength($file, 'Fasta');
+				$reads{$l} += $x;
+				print STDERR "File: $file has $x reads of length $l\n";
+			}
+			else {
+				my $x = `grep -c "^@" $file`;
+				chomp($x);
+				$num += $x;
+				my $l = &getReadLength($file, 'Fastq');
+				$reads{$l} += $x;
+				print STDERR "File: $file has $x reads of length $l\n";
+			}
+		}
+		$i ++;
+	}
+	my $totlength = 0;
+	foreach my $k (keys %reads){
+		$totlength += ($reads{$k} * $k);
+	}
+	
+	
+	my @results;
+	push @results, floor($totlength/$num);
+	push @results, ($num/1000000);
+	printf STDERR "Total reads: %.1f million. Avg length: %.1f\n",($num/1000000), ($totlength/$num);
+	return @results;
+}
+
+# getReadLength - returns the length of the first read in a file of type fasta or fastq..
+#
+sub getReadLength {
+	my ($f, $t) = @_;
+	my $sio = Bio::SeqIO->new(-file => $f, -format => $t);
+	my $s = $sio->next_seq() or croak "Something went bad while reading file $f!\n";
+	return $s->length;
+}
+
+return 1;
+