Mercurial > repos > konradpaszkiewicz > velvetoptimiser
diff VelvetOptimiser-2.1.7_modified/VelvetOpt/Utils.pm @ 0:50ae1360fbbe default tip
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | konradpaszkiewicz |
---|---|
date | Tue, 07 Jun 2011 18:07:56 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VelvetOptimiser-2.1.7_modified/VelvetOpt/Utils.pm Tue Jun 07 18:07:56 2011 -0400 @@ -0,0 +1,218 @@ +# +# VelvetOpt::Utils.pm +# +# Copyright 2008,2009,2010 Simon Gladman <simon.gladman@csiro.au> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + +# Version 2.1.3 + +# Changes for Version 2.0.1 +# Added Mikael Brandstrom Durling's numCpus and freeMem for the Mac. +# +# Changes for Version 2.1.0 +# Fixed bug in estExpCov so it now correctly uses all short read categories not just the first two. +# +# Changes for Version 2.1.2 +# Fixed bug in estExpCov so it now won't take columns with "N/A" or "Inf" into account +# +# Changes for Version 2.1.3 +# Changed the minimum contig size to use for estimating expected coverage to 3*kmer size -1 and set the minimum coverage to 2 instead of 0. +# This should get rid of exp_covs of 1 when it should be very high for assembling reads that weren't ampped to a reference using one of the standard read mapping programs + + +package VelvetOpt::Utils; + +use strict; +use lib "/usr/local/lib/perl5/site_perl/5.8.8"; +use warnings; +use POSIX qw(ceil floor); +use Carp; +use List::Util qw(max); +use Bio::SeqIO; + +# num_cpu +# It returns the number of cpus present in the system if linux. +# If it is MAC then it returns the number of cores present. +# If the OS is not linux or Mac then it returns 1. +# Written by Torsten Seemann 2009 (linux) and Mikael Brandstrom Durling 2009 (Mac). + +sub num_cpu { + if ( $^O =~ m/linux/i ) { + my ($num) = qx(grep -c ^processor /proc/cpuinfo); + chomp $num; + return $num if $num =~ m/^\d+/; + } + elsif( $^O =~ m/darwin/i){ + my ($num) = qx(system_profiler SPHardwareDataType | grep Cores); + $num =~ /.*Cores: (\d+)/; + $num =$1; + return $num; + } + return 1; +} + +# free_mem +# Returns the current amount of free memory +# Mac Section written by Mikael Brandstrom Durling 2009 (Mac). + +sub free_mem { + if( $^O =~ m/linux/i ) { + my $x = `free | grep '^Mem:' | sed 's/ */~/g' | cut -d '~' -f 4,7`; + my @tmp = split "~", $x; + my $total = $tmp[0] + $tmp[1]; + my $totalGB = $total / 1024 / 1024; + return $totalGB; + } + elsif( $^O =~ m/darwin/i){ + my ($tmp) = qx(vm_stat | grep size); + $tmp =~ /.*size of (\d+) bytes.*/; + my $page_size = $1; + ($tmp) = qx(vm_stat | grep free); + $tmp =~ /[^0-9]+(\d+).*/; + my $free_pages = $1; + my $totalGB = ($free_pages * $page_size) / 1024 / 1024 / 1024; + return $totalGB; + } +} + +# estExpCov +# it returns the expected coverage of short reads from an assembly by +# performing a math mode on the stats.txt file supplied.. It looks at +# all the short_cov? columns.. Uses minimum contig length and minimum coverage. +# needs the stats.txt file path and name, and the k-value used in the assembly. +# Original algorithm by Torsten Seemann 2009 under the GPL. +# Adapted by Simon Gladman 2009. +# It does a weighted mode... + +sub estExpCov { + use List::Util qw(max); + my $file = shift; + my $kmer = shift; + my $minlen = 3 * $kmer - 1; + my $mincov = 2; + my $fh; + unless ( open IN, $file ) { + croak "Unable to open $file for exp_cov determination.\n"; + } + my @cov; + while (<IN>) { + chomp; + my @x = split m/\t/; + my $len = scalar @x; + next unless @x >= 7; + next unless $x[1] =~ m/^\d+$/; + next unless $x[1] >= $minlen; + + #add all the short_cov columns.. + my $cov = 0; + for(my $i = 5; $i < $len; $i += 2){ + if($x[$i] =~ /\d/){ + $cov += $x[$i]; + } + } + next unless $cov > $mincov; + push @cov, ( ( int($cov) ) x $x[1] ); + } + + my %freq_of; + map { $freq_of{$_}++ } @cov; + my $mode = 0; + $freq_of{$mode} = 0; # sentinel + for my $x ( keys %freq_of ) { + $mode = $x if $freq_of{$x} > $freq_of{$mode}; + } + return $mode; +} + +# estVelvetMemUse +# returns the estimated memory usage for velvet in GB + +sub estVelvetMemUse { + my ($readsize, $genomesize, $numreads, $k) = @_; + my $velvetgmem = -109635 + 18977*$readsize + 86326*$genomesize + 233353*$numreads - 51092*$k; + my $out = ($velvetgmem/1024) / 1024; + return $out; +} + +# getReadSizeNum +# returns the number of reads and average size in the short and shortPaired categories... + +sub getReadSizeNum { + my $f = shift; + my %reads; + my $num = 0; + my $currentfiletype = "fasta"; + #first pull apart the velveth string and get the short and shortpaired filenames... + my @l = split /\s+/, $f; + my $i = 0; + foreach (@l){ + if(/^-/){ + if(/^-fasta/){ + $currentfiletype = "fasta"; + } + elsif(/^-fastq/){ + $currentfiletype = "fastq"; + } + elsif(/(-eland)|(-gerald)|(-fasta.gz)|(-fastq.gz)/) { + croak "Cannot estimate memory usage from file types other than fasta or fastq..\n"; + } + } + elsif(-r $_){ + my $file = $_; + if($currentfiletype eq "fasta"){ + my $x = `grep -c "^>" $file`; + chomp($x); + $num += $x; + my $l = &getReadLength($file, 'Fasta'); + $reads{$l} += $x; + print STDERR "File: $file has $x reads of length $l\n"; + } + else { + my $x = `grep -c "^@" $file`; + chomp($x); + $num += $x; + my $l = &getReadLength($file, 'Fastq'); + $reads{$l} += $x; + print STDERR "File: $file has $x reads of length $l\n"; + } + } + $i ++; + } + my $totlength = 0; + foreach my $k (keys %reads){ + $totlength += ($reads{$k} * $k); + } + + + my @results; + push @results, floor($totlength/$num); + push @results, ($num/1000000); + printf STDERR "Total reads: %.1f million. Avg length: %.1f\n",($num/1000000), ($totlength/$num); + return @results; +} + +# getReadLength - returns the length of the first read in a file of type fasta or fastq.. +# +sub getReadLength { + my ($f, $t) = @_; + my $sio = Bio::SeqIO->new(-file => $f, -format => $t); + my $s = $sio->next_seq() or croak "Something went bad while reading file $f!\n"; + return $s->length; +} + +return 1; +