comparison VelvetOptimiser-2.1.7_modified/VelvetOpt/Utils.pm @ 0:50ae1360fbbe default tip

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author konradpaszkiewicz
date Tue, 07 Jun 2011 18:07:56 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:50ae1360fbbe
1 #
2 # VelvetOpt::Utils.pm
3 #
4 # Copyright 2008,2009,2010 Simon Gladman <simon.gladman@csiro.au>
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 # MA 02110-1301, USA.
20
21 # Version 2.1.3
22
23 # Changes for Version 2.0.1
24 # Added Mikael Brandstrom Durling's numCpus and freeMem for the Mac.
25 #
26 # Changes for Version 2.1.0
27 # Fixed bug in estExpCov so it now correctly uses all short read categories not just the first two.
28 #
29 # Changes for Version 2.1.2
30 # Fixed bug in estExpCov so it now won't take columns with "N/A" or "Inf" into account
31 #
32 # Changes for Version 2.1.3
33 # Changed the minimum contig size to use for estimating expected coverage to 3*kmer size -1 and set the minimum coverage to 2 instead of 0.
34 # This should get rid of exp_covs of 1 when it should be very high for assembling reads that weren't ampped to a reference using one of the standard read mapping programs
35
36
37 package VelvetOpt::Utils;
38
39 use strict;
40 use lib "/usr/local/lib/perl5/site_perl/5.8.8";
41 use warnings;
42 use POSIX qw(ceil floor);
43 use Carp;
44 use List::Util qw(max);
45 use Bio::SeqIO;
46
47 # num_cpu
48 # It returns the number of cpus present in the system if linux.
49 # If it is MAC then it returns the number of cores present.
50 # If the OS is not linux or Mac then it returns 1.
51 # Written by Torsten Seemann 2009 (linux) and Mikael Brandstrom Durling 2009 (Mac).
52
53 sub num_cpu {
54 if ( $^O =~ m/linux/i ) {
55 my ($num) = qx(grep -c ^processor /proc/cpuinfo);
56 chomp $num;
57 return $num if $num =~ m/^\d+/;
58 }
59 elsif( $^O =~ m/darwin/i){
60 my ($num) = qx(system_profiler SPHardwareDataType | grep Cores);
61 $num =~ /.*Cores: (\d+)/;
62 $num =$1;
63 return $num;
64 }
65 return 1;
66 }
67
68 # free_mem
69 # Returns the current amount of free memory
70 # Mac Section written by Mikael Brandstrom Durling 2009 (Mac).
71
72 sub free_mem {
73 if( $^O =~ m/linux/i ) {
74 my $x = `free | grep '^Mem:' | sed 's/ */~/g' | cut -d '~' -f 4,7`;
75 my @tmp = split "~", $x;
76 my $total = $tmp[0] + $tmp[1];
77 my $totalGB = $total / 1024 / 1024;
78 return $totalGB;
79 }
80 elsif( $^O =~ m/darwin/i){
81 my ($tmp) = qx(vm_stat | grep size);
82 $tmp =~ /.*size of (\d+) bytes.*/;
83 my $page_size = $1;
84 ($tmp) = qx(vm_stat | grep free);
85 $tmp =~ /[^0-9]+(\d+).*/;
86 my $free_pages = $1;
87 my $totalGB = ($free_pages * $page_size) / 1024 / 1024 / 1024;
88 return $totalGB;
89 }
90 }
91
92 # estExpCov
93 # it returns the expected coverage of short reads from an assembly by
94 # performing a math mode on the stats.txt file supplied.. It looks at
95 # all the short_cov? columns.. Uses minimum contig length and minimum coverage.
96 # needs the stats.txt file path and name, and the k-value used in the assembly.
97 # Original algorithm by Torsten Seemann 2009 under the GPL.
98 # Adapted by Simon Gladman 2009.
99 # It does a weighted mode...
100
101 sub estExpCov {
102 use List::Util qw(max);
103 my $file = shift;
104 my $kmer = shift;
105 my $minlen = 3 * $kmer - 1;
106 my $mincov = 2;
107 my $fh;
108 unless ( open IN, $file ) {
109 croak "Unable to open $file for exp_cov determination.\n";
110 }
111 my @cov;
112 while (<IN>) {
113 chomp;
114 my @x = split m/\t/;
115 my $len = scalar @x;
116 next unless @x >= 7;
117 next unless $x[1] =~ m/^\d+$/;
118 next unless $x[1] >= $minlen;
119
120 #add all the short_cov columns..
121 my $cov = 0;
122 for(my $i = 5; $i < $len; $i += 2){
123 if($x[$i] =~ /\d/){
124 $cov += $x[$i];
125 }
126 }
127 next unless $cov > $mincov;
128 push @cov, ( ( int($cov) ) x $x[1] );
129 }
130
131 my %freq_of;
132 map { $freq_of{$_}++ } @cov;
133 my $mode = 0;
134 $freq_of{$mode} = 0; # sentinel
135 for my $x ( keys %freq_of ) {
136 $mode = $x if $freq_of{$x} > $freq_of{$mode};
137 }
138 return $mode;
139 }
140
141 # estVelvetMemUse
142 # returns the estimated memory usage for velvet in GB
143
144 sub estVelvetMemUse {
145 my ($readsize, $genomesize, $numreads, $k) = @_;
146 my $velvetgmem = -109635 + 18977*$readsize + 86326*$genomesize + 233353*$numreads - 51092*$k;
147 my $out = ($velvetgmem/1024) / 1024;
148 return $out;
149 }
150
151 # getReadSizeNum
152 # returns the number of reads and average size in the short and shortPaired categories...
153
154 sub getReadSizeNum {
155 my $f = shift;
156 my %reads;
157 my $num = 0;
158 my $currentfiletype = "fasta";
159 #first pull apart the velveth string and get the short and shortpaired filenames...
160 my @l = split /\s+/, $f;
161 my $i = 0;
162 foreach (@l){
163 if(/^-/){
164 if(/^-fasta/){
165 $currentfiletype = "fasta";
166 }
167 elsif(/^-fastq/){
168 $currentfiletype = "fastq";
169 }
170 elsif(/(-eland)|(-gerald)|(-fasta.gz)|(-fastq.gz)/) {
171 croak "Cannot estimate memory usage from file types other than fasta or fastq..\n";
172 }
173 }
174 elsif(-r $_){
175 my $file = $_;
176 if($currentfiletype eq "fasta"){
177 my $x = `grep -c "^>" $file`;
178 chomp($x);
179 $num += $x;
180 my $l = &getReadLength($file, 'Fasta');
181 $reads{$l} += $x;
182 print STDERR "File: $file has $x reads of length $l\n";
183 }
184 else {
185 my $x = `grep -c "^@" $file`;
186 chomp($x);
187 $num += $x;
188 my $l = &getReadLength($file, 'Fastq');
189 $reads{$l} += $x;
190 print STDERR "File: $file has $x reads of length $l\n";
191 }
192 }
193 $i ++;
194 }
195 my $totlength = 0;
196 foreach my $k (keys %reads){
197 $totlength += ($reads{$k} * $k);
198 }
199
200
201 my @results;
202 push @results, floor($totlength/$num);
203 push @results, ($num/1000000);
204 printf STDERR "Total reads: %.1f million. Avg length: %.1f\n",($num/1000000), ($totlength/$num);
205 return @results;
206 }
207
208 # getReadLength - returns the length of the first read in a file of type fasta or fastq..
209 #
210 sub getReadLength {
211 my ($f, $t) = @_;
212 my $sio = Bio::SeqIO->new(-file => $f, -format => $t);
213 my $s = $sio->next_seq() or croak "Something went bad while reading file $f!\n";
214 return $s->length;
215 }
216
217 return 1;
218