Mercurial > repos > konradpaszkiewicz > velvetoptimiser
comparison VelvetOptimiser-2.1.7_modified/VelvetOpt/Utils.pm @ 0:50ae1360fbbe default tip
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | konradpaszkiewicz |
---|---|
date | Tue, 07 Jun 2011 18:07:56 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:50ae1360fbbe |
---|---|
1 # | |
2 # VelvetOpt::Utils.pm | |
3 # | |
4 # Copyright 2008,2009,2010 Simon Gladman <simon.gladman@csiro.au> | |
5 # | |
6 # This program is free software; you can redistribute it and/or modify | |
7 # it under the terms of the GNU General Public License as published by | |
8 # the Free Software Foundation; either version 2 of the License, or | |
9 # (at your option) any later version. | |
10 # | |
11 # This program is distributed in the hope that it will be useful, | |
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 # GNU General Public License for more details. | |
15 # | |
16 # You should have received a copy of the GNU General Public License | |
17 # along with this program; if not, write to the Free Software | |
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
19 # MA 02110-1301, USA. | |
20 | |
21 # Version 2.1.3 | |
22 | |
23 # Changes for Version 2.0.1 | |
24 # Added Mikael Brandstrom Durling's numCpus and freeMem for the Mac. | |
25 # | |
26 # Changes for Version 2.1.0 | |
27 # Fixed bug in estExpCov so it now correctly uses all short read categories not just the first two. | |
28 # | |
29 # Changes for Version 2.1.2 | |
30 # Fixed bug in estExpCov so it now won't take columns with "N/A" or "Inf" into account | |
31 # | |
32 # Changes for Version 2.1.3 | |
33 # Changed the minimum contig size to use for estimating expected coverage to 3*kmer size -1 and set the minimum coverage to 2 instead of 0. | |
34 # This should get rid of exp_covs of 1 when it should be very high for assembling reads that weren't ampped to a reference using one of the standard read mapping programs | |
35 | |
36 | |
37 package VelvetOpt::Utils; | |
38 | |
39 use strict; | |
40 use lib "/usr/local/lib/perl5/site_perl/5.8.8"; | |
41 use warnings; | |
42 use POSIX qw(ceil floor); | |
43 use Carp; | |
44 use List::Util qw(max); | |
45 use Bio::SeqIO; | |
46 | |
47 # num_cpu | |
48 # It returns the number of cpus present in the system if linux. | |
49 # If it is MAC then it returns the number of cores present. | |
50 # If the OS is not linux or Mac then it returns 1. | |
51 # Written by Torsten Seemann 2009 (linux) and Mikael Brandstrom Durling 2009 (Mac). | |
52 | |
53 sub num_cpu { | |
54 if ( $^O =~ m/linux/i ) { | |
55 my ($num) = qx(grep -c ^processor /proc/cpuinfo); | |
56 chomp $num; | |
57 return $num if $num =~ m/^\d+/; | |
58 } | |
59 elsif( $^O =~ m/darwin/i){ | |
60 my ($num) = qx(system_profiler SPHardwareDataType | grep Cores); | |
61 $num =~ /.*Cores: (\d+)/; | |
62 $num =$1; | |
63 return $num; | |
64 } | |
65 return 1; | |
66 } | |
67 | |
68 # free_mem | |
69 # Returns the current amount of free memory | |
70 # Mac Section written by Mikael Brandstrom Durling 2009 (Mac). | |
71 | |
72 sub free_mem { | |
73 if( $^O =~ m/linux/i ) { | |
74 my $x = `free | grep '^Mem:' | sed 's/ */~/g' | cut -d '~' -f 4,7`; | |
75 my @tmp = split "~", $x; | |
76 my $total = $tmp[0] + $tmp[1]; | |
77 my $totalGB = $total / 1024 / 1024; | |
78 return $totalGB; | |
79 } | |
80 elsif( $^O =~ m/darwin/i){ | |
81 my ($tmp) = qx(vm_stat | grep size); | |
82 $tmp =~ /.*size of (\d+) bytes.*/; | |
83 my $page_size = $1; | |
84 ($tmp) = qx(vm_stat | grep free); | |
85 $tmp =~ /[^0-9]+(\d+).*/; | |
86 my $free_pages = $1; | |
87 my $totalGB = ($free_pages * $page_size) / 1024 / 1024 / 1024; | |
88 return $totalGB; | |
89 } | |
90 } | |
91 | |
92 # estExpCov | |
93 # it returns the expected coverage of short reads from an assembly by | |
94 # performing a math mode on the stats.txt file supplied.. It looks at | |
95 # all the short_cov? columns.. Uses minimum contig length and minimum coverage. | |
96 # needs the stats.txt file path and name, and the k-value used in the assembly. | |
97 # Original algorithm by Torsten Seemann 2009 under the GPL. | |
98 # Adapted by Simon Gladman 2009. | |
99 # It does a weighted mode... | |
100 | |
101 sub estExpCov { | |
102 use List::Util qw(max); | |
103 my $file = shift; | |
104 my $kmer = shift; | |
105 my $minlen = 3 * $kmer - 1; | |
106 my $mincov = 2; | |
107 my $fh; | |
108 unless ( open IN, $file ) { | |
109 croak "Unable to open $file for exp_cov determination.\n"; | |
110 } | |
111 my @cov; | |
112 while (<IN>) { | |
113 chomp; | |
114 my @x = split m/\t/; | |
115 my $len = scalar @x; | |
116 next unless @x >= 7; | |
117 next unless $x[1] =~ m/^\d+$/; | |
118 next unless $x[1] >= $minlen; | |
119 | |
120 #add all the short_cov columns.. | |
121 my $cov = 0; | |
122 for(my $i = 5; $i < $len; $i += 2){ | |
123 if($x[$i] =~ /\d/){ | |
124 $cov += $x[$i]; | |
125 } | |
126 } | |
127 next unless $cov > $mincov; | |
128 push @cov, ( ( int($cov) ) x $x[1] ); | |
129 } | |
130 | |
131 my %freq_of; | |
132 map { $freq_of{$_}++ } @cov; | |
133 my $mode = 0; | |
134 $freq_of{$mode} = 0; # sentinel | |
135 for my $x ( keys %freq_of ) { | |
136 $mode = $x if $freq_of{$x} > $freq_of{$mode}; | |
137 } | |
138 return $mode; | |
139 } | |
140 | |
141 # estVelvetMemUse | |
142 # returns the estimated memory usage for velvet in GB | |
143 | |
144 sub estVelvetMemUse { | |
145 my ($readsize, $genomesize, $numreads, $k) = @_; | |
146 my $velvetgmem = -109635 + 18977*$readsize + 86326*$genomesize + 233353*$numreads - 51092*$k; | |
147 my $out = ($velvetgmem/1024) / 1024; | |
148 return $out; | |
149 } | |
150 | |
151 # getReadSizeNum | |
152 # returns the number of reads and average size in the short and shortPaired categories... | |
153 | |
154 sub getReadSizeNum { | |
155 my $f = shift; | |
156 my %reads; | |
157 my $num = 0; | |
158 my $currentfiletype = "fasta"; | |
159 #first pull apart the velveth string and get the short and shortpaired filenames... | |
160 my @l = split /\s+/, $f; | |
161 my $i = 0; | |
162 foreach (@l){ | |
163 if(/^-/){ | |
164 if(/^-fasta/){ | |
165 $currentfiletype = "fasta"; | |
166 } | |
167 elsif(/^-fastq/){ | |
168 $currentfiletype = "fastq"; | |
169 } | |
170 elsif(/(-eland)|(-gerald)|(-fasta.gz)|(-fastq.gz)/) { | |
171 croak "Cannot estimate memory usage from file types other than fasta or fastq..\n"; | |
172 } | |
173 } | |
174 elsif(-r $_){ | |
175 my $file = $_; | |
176 if($currentfiletype eq "fasta"){ | |
177 my $x = `grep -c "^>" $file`; | |
178 chomp($x); | |
179 $num += $x; | |
180 my $l = &getReadLength($file, 'Fasta'); | |
181 $reads{$l} += $x; | |
182 print STDERR "File: $file has $x reads of length $l\n"; | |
183 } | |
184 else { | |
185 my $x = `grep -c "^@" $file`; | |
186 chomp($x); | |
187 $num += $x; | |
188 my $l = &getReadLength($file, 'Fastq'); | |
189 $reads{$l} += $x; | |
190 print STDERR "File: $file has $x reads of length $l\n"; | |
191 } | |
192 } | |
193 $i ++; | |
194 } | |
195 my $totlength = 0; | |
196 foreach my $k (keys %reads){ | |
197 $totlength += ($reads{$k} * $k); | |
198 } | |
199 | |
200 | |
201 my @results; | |
202 push @results, floor($totlength/$num); | |
203 push @results, ($num/1000000); | |
204 printf STDERR "Total reads: %.1f million. Avg length: %.1f\n",($num/1000000), ($totlength/$num); | |
205 return @results; | |
206 } | |
207 | |
208 # getReadLength - returns the length of the first read in a file of type fasta or fastq.. | |
209 # | |
210 sub getReadLength { | |
211 my ($f, $t) = @_; | |
212 my $sio = Bio::SeqIO->new(-file => $f, -format => $t); | |
213 my $s = $sio->next_seq() or croak "Something went bad while reading file $f!\n"; | |
214 return $s->length; | |
215 } | |
216 | |
217 return 1; | |
218 |