comparison fasta_summary.pl @ 0:ad2b274663f8 draft

planemo upload for repository https://github.com/phac-nml/galaxy_tools commit 3f9ae719338c7c8db81d645b8ee09727e2d9ce23
author nml
date Tue, 07 Nov 2017 12:28:31 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ad2b274663f8
1 #!/usr/bin/env perl
2
3 #==============================================================================================
4
5 # Script to output statistsics and histograms for reads and contigs/isotigs
6
7
8 # Outputs include:
9 # Mean, N50, StdDev or reads or contig lengths,
10 # Mean and Modal read or contig lengths.
11 # Number of reads or contigs > 1 kb in length
12 # Summed contig length (by number of contigs, in sorted order)
13 # Histogram of read or contig lengths,
14 # Graph of sums of read-lengths
15 # File of reads or contigs sorted by read or contig length
16 # Test for mono/di-nucelotide repeats
17 # Randomly selected reads or contigs
18
19
20 # Needs gnuplot installed to create the histograms:
21 # On Fedora/Redhat linux: sudo yum install gnuplot
22 # On Ubuntu/Debian: sudo apt-get install gnuplot
23
24 # Uses a linux pipe to call gnu-plot directly, rather than as a separate shell script.
25
26 # Original written by Sujai Kumar, 2008-09-05 University of Edinburgh
27 # Modified by Stephen: 29-Apr-2009:
28 # Last changed by Stephen: 9-Aug-2010
29
30
31 # Usage: fasta_summary.pl -i infile.fasta -o process_reads -t read OR contig OR isotig (to use 'read' or 'contig' or 'isotig' in the output table & graphs. Isotig is for 'runAssembly -cdna ...' output file '454Isotigs.fna') [-r 1 to indicate count simple nucleotide repeats] [-n number of random reads to output] [-c cutoff_length] [-l 1 to indicate output the longest read] [-f (s or t or w) for spacer, tab or wiki format table output.]
32
33 # Note: The parameters above in the [] are optional.
34
35 # eg: fasta_summary.pl -i myfile.fasta -o process_reads -t read
36 # Where:
37 # -i reads or contigs as input, in fasta format.
38 # -o output_dir (created if it doesn't exist)
39 # -t read, contig or isotig
40
41 # Gives back
42 # - N50
43 # - num of contigs > 1 kb
44 # - num of contigs
45 # - Read or Contig Histogram and graphs.
46 # - Summed contig length (by number of contigs, in sorted order)
47
48 #==============================================================================================
49
50
51 use strict;
52 use warnings;
53 use Getopt::Long;
54
55 my $infile;
56 my $output_dir;
57 my $type='read'; # Defaults to 'read' at present
58 my $repeats=1;
59 my $num_random_reads_to_output=0;
60 my $cutoff_length=-1; # -1 means won't check this cutoff
61 my $longest_read=-1; # -1 mean's don't output the sequence for the longest read.
62 my $doCommify=1; # Outputs statistics numbers in format: 9,999,999
63 my $format="t"; # "s"=spaces between columns, "t"=tabs between columns, "w"=wiki '||' and '|'.
64 my $bucket1=0; # For optional exact length histogram distribution as asked for by JH.
65
66 if ($#ARGV==-1) {die "
67 Usage:
68
69 fasta_summary.pl -i infile.fasta -o output_dir -t ( read | contig | isotig ) [ -r 0 ] [ -n num_reads ] [ -c cutoff_length ] [ -l 1 ] [ -d 0 ] [ -f (w | t ) ] [ -bucket1 ]
70
71 where:
72
73 -i or -infile infile.fasta : input fatsa file of raeds, contigs or isotigs,
74
75 -o or -output_dir output_directory : directory to put output stats and graphs into.
76
77 -t or -type (read or contig or isotig) : for displaying the graph title, where type is 'read' or 'contig' or 'isotig'.
78
79 -r or -repeats 0 or 1 : 1=count number of reads that contain over 70% simple mono-nucleotide and di-nucleotide repeat bases; 0=don't count.
80
81 -n or -number num_reads : For outputting specified number of randomly selected reads or contigs.
82
83 -c or -cutoff cutoff_length : Give a number of reads to do extra analysis (calculating again the number of reads and number of bases in reads above this length)
84
85 -l or -longest 0 or 1 : 1=Output the longest read; 0= don't output the longest read
86
87 -d or -doCommify 0 or 1 : Output numbers formatted with commas to make easier to read: 0=no commas, default=1
88
89 -f or -format w or t : w=wiki_format (ie. table with || and | for column dividers), t=tabs between column symbols for the wiki pages, default is spaces between columns.
90
91 -b or -bucket1 : To also output histogram file of exact read lengths (ie. bucket size of 1)
92
93
94 eg: For 454 reads: fasta_summary.pl -i RawReads.fna -o read_stats -t read
95 For 454 isotigs: fasta_summary.pl -i 454Isotigs.fna -o isotig_stats -t isotig
96
97 ";}
98
99 GetOptions (
100 "infile=s" => \$infile,
101 "output_dir=s" => \$output_dir,
102 "type=s" => \$type, ## type is 'read' or 'contig' or 'isotig' - for displaying the graph title
103 "repeats=i" => \$repeats, # To count simple repeats
104 "number=i" => \$num_random_reads_to_output, # For outputting specified number of random reads
105 "cutoff=i" => \$cutoff_length, # Give a number of reads to do extra analysis (calculating again the number of reads and number of bases in reads above this length)
106 "longest=i" => \$longest_read, # Output the longest read.
107 "doCommify=i" => \$doCommify, # Output numbers formatted with commas to make easier to read: 0=no commas, default=1
108 "format=s" => \$format, # "w"=wiki_format (ie. table with || and | for column dividers), "t"=tabs between column symbols for the wiki pages, default is spaces between columns.
109 "bucket1" => \$bucket1, # To also output histogram file of exact read lengths (ie. bucket size of 1)
110 );
111 if ($#ARGV>-1) {die "Unused options specified: @ARGV\n";}
112 if ( (! defined $infile) || ($infile eq '') ) {die "\nPlease give input fasta file, preceeded by the '-i' option\n\n";}
113 if ( (! defined $output_dir) || ($output_dir eq '') ) {die "Please give output_directory, preceeded by the '-o' option\n\n";}
114 if ( (! defined $type) || (($type ne 'contig') && ($type ne 'read') && ($type ne 'isotig')) ) {die "ERROR: On commandline: -t type must be 'contig' or 'read' or 'isotig'\n\n";}
115
116
117 my ($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab);
118 if ($format eq 's') {($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab)=('',' ','', '',' ','', "\n",'');}
119 elsif ($format eq 't') {($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab)=("\t","\t",'', "","\t",'', "\n",'');} # There is correctly a tab for the $L, but not the $Lh.
120 elsif ($format eq 'w') {($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab)=('| ',' | ',' |', '|| ',' || ',' ||', '|| ',' || ');}
121 else {die "\nInvalid output format code: '$format'. Should be 's', 't' or 'w'.\n\n";}
122
123 ### create output_dir if it doesn't exist
124 if (-d $output_dir) {
125 print STDERR " Directory '$output_dir' exists, so the existing fasta_summary.pl output files will be overwritten\n";
126 } else {
127 mkdir $output_dir;
128 print STDERR " Directory '$output_dir' created\n";
129 }
130
131 my $gc_count = 0;
132
133 #--------------- Read in contigs from fasta file -------------------
134
135 open INFILE, "<$infile" or die "Failed to open file: '$infile' : $!";
136 open STATS, ">$output_dir/stats.txt" or die "Failed to open $output_dir/stats.txt: '' : $!";
137
138 my $header = <INFILE>;
139 if (! defined($header) ) {print "\n** ERROR: First line of input fasta file is undefined - so file must be empty **\n\n"; print STATS "No sequences found\n"; exit 1;}
140 if ($header!~/^>/) {print "\nERROR: First line of input fasta file should start with '>' BUT first line is: '$header'\n"; print STATS "No sequences found\n"; exit 1;}
141
142 my $seq = "";
143 my @sequences;
144
145 while (<INFILE>) {
146 if (/^>/) {
147 push @sequences, [length($seq), $header, $seq];
148 $header = $_;
149 $seq = "";
150 } else {
151 chomp;
152 $gc_count += tr/gcGC/gcGC/;
153 $seq .= $_;
154 }
155 }
156 push @sequences, [length($seq), $header, $seq];
157 close INFILE;
158 if ($#sequences==-1) {print "\nERROR: There are zero sequences in the input file: $infile\n\n"; print STATS "No sequences found\n"; exit 1;}
159
160
161 my $all_contigs_length=0;
162 foreach (@sequences) {$all_contigs_length += $_->[0];}
163 if ($all_contigs_length==0) {print "\nERROR: Length of all contigs is zero\n\n"; exit 2;}
164
165 # Find number and number of bases in reads greater than the optional cut-off length given at command-line.
166 my $num_reads_above_cutoff=0;
167 my $num_of_bases_in_reads_above_cutoff=0;
168 if ($cutoff_length>0)
169 {
170 foreach (@sequences)
171 {
172 if ($_->[0]>=$cutoff_length) {$num_of_bases_in_reads_above_cutoff+= $_->[0]; $num_reads_above_cutoff++;}
173 }
174 }
175
176
177 #--------------- Gather Plots Data, Find N50, Print sorted contig file -------------------
178
179 my $summed_contig_length = 0;
180 my @summed_contig_data; # <-- For graph of summed length (in number of bases) versus number of contigs.
181 my @summed_contig_data_contigLens; # <-- Added by SJBridgett to get graph of summed contig length versus min. contig length included (ie. X-axis is sort of inverse of above)
182
183 my $contig1k_count = 0;
184 my $contig1k_length = 0;
185
186 open SORTED, ">$output_dir/sorted_contigs.fa" or die $!;
187
188 # top row in stats file
189 #print STATS "N50\nMax contig size\nNumber of bases in contigs\nNumber of contigs\nNumber of contigs >=1kb\nNumber of contigs in N50\nNumber of bases in contigs >=1kb\nGC Content of contigs\n";
190
191 my $N50size=-1;
192 my $N50_contigs = 0;
193
194 my @sorted_by_contig_length = sort {$b->[0] <=> $a->[0]} @sequences;
195
196 ### variables and initialization for histogram (stored in @bins)
197 my $max = $sorted_by_contig_length[0][0];
198 my $mean= $all_contigs_length/($#sequences+1); # <-- Added by Stephen Bridgett. Note: as $# gives the highest index number, so add 1 as arrays are zero-based.
199
200 # Calculate standard deviation
201 my $sumsquares = 0;
202 foreach (@sequences) {$sumsquares += ($_->[0] - $mean) ** 2;} # <-- Taken from John's "mean_fasta_length.pl" script.
203 my $stddev = ( $sumsquares/($#sequences+1) ) ** 0.5;
204
205 my $min = 0;
206 # Aim for approximately 100 bins, so
207
208 my $bin_size=1;
209 my $min_max_range=$max - $min;
210 # $bin_size = ($min_max_range)/(99); # (99 is 100-1) so 1000/100
211 if ($min_max_range>=100000000) {$bin_size=1000000;}
212 elsif ($min_max_range>=10000000) {$bin_size=100000;}
213 elsif ($min_max_range>=1000000) {$bin_size=10000;}
214 elsif ($min_max_range>=100000) {$bin_size=1000;}
215 elsif ($min_max_range>=10000) {$bin_size=100;}
216 else {$bin_size=10;} # elsif ($min_max_range>=1000) {$bin_size=10;}
217 #elsif ($min_max_range>=100) {$bin_size=1;}
218 #elsif ($min_max_range>=10) {}
219 #elsif ($min_max_range>=1) {}
220 # WAS: my $bin_size = ($type eq 'contig') ? 1000 : 10;
221
222 my @bins;
223 $#bins = int(($min_max_range)/$bin_size) + 1; # <-- Set the bins array size.
224 foreach (@bins) {$_ = 0};
225
226 foreach (@sorted_by_contig_length) {
227
228 my $curr_contig_length = $_->[0];
229 push @summed_contig_data_contigLens, $curr_contig_length; # <-- added by Stephen.
230
231 $bins[int(($curr_contig_length + 1 - $min)/$bin_size)]++;
232
233 $summed_contig_length += $curr_contig_length;
234 push @summed_contig_data, $summed_contig_length;
235
236 ### sorted contigs file
237 print SORTED $_->[1] . $_->[2] . "\n";
238
239 if ($curr_contig_length >= 1000) {
240 $contig1k_count++;
241 $contig1k_length += $curr_contig_length;
242 }
243
244 $N50_contigs++ unless ($N50size>-1); # Was unless $N50_found
245
246 if ($summed_contig_length > ($all_contigs_length / 2) and $N50size == -1) {
247 $N50size = $curr_contig_length;
248 }
249 }
250
251
252 if ($bucket1!=0)
253 {
254 =pod
255 # This firsdt method works and agress with the second, but the lengths are in reverse order, at the @sorted_by_contig_length array was sorted with longest contig first.
256 open BUCKET1, ">$output_dir/lengths_hist1.txt" or die "Failed to open file '$output_dir/lengths_hist1.txt' : $!\n";
257 print BUCKET1 "Length\tFrequency\n";
258 my $len=-1;
259 my $count=0;
260 foreach (@sorted_by_contig_length)
261 {
262 if ( $len != $_->[0] ) {if ($len>-1) {print BUCKET1 "$len\t$count\n";} $len=$_->[0]; $count=0;}
263 $count++;
264 }
265 if ($len>-1) {print BUCKET1 "$len\t$count\n";} # Print length of final length grouping.
266 close BUCKET1;
267 =cut
268 open BUCKET1, ">$output_dir/lengths_hist1_with_zeros.txt" or die "Failed to open file '$output_dir/lengths_hist1_with_zeros.txt' : $!\n";
269 print BUCKET1 "Length\tFrequency\n";
270 my @bucket=(); # To check the result by using array.
271 foreach (@sequences)
272 {
273 my $len=$_->[0];
274 if (defined $bucket[$len]) {$bucket[$len]++;}
275 else {$bucket[$len]=1;}
276 }
277 for (my $i=0; $i<=$#bucket; $i++)
278 # for (my $i=$#bucket; $i>=0; $i--) # <-- for reverse order
279 {
280 if (defined $bucket[$i]) {print BUCKET1 "$i\t$bucket[$i]\n";}
281 else {print BUCKET1 "$i\t0\n";} # Can uncomment this later if don't want zeros in the output.
282 }
283 close BUCKET1;
284 }
285
286
287 my $type_plural=$type.'s';
288 print STATS $Lh."Statistics for $type lengths:".$Mhnotab.$Rh."\n";
289 print STATS $L."Min $type length:".$M.&commify_if($sorted_by_contig_length[$#sequences][0],$doCommify).$R."\n";
290 print STATS $L."Max $type length:".$M.&commify_if($max,$doCommify).$R."\n";
291 printf STATS $L."Mean %s length:".$M."%.2f".$R."\n", $type,$mean; # <-- Added by Stephen Bridgett, April 2009.
292 printf STATS $L."Standard deviation of %s length:".$M."%.2f".$R."\n", $type,$stddev; ## <-- Added by Stephen Bridgett, May 2009.
293 print STATS $L."Median $type length:".$M.&commify_if($sorted_by_contig_length[int($#sequences/2)][0],$doCommify).$R."\n";
294 print STATS $L."N50 $type length:".$M.&commify_if($N50size,$doCommify).$R."\n";
295
296 print STATS $Lhnewline."Statistics for numbers of $type_plural:".$Mhnotab.$Rh."\n";
297 print STATS $L."Number of $type_plural:".$M.&commify_if($#sequences+1,$doCommify).$R."\n";
298 print STATS $L."Number of $type_plural >=1kb:".$M.&commify_if($contig1k_count,$doCommify).$R."\n";
299 print STATS $L."Number of $type_plural in N50:".$M.&commify_if($N50_contigs,$doCommify).$R."\n";
300
301 print STATS $Lhnewline."Statistics for bases in the $type_plural:".$Mhnotab.$Rh."\n";
302 print STATS $L."Number of bases in all $type_plural:".$M.&commify_if($all_contigs_length,$doCommify).$R."\n";
303 print STATS $L."Number of bases in $type_plural >=1kb:".$M.&commify_if($contig1k_length,$doCommify).$R."\n";
304 printf STATS $L."GC Content of %s:".$M."%.2f %%".$R."\n", $type_plural,(100*$gc_count/$all_contigs_length);
305
306 if ($cutoff_length>0)
307 {
308 print STATS $Lhnewline."Statistics for $type_plural >= $cutoff_length bp in length:".$Mhnotab.$Rh."\n";
309 print STATS $L."Number of $type_plural >= $cutoff_length bp:".$M.&commify($num_reads_above_cutoff,$doCommify).$R."\n";
310 print STATS $L."\tNumber of bases in $type_plural >= $cutoff_length bp:".$M.&commify($num_of_bases_in_reads_above_cutoff,$doCommify).$R."\n";
311 }
312
313 if ($repeats==1) {&countRepeats();}
314
315 print STATS "\n";
316
317
318 # Output random selection of reads if requested on commandline:
319 my $fastaLineLen=60; # <-- The line length used for 454 sffinfo output, but could use a value read from input file (but be careful not to read a short line)
320 if ($num_random_reads_to_output>0)
321 {
322 my @randlist;
323 if ($num_random_reads_to_output<($#sequences+1))
324 {
325 print STATS "\nSome randomly selected reads:\n\n";
326 @randlist= &getListOfRandomNumbers($#sequences, $num_random_reads_to_output); # Don't use ($#sequences + 1), just ($#sequences) otherwise would be outside the array.
327 }
328 else # Just print all the sequences:
329 {
330 print STATS "\nAll ".($#sequences+1)." reads:\n\n";
331 for (my $i=0;$i<=$#sequences;$i++) {push @randlist,$i;}
332 }
333 &print_sequences(\@randlist)
334 }
335
336
337 # Print the longest read:
338 if ($longest_read>0)
339 {
340 my $length_of_longest_read=-1;
341 my @longest_read=();
342 my $i=0;
343 foreach (@sequences)
344 {
345 if ($_->[0]>$length_of_longest_read) {$length_of_longest_read=$_->[0]; $longest_read[0]=$i;}
346 $i++;
347 }
348 if ($length_of_longest_read>0) {print STATS "\nLongest read:\n"}
349 &print_sequences(\@longest_read);
350 }
351
352
353 =pod
354 print STATS "\n$type\tSummed\nlength\tlength\n"; # <-- Added by Stephen Bridgett, but better to produce a graph instead.
355
356 my $i=0;
357 foreach (@summed_contig_data) {
358 # print STATS $sorted_by_contig_length[$i]->[0]."\t".$summed_contig_data_contigLens[$i]."\t".$_."\t".$summed_contig_data[$i]."\n";
359 print STATS $sorted_by_contig_length[$i]->[0]."\t".$_."\n";
360 $i++;
361 }
362 =cut
363
364 open SUMMED, ">$output_dir/summed_contig_lengths.dat" or die $!;
365 print SUMMED join "\n",@summed_contig_data;
366 close SUMMED;
367
368 open HISTOGRAMBINS, ">$output_dir/histogram_bins.dat" or die $!;
369 my $bin_size_counter = 0;
370 foreach (@bins) {
371 print HISTOGRAMBINS eval($bin_size_counter++ * $bin_size + $bin_size/2) . "\t$_\n";
372 }
373 close HISTOGRAMBINS;
374
375
376 # Graph of cumulative (summed) number of reads on y-axis, versus length of read (decending order) on x-axis
377 open SUMREAD_READLEN, ">$output_dir/sum_reads_vs_read_len.dat" or die $!;
378 #my $read_counter= 0;
379 my $read_counter= $#sorted_by_contig_length+1;
380
381 foreach (@sorted_by_contig_length) {
382 # $read_counter++;
383 $read_counter--;
384 print SUMREAD_READLEN "$_->[0]\n"; # $read_counter
385 }
386 close SUMREAD_READLEN;
387
388
389
390
391 my $properType=ucfirst($type); # Makes the first letter an upper case letter, ie. 'Config' or 'Read'
392 #if ($type eq 'contig')
393 # {
394 # print the outcome of the gnu_plot as may have a write permissions error sometimes.
395 my $YHistogramScaleType = ($type eq 'read') ? '' : 'log y'; # Not using log scale for reads, just for contig/isotigs.
396 &plot_graph('histogram', "$output_dir/histogram_bins.dat", "Histogram of $type lengths", "$properType length", "Number of $type_plural", '0.9', $YHistogramScaleType);
397 &plot_graph('line', "$output_dir/summed_contig_lengths.dat", "Summed $type lengths", "Number of $type_plural", "Summed $type length in bases", '0.9', '');
398 &plot_graph('line', "$output_dir/sum_reads_vs_read_len.dat", "X-axis gives the Number of $type_plural that are greater than the $properType-length given on the Y-axis", "$properType length", "Cummulative number of $type_plural", '0.9', '');
399
400 =pod
401 # print `gnuplot_histogram.sh $output_dir/histogram_bins.dat`;
402
403 &plot_graph("$output_dir/summed_contig_lengths.dat", 'Summed contig lengths', 'Number of contigs', 'Summed contig length in bases', '0.9', '');
404 # print `gnuplot_summedcontigs.sh $output_dir/summed_contig_lengths.dat`;
405
406 &plot_graph('line', "$output_dir/sum_reads_vs_read_len.dat", 'X-axis gives the Number of contigs that are greater than the Contig-length given on the Y-axis', 'Contig length', 'Cummulative number of contigs', '0.9', '');
407 # print `gnuplot_sum_contig_vs_contig_len.sh $output_dir/sum_reads_vs_read_len.dat`;
408 }
409 elsif ($type eq 'read')
410 {
411
412 # print `gnuplot_readshistogram_logY.sh $output_dir/histogram_bins.dat`; # There's also optionally a "...._linearY.sh"
413
414 &plot_graph('line', "$output_dir/summed_contig_lengths.dat",'Summed read lengths', 'Number of reads', 'Summed read length in bases', '0.9', '');
415 # print `gnuplot_summedreads.sh $output_dir/summed_contig_lengths.dat`;
416
417 &plot_graph('line', "$output_dir/sum_reads_vs_read_len.dat", 'X-axis gives the Number of reads that are greater than the Read-length given on the Y-axis', 'Read length', 'Cummulative number of reads', '0.9', '');
418 # print `gnuplot_sum_reads_vs_read_len.sh $output_dir/sum_reads_vs_read_len.dat`;
419 }
420 else {die "\n** ERROR: Invalid type='$type' **\n\n";}
421 =cut
422
423 close SORTED;
424 close STATS;
425
426
427 # Use pipe to plot directly with gnuplot, rather than calling a separate shell script:
428 # http://www.vioan.ro/wp/2008/09/30/calling-gnuplot-from-perl/
429 # http://forums.devshed.com/perl-programming-6/plotting-with-gnuplot-within-perl-script-549682.html
430 # Another option is the perl module: GnuplotIF: http://lug.fh-swf.de/perl/GnuplotIF.html OR: http://lug.fh-swf.de/perl/
431
432 # PlPlot: Perl: http://search.cpan.org/~dhunt/PDL-Graphics-PLplot-0.47/plplot.pd
433 # http://plplot.sourceforge.net/
434 # dislin: http://www.mps.mpg.de/dislin/overview.html
435 # MathGL: http://mathgl.sourceforge.net/index.html
436
437
438 sub plot_graph
439 {
440 # Plots a histogram or xy-line graph
441 # Parameters: GraphType (histogram/line) DataFile, Title, X-label, Y-label, Y-range
442 # Graphfile should end with '.png'
443 # The $yloglinear is 'log y' for log, or '' for linear
444 my ($graphtype, $datafile, $title,$xlabel,$ylabel,$yrange,$yloglinear)=@_; # yrange for reads: 0.1, and for contigs: 0.9
445
446 my $graphstyle='';
447 if ($graphtype eq 'histogram') {$graphstyle="plot \"$datafile\" using 1:2 with boxes";}
448 elsif ($graphtype eq 'line') {$graphstyle="plot \"$datafile\" using 1 with lines";}
449 else {die "\n** ERROR: Invalid graphtype='$graphtype'\n\n";}
450 my $yloglinearscale= ($yloglinear eq '') ? '' : "set $yloglinear";
451
452 # To capture any errors that are normally sent from gnuplot to stderr, could use: open3 pipe interface:
453 # http://www.clintoneast.com/articles/perl-open3-example.php
454 # http://hell.org.ua/Docs/oreilly/perl2/prog/ch16_03.htm
455 # But the following should be fine, as the stderr will display when running the script anyway.
456 # If needed a simpler way would be to sent the output to a file using eg: open (GNUPLOT, "|gnuplot > gnu_out.txt 2>&1") or die .... The read the resulting file.
457 open (GNUPLOT, "|gnuplot") or die "\n**ERROR: Failed to open gnuplot : $!\n\n **";
458 print GNUPLOT <<ENDPLOT;
459 set terminal png
460 set output "$datafile.png"
461 set nokey
462 $yloglinearscale
463 set xlabel "$xlabel"
464 set ylabel "$ylabel"
465 set yrange [$yrange:]
466 set title "$title"
467 $graphstyle
468 ENDPLOT
469 close(GNUPLOT);
470 if ($? != 0) {print "\n** WARNING: GNUplot pipe returned non-zero status: '$?'\n\n";} # $? is the status returned by the last pipe close (or backtick or system operator)
471 if (! -e "$datafile.png") {die "\n** ERROR: Failed to create '$datafile.png'**\n\n";}
472
473 =pod
474 #PNG
475 set term png small xFFFFFF
476 set output "$file.png"
477 set size 1 ,1
478 set nokey
479 set data style line
480 set xlabel "frequency" font "VeraMono,10"
481 set title "Fast Fourier Transform" font "VeraMono,10"
482 set grid xtics ytics
483 set xtics 100
484 plot "$file" using 1:2 w lines 1
485 =cut
486
487 #WAS PREVIOUSLY AS .sh script
488 =pod
489 # The 'gnuplot_readshistogram_logY.sh' is:
490 set terminal png
491 set output "$1.png"
492 set log y
493 set xlabel "Read length"
494 set ylabel "Frequency"
495 set yrange [0.9:]
496 set title "Histogram of read lengths"
497 plot "$1" using 1:2 with boxes
498 =cut
499 }
500
501
502 # Was previously a separate .sh file:
503 =pod
504 #!/bin/sh
505 gnuplot << EOF
506 set terminal png
507 set output "$1.png"
508 set xlabel "Number of contigs"
509 set ylabel "Summed contig length in bases"
510 set yrange [0.9:]
511 set title "Summed contig lengths"
512 plot "$1" using 1 with lines
513 EOF
514 =cut
515
516
517
518
519 # Added function to count number of simple dinucleotide repeats:
520 sub countRepeats
521 {
522 # To count the number of sequences that contain mostly repeats.
523 # This would be faster if called a C program on the file.
524
525 # Common simple repeats are listed here: http://www.bioinfo.de/isb/2005/05/0041/
526 # Dinucleotide
527 # AT/TA
528 # AC/TG
529 # AG/TC
530 # CG/GC
531 # Trinucleotide
532 # AAT/TTA
533 # CTA/GAT
534 # ATG/TAC
535 # ACT/TGA
536 # CTC/GAG
537 # AGG/TCC
538 # CAG/GTC
539 # AAG/TTC
540 # ATA/TAT
541 # CAA/GTT
542 # AGC/TCG
543 # ACA/TGT
544 # ACG/TGC
545 # AGA/TCT
546 # ACC/TGG
547 # Other
548 # Tetranucleotide
549 # AAAT
550 # AAAC
551 # CACG
552 # AACA
553 # AATA
554 # AAGA
555 # TGAA
556 # AAAG
557 # ACAT
558 # AATG
559 # AGCC
560 # Other
561 # Pentanucleotide
562 # AAAAC
563 # AATTG
564 # GCTAA
565 # ATAAT
566 # AAAAT
567 # AAACA
568 # ATATA
569 # TTGCC
570 # Other
571
572 # I also add mono-nucleotide repeats: - ie. just all T's, or A's, etc
573 # Just consider the dinucleotide repeats for now:
574 my ($ATseq,$CGseq,$ACseq,$TGseq,$AGseq,$TCseq)=(0,0,0,0,0,0);
575 my ($AAseq,$TTseq,$CCseq,$GGseq)=(0,0,0,0);
576 foreach (@sequences)
577 {
578 my $seq_len=$_->[0];
579 my $seq=$_->[2]; # This copy might be slow, maybe should just stick with using the reference.
580 my $mnt=0.35*$seq_len; # Mononucleotide threshold: HERE 0.35 also means 70%; 0.4 means 80% dinucleotide repeats, as really one base so 0.5 = 100%
581 my $dnt=0.35*$seq_len; # Dinucleotide threshold: 0.35 means 70%; 0.4 means 80% dinucleotide repeats, as two bases so 0.5 = 100%
582 my ($AT,$CG,$AC,$TG,$AG,$TC)=(0,0,0,0,0,0);
583 my ($AA,$TT,$CC,$GG)=(0,0,0,0);
584 # See: http://www.allinterview.com/showanswers/76719.html
585
586 # AT/TA seems most common repeat so process it first to save time:
587 while ($seq=~/AT/g) {$AT++;} if ($AT>$dnt) {$ATseq++; next;} # AT is same as TA. If has 80% AT's then won't have 80% AC etc.
588 while ($seq=~/CG/g) {$CG++;} if ($CG>$dnt) {$CGseq++; next;} # CG is same as GC.
589 # AC,TG
590 while ($seq=~/AC/g) {$AC++;} if ($AC>$dnt) {$ACseq++; next;}
591 while ($seq=~/TG/g) {$TG++;} if ($TG>$dnt) {$TGseq++; next;}
592 # AG/TC
593 while ($seq=~/AG/g) {$AG++;} if ($AG>$dnt) {$AGseq++; next;}
594 while ($seq=~/TC/g) {$TC++;} if ($TC>$dnt) {$TCseq++; next;}
595
596 # Added my simple mononucleotde repeat count:
597 while ($seq=~/AA/g) {$AA++;} if ($AA>$mnt) {$AAseq++; next;}
598 while ($seq=~/TT/g) {$TT++;} if ($TT>$mnt) {$TTseq++; next;}
599 while ($seq=~/CC/g) {$CC++;} if ($CC>$mnt) {$CCseq++; next;}
600 while ($seq=~/GG/g) {$GG++;} if ($GG>$mnt) {$GGseq++; next;}
601 }
602
603 my $num_seq=($#sequences+1);
604 my $total_din_repeats_seq= $ACseq+$TGseq+$ATseq+$AGseq+$TCseq+$CGseq;
605 my $percent_din_repeats=100*$total_din_repeats_seq/$num_seq;
606 print STATS "\nSimple Dinucleotide repeats:\n";
607 printf STATS "\tNumber of %s with over 70%% dinucleotode repeats:\t%.2f %% (%d %s)\n", $type_plural, $percent_din_repeats, $total_din_repeats_seq, $type_plural;
608 printf STATS "\tAT:\t%.2f %% (%d %s)\n", (100*$ATseq/$num_seq),$ATseq,$type_plural;
609 printf STATS "\tCG:\t%.2f %% (%d %s)\n", (100*$CGseq/$num_seq),$CGseq,$type_plural;
610 printf STATS "\tAC:\t%.2f %% (%d %s)\n", (100*$ACseq/$num_seq),$ACseq,$type_plural;
611 printf STATS "\tTG:\t%.2f %% (%d %s)\n", (100*$TGseq/$num_seq),$TGseq,$type_plural;
612 printf STATS "\tAG:\t%.2f %% (%d %s)\n", (100*$AGseq/$num_seq),$AGseq,$type_plural;
613 printf STATS "\tTC:\t%.2f %% (%d %s)\n", (100*$TCseq/$num_seq),$TCseq,$type_plural;
614
615 my $total_mono_repeats_seq= $AAseq+$TTseq+$CCseq+$GGseq;
616 my $percent_mono_repeats=100*$total_mono_repeats_seq/$num_seq;
617 print STATS "\nSimple mononucleotide repeats:\n";
618 printf STATS "\tNumber of %s with over 50%% mononucleotode repeats:\t%.2f %% (%d %s)\n", $type_plural, $percent_mono_repeats, $total_mono_repeats_seq, $type_plural;
619 printf STATS "\tAA:\t%.2f %% (%d %s)\n", (100*$AAseq/$num_seq),$AAseq,$type_plural;
620 printf STATS "\tTT:\t%.2f %% (%d %s)\n", (100*$TTseq/$num_seq),$TTseq,$type_plural;
621 printf STATS "\tCC:\t%.2f %% (%d %s)\n", (100*$CCseq/$num_seq),$CCseq,$type_plural;
622 printf STATS "\tGG:\t%.2f %% (%d %s)\n", (100*$GGseq/$num_seq),$GGseq,$type_plural;
623
624 return $percent_din_repeats+$percent_mono_repeats;
625 }
626
627
628 sub commify_if
629 {
630 # If doCommify is >0 then converts output to commas.
631 # Formats '1234567890.01' with commas as "1,234,567,890.01
632 # Based on: http://www.perlmonks.org/?node_id=110137
633 my ($number,$doCommify)=@_;
634 if ($doCommify > 0) {$number =~ s/(\d)(?=(\d{3})+(\D|$))/$1\,/g;}
635 return $number;
636 }
637
638
639 #--------------- Produce ordered list of random numbers -------------------
640 # This is copied from: my_random_contigs.pl
641
642 sub getListOfRandomNumbers
643 {
644 # Use: @list=getListOfRandomNumbers(200,20); to return sorted list of 20 numbers in range from 0 to 200 inclusive.
645 my %list2=();
646 my $i=0;
647 my $MaxNumber=$_[0];
648 my $NumToPick=$_[1];
649 while ($i<$NumToPick)
650 {
651 my $intRand = int(rand($MaxNumber+1)); # For Zero-based perl-arrays. The +1 means this generates random integers between 0 and $MaxNumber. (See: http://perldoc.perl.org/functions/rand.html )
652 if ($intRand>$MaxNumber) {$intRand=$MaxNumber} # Just to be extra sure that don't exceed $MaxNumber.
653 if ( !exists($list2{$intRand}) ) {$list2{$intRand}=1; $i++;}
654 }
655 #foreach my $key(keys %list2) {print "$key ";}
656 # Sort the list of numbers:
657 #my @SortedList2 = sort { $a <=> $b } keys(%list2);
658 #return @SortedList2;
659 return (sort { $a <=> $b } keys(%list2));
660 #print "Sorted list of ".$NumToPick." random numbers:\n";
661 #foreach my $num(@SortedList2) {print "$num\n";}
662 #print "\n\n";
663 }
664
665
666 sub print_sequences
667 {
668 # Print the sequences wrapping sequences using index array, at line length of '$fastaLineLen' characters:
669 # Uses the global '@sequences' array.
670 my $sequence_indexes_list=$_[0]; # This is an array reference, not the array itself.
671 foreach my $num(@{$sequence_indexes_list})
672 {
673 #print "$num (max=$#sequences)\n";
674 print STATS $sequences[$num]->[1]; # Prints the header, no "\n" needed after it.
675 my $pos=0;
676 my $seqlen=$sequences[$num]->[0];
677 while ($pos<$seqlen)
678 {
679 print STATS substr($sequences[$num]->[2],$pos,$fastaLineLen)."\n";
680 $pos+=$fastaLineLen;
681 }
682 print STATS "\n";
683 }
684 }
685
686
687 =pod
688 # Some test runs for mono-nucleotides and dinucelotides:
689 >FUOMOGO01AQV42DUMMYA length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_
690 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
691 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
692 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
693 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
694 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
695 >FUOMOGO01AQV42DUMMYB length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_
696 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
697 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
698 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
699 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
700 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
701 >FUOMOGO01AQV42DUMMYC length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_
702 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
703 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
704 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
705 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
706 >FUOMOGO01AQV42DUMMYD length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_
707 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
708 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
709
710 >FUOMOGO01AQV42 length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_
711 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
712 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
713 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
714 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
715 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
716 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT
717 >FUOMOGO01AUK0D length=214 xy=0231_0843 region=1 run=R_2009_04_23_17_54_06_
718 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
719 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
720 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
721 ACACACACACACACACACACACGACGACGACGAC
722 >FUOMOGO01AUB7C length=64 xy=0228_1718 region=1 run=R_2009_04_23_17_54_06_
723 ATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGTACG
724 TACG
725 >FUOMOGO01AU00B length=213 xy=0236_1097 region=1 run=R_2009_04_23_17_54_06_
726 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
727 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
728 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
729 ACACACACACACACACACACGACGACGACGACG
730 >FUOMOGO01ATYRT length=169 xy=0224_0695 region=1 run=R_2009_04_23_17_54_06_
731 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
732 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
733 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT
734 >FUOMOGO01ARMLN length=400 xy=0197_2201 region=1 run=R_2009_04_23_17_54_06_
735 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA
736 TATAGTAGTAGTAGTATATATATATATATATATATATATATATATATATATATATATATA
737 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA
738 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA
739 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA
740 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA
741 TATATATATATATATATATATATATATATATATATATATA
742 >FUOMOGO01AVGRX length=44 xy=0241_1051 region=1 run=R_2009_04_23_17_54_06_
743 TATATATATATATATATATATATATATATATATATATATATATA
744 >FUOMOGO01ASZ6K length=315 xy=0213_0922 region=1 run=R_2009_04_23_17_54_06_
745 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
746 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
747 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
748 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
749 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
750 TGTGTGTGTGTGTGT
751 >FUOMOGO01ARSZF length=65 xy=0199_2281 region=1 run=R_2009_04_23_17_54_06_
752 TATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGTAC
753 GTACG
754 >FUOMOGO01AYV8U length=49 xy=0280_1324 region=1 run=R_2009_04_23_17_54_06_
755 ATATATATATATATATATATATATATATATATATATATATATATATATA
756 >FUOMOGO01AYV9X length=40 xy=0280_1363 region=1 run=R_2009_04_23_17_54_06_
757 TATATATATATATATATATATATATATATATATATATATA
758 >FUOMOGO01AUX4M length=40 xy=0235_1460 region=1 run=R_2009_04_23_17_54_06_
759 TATATATATATATATATATATATATATATATATATATATA
760 >FUOMOGO01AWOTU length=54 xy=0255_0800 region=1 run=R_2009_04_23_17_54_06_
761 ATATATATATATATATATATATATATATATATATATATATATATATATATAGTA
762 >FUOMOGO01A11TC length=66 xy=0316_1054 region=1 run=R_2009_04_23_17_54_06_
763 ATATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGTA
764 CGTACG
765 >FUOMOGO01ASRJP length=401 xy=0210_2019 region=1 run=R_2009_04_23_17_54_06_
766 TATATATATATATATATATATATATATATATATATATATATATATATATATATAGTATAT
767 AGTAGTAGTAGTATATATATATATATATATATATATATATATATATATATATATATATAT
768 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT
769 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT
770 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT
771 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT
772 ATATATATATATATATATATATATATATATATATATATATA
773 >FUOMOGO01AU1ZH length=67 xy=0236_2363 region=1 run=R_2009_04_23_17_54_06_
774 TATATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGT
775 ACGTACG
776 =cut