annotate COG/bac-genomics-scripts/calc_fastq-stats/calc_fastq-stats.pl @ 14:5a5c9a6b047b draft

Uploaded
author dereeper
date Tue, 10 Dec 2024 16:20:53 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 # POD #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 =pod
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 =head1 NAME
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 C<calc_fastq-stats.pl> - basic statistics for bases and reads in a FASTQ file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 =head1 SYNOPSIS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 C<perl calc_fastq-stats.pl -i reads.fastq>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 C<gzip -dc reads.fastq.gz | perl calc_fastq-stats.pl -i ->
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 =head1 DESCRIPTION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 Calculates some simple statistics, like individual and total base
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 counts, GC content, and basic stats for the read lengths, and
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 read/base qualities in a FASTQ file. The GC content calculation does
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 not include 'N's. Stats are printed to C<STDOUT> and optionally to an
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 output file.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 Because the quality of a read degrades over its length with all NGS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 machines, it is advisable to also plot the quality for each cycle as
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 implemented in tools like
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 L<FastQC|http://www.bioinformatics.babraham.ac.uk/projects/fastqc/>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 or the L<fastx-toolkit|http://hannonlab.cshl.edu/fastx_toolkit/>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 If the sequence and the quality values are interrupted by line
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 breaks (i.e. a read is B<not> represented by four lines), please fix
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 with Heng Li's L<seqtk|https://github.com/lh3/seqtk>:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 C<seqtk seq -l 0 infile.fastq E<gt> outfile.fastq>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 An alternative tool, which is a lot faster, is B<fastq-stats> from
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 L<ea-utils|https://code.google.com/p/ea-utils/>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 =head1 OPTIONS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 =head2 Mandatory options
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 =item B<-i>=I<str>, B<-input>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 Input FASTQ file or piped STDIN (-) from a gzipped file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 =item B<-q>=I<int>, B<-qual_offset>=I<int>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 ASCII quality offset of the Phred (Sanger) quality values [default 33]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 =head2 Optional options
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 =item B<-h>, B<-help>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 Help (perldoc POD)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 =item B<-c>=I<int>, B<-coverage_limit>=I<int>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 Number of bases to sample from the top of the file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 =item B<-n>=I<int>, B<-num_read>=I<int>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 Number of reads to sample from the top of the file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 =item B<-o>=I<str>, B<-output>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 Print stats in addition to C<STDOUT> to the specified output file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 =item B<-v>, B<-version>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 Print version number to C<STDERR>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 =head1 OUTPUT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 =item C<STDOUT>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 Calculated stats are printed to C<STDOUT>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 =item F<(outfile)>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 Optional outfile for stats
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 =head1 EXAMPLES
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 =item C<zcat reads.fastq.gz | perl calc_fastq-stats.pl -i - -q 64 -c 175000000 -n 3000000>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 =head1 DEPENDENCIES
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 If the following modules are not installed get them from
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 L<CPAN|http://www.cpan.org/>:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 =item C<Statistics::Descriptive>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 Perl module to calculate basic descriptive statistics
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 =item C<Statistics::Descriptive::Discrete>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 Perl module to calculate descriptive statistics for discrete data sets
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 =item C<Statistics::Descriptive::Weighted>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 Perl module to calculate descriptive statistics for weighted variates
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 =head1 VERSION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 0.1 28-10-2014
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 =head1 AUTHOR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 Andreas Leimbach aleimba[at]gmx[dot]de
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 =head1 LICENSE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 This program is free software: you can redistribute it and/or modify
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 it under the terms of the GNU General Public License as published by
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 the Free Software Foundation; either version 3 (GPLv3) of the License,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 or (at your option) any later version.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 This program is distributed in the hope that it will be useful, but
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 WITHOUT ANY WARRANTY; without even the implied warranty of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 General Public License for more details.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 You should have received a copy of the GNU General Public License
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 along with this program. If not, see L<http://www.gnu.org/licenses/>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 =cut
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 # MAIN #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 use warnings;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 use autodie;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 use Getopt::Long;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 use Pod::Usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 eval { # check if modules are installed
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165 require Statistics::Descriptive; # module of basic descriptive statistics
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166 Statistics::Descriptive->import();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167 require Statistics::Descriptive::Discrete; # module for statistics with 'discrete' values
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 Statistics::Descriptive::Discrete->import();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 require Statistics::Descriptive::Weighted; # module for weighted statistics (includes approximations for quantiles in contrast to 'Statistics::Descriptive::Discrete')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170 Statistics::Descriptive::Weighted->import();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 }; # semi-colon needed
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 if ($@) { # if module(s) not installed die with error message
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 die "\n### Fatal error: One or several of the required statistical Perl modules 'Statistics::Descriptive', 'Statistics::Descriptive::Weighted', or 'Statistics::Descriptive::Discrete' are not installed! Please install from CPAN!\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 ### Get the options with Getopt::Long
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 my $Input_File; # either input FASTQ file or STDIN
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 my $Output_File; # stats are printed to STDOUT, optionally to this output file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 my $Qual_Ascii_Offset = 33; # ASCII quality offset for Phred (Sanger) quality scores (see below)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 my $Coverage_Limit; # number of bases to sample
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 my $Num_Read; # number of reads to sample
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 my $VERSION = 0.1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183 my ($Opt_Version, $Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 GetOptions ('input=s' => \$Input_File,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185 'output:s' => \$Output_File,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186 'qual_offset=i' => \$Qual_Ascii_Offset,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 'coverage_limit:i' => \$Coverage_Limit,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188 'num_read:i' => \$Num_Read,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 'version' => \$Opt_Version,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190 'help|?' => \$Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194 ### Run perldoc on POD
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195 pod2usage(-verbose => 2) if ($Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196 die "$0 $VERSION\n" if ($Opt_Version);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197 if (!$Input_File) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198 my $warning = "\n### Fatal error: Option '-i' or its argument is missing!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204 ### Open FASTQ file or accept STDIN, open optional output file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205 my $Input_Fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206 if ($Input_File eq '-') { # file input via STDIN
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207 $Input_Fh = *STDIN; # capture typeglob of STDIN
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
208 } else { # input via FASTQ file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
209 open ($Input_Fh, "<", "$Input_File");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
210 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
211 my $Output_Fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
212 if ($Output_File) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
213 file_exist($Output_File); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
214 open ($Output_Fh, ">", "$Output_File");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
215 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
216
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
217
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
218
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
219 ### Parse read data in FASTQ file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
220 my ($A, $C, $G, $T, $U, $N);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
221 $A = $C = $G = $T = $U = $N = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
222 my $Total_Bases = 0; # total number of bases
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
223
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
224 my $Line_Count = 0; # number of lines in the FASTQ file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
225 my $Read_Count = 0; # number of reads
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
226 my @Read_Lengths; # length of each individual read
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
227
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
228 my %Base_Quality_Counts; # quality counts for each individual base
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
229 my @Mean_Read_Quals; # average qualities for each read
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
230
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
231 my $I = 1; # mio. counter for processing status message
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
232 while (<$Input_Fh>) { # FASTQ "usually" format uses four lines per sequence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
233 chomp;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
234 $Read_Count++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
235 $Line_Count++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
236
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
237 # status message
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
238 if ($Read_Count/1000000 == $I) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
239 print STDERR "$I Mio reads processed ...\r"; # carriage return to overwrite messages and not clutter STDERR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
240 $I++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
241 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
242
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
243 # sequence identifier/read name (line 1)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
244 # e.g.: @H108:287:D0M79ACXX:7:1101:5248:1997 1:N:0:TAGGCATGAGAGTAGA
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
245 # @ <instrument‐name>:<run ID>:<flowcell ID>:<lane‐number>:<tile‐number>:<x‐pos>:<y‐pos> <read number>:<is filtered>:<control number>:<barcode sequence>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
246 # [Illumina, CASAVA v1.8 Changes, 05.01.2011]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
247 my $seq_id = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
248 die "\n### Fatal error:\nThis read doesn't have a sequence identifier/read name according to FASTQ specs, it should begin with a '\@':\n$seq_id\n" if ($seq_id !~ /^@/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
249 $seq_id =~ s/^@(.+)/$1/; # remove '@' to make comparable to $qual_id
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
250
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
251 # DNA sequence of the read (line 2)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
252 my $seq = read_line(); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
253 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /\s+/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
254 base_count($seq, $seq_id); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
255 push(@Read_Lengths, length $seq);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
256 $Total_Bases += length $seq;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
257
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
258 # optional sequence ID of quality (line 3)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
259 my $qual_id = read_line(); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
260 die "\n### Fatal error:\nThe optional sequence identifier/read name for the quality line of read '$seq_id' is not according to FASTQ specs, it should begin with a '+':\n$qual_id\n" if ($qual_id !~ /^\+/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
261 $qual_id =~ s/^\+(.*)/$1/; # if optional ID is present check if equal to $seq_id in line 1
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
262 die "\n### Fatal error:\nThe sequence identifier/read name of read '$seq_id' doesn't fit to the optional ID in the quality line:\n$qual_id\n" if ($qual_id && $qual_id ne $seq_id);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
263
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
264 # ASCII quality values (line 4)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
265 # The quality scores (Q; from Illumina 1.3 onwards: Phred [i.e. Sanger] Q = -10 log10(Pe); with Pe being estimated probability for base calling error) are transformed from integer to a single ASCII character for brevity so that a string can represent all of the quality scores within a read. ASCII offset of 33 (Illumina 1.8+; from Illumina 1.3+ was offset of 64). Q + 33 = ASCII. [Illumina, CASAVA v1.8 Changes, 05.01.2011]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
266 my $qual = read_line(); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
267 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /\s+/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
268 die "\n### Fatal error:\nRead '$seq_id' has a non-ASCII character in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /[^[:ascii]]/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
269 die "\n### Fatal error:\nThe quality line of read '$seq_id' doesn't have the same number of symbols as letters in the sequence:\n$seq\n$qual\n" if (length $qual != length $seq);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
270
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
271 my $total_read_qual = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
272 foreach (split('', $qual)) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
273 my $phred_qual = ord($_)-$Qual_Ascii_Offset; # ord converts a character to its ASCII value (an integer), which has to be offset to get the Phred quality value
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
274 $Base_Quality_Counts{$phred_qual}++; # count individual base qualities
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
275 $total_read_qual += $phred_qual;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
276 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
277
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
278 my $mean_read_qual = sprintf("%.2f", $total_read_qual/length $qual);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
279 push(@Mean_Read_Quals, $mean_read_qual);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
280
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
281 if ($Coverage_Limit && $Total_Bases >= $Coverage_Limit) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
282 print STDERR "\nReached coverage limit of '$Coverage_Limit bp' at read number '$Read_Count' with '$Total_Bases bp'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
283 last;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
284 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
285 last if ($Num_Read && $Read_Count == $Num_Read); # skip rest of reads if $num_read is reached
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
286 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
287 print "\n"; # get rid of the carriage return for status messages
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
288 close $Input_Fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
289
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
290
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
291
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
292 ### Final sanity checks
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
293 die "\n### Fatal error:\nFASTQ file doesn't contain four lines for each read, which is an accepted convention in the FASTQ specs! If the sequence and the quality values are interrupted by line breaks please fix with Heng Li's 'seqtk' (https://github.com/lh3/seqtk):\nseqtk seq -l 0 infile.fastq > outfile.fastq\n" if ($Line_Count/4 != $Read_Count); # should be already covered as the parser assumes four lines per read (see above)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
294 my $Total_Nucs = $A + $C + $G + $T + $U + $N;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
295 die "\n### Fatal error:\nThe total number of bases '$Total_Nucs bp' is not equal to the read length sum '$Total_Bases bp'!\nAre there degenerate IUPAC bases (except for 'N') or non-nucleotide characters in the sequence (which is not allowed according to FASTQ specs)?\n" if ($Total_Nucs != $Total_Bases); # should be already covered in subroutine 'base_count' and checks in the parser above
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
296
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
297
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
298
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
299 ### Read length stats
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
300 my %Read_Lengths_Stats; # store stats for read lengths
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
301 stats_full(\@Read_Lengths, \%Read_Lengths_Stats); # subroutine for 'Statistics::Descriptive'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
302 print "#Read stats:\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
303 print_out_std("number_of_reads\t$Read_Count\n"); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
304 print_out_std("min_read_length\t$Read_Lengths_Stats{'min'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
305 print_out_std("max_read_length\t$Read_Lengths_Stats{'max'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
306 print_out_std("mean_read_length\t$Read_Lengths_Stats{'mean'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
307 print_out_std("q1_read_length\t$Read_Lengths_Stats{'q1'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
308 print_out_std("median_read_length\t$Read_Lengths_Stats{'median'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
309 print_out_std("q3_read_length\t$Read_Lengths_Stats{'q3'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
310 print_out_std("stdev_read_length\t$Read_Lengths_Stats{'sd'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
311
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
312
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
313
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
314 ### Mean read Phred quality stats
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
315 my %Mean_Read_Quals_Stats; # store mean quality stats for reads
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
316 stats_full(\@Mean_Read_Quals, \%Mean_Read_Quals_Stats); # subroutine for 'Statistics::Descriptive'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
317 print "#Read quality stats:\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
318 print_out_std("min_read_qual\t$Mean_Read_Quals_Stats{'min'}\n"); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
319 print_out_std("max_read_qual\t$Mean_Read_Quals_Stats{'max'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
320 print_out_std("mean_read_qual\t$Mean_Read_Quals_Stats{'mean'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
321 print_out_std("q1_read_qual\t$Mean_Read_Quals_Stats{'q1'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
322 print_out_std("median_read_qual\t$Mean_Read_Quals_Stats{'median'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
323 print_out_std("q3_read_qual\t$Mean_Read_Quals_Stats{'q3'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
324 print_out_std("stdev_read_qual\t$Mean_Read_Quals_Stats{'sd'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
325
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
326
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
327
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
328 ### Total bases Phred quality stats
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
329 my %Base_Quals_Stats; # store quality stats for individual bases
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
330 stats_discrete_full(\%Base_Quality_Counts, \%Base_Quals_Stats); # subroutine for 'Statistics::Descriptive::Discrete' and 'Statistics::Descriptive::Weighted'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
331 print "#Base quality stats:\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
332 print_out_std("min_base_qual\t$Base_Quals_Stats{'min'}\n"); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
333 print_out_std("max_base_qual\t$Base_Quals_Stats{'max'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
334 print_out_std("mean_base_qual\t$Base_Quals_Stats{'mean'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
335 print_out_std("q1_base_qual\t$Base_Quals_Stats{'q1'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
336 print_out_std("median_base_qual\t$Base_Quals_Stats{'median'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
337 print_out_std("q3_base_qual\t$Base_Quals_Stats{'q3'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
338 print_out_std("stdev_base_qual\t$Base_Quals_Stats{'sd'}\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
339
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
340
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
341
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
342 ### Base stats
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
343 print "#Base count stats:\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
344 print_out_std("number_of_bases\t$Total_Bases\n"); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
345 print_out_std("A_count\t$A\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
346 print_out_std("C_count\t$C\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
347 print_out_std("G_count\t$G\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
348 print_out_std("T_count\t$T\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
349 if ($U) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
350 print STDERR "Nucleotide 'U' found in the sequences, RNA?\n" if ($U);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
351 print_out_std("U_count\t$U\n") if ($U);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
352 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
353 print_out_std("N_count\t$N\n") if ($N);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
354 my $GC_Content = sprintf ("%.2f", (($C + $G)/$Total_Bases)*100);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
355 print_out_std("GC_content_[%]\t$GC_Content\n");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
356 close Output_Fh if ($Output_File);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
357
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
358
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
359 exit;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
360
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
361 #############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
362 #Subroutines#
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
363 #############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
364
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
365 ### Subroutine to count bases
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
366 sub base_count {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
367 my ($seq, $seq_id) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
368 die "\n### Fatal error:\nRead '$seq_id' has a IUPAC degenerate base (except for 'N') or non-nucleotide character in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /[^acgtun]/i);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
369 $A += ($seq =~ tr/[aA]//);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
370 $C += ($seq =~ tr/[cC]//);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
371 $G += ($seq =~ tr/[gG]//);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
372 $T += ($seq =~ tr/[tT]//);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
373 $U += ($seq =~ tr/[uU]//);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
374 $N += ($seq =~ tr/[nN]//);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
375 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
376 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
377
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
378
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
379
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
380 ### Subroutine to test for file existence and give warning to STDERR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
381 sub file_exist {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
382 my $file = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
383 if (-e $file) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
384 warn "\nThe result file '$file' exists already and will be overwritten!\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
385 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
386 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
387 return 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
388 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
389
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
390
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
391
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
392 ### Subroutine to read in a line from input, chomp, and count it
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
393 sub read_line {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
394 my $line = <$Input_Fh>;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
395 $Line_Count++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
396 chomp $line;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
397 return $line;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
398 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
399
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
400
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
401
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
402 ### Subroutine to calculate stats with modules 'Statistics::Descriptive::Discrete' and 'Statistics::Descriptive::Weighted' for large data sets, which overburden RAM and/or module 'Statistics::Descriptive'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
403 sub stats_discrete_full {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
404 my ($data_hash_ref, $hash_stat_ref) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
405
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
406 # convert data to needed formats
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
407 my @discrete_tuple; # for 'Statistics::Descriptive::Discrete'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
408 my (@values, @weights); # for 'Statistics::Descriptive::Weighted'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
409 foreach (sort {$a <=> $b} keys %$data_hash_ref) { # de-reference hash ref
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
410 push(@discrete_tuple, ($_, $data_hash_ref->{$_})); # expects first value then weight, de-reference
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
411 push(@values, $_);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
412 push(@weights, $data_hash_ref->{$_});
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
413 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
414
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
415 my $stat_discrete = Statistics::Descriptive::Discrete->new();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
416 $stat_discrete->add_data_tuple(@discrete_tuple);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
417 $hash_stat_ref->{'mean'} = sprintf("%.2f", $stat_discrete->mean()); # rounded to two decimal places
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
418 $hash_stat_ref->{'median'} = sprintf("%.2f", $stat_discrete->median());
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
419 $hash_stat_ref->{'sd'} = sprintf("%.2f", $stat_discrete->standard_deviation());
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
420 $hash_stat_ref->{'min'} = $stat_discrete->min();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
421 $hash_stat_ref->{'max'} = $stat_discrete->max();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
422
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
423 # quantile approximations only implemented in 'Statistics::Descriptive::Weighted' not in 'Statistics::Descriptive::Discrete'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
424 my $stat_weighted = Statistics::Descriptive::Weighted::Full->new();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
425 $stat_weighted->add_data(\@values, \@weights); # add data as array refs
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
426 $hash_stat_ref->{'q1'} = sprintf("%.2f", $stat_weighted->quantile(.25)); # Q1, first quartile (25th percentile)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
427 $hash_stat_ref->{'q3'} = sprintf("%.2f", $stat_weighted->quantile(.75)); # Q3, third quartile (75th percentile)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
428 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
429 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
430
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
431
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
432
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
433 ### Subroutine to calculate stats with module 'Statistics::Descriptive'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
434 sub stats_full {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
435 my ($data_array_ref, $hash_stat_ref) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
436 my $stat = Statistics::Descriptive::Full->new();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
437 $stat->add_data(@$data_array_ref); # de-reference array ref
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
438 $hash_stat_ref->{'mean'} = sprintf("%.2f", $stat->mean());
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
439 $hash_stat_ref->{'median'} = sprintf("%.2f", $stat->median());
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
440 $hash_stat_ref->{'sd'} = sprintf("%.2f", $stat->standard_deviation());
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
441 $hash_stat_ref->{'min'} = $stat->min();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
442 $hash_stat_ref->{'max'} = $stat->max();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
443 $hash_stat_ref->{'q1'} = sprintf("%.2f", $stat->quantile(1)); # Q1, first quartile (25th percentile)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
444 $hash_stat_ref->{'q3'} = sprintf("%.2f", $stat->quantile(3)); # Q3, third quartile (75th percentile)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
445 # $stat->clear(); # remove stats from the module for next round; not needed because of 'new()' initialisation at begin of subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
446 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
447 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
448
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
449
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
450
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
451 ### Subroutine to print to STDOUT and optionally to output file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
452 sub print_out_std {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
453 print "@_"; # print line to STDOUT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
454 print $Output_Fh "@_" if ($Output_File); # additionally, print to optional output file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
455 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
456 }