view COG/bac-genomics-scripts/calc_fastq-stats/calc_fastq-stats.pl @ 13:152d7c43478b draft default tip

Uploaded
author dereeper
date Thu, 30 May 2024 20:07:55 +0000
parents e42d30da7a74
children
line wrap: on
line source

#!/usr/bin/perl

#######
# POD #
#######

=pod

=head1 NAME

C<calc_fastq-stats.pl> - basic statistics for bases and reads in a FASTQ file

=head1 SYNOPSIS

C<perl calc_fastq-stats.pl -i reads.fastq>

B<or>

C<gzip -dc reads.fastq.gz | perl calc_fastq-stats.pl -i ->

=head1 DESCRIPTION

Calculates some simple statistics, like individual and total base
counts, GC content, and basic stats for the read lengths, and
read/base qualities in a FASTQ file. The GC content calculation does
not include 'N's. Stats are printed to C<STDOUT> and optionally to an
output file.

Because the quality of a read degrades over its length with all NGS
machines, it is advisable to also plot the quality for each cycle as
implemented in tools like
L<FastQC|http://www.bioinformatics.babraham.ac.uk/projects/fastqc/>
or the L<fastx-toolkit|http://hannonlab.cshl.edu/fastx_toolkit/>.

If the sequence and the quality values are interrupted by line
breaks (i.e. a read is B<not> represented by four lines), please fix
with Heng Li's L<seqtk|https://github.com/lh3/seqtk>:

C<seqtk seq -l 0 infile.fastq E<gt> outfile.fastq>

An alternative tool, which is a lot faster, is B<fastq-stats> from
L<ea-utils|https://code.google.com/p/ea-utils/>.

=head1 OPTIONS

=head2 Mandatory options

=over 20

=item B<-i>=I<str>, B<-input>=I<str>

Input FASTQ file or piped STDIN (-) from a gzipped file

=item B<-q>=I<int>, B<-qual_offset>=I<int>

ASCII quality offset of the Phred (Sanger) quality values [default 33]

=back

=head2 Optional options

=over 20

=item B<-h>, B<-help>

Help (perldoc POD)

=item B<-c>=I<int>, B<-coverage_limit>=I<int>

Number of bases to sample from the top of the file

=item B<-n>=I<int>, B<-num_read>=I<int>

Number of reads to sample from the top of the file

=item B<-o>=I<str>, B<-output>=I<str>

Print stats in addition to C<STDOUT> to the specified output file

=item B<-v>, B<-version>

Print version number to C<STDERR>

=back

=head1 OUTPUT

=over 20

=item C<STDOUT>

Calculated stats are printed to C<STDOUT>

=item F<(outfile)>

Optional outfile for stats

=back

=head1 EXAMPLES

=over

=item C<zcat reads.fastq.gz | perl calc_fastq-stats.pl -i - -q 64 -c 175000000 -n 3000000>

=back

=head1 DEPENDENCIES

If the following modules are not installed get them from
L<CPAN|http://www.cpan.org/>:

=over 20

=item C<Statistics::Descriptive>

Perl module to calculate basic descriptive statistics

=item C<Statistics::Descriptive::Discrete>

Perl module to calculate descriptive statistics for discrete data sets

=item C<Statistics::Descriptive::Weighted>

Perl module to calculate descriptive statistics for weighted variates

=back

=head1 VERSION

 0.1                                                       28-10-2014

=head1 AUTHOR

 Andreas Leimbach                               aleimba[at]gmx[dot]de

=head1 LICENSE

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 (GPLv3) of the License,
or (at your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see L<http://www.gnu.org/licenses/>.

=cut


########
# MAIN #
########

use strict;
use warnings;
use autodie;
use Getopt::Long;
use Pod::Usage;
eval { # check if modules are installed
    require Statistics::Descriptive; # module of basic descriptive statistics
    Statistics::Descriptive->import();
    require Statistics::Descriptive::Discrete; # module for statistics with 'discrete' values
    Statistics::Descriptive::Discrete->import();
    require Statistics::Descriptive::Weighted; # module for weighted statistics (includes approximations for quantiles in contrast to 'Statistics::Descriptive::Discrete')
    Statistics::Descriptive::Weighted->import();
}; # semi-colon needed
if ($@) { # if module(s) not installed die with error message
    die "\n### Fatal error: One or several of the required statistical Perl modules 'Statistics::Descriptive', 'Statistics::Descriptive::Weighted', or 'Statistics::Descriptive::Discrete' are not installed! Please install from CPAN!\n\n";
}

### Get the options with Getopt::Long
my $Input_File; # either input FASTQ file or STDIN
my $Output_File; # stats are printed to STDOUT, optionally to this output file
my $Qual_Ascii_Offset = 33; # ASCII quality offset for Phred (Sanger) quality scores (see below)
my $Coverage_Limit; # number of bases to sample
my $Num_Read; # number of reads to sample
my $VERSION = 0.1;
my ($Opt_Version, $Opt_Help);
GetOptions ('input=s' => \$Input_File,
            'output:s' => \$Output_File,
            'qual_offset=i' => \$Qual_Ascii_Offset,
            'coverage_limit:i' => \$Coverage_Limit,
            'num_read:i' => \$Num_Read,
            'version' => \$Opt_Version,
            'help|?' => \$Opt_Help);



### Run perldoc on POD
pod2usage(-verbose => 2) if ($Opt_Help);
die "$0 $VERSION\n" if ($Opt_Version);
if (!$Input_File) {
    my $warning = "\n### Fatal error: Option '-i' or its argument is missing!\n";
    pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
}



### Open FASTQ file or accept STDIN, open optional output file
my $Input_Fh;
if ($Input_File eq '-') { # file input via STDIN
    $Input_Fh = *STDIN; # capture typeglob of STDIN
} else { # input via FASTQ file
    open ($Input_Fh, "<", "$Input_File");
}
my $Output_Fh;
if ($Output_File) {
    file_exist($Output_File); # subroutine
    open ($Output_Fh, ">", "$Output_File");
}



### Parse read data in FASTQ file
my ($A, $C, $G, $T, $U, $N);
$A = $C = $G = $T = $U = $N = 0;
my $Total_Bases = 0; # total number of bases

my $Line_Count = 0; # number of lines in the FASTQ file
my $Read_Count = 0; # number of reads
my @Read_Lengths; # length of each individual read

my %Base_Quality_Counts; # quality counts for each individual base
my @Mean_Read_Quals; # average qualities for each read

my $I = 1; # mio. counter for processing status message
while (<$Input_Fh>) { # FASTQ "usually" format uses four lines per sequence
    chomp;
    $Read_Count++;
    $Line_Count++;

    # status message
    if ($Read_Count/1000000 == $I) {
        print STDERR "$I Mio reads processed ...\r"; # carriage return to overwrite messages and not clutter STDERR
        $I++;
    }

    # sequence identifier/read name (line 1)
    # e.g.: @H108:287:D0M79ACXX:7:1101:5248:1997 1:N:0:TAGGCATGAGAGTAGA
    # @ <instrument‐name>:<run ID>:<flowcell ID>:<lane‐number>:<tile‐number>:<x‐pos>:<y‐pos> <read number>:<is filtered>:<control number>:<barcode sequence>
    # [Illumina, CASAVA v1.8 Changes, 05.01.2011]
    my $seq_id = $_;
    die "\n### Fatal error:\nThis read doesn't have a sequence identifier/read name according to FASTQ specs, it should begin with a '\@':\n$seq_id\n" if ($seq_id !~ /^@/);
    $seq_id =~ s/^@(.+)/$1/; # remove '@' to make comparable to $qual_id

    # DNA sequence of the read (line 2)
    my $seq = read_line(); # subroutine
    die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /\s+/);
    base_count($seq, $seq_id); # subroutine
    push(@Read_Lengths, length $seq);
    $Total_Bases += length $seq;

    # optional sequence ID of quality (line 3)
    my $qual_id = read_line(); # subroutine
    die "\n### Fatal error:\nThe optional sequence identifier/read name for the quality line of read '$seq_id' is not according to FASTQ specs, it should begin with a '+':\n$qual_id\n" if ($qual_id !~ /^\+/);
    $qual_id =~ s/^\+(.*)/$1/; # if optional ID is present check if equal to $seq_id in line 1
    die "\n### Fatal error:\nThe sequence identifier/read name of read '$seq_id' doesn't fit to the optional ID in the quality line:\n$qual_id\n" if ($qual_id && $qual_id ne $seq_id);

    # ASCII quality values (line 4)
    # The quality scores (Q; from Illumina 1.3 onwards: Phred [i.e. Sanger] Q = -10 log10(Pe); with Pe being estimated probability for base calling error) are transformed from integer to a single ASCII character for brevity so that a string can represent all of the quality scores within a read. ASCII offset of 33 (Illumina 1.8+; from Illumina 1.3+ was offset of 64). Q + 33 = ASCII. [Illumina, CASAVA v1.8 Changes, 05.01.2011]
    my $qual = read_line(); # subroutine
    die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /\s+/);
    die "\n### Fatal error:\nRead '$seq_id' has a non-ASCII character in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /[^[:ascii]]/);
    die "\n### Fatal error:\nThe quality line of read '$seq_id' doesn't have the same number of symbols as letters in the sequence:\n$seq\n$qual\n" if (length $qual != length $seq);

    my $total_read_qual = 0;
    foreach (split('', $qual)) {
        my $phred_qual = ord($_)-$Qual_Ascii_Offset; # ord converts a character to its ASCII value (an integer), which has to be offset to get the Phred quality value
        $Base_Quality_Counts{$phred_qual}++; # count individual base qualities
        $total_read_qual += $phred_qual;
    }

    my $mean_read_qual = sprintf("%.2f", $total_read_qual/length $qual);
    push(@Mean_Read_Quals, $mean_read_qual);

    if ($Coverage_Limit && $Total_Bases >= $Coverage_Limit) {
        print STDERR "\nReached coverage limit of '$Coverage_Limit bp' at read number '$Read_Count' with '$Total_Bases bp'!\n";
        last;
    }
    last if ($Num_Read && $Read_Count == $Num_Read); # skip rest of reads if $num_read is reached
}
print "\n"; # get rid of the carriage return for status messages
close $Input_Fh;



### Final sanity checks
die "\n### Fatal error:\nFASTQ file doesn't contain four lines for each read, which is an accepted convention in the FASTQ specs! If the sequence and the quality values are interrupted by line breaks please fix with Heng Li's 'seqtk' (https://github.com/lh3/seqtk):\nseqtk seq -l 0 infile.fastq > outfile.fastq\n" if ($Line_Count/4 != $Read_Count); # should be already covered as the parser assumes four lines per read (see above)
my $Total_Nucs = $A + $C + $G + $T + $U + $N;
die "\n### Fatal error:\nThe total number of bases '$Total_Nucs bp' is not equal to the read length sum '$Total_Bases bp'!\nAre there degenerate IUPAC bases (except for 'N') or non-nucleotide characters in the sequence (which is not allowed according to FASTQ specs)?\n" if ($Total_Nucs != $Total_Bases); # should be already covered in subroutine 'base_count' and checks in the parser above



### Read length stats
my %Read_Lengths_Stats; # store stats for read lengths
stats_full(\@Read_Lengths, \%Read_Lengths_Stats); # subroutine for 'Statistics::Descriptive'
print "#Read stats:\n";
print_out_std("number_of_reads\t$Read_Count\n"); # subroutine
print_out_std("min_read_length\t$Read_Lengths_Stats{'min'}\n");
print_out_std("max_read_length\t$Read_Lengths_Stats{'max'}\n");
print_out_std("mean_read_length\t$Read_Lengths_Stats{'mean'}\n");
print_out_std("q1_read_length\t$Read_Lengths_Stats{'q1'}\n");
print_out_std("median_read_length\t$Read_Lengths_Stats{'median'}\n");
print_out_std("q3_read_length\t$Read_Lengths_Stats{'q3'}\n");
print_out_std("stdev_read_length\t$Read_Lengths_Stats{'sd'}\n");



### Mean read Phred quality stats
my %Mean_Read_Quals_Stats; # store mean quality stats for reads
stats_full(\@Mean_Read_Quals, \%Mean_Read_Quals_Stats); # subroutine for 'Statistics::Descriptive'
print "#Read quality stats:\n";
print_out_std("min_read_qual\t$Mean_Read_Quals_Stats{'min'}\n"); # subroutine
print_out_std("max_read_qual\t$Mean_Read_Quals_Stats{'max'}\n");
print_out_std("mean_read_qual\t$Mean_Read_Quals_Stats{'mean'}\n");
print_out_std("q1_read_qual\t$Mean_Read_Quals_Stats{'q1'}\n");
print_out_std("median_read_qual\t$Mean_Read_Quals_Stats{'median'}\n");
print_out_std("q3_read_qual\t$Mean_Read_Quals_Stats{'q3'}\n");
print_out_std("stdev_read_qual\t$Mean_Read_Quals_Stats{'sd'}\n");



### Total bases Phred quality stats
my %Base_Quals_Stats; # store quality stats for individual bases
stats_discrete_full(\%Base_Quality_Counts, \%Base_Quals_Stats); # subroutine for 'Statistics::Descriptive::Discrete' and 'Statistics::Descriptive::Weighted'
print "#Base quality stats:\n";
print_out_std("min_base_qual\t$Base_Quals_Stats{'min'}\n"); # subroutine
print_out_std("max_base_qual\t$Base_Quals_Stats{'max'}\n");
print_out_std("mean_base_qual\t$Base_Quals_Stats{'mean'}\n");
print_out_std("q1_base_qual\t$Base_Quals_Stats{'q1'}\n");
print_out_std("median_base_qual\t$Base_Quals_Stats{'median'}\n");
print_out_std("q3_base_qual\t$Base_Quals_Stats{'q3'}\n");
print_out_std("stdev_base_qual\t$Base_Quals_Stats{'sd'}\n");



### Base stats
print "#Base count stats:\n";
print_out_std("number_of_bases\t$Total_Bases\n"); # subroutine
print_out_std("A_count\t$A\n");
print_out_std("C_count\t$C\n");
print_out_std("G_count\t$G\n");
print_out_std("T_count\t$T\n");
if ($U) {
    print STDERR "Nucleotide 'U' found in the sequences, RNA?\n" if ($U);
    print_out_std("U_count\t$U\n") if ($U);
}
print_out_std("N_count\t$N\n") if ($N);
my $GC_Content = sprintf ("%.2f", (($C + $G)/$Total_Bases)*100);
print_out_std("GC_content_[%]\t$GC_Content\n");
close Output_Fh if ($Output_File);


exit;

#############
#Subroutines#
#############

### Subroutine to count bases
sub base_count {
    my ($seq, $seq_id) = @_;
    die "\n### Fatal error:\nRead '$seq_id' has a IUPAC degenerate base (except for 'N') or non-nucleotide character in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /[^acgtun]/i);
    $A += ($seq =~ tr/[aA]//);
    $C += ($seq =~ tr/[cC]//);
    $G += ($seq =~ tr/[gG]//);
    $T += ($seq =~ tr/[tT]//);
    $U += ($seq =~ tr/[uU]//);
    $N += ($seq =~ tr/[nN]//);
    return 1;
}



### Subroutine to test for file existence and give warning to STDERR
sub file_exist {
    my $file = shift;
    if (-e $file) {
        warn "\nThe result file '$file' exists already and will be overwritten!\n\n";
        return 1;
    }
    return 0;
}



### Subroutine to read in a line from input, chomp, and count it
sub read_line {
    my $line = <$Input_Fh>;
    $Line_Count++;
    chomp $line;
    return $line;
}



### Subroutine to calculate stats with modules 'Statistics::Descriptive::Discrete' and 'Statistics::Descriptive::Weighted' for large data sets, which overburden RAM and/or module 'Statistics::Descriptive'
sub stats_discrete_full {
    my ($data_hash_ref, $hash_stat_ref) = @_;

    # convert data to needed formats
    my @discrete_tuple; # for 'Statistics::Descriptive::Discrete'
    my (@values, @weights); # for 'Statistics::Descriptive::Weighted'
    foreach (sort {$a <=> $b} keys %$data_hash_ref) { # de-reference hash ref
        push(@discrete_tuple, ($_, $data_hash_ref->{$_})); # expects first value then weight, de-reference
        push(@values, $_);
        push(@weights, $data_hash_ref->{$_});
    }

    my $stat_discrete = Statistics::Descriptive::Discrete->new();
    $stat_discrete->add_data_tuple(@discrete_tuple);
    $hash_stat_ref->{'mean'} = sprintf("%.2f", $stat_discrete->mean()); # rounded to two decimal places
    $hash_stat_ref->{'median'} = sprintf("%.2f", $stat_discrete->median());
    $hash_stat_ref->{'sd'} = sprintf("%.2f", $stat_discrete->standard_deviation());
    $hash_stat_ref->{'min'} = $stat_discrete->min();
    $hash_stat_ref->{'max'} = $stat_discrete->max();

    # quantile approximations only implemented in 'Statistics::Descriptive::Weighted' not in 'Statistics::Descriptive::Discrete'
    my $stat_weighted = Statistics::Descriptive::Weighted::Full->new();
    $stat_weighted->add_data(\@values, \@weights); # add data as array refs
    $hash_stat_ref->{'q1'} = sprintf("%.2f", $stat_weighted->quantile(.25)); # Q1, first quartile (25th percentile)
    $hash_stat_ref->{'q3'} = sprintf("%.2f", $stat_weighted->quantile(.75)); # Q3, third quartile (75th percentile)
    return 1;
}



### Subroutine to calculate stats with module 'Statistics::Descriptive'
sub stats_full {
    my ($data_array_ref, $hash_stat_ref) = @_;
    my $stat = Statistics::Descriptive::Full->new();
    $stat->add_data(@$data_array_ref); # de-reference array ref
    $hash_stat_ref->{'mean'} = sprintf("%.2f", $stat->mean());
    $hash_stat_ref->{'median'} = sprintf("%.2f", $stat->median());
    $hash_stat_ref->{'sd'} = sprintf("%.2f", $stat->standard_deviation());
    $hash_stat_ref->{'min'} = $stat->min();
    $hash_stat_ref->{'max'} = $stat->max();
    $hash_stat_ref->{'q1'} = sprintf("%.2f", $stat->quantile(1)); # Q1, first quartile (25th percentile)
    $hash_stat_ref->{'q3'} = sprintf("%.2f", $stat->quantile(3)); # Q3, third quartile (75th percentile)
    # $stat->clear(); # remove stats from the module for next round; not needed because of 'new()' initialisation at begin of subroutine
    return 1;
}



### Subroutine to print to STDOUT and optionally to output file
sub print_out_std {
    print "@_"; # print line to STDOUT
    print $Output_Fh "@_" if ($Output_File); # additionally, print to optional output file
    return 1;
}