Mercurial > repos > dereeper > pangenome_explorer
diff COG/bac-genomics-scripts/sample_fastx-txt/sample_fastx-txt.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/COG/bac-genomics-scripts/sample_fastx-txt/sample_fastx-txt.pl Thu May 30 11:52:25 2024 +0000 @@ -0,0 +1,405 @@ +#!/usr/bin/perl + +####### +# POD # +####### + +=pod + +=head1 NAME + +C<sample_fastx-txt.pl> - random subsampling of FASTA, FASTQ, or TEXT files + +=head1 SYNOPSIS + +C<perl sample_fastx-txt.pl -i infile.fasta -n 100 E<gt> subsample.fasta> + +B<or> + +C<zcat reads.fastq.gz | perl sample_fastx-txt.pl -i - -n 100000 +E<gt> subsample.fastq> + +=head1 DESCRIPTION + +Randomly subsample FASTA, FASTQ, and TEXT files. + +Empty lines in the input files will be skipped and not included in +sampling. Format TEXT assumes one entry per single line. FASTQ +format assumes B<four> lines per read, if this is not the case run +the FASTQ file through L<C<fastx_fix.pl>|/fastx_fix> or use Heng +Li's L<C<seqtk seq>|https://github.com/lh3/seqtk>: + +C<seqtk seq -l 0 infile.fq E<gt> outfile.fq> + +The file type is detected automatically. However, if automatic +detection fails, TEXT format is assumed. As a last resort, you can +set the file type manually with option B<-f>. + +This script is an implementation of the I<reservoir sampling> +algorithm (or I<Algorithm R (3.4.2)>) described in Donald Knuth's +L<I<The Art of Computer Programming>|https://en.wikipedia.org/wiki/The_Art_of_Computer_Programming>. +It is designed to randomly pull a small sample size from a +(potential) huge input file of indeterminate size, which +(potentially) doesn't fit into main memory. The beauty of reservoir +sampling is that it requires only one pass through the input file. +The memory consumption of the algorithm is proportional to the +sample size, thus large sample sizes will consume lots of memory as +the whole sample will be held in memory. On the other hand, the size +of the initial file is irrelevant. + +An alternative tool, which is a lot faster, is C<seqtk sample> from +the L<I<seqtk toolkit>|https://github.com/lh3/seqtk>. + +=head1 OPTIONS + +=head2 Mandatory options + +=over 20 + +=item B<-i>=I<str>, B<-input>=I<str> + +Input FASTA/Q or TEXT file, or piped C<STDIN> (-) + +=item B<-n>=I<int>, B<-num>=I<int> + +Number of entries/reads to subsample + +=back + +=head2 Optional options + +=over 20 + +=item B<-h>, B<-help> + +Help (perldoc POD) + +=item B<-f>=I<fasta|fastq|text>, B<-file_type>=I<fasta|fastq|text> + +Set the file type manually + +=item B<-s>=I<int>, B<-seed>=I<int> + +Set starting random seed. For B<paired-end> read data use the B<same +random seed> for both FASTQ files with option B<-s> to retain +pairing (see L<"EXAMPLES"> below). + +=item B<-t>=I<int>, B<-title_skip>=I<int> + +Skip the specified number of header lines in TEXT files before +subsampling and append them again afterwards. If you want to get rid +of the header as well, pipe the subsample output to +L<C<sed>|https://www.gnu.org/software/sed/manual/sed.html> (see +C<man sed> and L<"EXAMPLES"> below). + +=item B<-v>, B<-version> + +Print version number to C<STDERR> + +=back + +=head1 OUTPUT + +=over 20 + +=item C<STDOUT> + +The subsample of the input file is printed to C<STDOUT>. Redirect or +pipe into another tool as needed. + +=back + +=head1 EXAMPLES + +=over + +=item C<perl sample_fastx-txt.pl -i read-pair_1.fq -n 1000000 -s 123 +E<gt> sub-pair_1.fq> + +=item C<perl sample_fastx-txt.pl -i read-pair_2.fq -n 1000000 -s 123 +E<gt> sub-pair_2.fq> + +=item C<perl sample_fastx-txt.pl -i infile.txt -n 100 -f text -t 3 +E<gt> subsample.txt> + +=item C<perl sample_fastx-txt.pl -i infile.txt -n 350 -t 2 | sed +'1,2d' E<gt> sub_no-header.txt> + +=back + +=head1 VERSION + + 0.1 18-11-2014 + +=head1 AUTHOR + + Andreas Leimbach aleimba[at]gmx[dot]de + +=head1 ACKNOWLEDGEMENTS + +I got the idea for reservoir sampling from Sean Eddy's keynote at +the Janelia meeting on L<I<High Throughput Sequencing for +Neuroscience>|http://cryptogenomicon.wordpress.com/2014/11/01/high-throughput-sequencing-for-neuroscience/> +which he posted in his blog +L<I<Cryptogenomicon>|http://cryptogenomicon.wordpress.com/>. The +L<I<Wikipedia +article>|https://en.wikipedia.org/wiki/Reservoir_sampling> and the +L<I<PerlMonks>|http://www.perlmonks.org/index.pl?node_id=177092> +implementation helped a lot, as well. + +=head1 LICENSE + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 (GPLv3) of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see L<http://www.gnu.org/licenses/>. + +=cut + + +######## +# MAIN # +######## + +use strict; +use warnings; +use autodie; +use Getopt::Long; +use Pod::Usage; + +### Get options with Getopt::Long +my $Input_File; # input file to subsample from +my $Sample_Num; # number of sequence entries/reads/lines to sample +my $File_Type; # set file type; otherwise detect file type by file extension, or by looking at the first line of the file if input is piped from STDIN +my $Seed; # give starting seed number for 'rand' to make results repeatable; also needed for paired FASTA/Q files +my $Title_Skip; # optionally skip given number of header lines of TEXT files +my $VERSION = 0.1; +my ($Opt_Version, $Opt_Help); +GetOptions ('input=s' => \$Input_File, + 'num=i' => \$Sample_Num, + 'file_type=s' => \$File_Type, + 'seed=i' => \$Seed, + 'title_skip=i' => \$Title_Skip, + 'version' => \$Opt_Version, + 'help|?' => \$Opt_Help); + + + +### Run perldoc on POD +pod2usage(-verbose => 2) if ($Opt_Help); +die "$0 $VERSION\n" if ($Opt_Version); +if (!$Input_File || !$Sample_Num) { + my $warning = "\n### Fatal error: Options '-i' or '-n' or their arguments are missing!\n"; + pod2usage(-verbose => 1, -message => $warning, -exitval => 2); +} +die "\n### Fatal error:\nUnknown file type '$File_Type' given with option '-f'. Please choose from either 'fasta', 'fastq', or 'text'!\n" if ($File_Type && $File_Type !~ /(fasta|fastq|text)/i); + + + +### Pipe input from STDIN or open input file +my $Input_Fh; +if ($Input_File eq '-') { # file input via STDIN + $Input_Fh = *STDIN; # capture typeglob of STDIN +} else { # input via input file + open ($Input_Fh, "<", "$Input_File"); + get_file_type('ext') if (!$File_Type); # subroutine to determine file type; mode 'ext' (extension) for input files +} + + + +### Reservoir sampling +my @Rsvr_Array; # sample reservoir (array for TEXT files, array-in-array for FASTA/Q files) +my @Title; # store header for TEXT files to append to the file later on, see option flag '-title_skip' +my $Next_Fasta_ID; # for multi-line FASTA input files to store next entry header/ID line while parsing in subroutine 'get_fastx_entry' + +# 1. fill reservoir +while (<$Input_Fh>) { + if (/^\s*$/) { # skip empty lines in input + warn "\n### FASTQ file includes empty lines, which is unusual. Sampling FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i); + next; + } + chomp; + get_file_type('stdin', $_) if (!$File_Type && $Input_File eq '-' && $. == 1); # subroutine to determine file type; mode 'stdin' piped input with first line of file + warn "\n### Ignoring option '-t|-title_skip' as it has no effect on FASTA/Q files! Use the option only for TEXT files!\n\n" if($Title_Skip && $File_Type !~ /text/i && $. == 1); + + # FASTA file + if ($File_Type =~ /fasta/i) { + $_ = get_fastx_entry($_); # subroutine to read one FASTA sequence entry (seq in multi-line or not), returns anonymous array + + # FASTQ file + } elsif ($File_Type =~ /fastq/i) { + $_ = get_fastx_entry($_); # subroutine to read one FASTQ read composed of FOUR mandatory lines, returns reference to array + + # "single-line" TEXT file + } elsif ($File_Type =~ /text/i) { + warn "\n### Sure this is a TEXT file? The first line suspiciously looks like a FASTA ID/header line:\n$_\nBut, proceeding with file type 'text' ...\n\n" if ($. == 1 && $_ =~ /^>.*/); + warn "\n### Sure this is a TEXT file? The first line suspiciously looks like a FASTQ sequence ID line:\n$_\nBut, proceeding with file type 'text' ...\n\n" if ($. == 1 && $_ =~ /^@.+/); + if ($Title_Skip) { # skip possible header lines of TEXT file + while ($Title_Skip) { + push(@Title, $_); # store header line + chomp($_ = <$Input_Fh>); + $Title_Skip--; + } + } + } + + push(@Rsvr_Array, $_); # fill reservoir + last if (@Rsvr_Array == $Sample_Num); # reservoir is filled +} +die "\n### Fatal error:\nInsufficient records in input for sample size '$Sample_Num'!\n" if (@Rsvr_Array < $Sample_Num); + +# 2. randomly replace elements in the reservoir with a decreasing probability +srand($Seed) if ($Seed); # force seed value +while (<$Input_Fh>) { + if (/^\s*$/) { + warn "\n### FASTQ file includes empty lines, which is unusual. Sampling FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i); + next; + } + my $rand_num = int(rand($.)); # choose an integer between 0 and eof-1; inclusive because array zero-based + + # replace elements in reservoir array + if ($rand_num < @Rsvr_Array) { + chomp; + + # FASTA file + if ($File_Type =~ /fasta/i) { + $_ = get_fastx_entry($_); # subroutine + + # FASTQ file + } elsif ($File_Type =~ /fastq/i) { + $_ = get_fastx_entry($_); # subroutine + } + $Rsvr_Array[$rand_num] = $_; # TEXT files are single-line based + + } elsif ($rand_num >= @Rsvr_Array) { # skip residual entry/read lines of FASTA/Q files + if ($File_Type =~ /fasta/i) { + get_fastx_entry($_); # subroutine without storing returning anonymous array (to skip FASTA multi-line seq) + + } elsif ($File_Type =~ /fastq/i) { # skip second, third, and fourth line of FASTQ file + <$Input_Fh>; <$Input_Fh>; <$Input_Fh>; + } + } +} +close $Input_Fh; + + + +### Print subsample to STDOUT +if (@Title) { # put header back in TEXT output + foreach (@Title) { + print "$_\n"; + } +} +foreach (@Rsvr_Array) { + if ($File_Type =~ /fast/i) { # for both FASTA/Q file types + foreach (@$_) { # de-reference array-ref for iteration + print "$_\n"; + } + + # single-line TEXT file + } else { + print "$_\n"; + } +} + +exit; + + +############### +# Subroutines # +############### + +### Get sequence entries/reads from FASTA/Q file +sub get_fastx_entry { + my $line = shift; + + # possible multi-line seq in FASTA + if ($File_Type =~ /fasta/i) { + my ($seq, $header); + if ($. == 1) { # first line of file + die "\n### Fatal error:\nNot a FASTA input file, first line of file should be a FASTA ID/header line and start with a '>':\n$line\n" if ($line !~ /^>/); + $header = $line; + } elsif ($Next_Fasta_ID) { + $header = $Next_Fasta_ID; + $seq = $line; + } + while (<$Input_Fh>) { + chomp; + $Next_Fasta_ID = $_ if (/^>/); # store ID/header for next seq entry + return [$header, $seq] if (/^>/); # return anonymous array with current header and seq + $seq .= $_; # concatenate multi-line seq + } + return [$header, $seq] if eof; + + # FASTQ: FOUR lines for each FASTQ read (seq-ID, sequence, qual-ID [optional], qual) + } elsif ($File_Type =~ /fastq/i) { + my @fastq_read; + + # read name/ID, line 1 + my $seq_id = $line; + die "\n### Fatal error:\nThis read doesn't have a sequence identifier/read name according to FASTQ specs, it should begin with a '\@':\n$seq_id\n" if ($seq_id !~ /^@.+/); + push(@fastq_read, $seq_id); + $seq_id =~ s/^@//; # remove '@' to make comparable to $qual_id + + # sequence, line 2 + chomp (my $seq = <$Input_Fh>); + die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /\s+/); + die "\n### Fatal error:\nRead '$seq_id' has a IUPAC degenerate base (except for 'N') or non-nucleotide character in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /[^acgtun]/i); + push(@fastq_read, $seq); + + # optional quality ID, line 3 + chomp (my $qual_id = <$Input_Fh>); + die "\n### Fatal error:\nThe optional sequence identifier/read name for the quality line of read '$seq_id' is not according to FASTQ specs, it should begin with a '+':\n$qual_id\n" if ($qual_id !~ /^\+/); + push(@fastq_read, $qual_id); + $qual_id =~ s/^\+//; # if optional ID is present check if equal to $seq_id in line 1 + die "\n### Fatal error:\nThe sequence identifier/read name of read '$seq_id' doesn't fit to the optional ID in the quality line:\n$qual_id\n" if ($qual_id && $qual_id ne $seq_id); + + # quality, line 4 + chomp (my $qual = <$Input_Fh>); + die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /\s+/); + die "\n### Fatal error:\nRead '$seq_id' has a non-ASCII character in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /[^[:ascii]]/); + die "\n### Fatal error:\nThe quality line of read '$seq_id' doesn't have the same number of symbols as letters in the sequence:\n$seq\n$qual\n" if (length $qual != length $seq); + push(@fastq_read, $qual); + + return \@fastq_read; # return array-ref + } +} + + + +### Determine file type (FASTA, FASTQ, or TEXT) +sub get_file_type { + my ($mode, $line) = @_; # mode either 'ext' or 'stdin' ('stdin' needs first line of input file) + + # determine file type via file extension + if ($mode eq 'ext') { + if ($Input_File =~ /.+\.(fa|fas|fasta|ffn|fna|frn|fsa)$/) { # use "|fsa)(\.gz)*$" if unzip inside script + $File_Type = 'fasta'; + } elsif ($Input_File =~ /.+\.(fastq|fq)$/) { + $File_Type = 'fastq'; + } else { + $File_Type = 'text'; + } + + # determine by looking at first line of file + } elsif ($mode eq 'stdin') { + if ($line =~ /^>.*/) { + $File_Type = 'fasta'; + } elsif ($line =~ /^@.+/) { + $File_Type = 'fastq'; + } else { + $File_Type = 'text'; + } + } + + print STDERR "Detected file type: $File_Type\n"; + return 1; +}