comparison COG/bac-genomics-scripts/sample_fastx-txt/sample_fastx-txt.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2
3 #######
4 # POD #
5 #######
6
7 =pod
8
9 =head1 NAME
10
11 C<sample_fastx-txt.pl> - random subsampling of FASTA, FASTQ, or TEXT files
12
13 =head1 SYNOPSIS
14
15 C<perl sample_fastx-txt.pl -i infile.fasta -n 100 E<gt> subsample.fasta>
16
17 B<or>
18
19 C<zcat reads.fastq.gz | perl sample_fastx-txt.pl -i - -n 100000
20 E<gt> subsample.fastq>
21
22 =head1 DESCRIPTION
23
24 Randomly subsample FASTA, FASTQ, and TEXT files.
25
26 Empty lines in the input files will be skipped and not included in
27 sampling. Format TEXT assumes one entry per single line. FASTQ
28 format assumes B<four> lines per read, if this is not the case run
29 the FASTQ file through L<C<fastx_fix.pl>|/fastx_fix> or use Heng
30 Li's L<C<seqtk seq>|https://github.com/lh3/seqtk>:
31
32 C<seqtk seq -l 0 infile.fq E<gt> outfile.fq>
33
34 The file type is detected automatically. However, if automatic
35 detection fails, TEXT format is assumed. As a last resort, you can
36 set the file type manually with option B<-f>.
37
38 This script is an implementation of the I<reservoir sampling>
39 algorithm (or I<Algorithm R (3.4.2)>) described in Donald Knuth's
40 L<I<The Art of Computer Programming>|https://en.wikipedia.org/wiki/The_Art_of_Computer_Programming>.
41 It is designed to randomly pull a small sample size from a
42 (potential) huge input file of indeterminate size, which
43 (potentially) doesn't fit into main memory. The beauty of reservoir
44 sampling is that it requires only one pass through the input file.
45 The memory consumption of the algorithm is proportional to the
46 sample size, thus large sample sizes will consume lots of memory as
47 the whole sample will be held in memory. On the other hand, the size
48 of the initial file is irrelevant.
49
50 An alternative tool, which is a lot faster, is C<seqtk sample> from
51 the L<I<seqtk toolkit>|https://github.com/lh3/seqtk>.
52
53 =head1 OPTIONS
54
55 =head2 Mandatory options
56
57 =over 20
58
59 =item B<-i>=I<str>, B<-input>=I<str>
60
61 Input FASTA/Q or TEXT file, or piped C<STDIN> (-)
62
63 =item B<-n>=I<int>, B<-num>=I<int>
64
65 Number of entries/reads to subsample
66
67 =back
68
69 =head2 Optional options
70
71 =over 20
72
73 =item B<-h>, B<-help>
74
75 Help (perldoc POD)
76
77 =item B<-f>=I<fasta|fastq|text>, B<-file_type>=I<fasta|fastq|text>
78
79 Set the file type manually
80
81 =item B<-s>=I<int>, B<-seed>=I<int>
82
83 Set starting random seed. For B<paired-end> read data use the B<same
84 random seed> for both FASTQ files with option B<-s> to retain
85 pairing (see L<"EXAMPLES"> below).
86
87 =item B<-t>=I<int>, B<-title_skip>=I<int>
88
89 Skip the specified number of header lines in TEXT files before
90 subsampling and append them again afterwards. If you want to get rid
91 of the header as well, pipe the subsample output to
92 L<C<sed>|https://www.gnu.org/software/sed/manual/sed.html> (see
93 C<man sed> and L<"EXAMPLES"> below).
94
95 =item B<-v>, B<-version>
96
97 Print version number to C<STDERR>
98
99 =back
100
101 =head1 OUTPUT
102
103 =over 20
104
105 =item C<STDOUT>
106
107 The subsample of the input file is printed to C<STDOUT>. Redirect or
108 pipe into another tool as needed.
109
110 =back
111
112 =head1 EXAMPLES
113
114 =over
115
116 =item C<perl sample_fastx-txt.pl -i read-pair_1.fq -n 1000000 -s 123
117 E<gt> sub-pair_1.fq>
118
119 =item C<perl sample_fastx-txt.pl -i read-pair_2.fq -n 1000000 -s 123
120 E<gt> sub-pair_2.fq>
121
122 =item C<perl sample_fastx-txt.pl -i infile.txt -n 100 -f text -t 3
123 E<gt> subsample.txt>
124
125 =item C<perl sample_fastx-txt.pl -i infile.txt -n 350 -t 2 | sed
126 '1,2d' E<gt> sub_no-header.txt>
127
128 =back
129
130 =head1 VERSION
131
132 0.1 18-11-2014
133
134 =head1 AUTHOR
135
136 Andreas Leimbach aleimba[at]gmx[dot]de
137
138 =head1 ACKNOWLEDGEMENTS
139
140 I got the idea for reservoir sampling from Sean Eddy's keynote at
141 the Janelia meeting on L<I<High Throughput Sequencing for
142 Neuroscience>|http://cryptogenomicon.wordpress.com/2014/11/01/high-throughput-sequencing-for-neuroscience/>
143 which he posted in his blog
144 L<I<Cryptogenomicon>|http://cryptogenomicon.wordpress.com/>. The
145 L<I<Wikipedia
146 article>|https://en.wikipedia.org/wiki/Reservoir_sampling> and the
147 L<I<PerlMonks>|http://www.perlmonks.org/index.pl?node_id=177092>
148 implementation helped a lot, as well.
149
150 =head1 LICENSE
151
152 This program is free software: you can redistribute it and/or modify
153 it under the terms of the GNU General Public License as published by
154 the Free Software Foundation; either version 3 (GPLv3) of the
155 License, or (at your option) any later version.
156
157 This program is distributed in the hope that it will be useful, but
158 WITHOUT ANY WARRANTY; without even the implied warranty of
159 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
160 General Public License for more details.
161
162 You should have received a copy of the GNU General Public License
163 along with this program. If not, see L<http://www.gnu.org/licenses/>.
164
165 =cut
166
167
168 ########
169 # MAIN #
170 ########
171
172 use strict;
173 use warnings;
174 use autodie;
175 use Getopt::Long;
176 use Pod::Usage;
177
178 ### Get options with Getopt::Long
179 my $Input_File; # input file to subsample from
180 my $Sample_Num; # number of sequence entries/reads/lines to sample
181 my $File_Type; # set file type; otherwise detect file type by file extension, or by looking at the first line of the file if input is piped from STDIN
182 my $Seed; # give starting seed number for 'rand' to make results repeatable; also needed for paired FASTA/Q files
183 my $Title_Skip; # optionally skip given number of header lines of TEXT files
184 my $VERSION = 0.1;
185 my ($Opt_Version, $Opt_Help);
186 GetOptions ('input=s' => \$Input_File,
187 'num=i' => \$Sample_Num,
188 'file_type=s' => \$File_Type,
189 'seed=i' => \$Seed,
190 'title_skip=i' => \$Title_Skip,
191 'version' => \$Opt_Version,
192 'help|?' => \$Opt_Help);
193
194
195
196 ### Run perldoc on POD
197 pod2usage(-verbose => 2) if ($Opt_Help);
198 die "$0 $VERSION\n" if ($Opt_Version);
199 if (!$Input_File || !$Sample_Num) {
200 my $warning = "\n### Fatal error: Options '-i' or '-n' or their arguments are missing!\n";
201 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
202 }
203 die "\n### Fatal error:\nUnknown file type '$File_Type' given with option '-f'. Please choose from either 'fasta', 'fastq', or 'text'!\n" if ($File_Type && $File_Type !~ /(fasta|fastq|text)/i);
204
205
206
207 ### Pipe input from STDIN or open input file
208 my $Input_Fh;
209 if ($Input_File eq '-') { # file input via STDIN
210 $Input_Fh = *STDIN; # capture typeglob of STDIN
211 } else { # input via input file
212 open ($Input_Fh, "<", "$Input_File");
213 get_file_type('ext') if (!$File_Type); # subroutine to determine file type; mode 'ext' (extension) for input files
214 }
215
216
217
218 ### Reservoir sampling
219 my @Rsvr_Array; # sample reservoir (array for TEXT files, array-in-array for FASTA/Q files)
220 my @Title; # store header for TEXT files to append to the file later on, see option flag '-title_skip'
221 my $Next_Fasta_ID; # for multi-line FASTA input files to store next entry header/ID line while parsing in subroutine 'get_fastx_entry'
222
223 # 1. fill reservoir
224 while (<$Input_Fh>) {
225 if (/^\s*$/) { # skip empty lines in input
226 warn "\n### FASTQ file includes empty lines, which is unusual. Sampling FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i);
227 next;
228 }
229 chomp;
230 get_file_type('stdin', $_) if (!$File_Type && $Input_File eq '-' && $. == 1); # subroutine to determine file type; mode 'stdin' piped input with first line of file
231 warn "\n### Ignoring option '-t|-title_skip' as it has no effect on FASTA/Q files! Use the option only for TEXT files!\n\n" if($Title_Skip && $File_Type !~ /text/i && $. == 1);
232
233 # FASTA file
234 if ($File_Type =~ /fasta/i) {
235 $_ = get_fastx_entry($_); # subroutine to read one FASTA sequence entry (seq in multi-line or not), returns anonymous array
236
237 # FASTQ file
238 } elsif ($File_Type =~ /fastq/i) {
239 $_ = get_fastx_entry($_); # subroutine to read one FASTQ read composed of FOUR mandatory lines, returns reference to array
240
241 # "single-line" TEXT file
242 } elsif ($File_Type =~ /text/i) {
243 warn "\n### Sure this is a TEXT file? The first line suspiciously looks like a FASTA ID/header line:\n$_\nBut, proceeding with file type 'text' ...\n\n" if ($. == 1 && $_ =~ /^>.*/);
244 warn "\n### Sure this is a TEXT file? The first line suspiciously looks like a FASTQ sequence ID line:\n$_\nBut, proceeding with file type 'text' ...\n\n" if ($. == 1 && $_ =~ /^@.+/);
245 if ($Title_Skip) { # skip possible header lines of TEXT file
246 while ($Title_Skip) {
247 push(@Title, $_); # store header line
248 chomp($_ = <$Input_Fh>);
249 $Title_Skip--;
250 }
251 }
252 }
253
254 push(@Rsvr_Array, $_); # fill reservoir
255 last if (@Rsvr_Array == $Sample_Num); # reservoir is filled
256 }
257 die "\n### Fatal error:\nInsufficient records in input for sample size '$Sample_Num'!\n" if (@Rsvr_Array < $Sample_Num);
258
259 # 2. randomly replace elements in the reservoir with a decreasing probability
260 srand($Seed) if ($Seed); # force seed value
261 while (<$Input_Fh>) {
262 if (/^\s*$/) {
263 warn "\n### FASTQ file includes empty lines, which is unusual. Sampling FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i);
264 next;
265 }
266 my $rand_num = int(rand($.)); # choose an integer between 0 and eof-1; inclusive because array zero-based
267
268 # replace elements in reservoir array
269 if ($rand_num < @Rsvr_Array) {
270 chomp;
271
272 # FASTA file
273 if ($File_Type =~ /fasta/i) {
274 $_ = get_fastx_entry($_); # subroutine
275
276 # FASTQ file
277 } elsif ($File_Type =~ /fastq/i) {
278 $_ = get_fastx_entry($_); # subroutine
279 }
280 $Rsvr_Array[$rand_num] = $_; # TEXT files are single-line based
281
282 } elsif ($rand_num >= @Rsvr_Array) { # skip residual entry/read lines of FASTA/Q files
283 if ($File_Type =~ /fasta/i) {
284 get_fastx_entry($_); # subroutine without storing returning anonymous array (to skip FASTA multi-line seq)
285
286 } elsif ($File_Type =~ /fastq/i) { # skip second, third, and fourth line of FASTQ file
287 <$Input_Fh>; <$Input_Fh>; <$Input_Fh>;
288 }
289 }
290 }
291 close $Input_Fh;
292
293
294
295 ### Print subsample to STDOUT
296 if (@Title) { # put header back in TEXT output
297 foreach (@Title) {
298 print "$_\n";
299 }
300 }
301 foreach (@Rsvr_Array) {
302 if ($File_Type =~ /fast/i) { # for both FASTA/Q file types
303 foreach (@$_) { # de-reference array-ref for iteration
304 print "$_\n";
305 }
306
307 # single-line TEXT file
308 } else {
309 print "$_\n";
310 }
311 }
312
313 exit;
314
315
316 ###############
317 # Subroutines #
318 ###############
319
320 ### Get sequence entries/reads from FASTA/Q file
321 sub get_fastx_entry {
322 my $line = shift;
323
324 # possible multi-line seq in FASTA
325 if ($File_Type =~ /fasta/i) {
326 my ($seq, $header);
327 if ($. == 1) { # first line of file
328 die "\n### Fatal error:\nNot a FASTA input file, first line of file should be a FASTA ID/header line and start with a '>':\n$line\n" if ($line !~ /^>/);
329 $header = $line;
330 } elsif ($Next_Fasta_ID) {
331 $header = $Next_Fasta_ID;
332 $seq = $line;
333 }
334 while (<$Input_Fh>) {
335 chomp;
336 $Next_Fasta_ID = $_ if (/^>/); # store ID/header for next seq entry
337 return [$header, $seq] if (/^>/); # return anonymous array with current header and seq
338 $seq .= $_; # concatenate multi-line seq
339 }
340 return [$header, $seq] if eof;
341
342 # FASTQ: FOUR lines for each FASTQ read (seq-ID, sequence, qual-ID [optional], qual)
343 } elsif ($File_Type =~ /fastq/i) {
344 my @fastq_read;
345
346 # read name/ID, line 1
347 my $seq_id = $line;
348 die "\n### Fatal error:\nThis read doesn't have a sequence identifier/read name according to FASTQ specs, it should begin with a '\@':\n$seq_id\n" if ($seq_id !~ /^@.+/);
349 push(@fastq_read, $seq_id);
350 $seq_id =~ s/^@//; # remove '@' to make comparable to $qual_id
351
352 # sequence, line 2
353 chomp (my $seq = <$Input_Fh>);
354 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /\s+/);
355 die "\n### Fatal error:\nRead '$seq_id' has a IUPAC degenerate base (except for 'N') or non-nucleotide character in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /[^acgtun]/i);
356 push(@fastq_read, $seq);
357
358 # optional quality ID, line 3
359 chomp (my $qual_id = <$Input_Fh>);
360 die "\n### Fatal error:\nThe optional sequence identifier/read name for the quality line of read '$seq_id' is not according to FASTQ specs, it should begin with a '+':\n$qual_id\n" if ($qual_id !~ /^\+/);
361 push(@fastq_read, $qual_id);
362 $qual_id =~ s/^\+//; # if optional ID is present check if equal to $seq_id in line 1
363 die "\n### Fatal error:\nThe sequence identifier/read name of read '$seq_id' doesn't fit to the optional ID in the quality line:\n$qual_id\n" if ($qual_id && $qual_id ne $seq_id);
364
365 # quality, line 4
366 chomp (my $qual = <$Input_Fh>);
367 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /\s+/);
368 die "\n### Fatal error:\nRead '$seq_id' has a non-ASCII character in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /[^[:ascii]]/);
369 die "\n### Fatal error:\nThe quality line of read '$seq_id' doesn't have the same number of symbols as letters in the sequence:\n$seq\n$qual\n" if (length $qual != length $seq);
370 push(@fastq_read, $qual);
371
372 return \@fastq_read; # return array-ref
373 }
374 }
375
376
377
378 ### Determine file type (FASTA, FASTQ, or TEXT)
379 sub get_file_type {
380 my ($mode, $line) = @_; # mode either 'ext' or 'stdin' ('stdin' needs first line of input file)
381
382 # determine file type via file extension
383 if ($mode eq 'ext') {
384 if ($Input_File =~ /.+\.(fa|fas|fasta|ffn|fna|frn|fsa)$/) { # use "|fsa)(\.gz)*$" if unzip inside script
385 $File_Type = 'fasta';
386 } elsif ($Input_File =~ /.+\.(fastq|fq)$/) {
387 $File_Type = 'fastq';
388 } else {
389 $File_Type = 'text';
390 }
391
392 # determine by looking at first line of file
393 } elsif ($mode eq 'stdin') {
394 if ($line =~ /^>.*/) {
395 $File_Type = 'fasta';
396 } elsif ($line =~ /^@.+/) {
397 $File_Type = 'fastq';
398 } else {
399 $File_Type = 'text';
400 }
401 }
402
403 print STDERR "Detected file type: $File_Type\n";
404 return 1;
405 }