3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<sample_fastx-txt.pl> - random subsampling of FASTA, FASTQ, or TEXT files
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 C<perl sample_fastx-txt.pl -i infile.fasta -n 100 E<gt> subsample.fasta>
|
|
16
|
|
17 B<or>
|
|
18
|
|
19 C<zcat reads.fastq.gz | perl sample_fastx-txt.pl -i - -n 100000
|
|
20 E<gt> subsample.fastq>
|
|
21
|
|
22 =head1 DESCRIPTION
|
|
23
|
|
24 Randomly subsample FASTA, FASTQ, and TEXT files.
|
|
25
|
|
26 Empty lines in the input files will be skipped and not included in
|
|
27 sampling. Format TEXT assumes one entry per single line. FASTQ
|
|
28 format assumes B<four> lines per read, if this is not the case run
|
|
29 the FASTQ file through L<C<fastx_fix.pl>|/fastx_fix> or use Heng
|
|
30 Li's L<C<seqtk seq>|https://github.com/lh3/seqtk>:
|
|
31
|
|
32 C<seqtk seq -l 0 infile.fq E<gt> outfile.fq>
|
|
33
|
|
34 The file type is detected automatically. However, if automatic
|
|
35 detection fails, TEXT format is assumed. As a last resort, you can
|
|
36 set the file type manually with option B<-f>.
|
|
37
|
|
38 This script is an implementation of the I<reservoir sampling>
|
|
39 algorithm (or I<Algorithm R (3.4.2)>) described in Donald Knuth's
|
|
40 L<I<The Art of Computer Programming>|https://en.wikipedia.org/wiki/The_Art_of_Computer_Programming>.
|
|
41 It is designed to randomly pull a small sample size from a
|
|
42 (potential) huge input file of indeterminate size, which
|
|
43 (potentially) doesn't fit into main memory. The beauty of reservoir
|
|
44 sampling is that it requires only one pass through the input file.
|
|
45 The memory consumption of the algorithm is proportional to the
|
|
46 sample size, thus large sample sizes will consume lots of memory as
|
|
47 the whole sample will be held in memory. On the other hand, the size
|
|
48 of the initial file is irrelevant.
|
|
49
|
|
50 An alternative tool, which is a lot faster, is C<seqtk sample> from
|
|
51 the L<I<seqtk toolkit>|https://github.com/lh3/seqtk>.
|
|
52
|
|
53 =head1 OPTIONS
|
|
54
|
|
55 =head2 Mandatory options
|
|
56
|
|
57 =over 20
|
|
58
|
|
59 =item B<-i>=I<str>, B<-input>=I<str>
|
|
60
|
|
61 Input FASTA/Q or TEXT file, or piped C<STDIN> (-)
|
|
62
|
|
63 =item B<-n>=I<int>, B<-num>=I<int>
|
|
64
|
|
65 Number of entries/reads to subsample
|
|
66
|
|
67 =back
|
|
68
|
|
69 =head2 Optional options
|
|
70
|
|
71 =over 20
|
|
72
|
|
73 =item B<-h>, B<-help>
|
|
74
|
|
75 Help (perldoc POD)
|
|
76
|
|
77 =item B<-f>=I<fasta|fastq|text>, B<-file_type>=I<fasta|fastq|text>
|
|
78
|
|
79 Set the file type manually
|
|
80
|
|
81 =item B<-s>=I<int>, B<-seed>=I<int>
|
|
82
|
|
83 Set starting random seed. For B<paired-end> read data use the B<same
|
|
84 random seed> for both FASTQ files with option B<-s> to retain
|
|
85 pairing (see L<"EXAMPLES"> below).
|
|
86
|
|
87 =item B<-t>=I<int>, B<-title_skip>=I<int>
|
|
88
|
|
89 Skip the specified number of header lines in TEXT files before
|
|
90 subsampling and append them again afterwards. If you want to get rid
|
|
91 of the header as well, pipe the subsample output to
|
|
92 L<C<sed>|https://www.gnu.org/software/sed/manual/sed.html> (see
|
|
93 C<man sed> and L<"EXAMPLES"> below).
|
|
94
|
|
95 =item B<-v>, B<-version>
|
|
96
|
|
97 Print version number to C<STDERR>
|
|
98
|
|
99 =back
|
|
100
|
|
101 =head1 OUTPUT
|
|
102
|
|
103 =over 20
|
|
104
|
|
105 =item C<STDOUT>
|
|
106
|
|
107 The subsample of the input file is printed to C<STDOUT>. Redirect or
|
|
108 pipe into another tool as needed.
|
|
109
|
|
110 =back
|
|
111
|
|
112 =head1 EXAMPLES
|
|
113
|
|
114 =over
|
|
115
|
|
116 =item C<perl sample_fastx-txt.pl -i read-pair_1.fq -n 1000000 -s 123
|
|
117 E<gt> sub-pair_1.fq>
|
|
118
|
|
119 =item C<perl sample_fastx-txt.pl -i read-pair_2.fq -n 1000000 -s 123
|
|
120 E<gt> sub-pair_2.fq>
|
|
121
|
|
122 =item C<perl sample_fastx-txt.pl -i infile.txt -n 100 -f text -t 3
|
|
123 E<gt> subsample.txt>
|
|
124
|
|
125 =item C<perl sample_fastx-txt.pl -i infile.txt -n 350 -t 2 | sed
|
|
126 '1,2d' E<gt> sub_no-header.txt>
|
|
127
|
|
128 =back
|
|
129
|
|
130 =head1 VERSION
|
|
131
|
|
132 0.1 18-11-2014
|
|
133
|
|
134 =head1 AUTHOR
|
|
135
|
|
136 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
137
|
|
138 =head1 ACKNOWLEDGEMENTS
|
|
139
|
|
140 I got the idea for reservoir sampling from Sean Eddy's keynote at
|
|
141 the Janelia meeting on L<I<High Throughput Sequencing for
|
|
142 Neuroscience>|http://cryptogenomicon.wordpress.com/2014/11/01/high-throughput-sequencing-for-neuroscience/>
|
|
143 which he posted in his blog
|
|
144 L<I<Cryptogenomicon>|http://cryptogenomicon.wordpress.com/>. The
|
|
145 L<I<Wikipedia
|
|
146 article>|https://en.wikipedia.org/wiki/Reservoir_sampling> and the
|
|
147 L<I<PerlMonks>|http://www.perlmonks.org/index.pl?node_id=177092>
|
|
148 implementation helped a lot, as well.
|
|
149
|
|
150 =head1 LICENSE
|
|
151
|
|
152 This program is free software: you can redistribute it and/or modify
|
|
153 it under the terms of the GNU General Public License as published by
|
|
154 the Free Software Foundation; either version 3 (GPLv3) of the
|
|
155 License, or (at your option) any later version.
|
|
156
|
|
157 This program is distributed in the hope that it will be useful, but
|
|
158 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
159 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
160 General Public License for more details.
|
|
161
|
|
162 You should have received a copy of the GNU General Public License
|
|
163 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
164
|
|
165 =cut
|
|
166
|
|
167
|
|
168 ########
|
|
169 # MAIN #
|
|
170 ########
|
|
171
|
|
172 use strict;
|
|
173 use warnings;
|
|
174 use autodie;
|
|
175 use Getopt::Long;
|
|
176 use Pod::Usage;
|
|
177
|
|
178 ### Get options with Getopt::Long
|
|
179 my $Input_File; # input file to subsample from
|
|
180 my $Sample_Num; # number of sequence entries/reads/lines to sample
|
|
181 my $File_Type; # set file type; otherwise detect file type by file extension, or by looking at the first line of the file if input is piped from STDIN
|
|
182 my $Seed; # give starting seed number for 'rand' to make results repeatable; also needed for paired FASTA/Q files
|
|
183 my $Title_Skip; # optionally skip given number of header lines of TEXT files
|
|
184 my $VERSION = 0.1;
|
|
185 my ($Opt_Version, $Opt_Help);
|
|
186 GetOptions ('input=s' => \$Input_File,
|
|
187 'num=i' => \$Sample_Num,
|
|
188 'file_type=s' => \$File_Type,
|
|
189 'seed=i' => \$Seed,
|
|
190 'title_skip=i' => \$Title_Skip,
|
|
191 'version' => \$Opt_Version,
|
|
192 'help|?' => \$Opt_Help);
|
|
193
|
|
194
|
|
195
|
|
196 ### Run perldoc on POD
|
|
197 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
198 die "$0 $VERSION\n" if ($Opt_Version);
|
|
199 if (!$Input_File || !$Sample_Num) {
|
|
200 my $warning = "\n### Fatal error: Options '-i' or '-n' or their arguments are missing!\n";
|
|
201 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
202 }
|
|
203 die "\n### Fatal error:\nUnknown file type '$File_Type' given with option '-f'. Please choose from either 'fasta', 'fastq', or 'text'!\n" if ($File_Type && $File_Type !~ /(fasta|fastq|text)/i);
|
|
204
|
|
205
|
|
206
|
|
207 ### Pipe input from STDIN or open input file
|
|
208 my $Input_Fh;
|
|
209 if ($Input_File eq '-') { # file input via STDIN
|
|
210 $Input_Fh = *STDIN; # capture typeglob of STDIN
|
|
211 } else { # input via input file
|
|
212 open ($Input_Fh, "<", "$Input_File");
|
|
213 get_file_type('ext') if (!$File_Type); # subroutine to determine file type; mode 'ext' (extension) for input files
|
|
214 }
|
|
215
|
|
216
|
|
217
|
|
218 ### Reservoir sampling
|
|
219 my @Rsvr_Array; # sample reservoir (array for TEXT files, array-in-array for FASTA/Q files)
|
|
220 my @Title; # store header for TEXT files to append to the file later on, see option flag '-title_skip'
|
|
221 my $Next_Fasta_ID; # for multi-line FASTA input files to store next entry header/ID line while parsing in subroutine 'get_fastx_entry'
|
|
222
|
|
223 # 1. fill reservoir
|
|
224 while (<$Input_Fh>) {
|
|
225 if (/^\s*$/) { # skip empty lines in input
|
|
226 warn "\n### FASTQ file includes empty lines, which is unusual. Sampling FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i);
|
|
227 next;
|
|
228 }
|
|
229 chomp;
|
|
230 get_file_type('stdin', $_) if (!$File_Type && $Input_File eq '-' && $. == 1); # subroutine to determine file type; mode 'stdin' piped input with first line of file
|
|
231 warn "\n### Ignoring option '-t|-title_skip' as it has no effect on FASTA/Q files! Use the option only for TEXT files!\n\n" if($Title_Skip && $File_Type !~ /text/i && $. == 1);
|
|
232
|
|
233 # FASTA file
|
|
234 if ($File_Type =~ /fasta/i) {
|
|
235 $_ = get_fastx_entry($_); # subroutine to read one FASTA sequence entry (seq in multi-line or not), returns anonymous array
|
|
236
|
|
237 # FASTQ file
|
|
238 } elsif ($File_Type =~ /fastq/i) {
|
|
239 $_ = get_fastx_entry($_); # subroutine to read one FASTQ read composed of FOUR mandatory lines, returns reference to array
|
|
240
|
|
241 # "single-line" TEXT file
|
|
242 } elsif ($File_Type =~ /text/i) {
|
|
243 warn "\n### Sure this is a TEXT file? The first line suspiciously looks like a FASTA ID/header line:\n$_\nBut, proceeding with file type 'text' ...\n\n" if ($. == 1 && $_ =~ /^>.*/);
|
|
244 warn "\n### Sure this is a TEXT file? The first line suspiciously looks like a FASTQ sequence ID line:\n$_\nBut, proceeding with file type 'text' ...\n\n" if ($. == 1 && $_ =~ /^@.+/);
|
|
245 if ($Title_Skip) { # skip possible header lines of TEXT file
|
|
246 while ($Title_Skip) {
|
|
247 push(@Title, $_); # store header line
|
|
248 chomp($_ = <$Input_Fh>);
|
|
249 $Title_Skip--;
|
|
250 }
|
|
251 }
|
|
252 }
|
|
253
|
|
254 push(@Rsvr_Array, $_); # fill reservoir
|
|
255 last if (@Rsvr_Array == $Sample_Num); # reservoir is filled
|
|
256 }
|
|
257 die "\n### Fatal error:\nInsufficient records in input for sample size '$Sample_Num'!\n" if (@Rsvr_Array < $Sample_Num);
|
|
258
|
|
259 # 2. randomly replace elements in the reservoir with a decreasing probability
|
|
260 srand($Seed) if ($Seed); # force seed value
|
|
261 while (<$Input_Fh>) {
|
|
262 if (/^\s*$/) {
|
|
263 warn "\n### FASTQ file includes empty lines, which is unusual. Sampling FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i);
|
|
264 next;
|
|
265 }
|
|
266 my $rand_num = int(rand($.)); # choose an integer between 0 and eof-1; inclusive because array zero-based
|
|
267
|
|
268 # replace elements in reservoir array
|
|
269 if ($rand_num < @Rsvr_Array) {
|
|
270 chomp;
|
|
271
|
|
272 # FASTA file
|
|
273 if ($File_Type =~ /fasta/i) {
|
|
274 $_ = get_fastx_entry($_); # subroutine
|
|
275
|
|
276 # FASTQ file
|
|
277 } elsif ($File_Type =~ /fastq/i) {
|
|
278 $_ = get_fastx_entry($_); # subroutine
|
|
279 }
|
|
280 $Rsvr_Array[$rand_num] = $_; # TEXT files are single-line based
|
|
281
|
|
282 } elsif ($rand_num >= @Rsvr_Array) { # skip residual entry/read lines of FASTA/Q files
|
|
283 if ($File_Type =~ /fasta/i) {
|
|
284 get_fastx_entry($_); # subroutine without storing returning anonymous array (to skip FASTA multi-line seq)
|
|
285
|
|
286 } elsif ($File_Type =~ /fastq/i) { # skip second, third, and fourth line of FASTQ file
|
|
287 <$Input_Fh>; <$Input_Fh>; <$Input_Fh>;
|
|
288 }
|
|
289 }
|
|
290 }
|
|
291 close $Input_Fh;
|
|
292
|
|
293
|
|
294
|
|
295 ### Print subsample to STDOUT
|
|
296 if (@Title) { # put header back in TEXT output
|
|
297 foreach (@Title) {
|
|
298 print "$_\n";
|
|
299 }
|
|
300 }
|
|
301 foreach (@Rsvr_Array) {
|
|
302 if ($File_Type =~ /fast/i) { # for both FASTA/Q file types
|
|
303 foreach (@$_) { # de-reference array-ref for iteration
|
|
304 print "$_\n";
|
|
305 }
|
|
306
|
|
307 # single-line TEXT file
|
|
308 } else {
|
|
309 print "$_\n";
|
|
310 }
|
|
311 }
|
|
312
|
|
313 exit;
|
|
314
|
|
315
|
|
316 ###############
|
|
317 # Subroutines #
|
|
318 ###############
|
|
319
|
|
320 ### Get sequence entries/reads from FASTA/Q file
|
|
321 sub get_fastx_entry {
|
|
322 my $line = shift;
|
|
323
|
|
324 # possible multi-line seq in FASTA
|
|
325 if ($File_Type =~ /fasta/i) {
|
|
326 my ($seq, $header);
|
|
327 if ($. == 1) { # first line of file
|
|
328 die "\n### Fatal error:\nNot a FASTA input file, first line of file should be a FASTA ID/header line and start with a '>':\n$line\n" if ($line !~ /^>/);
|
|
329 $header = $line;
|
|
330 } elsif ($Next_Fasta_ID) {
|
|
331 $header = $Next_Fasta_ID;
|
|
332 $seq = $line;
|
|
333 }
|
|
334 while (<$Input_Fh>) {
|
|
335 chomp;
|
|
336 $Next_Fasta_ID = $_ if (/^>/); # store ID/header for next seq entry
|
|
337 return [$header, $seq] if (/^>/); # return anonymous array with current header and seq
|
|
338 $seq .= $_; # concatenate multi-line seq
|
|
339 }
|
|
340 return [$header, $seq] if eof;
|
|
341
|
|
342 # FASTQ: FOUR lines for each FASTQ read (seq-ID, sequence, qual-ID [optional], qual)
|
|
343 } elsif ($File_Type =~ /fastq/i) {
|
|
344 my @fastq_read;
|
|
345
|
|
346 # read name/ID, line 1
|
|
347 my $seq_id = $line;
|
|
348 die "\n### Fatal error:\nThis read doesn't have a sequence identifier/read name according to FASTQ specs, it should begin with a '\@':\n$seq_id\n" if ($seq_id !~ /^@.+/);
|
|
349 push(@fastq_read, $seq_id);
|
|
350 $seq_id =~ s/^@//; # remove '@' to make comparable to $qual_id
|
|
351
|
|
352 # sequence, line 2
|
|
353 chomp (my $seq = <$Input_Fh>);
|
|
354 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /\s+/);
|
|
355 die "\n### Fatal error:\nRead '$seq_id' has a IUPAC degenerate base (except for 'N') or non-nucleotide character in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /[^acgtun]/i);
|
|
356 push(@fastq_read, $seq);
|
|
357
|
|
358 # optional quality ID, line 3
|
|
359 chomp (my $qual_id = <$Input_Fh>);
|
|
360 die "\n### Fatal error:\nThe optional sequence identifier/read name for the quality line of read '$seq_id' is not according to FASTQ specs, it should begin with a '+':\n$qual_id\n" if ($qual_id !~ /^\+/);
|
|
361 push(@fastq_read, $qual_id);
|
|
362 $qual_id =~ s/^\+//; # if optional ID is present check if equal to $seq_id in line 1
|
|
363 die "\n### Fatal error:\nThe sequence identifier/read name of read '$seq_id' doesn't fit to the optional ID in the quality line:\n$qual_id\n" if ($qual_id && $qual_id ne $seq_id);
|
|
364
|
|
365 # quality, line 4
|
|
366 chomp (my $qual = <$Input_Fh>);
|
|
367 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /\s+/);
|
|
368 die "\n### Fatal error:\nRead '$seq_id' has a non-ASCII character in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /[^[:ascii]]/);
|
|
369 die "\n### Fatal error:\nThe quality line of read '$seq_id' doesn't have the same number of symbols as letters in the sequence:\n$seq\n$qual\n" if (length $qual != length $seq);
|
|
370 push(@fastq_read, $qual);
|
|
371
|
|
372 return \@fastq_read; # return array-ref
|
|
373 }
|
|
374 }
|
|
375
|
|
376
|
|
377
|
|
378 ### Determine file type (FASTA, FASTQ, or TEXT)
|
|
379 sub get_file_type {
|
|
380 my ($mode, $line) = @_; # mode either 'ext' or 'stdin' ('stdin' needs first line of input file)
|
|
381
|
|
382 # determine file type via file extension
|
|
383 if ($mode eq 'ext') {
|
|
384 if ($Input_File =~ /.+\.(fa|fas|fasta|ffn|fna|frn|fsa)$/) { # use "|fsa)(\.gz)*$" if unzip inside script
|
|
385 $File_Type = 'fasta';
|
|
386 } elsif ($Input_File =~ /.+\.(fastq|fq)$/) {
|
|
387 $File_Type = 'fastq';
|
|
388 } else {
|
|
389 $File_Type = 'text';
|
|
390 }
|
|
391
|
|
392 # determine by looking at first line of file
|
|
393 } elsif ($mode eq 'stdin') {
|
|
394 if ($line =~ /^>.*/) {
|
|
395 $File_Type = 'fasta';
|
|
396 } elsif ($line =~ /^@.+/) {
|
|
397 $File_Type = 'fastq';
|
|
398 } else {
|
|
399 $File_Type = 'text';
|
|
400 }
|
|
401 }
|
|
402
|
|
403 print STDERR "Detected file type: $File_Type\n";
|
|
404 return 1;
|
|
405 }
|