Mercurial > repos > dereeper > pangenome_explorer
comparison COG/bac-genomics-scripts/sample_fastx-txt/sample_fastx-txt.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 ####### | |
4 # POD # | |
5 ####### | |
6 | |
7 =pod | |
8 | |
9 =head1 NAME | |
10 | |
11 C<sample_fastx-txt.pl> - random subsampling of FASTA, FASTQ, or TEXT files | |
12 | |
13 =head1 SYNOPSIS | |
14 | |
15 C<perl sample_fastx-txt.pl -i infile.fasta -n 100 E<gt> subsample.fasta> | |
16 | |
17 B<or> | |
18 | |
19 C<zcat reads.fastq.gz | perl sample_fastx-txt.pl -i - -n 100000 | |
20 E<gt> subsample.fastq> | |
21 | |
22 =head1 DESCRIPTION | |
23 | |
24 Randomly subsample FASTA, FASTQ, and TEXT files. | |
25 | |
26 Empty lines in the input files will be skipped and not included in | |
27 sampling. Format TEXT assumes one entry per single line. FASTQ | |
28 format assumes B<four> lines per read, if this is not the case run | |
29 the FASTQ file through L<C<fastx_fix.pl>|/fastx_fix> or use Heng | |
30 Li's L<C<seqtk seq>|https://github.com/lh3/seqtk>: | |
31 | |
32 C<seqtk seq -l 0 infile.fq E<gt> outfile.fq> | |
33 | |
34 The file type is detected automatically. However, if automatic | |
35 detection fails, TEXT format is assumed. As a last resort, you can | |
36 set the file type manually with option B<-f>. | |
37 | |
38 This script is an implementation of the I<reservoir sampling> | |
39 algorithm (or I<Algorithm R (3.4.2)>) described in Donald Knuth's | |
40 L<I<The Art of Computer Programming>|https://en.wikipedia.org/wiki/The_Art_of_Computer_Programming>. | |
41 It is designed to randomly pull a small sample size from a | |
42 (potential) huge input file of indeterminate size, which | |
43 (potentially) doesn't fit into main memory. The beauty of reservoir | |
44 sampling is that it requires only one pass through the input file. | |
45 The memory consumption of the algorithm is proportional to the | |
46 sample size, thus large sample sizes will consume lots of memory as | |
47 the whole sample will be held in memory. On the other hand, the size | |
48 of the initial file is irrelevant. | |
49 | |
50 An alternative tool, which is a lot faster, is C<seqtk sample> from | |
51 the L<I<seqtk toolkit>|https://github.com/lh3/seqtk>. | |
52 | |
53 =head1 OPTIONS | |
54 | |
55 =head2 Mandatory options | |
56 | |
57 =over 20 | |
58 | |
59 =item B<-i>=I<str>, B<-input>=I<str> | |
60 | |
61 Input FASTA/Q or TEXT file, or piped C<STDIN> (-) | |
62 | |
63 =item B<-n>=I<int>, B<-num>=I<int> | |
64 | |
65 Number of entries/reads to subsample | |
66 | |
67 =back | |
68 | |
69 =head2 Optional options | |
70 | |
71 =over 20 | |
72 | |
73 =item B<-h>, B<-help> | |
74 | |
75 Help (perldoc POD) | |
76 | |
77 =item B<-f>=I<fasta|fastq|text>, B<-file_type>=I<fasta|fastq|text> | |
78 | |
79 Set the file type manually | |
80 | |
81 =item B<-s>=I<int>, B<-seed>=I<int> | |
82 | |
83 Set starting random seed. For B<paired-end> read data use the B<same | |
84 random seed> for both FASTQ files with option B<-s> to retain | |
85 pairing (see L<"EXAMPLES"> below). | |
86 | |
87 =item B<-t>=I<int>, B<-title_skip>=I<int> | |
88 | |
89 Skip the specified number of header lines in TEXT files before | |
90 subsampling and append them again afterwards. If you want to get rid | |
91 of the header as well, pipe the subsample output to | |
92 L<C<sed>|https://www.gnu.org/software/sed/manual/sed.html> (see | |
93 C<man sed> and L<"EXAMPLES"> below). | |
94 | |
95 =item B<-v>, B<-version> | |
96 | |
97 Print version number to C<STDERR> | |
98 | |
99 =back | |
100 | |
101 =head1 OUTPUT | |
102 | |
103 =over 20 | |
104 | |
105 =item C<STDOUT> | |
106 | |
107 The subsample of the input file is printed to C<STDOUT>. Redirect or | |
108 pipe into another tool as needed. | |
109 | |
110 =back | |
111 | |
112 =head1 EXAMPLES | |
113 | |
114 =over | |
115 | |
116 =item C<perl sample_fastx-txt.pl -i read-pair_1.fq -n 1000000 -s 123 | |
117 E<gt> sub-pair_1.fq> | |
118 | |
119 =item C<perl sample_fastx-txt.pl -i read-pair_2.fq -n 1000000 -s 123 | |
120 E<gt> sub-pair_2.fq> | |
121 | |
122 =item C<perl sample_fastx-txt.pl -i infile.txt -n 100 -f text -t 3 | |
123 E<gt> subsample.txt> | |
124 | |
125 =item C<perl sample_fastx-txt.pl -i infile.txt -n 350 -t 2 | sed | |
126 '1,2d' E<gt> sub_no-header.txt> | |
127 | |
128 =back | |
129 | |
130 =head1 VERSION | |
131 | |
132 0.1 18-11-2014 | |
133 | |
134 =head1 AUTHOR | |
135 | |
136 Andreas Leimbach aleimba[at]gmx[dot]de | |
137 | |
138 =head1 ACKNOWLEDGEMENTS | |
139 | |
140 I got the idea for reservoir sampling from Sean Eddy's keynote at | |
141 the Janelia meeting on L<I<High Throughput Sequencing for | |
142 Neuroscience>|http://cryptogenomicon.wordpress.com/2014/11/01/high-throughput-sequencing-for-neuroscience/> | |
143 which he posted in his blog | |
144 L<I<Cryptogenomicon>|http://cryptogenomicon.wordpress.com/>. The | |
145 L<I<Wikipedia | |
146 article>|https://en.wikipedia.org/wiki/Reservoir_sampling> and the | |
147 L<I<PerlMonks>|http://www.perlmonks.org/index.pl?node_id=177092> | |
148 implementation helped a lot, as well. | |
149 | |
150 =head1 LICENSE | |
151 | |
152 This program is free software: you can redistribute it and/or modify | |
153 it under the terms of the GNU General Public License as published by | |
154 the Free Software Foundation; either version 3 (GPLv3) of the | |
155 License, or (at your option) any later version. | |
156 | |
157 This program is distributed in the hope that it will be useful, but | |
158 WITHOUT ANY WARRANTY; without even the implied warranty of | |
159 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
160 General Public License for more details. | |
161 | |
162 You should have received a copy of the GNU General Public License | |
163 along with this program. If not, see L<http://www.gnu.org/licenses/>. | |
164 | |
165 =cut | |
166 | |
167 | |
168 ######## | |
169 # MAIN # | |
170 ######## | |
171 | |
172 use strict; | |
173 use warnings; | |
174 use autodie; | |
175 use Getopt::Long; | |
176 use Pod::Usage; | |
177 | |
178 ### Get options with Getopt::Long | |
179 my $Input_File; # input file to subsample from | |
180 my $Sample_Num; # number of sequence entries/reads/lines to sample | |
181 my $File_Type; # set file type; otherwise detect file type by file extension, or by looking at the first line of the file if input is piped from STDIN | |
182 my $Seed; # give starting seed number for 'rand' to make results repeatable; also needed for paired FASTA/Q files | |
183 my $Title_Skip; # optionally skip given number of header lines of TEXT files | |
184 my $VERSION = 0.1; | |
185 my ($Opt_Version, $Opt_Help); | |
186 GetOptions ('input=s' => \$Input_File, | |
187 'num=i' => \$Sample_Num, | |
188 'file_type=s' => \$File_Type, | |
189 'seed=i' => \$Seed, | |
190 'title_skip=i' => \$Title_Skip, | |
191 'version' => \$Opt_Version, | |
192 'help|?' => \$Opt_Help); | |
193 | |
194 | |
195 | |
196 ### Run perldoc on POD | |
197 pod2usage(-verbose => 2) if ($Opt_Help); | |
198 die "$0 $VERSION\n" if ($Opt_Version); | |
199 if (!$Input_File || !$Sample_Num) { | |
200 my $warning = "\n### Fatal error: Options '-i' or '-n' or their arguments are missing!\n"; | |
201 pod2usage(-verbose => 1, -message => $warning, -exitval => 2); | |
202 } | |
203 die "\n### Fatal error:\nUnknown file type '$File_Type' given with option '-f'. Please choose from either 'fasta', 'fastq', or 'text'!\n" if ($File_Type && $File_Type !~ /(fasta|fastq|text)/i); | |
204 | |
205 | |
206 | |
207 ### Pipe input from STDIN or open input file | |
208 my $Input_Fh; | |
209 if ($Input_File eq '-') { # file input via STDIN | |
210 $Input_Fh = *STDIN; # capture typeglob of STDIN | |
211 } else { # input via input file | |
212 open ($Input_Fh, "<", "$Input_File"); | |
213 get_file_type('ext') if (!$File_Type); # subroutine to determine file type; mode 'ext' (extension) for input files | |
214 } | |
215 | |
216 | |
217 | |
218 ### Reservoir sampling | |
219 my @Rsvr_Array; # sample reservoir (array for TEXT files, array-in-array for FASTA/Q files) | |
220 my @Title; # store header for TEXT files to append to the file later on, see option flag '-title_skip' | |
221 my $Next_Fasta_ID; # for multi-line FASTA input files to store next entry header/ID line while parsing in subroutine 'get_fastx_entry' | |
222 | |
223 # 1. fill reservoir | |
224 while (<$Input_Fh>) { | |
225 if (/^\s*$/) { # skip empty lines in input | |
226 warn "\n### FASTQ file includes empty lines, which is unusual. Sampling FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i); | |
227 next; | |
228 } | |
229 chomp; | |
230 get_file_type('stdin', $_) if (!$File_Type && $Input_File eq '-' && $. == 1); # subroutine to determine file type; mode 'stdin' piped input with first line of file | |
231 warn "\n### Ignoring option '-t|-title_skip' as it has no effect on FASTA/Q files! Use the option only for TEXT files!\n\n" if($Title_Skip && $File_Type !~ /text/i && $. == 1); | |
232 | |
233 # FASTA file | |
234 if ($File_Type =~ /fasta/i) { | |
235 $_ = get_fastx_entry($_); # subroutine to read one FASTA sequence entry (seq in multi-line or not), returns anonymous array | |
236 | |
237 # FASTQ file | |
238 } elsif ($File_Type =~ /fastq/i) { | |
239 $_ = get_fastx_entry($_); # subroutine to read one FASTQ read composed of FOUR mandatory lines, returns reference to array | |
240 | |
241 # "single-line" TEXT file | |
242 } elsif ($File_Type =~ /text/i) { | |
243 warn "\n### Sure this is a TEXT file? The first line suspiciously looks like a FASTA ID/header line:\n$_\nBut, proceeding with file type 'text' ...\n\n" if ($. == 1 && $_ =~ /^>.*/); | |
244 warn "\n### Sure this is a TEXT file? The first line suspiciously looks like a FASTQ sequence ID line:\n$_\nBut, proceeding with file type 'text' ...\n\n" if ($. == 1 && $_ =~ /^@.+/); | |
245 if ($Title_Skip) { # skip possible header lines of TEXT file | |
246 while ($Title_Skip) { | |
247 push(@Title, $_); # store header line | |
248 chomp($_ = <$Input_Fh>); | |
249 $Title_Skip--; | |
250 } | |
251 } | |
252 } | |
253 | |
254 push(@Rsvr_Array, $_); # fill reservoir | |
255 last if (@Rsvr_Array == $Sample_Num); # reservoir is filled | |
256 } | |
257 die "\n### Fatal error:\nInsufficient records in input for sample size '$Sample_Num'!\n" if (@Rsvr_Array < $Sample_Num); | |
258 | |
259 # 2. randomly replace elements in the reservoir with a decreasing probability | |
260 srand($Seed) if ($Seed); # force seed value | |
261 while (<$Input_Fh>) { | |
262 if (/^\s*$/) { | |
263 warn "\n### FASTQ file includes empty lines, which is unusual. Sampling FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i); | |
264 next; | |
265 } | |
266 my $rand_num = int(rand($.)); # choose an integer between 0 and eof-1; inclusive because array zero-based | |
267 | |
268 # replace elements in reservoir array | |
269 if ($rand_num < @Rsvr_Array) { | |
270 chomp; | |
271 | |
272 # FASTA file | |
273 if ($File_Type =~ /fasta/i) { | |
274 $_ = get_fastx_entry($_); # subroutine | |
275 | |
276 # FASTQ file | |
277 } elsif ($File_Type =~ /fastq/i) { | |
278 $_ = get_fastx_entry($_); # subroutine | |
279 } | |
280 $Rsvr_Array[$rand_num] = $_; # TEXT files are single-line based | |
281 | |
282 } elsif ($rand_num >= @Rsvr_Array) { # skip residual entry/read lines of FASTA/Q files | |
283 if ($File_Type =~ /fasta/i) { | |
284 get_fastx_entry($_); # subroutine without storing returning anonymous array (to skip FASTA multi-line seq) | |
285 | |
286 } elsif ($File_Type =~ /fastq/i) { # skip second, third, and fourth line of FASTQ file | |
287 <$Input_Fh>; <$Input_Fh>; <$Input_Fh>; | |
288 } | |
289 } | |
290 } | |
291 close $Input_Fh; | |
292 | |
293 | |
294 | |
295 ### Print subsample to STDOUT | |
296 if (@Title) { # put header back in TEXT output | |
297 foreach (@Title) { | |
298 print "$_\n"; | |
299 } | |
300 } | |
301 foreach (@Rsvr_Array) { | |
302 if ($File_Type =~ /fast/i) { # for both FASTA/Q file types | |
303 foreach (@$_) { # de-reference array-ref for iteration | |
304 print "$_\n"; | |
305 } | |
306 | |
307 # single-line TEXT file | |
308 } else { | |
309 print "$_\n"; | |
310 } | |
311 } | |
312 | |
313 exit; | |
314 | |
315 | |
316 ############### | |
317 # Subroutines # | |
318 ############### | |
319 | |
320 ### Get sequence entries/reads from FASTA/Q file | |
321 sub get_fastx_entry { | |
322 my $line = shift; | |
323 | |
324 # possible multi-line seq in FASTA | |
325 if ($File_Type =~ /fasta/i) { | |
326 my ($seq, $header); | |
327 if ($. == 1) { # first line of file | |
328 die "\n### Fatal error:\nNot a FASTA input file, first line of file should be a FASTA ID/header line and start with a '>':\n$line\n" if ($line !~ /^>/); | |
329 $header = $line; | |
330 } elsif ($Next_Fasta_ID) { | |
331 $header = $Next_Fasta_ID; | |
332 $seq = $line; | |
333 } | |
334 while (<$Input_Fh>) { | |
335 chomp; | |
336 $Next_Fasta_ID = $_ if (/^>/); # store ID/header for next seq entry | |
337 return [$header, $seq] if (/^>/); # return anonymous array with current header and seq | |
338 $seq .= $_; # concatenate multi-line seq | |
339 } | |
340 return [$header, $seq] if eof; | |
341 | |
342 # FASTQ: FOUR lines for each FASTQ read (seq-ID, sequence, qual-ID [optional], qual) | |
343 } elsif ($File_Type =~ /fastq/i) { | |
344 my @fastq_read; | |
345 | |
346 # read name/ID, line 1 | |
347 my $seq_id = $line; | |
348 die "\n### Fatal error:\nThis read doesn't have a sequence identifier/read name according to FASTQ specs, it should begin with a '\@':\n$seq_id\n" if ($seq_id !~ /^@.+/); | |
349 push(@fastq_read, $seq_id); | |
350 $seq_id =~ s/^@//; # remove '@' to make comparable to $qual_id | |
351 | |
352 # sequence, line 2 | |
353 chomp (my $seq = <$Input_Fh>); | |
354 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /\s+/); | |
355 die "\n### Fatal error:\nRead '$seq_id' has a IUPAC degenerate base (except for 'N') or non-nucleotide character in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /[^acgtun]/i); | |
356 push(@fastq_read, $seq); | |
357 | |
358 # optional quality ID, line 3 | |
359 chomp (my $qual_id = <$Input_Fh>); | |
360 die "\n### Fatal error:\nThe optional sequence identifier/read name for the quality line of read '$seq_id' is not according to FASTQ specs, it should begin with a '+':\n$qual_id\n" if ($qual_id !~ /^\+/); | |
361 push(@fastq_read, $qual_id); | |
362 $qual_id =~ s/^\+//; # if optional ID is present check if equal to $seq_id in line 1 | |
363 die "\n### Fatal error:\nThe sequence identifier/read name of read '$seq_id' doesn't fit to the optional ID in the quality line:\n$qual_id\n" if ($qual_id && $qual_id ne $seq_id); | |
364 | |
365 # quality, line 4 | |
366 chomp (my $qual = <$Input_Fh>); | |
367 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /\s+/); | |
368 die "\n### Fatal error:\nRead '$seq_id' has a non-ASCII character in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /[^[:ascii]]/); | |
369 die "\n### Fatal error:\nThe quality line of read '$seq_id' doesn't have the same number of symbols as letters in the sequence:\n$seq\n$qual\n" if (length $qual != length $seq); | |
370 push(@fastq_read, $qual); | |
371 | |
372 return \@fastq_read; # return array-ref | |
373 } | |
374 } | |
375 | |
376 | |
377 | |
378 ### Determine file type (FASTA, FASTQ, or TEXT) | |
379 sub get_file_type { | |
380 my ($mode, $line) = @_; # mode either 'ext' or 'stdin' ('stdin' needs first line of input file) | |
381 | |
382 # determine file type via file extension | |
383 if ($mode eq 'ext') { | |
384 if ($Input_File =~ /.+\.(fa|fas|fasta|ffn|fna|frn|fsa)$/) { # use "|fsa)(\.gz)*$" if unzip inside script | |
385 $File_Type = 'fasta'; | |
386 } elsif ($Input_File =~ /.+\.(fastq|fq)$/) { | |
387 $File_Type = 'fastq'; | |
388 } else { | |
389 $File_Type = 'text'; | |
390 } | |
391 | |
392 # determine by looking at first line of file | |
393 } elsif ($mode eq 'stdin') { | |
394 if ($line =~ /^>.*/) { | |
395 $File_Type = 'fasta'; | |
396 } elsif ($line =~ /^@.+/) { | |
397 $File_Type = 'fastq'; | |
398 } else { | |
399 $File_Type = 'text'; | |
400 } | |
401 } | |
402 | |
403 print STDERR "Detected file type: $File_Type\n"; | |
404 return 1; | |
405 } |