comparison COG/bac-genomics-scripts/order_fastx/order_fastx.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2
3 #######
4 # POD #
5 #######
6
7 =pod
8
9 =head1 NAME
10
11 C<order_fastx.pl> - order sequences in FASTA or FASTQ files
12
13 =head1 SYNOPSIS
14
15 C<perl order_fastx.pl -i infile.fasta -l order_id_list.txt
16 E<gt> ordered.fasta>
17
18 =head1 DESCRIPTION
19
20 Order sequence entries in FASTA or FASTQ sequence files according to
21 an ID list with a given order. Beware, the IDs in the order list
22 have to be B<identical> to the entire IDs in the sequence file.
23
24 However, the ">" or "@" ID identifiers of FASTA or FASTQ files,
25 respectively, can be omitted in the ID list.
26
27 The file type is detected automatically. But, you can set the file
28 type manually with option B<-f>. FASTQ format assumes B<four> lines
29 per read, if this is not the case run the FASTQ file through
30 L<C<fastx_fix.pl>|/fastx_fix> or use Heng Li's L<C<seqtk
31 seq>|https://github.com/lh3/seqtk>:
32
33 C<seqtk seq -l 0 infile.fq E<gt> outfile.fq>
34
35 The script can also be used to pull a subset of sequences in the ID
36 list from the sequence file. Probably best to set option flag B<-s>
37 in this case, see L<"Optional options"> below. But, rather use
38 L<C<filter_fastx.pl>|/filter_fastx>.
39
40 =head1 OPTIONS
41
42 =head2 Mandatory options
43
44 =over 20
45
46 =item B<-i>=I<str>, B<-input>=I<str>
47
48 Input FASTA or FASTQ file
49
50 =item B<-l>=I<str>, B<-list>=I<str>
51
52 List with sequence IDs in specified order
53
54 =back
55
56 =head2 Optional options
57
58 =over 20
59
60 =item B<-h>, B<-help>
61
62 Help (perldoc POD)
63
64 =item B<-f>=I<fasta|fastq>, B<-file_type>=I<fasta|fastq>
65
66 Set the file type manually
67
68 =item B<-e>, B<-error_files>
69
70 Write missing IDs in the seq file or the order ID list without an
71 equivalent in the other to error files instead of C<STDERR> (see
72 L<"OUTPUT"> below)
73
74 =item B<-s>, B<-skip_errors>
75
76 Skip missing ID error statements, excludes option B<-e>
77
78 =item B<-v>, B<-version>
79
80 Print version number to C<STDERR>
81
82 =back
83
84 =head1 OUTPUT
85
86 =over 20
87
88 =item C<STDOUT>
89
90 The newly ordered sequences are printed to C<STDOUT>. Redirect or
91 pipe into another tool as needed.
92
93 =item (F<order_ids_missing.txt>)
94
95 If IDs in the order list are missing in the sequence file with
96 option B<-e>
97
98 =item (F<seq_ids_missing.txt>)
99
100 If IDs in the sequence file are missing in the order ID list with
101 option B<-e>
102
103 =back
104
105 =head1 EXAMPLES
106
107 =over
108
109 =item C<perl order_fastx.pl -i infile.fq -l order_id_list.txt -s -f
110 fastq E<gt> ordered.fq>
111
112 =item C<perl order_fastx.pl -i infile.fasta -l order_id_list.txt -e
113 E<gt> ordered.fasta>
114
115 =back
116
117 =head1 VERSION
118
119 0.1 20-11-2014
120
121 =head1 AUTHOR
122
123 Andreas Leimbach aleimba[at]gmx[dot]de
124
125 =head1 LICENSE
126
127 This program is free software: you can redistribute it and/or modify
128 it under the terms of the GNU General Public License as published by
129 the Free Software Foundation; either version 3 (GPLv3) of the
130 License, or (at your option) any later version.
131
132 This program is distributed in the hope that it will be useful, but
133 WITHOUT ANY WARRANTY; without even the implied warranty of
134 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
135 General Public License for more details.
136
137 You should have received a copy of the GNU General Public License
138 along with this program. If not, see L<http://www.gnu.org/licenses/>.
139
140 =cut
141
142
143 ########
144 # MAIN #
145 ########
146
147 use strict;
148 use warnings;
149 use autodie;
150 use Getopt::Long;
151 use Pod::Usage;
152
153 ### Get the options with Getopt::Long
154 my $Seq_File; # sequence file to order sequences in
155 my $Order_List; # order ID list for seq file
156 my $File_Type; # set file type; otherwise detect file type by file extension
157 my $Opt_Error_Files; # print missing IDs not found in order list or seq file to error files instead of STDERR
158 my $Opt_Skip_Errors; # skip missing IDs error statements/files
159 my $VERSION = 0.1;
160 my ($Opt_Version, $Opt_Help);
161 GetOptions ('input=s' => \$Seq_File,
162 'list=s' => \$Order_List,
163 'file_type=s' => \$File_Type,
164 'error_files' => \$Opt_Error_Files,
165 'skip_errors' => \$Opt_Skip_Errors,
166 'version' => \$Opt_Version,
167 'help|?' => \$Opt_Help);
168
169
170
171 ### Run perldoc on POD
172 pod2usage(-verbose => 2) if ($Opt_Help);
173 die "$0 $VERSION\n" if ($Opt_Version);
174 if (!$Seq_File || !$Order_List) {
175 my $warning = "\n### Fatal error: Options '-i' or '-l' or their arguments are missing!\n";
176 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
177 }
178
179
180
181 ### Enforce mandatory or optional options
182 die "\n### Fatal error:\nUnknown file type '$File_Type' given with option '-f'. Please choose from either 'fasta' or 'fastq'!\n" if ($File_Type && $File_Type !~ /(fasta|fastq)/i);
183 warn "\n### Warning:\nIgnoring option flag '-e', because option '-s' set at the same time!\n\n" if ($Opt_Error_Files && $Opt_Skip_Errors);
184
185
186
187 ### Order input FASTA/FASTQ file according to a given list
188 open (my $Order_List_Fh, "<", "$Order_List");
189 open (my $Input_Fh, "<", "$Seq_File"); # pipe from STDIN not working because of 'seek' on filehandle
190 get_file_type() if (!$File_Type); # subroutine to determine file type by file extension
191
192 my %Order_List_IDs; # store order IDs and indicate if found in seq file
193 my %Seq_File_IDs; # store seq file IDs and indicate if present in order list
194
195 my $Next_Fasta_ID; # for multi-line FASTA input files to store next entry header/ID line while parsing in subroutine 'get_fastx_entry'
196 my $Parse_Run = 1; # indicate FIRST parsing cycle through seq file to collect all seq IDs
197
198 while (my $ord_id = <$Order_List_Fh>) {
199 chomp $ord_id;
200 next if ($ord_id =~ /^\s*$/); # skip emtpy lines in order list
201 $ord_id =~ s/^(>|@)//; # remove ">/@" for WHOLE string regex match -> ID in order list can be given with ">/@" or without (will be appended again in print)
202
203 if ($Order_List_IDs{$ord_id}) {
204 die "\n### Fatal error:\n'$ord_id' exists several times in '$Order_List' and IDs should be unique!\n";
205 } else {
206 $Order_List_IDs{$ord_id} = 1; # changes to 2 if ID was found in seq file
207 }
208
209 while (<$Input_Fh>) {
210 if (/^\s*$/) { # skip empty lines in input
211 warn "\n### Warning:\nFASTQ file includes empty lines, which is unusual. Parsing the FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i);
212 next;
213 }
214 chomp;
215
216 # FASTA file
217 if ($File_Type =~ /fasta/i) {
218 $_ = get_fastx_entry($_); # subroutine to read one FASTA sequence entry (seq in multi-line or not), returns anonymous array
219
220 # FASTQ file
221 } elsif ($File_Type =~ /fastq/i) {
222 $_ = get_fastx_entry($_); # subroutine to read one FASTQ read composed of FOUR mandatory lines, returns reference to array
223 }
224
225 if ($Seq_File_IDs{$_->[0]} && $Parse_Run == 1) { # only for first parse cycle, subsequent parsings of course will find the same IDs
226 die "\n### Fatal error:\n'$_->[0]' exists several times in '$Seq_File' and IDs should be unique!\n";
227 } elsif (!$Seq_File_IDs{$_->[0]}) {
228 $Seq_File_IDs{$_->[0]} = 1; # changes to 2 if present in order list
229 }
230
231 if ($ord_id =~ /^$_->[0]$/) { # order ID hit in seq file with the WHOLE string; de-reference array
232 $Order_List_IDs{$ord_id} = 2; # set to ID found
233 $Seq_File_IDs{$_->[0]} = 2;
234
235 print ">$_->[0]\n$_->[1]\n\n" if ($File_Type =~ /fasta/i); # print seq entry to STDOUT
236 print "\@$_->[0]\n$_->[1]\n$_->[2]\n$_->[3]\n" if ($File_Type =~ /fastq/i);
237
238 next if ($Parse_Run == 1); # parse the complete seq file once (skip 'last' below) to collect all seq IDs
239 last; # jump out of seq file 'while'
240 }
241 }
242
243 # rewind seq file for next order list ID
244 $Next_Fasta_ID = '';
245 seek $Input_Fh, 0, 0;
246 $. = 0; # set line number of seq file to 0 (seek doesn't do it automatically)
247 $Parse_Run = 0;
248 }
249 close $Input_Fh;
250 close $Order_List_Fh;
251
252
253
254 ### Print order and seq IDs that were not found in seq file or in order list, resp.
255 if (!$Opt_Skip_Errors) {
256 # order IDs not found in seq file
257 missing_IDs(\%Order_List_IDs, 'order_ids_missing.txt', 'order'); # subroutine to identify and print missing IDs
258
259 # seq file IDs not found in order list
260 missing_IDs(\%Seq_File_IDs, 'seq_ids_missing.txt', 'sequence'); # subroutine
261 }
262
263 exit;
264
265
266 #############
267 #Subroutines#
268 #############
269
270 ### Test for output file existence and give warning to STDERR
271 sub file_exist {
272 my $file = shift;
273 if (-e $file) {
274 warn "\n### Warning:\nThe error file '$file' exists already, the current errors will be appended to the existing file!\n";
275 return 1;
276 }
277 return 0;
278 }
279
280
281
282 ### Get sequence entries from FASTA/Q file
283 sub get_fastx_entry {
284 my $line = shift;
285
286 # possible multi-line seq in FASTA
287 if ($File_Type =~ /fasta/i) {
288 my ($seq, $header);
289 if ($. == 1) { # first line of file
290 die "\n### Fatal error:\nNot a FASTA input file, first line of file should be a FASTA ID/header line and start with a '>':\n$line\n" if ($line !~ /^>/);
291 $header = $line;
292 } elsif ($Next_Fasta_ID) {
293 $header = $Next_Fasta_ID;
294 $seq = $line;
295 }
296 while (<$Input_Fh>) {
297 chomp;
298 $Next_Fasta_ID = $_ if (/^>/); # store ID/header for next seq entry
299 $header =~ s/^>//; # remove '>' for WHOLE string regex match in MAIN
300 return [$header, $seq] if (/^>/); # return anonymous array with current header and seq
301 $seq .= $_; # concatenate multi-line seq
302 }
303 $header =~ s/^>//; # see above
304 return [$header, $seq] if eof;
305
306 # FASTQ: FOUR lines for each FASTQ read (seq-ID, sequence, qual-ID [optional], qual)
307 } elsif ($File_Type =~ /fastq/i) {
308 my @fastq_read;
309
310 # read name/ID, line 1
311 my $seq_id = $line;
312 die "\n### Fatal error:\nThis read doesn't have a sequence identifier/read name according to FASTQ specs, it should begin with a '\@':\n$seq_id\n" if ($seq_id !~ /^@.+/);
313 $seq_id =~ s/^@//; # remove '@' to make comparable to $qual_id and for WHOLE string regex match in MAIN
314 push(@fastq_read, $seq_id);
315
316 # sequence, line 2
317 chomp (my $seq = <$Input_Fh>);
318 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /\s+/);
319 die "\n### Fatal error:\nRead '$seq_id' has a IUPAC degenerate base (except for 'N') or non-nucleotide character in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /[^acgtun]/i);
320 push(@fastq_read, $seq);
321
322 # optional quality ID, line 3
323 chomp (my $qual_id = <$Input_Fh>);
324 die "\n### Fatal error:\nThe optional sequence identifier/read name for the quality line of read '$seq_id' is not according to FASTQ specs, it should begin with a '+':\n$qual_id\n" if ($qual_id !~ /^\+/);
325 push(@fastq_read, $qual_id);
326 $qual_id =~ s/^\+//; # if optional ID is present check if equal to $seq_id in line 1
327 die "\n### Fatal error:\nThe sequence identifier/read name of read '$seq_id' doesn't fit to the optional ID in the quality line:\n$qual_id\n" if ($qual_id && $qual_id ne $seq_id);
328
329 # quality, line 4
330 chomp (my $qual = <$Input_Fh>);
331 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /\s+/);
332 die "\n### Fatal error:\nRead '$seq_id' has a non-ASCII character in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /[^[:ascii]]/);
333 die "\n### Fatal error:\nThe quality line of read '$seq_id' doesn't have the same number of symbols as letters in the sequence:\n$seq\n$qual\n" if (length $qual != length $seq);
334 push(@fastq_read, $qual);
335
336 return \@fastq_read; # return array-ref
337 }
338 return 0;
339 }
340
341
342
343 ### Determine file type via file extension (FASTA or FASTQ)
344 sub get_file_type {
345 if ($Seq_File =~ /.+\.(fa|fas|fasta|ffn|fna|frn|fsa)$/) { # use "|fsa)(\.gz)*$" if unzip inside script
346 $File_Type = 'fasta';
347 } elsif ($Seq_File =~ /.+\.(fastq|fq)$/) {
348 $File_Type = 'fastq';
349 }
350
351 die "\n### Fatal error:\nFile type could not be automatically detected. Sure this is a FASTA/Q file? If yes, you can force the file type by setting option '-f' to either 'fasta' or 'fastq'!\n" if (!$File_Type);
352 print STDERR "Detected file type: $File_Type\n";
353 return 1;
354 }
355
356
357
358 ### Identify and print IDs with no hit in order list or seq file
359 sub missing_IDs {
360 my ($hash_ref, $error_file, $mode) = @_;
361
362 my @missed = grep ($hash_ref->{$_} == 1, keys %$hash_ref); # set to 2 if hit, 1 if "only" present
363
364 if (@missed) {
365 file_exist($error_file) if ($Opt_Error_Files); # subroutine
366 open (my $error_fh, ">>", $error_file) if ($Opt_Error_Files);
367
368 print STDERR "\n### Warning:\nSome $mode IDs were not found in '";
369 if ($mode eq 'order') {
370 print STDERR "$Seq_File";
371 } elsif ($mode eq 'sequence') {
372 print STDERR "$Order_List";
373 }
374 print STDERR "', listed ";
375 print STDERR "below:\n" if (!$Opt_Error_Files);
376 print STDERR "in error file '$error_file'!\n" if ($Opt_Error_Files);
377
378 foreach (sort @missed) {
379 if (!$Opt_Error_Files) {
380 print STDERR "$_\t"; # separated by tab
381 } elsif ($Opt_Error_Files) {
382 print $error_fh "$_\n"; # separated by newline
383 }
384 }
385 print STDERR "\n" if (!$Opt_Error_Files); # final newline for STDERR print
386
387 close $error_fh if ($Opt_Error_Files);
388 }
389 return 1;
390 }