3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<order_fastx.pl> - order sequences in FASTA or FASTQ files
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 C<perl order_fastx.pl -i infile.fasta -l order_id_list.txt
|
|
16 E<gt> ordered.fasta>
|
|
17
|
|
18 =head1 DESCRIPTION
|
|
19
|
|
20 Order sequence entries in FASTA or FASTQ sequence files according to
|
|
21 an ID list with a given order. Beware, the IDs in the order list
|
|
22 have to be B<identical> to the entire IDs in the sequence file.
|
|
23
|
|
24 However, the ">" or "@" ID identifiers of FASTA or FASTQ files,
|
|
25 respectively, can be omitted in the ID list.
|
|
26
|
|
27 The file type is detected automatically. But, you can set the file
|
|
28 type manually with option B<-f>. FASTQ format assumes B<four> lines
|
|
29 per read, if this is not the case run the FASTQ file through
|
|
30 L<C<fastx_fix.pl>|/fastx_fix> or use Heng Li's L<C<seqtk
|
|
31 seq>|https://github.com/lh3/seqtk>:
|
|
32
|
|
33 C<seqtk seq -l 0 infile.fq E<gt> outfile.fq>
|
|
34
|
|
35 The script can also be used to pull a subset of sequences in the ID
|
|
36 list from the sequence file. Probably best to set option flag B<-s>
|
|
37 in this case, see L<"Optional options"> below. But, rather use
|
|
38 L<C<filter_fastx.pl>|/filter_fastx>.
|
|
39
|
|
40 =head1 OPTIONS
|
|
41
|
|
42 =head2 Mandatory options
|
|
43
|
|
44 =over 20
|
|
45
|
|
46 =item B<-i>=I<str>, B<-input>=I<str>
|
|
47
|
|
48 Input FASTA or FASTQ file
|
|
49
|
|
50 =item B<-l>=I<str>, B<-list>=I<str>
|
|
51
|
|
52 List with sequence IDs in specified order
|
|
53
|
|
54 =back
|
|
55
|
|
56 =head2 Optional options
|
|
57
|
|
58 =over 20
|
|
59
|
|
60 =item B<-h>, B<-help>
|
|
61
|
|
62 Help (perldoc POD)
|
|
63
|
|
64 =item B<-f>=I<fasta|fastq>, B<-file_type>=I<fasta|fastq>
|
|
65
|
|
66 Set the file type manually
|
|
67
|
|
68 =item B<-e>, B<-error_files>
|
|
69
|
|
70 Write missing IDs in the seq file or the order ID list without an
|
|
71 equivalent in the other to error files instead of C<STDERR> (see
|
|
72 L<"OUTPUT"> below)
|
|
73
|
|
74 =item B<-s>, B<-skip_errors>
|
|
75
|
|
76 Skip missing ID error statements, excludes option B<-e>
|
|
77
|
|
78 =item B<-v>, B<-version>
|
|
79
|
|
80 Print version number to C<STDERR>
|
|
81
|
|
82 =back
|
|
83
|
|
84 =head1 OUTPUT
|
|
85
|
|
86 =over 20
|
|
87
|
|
88 =item C<STDOUT>
|
|
89
|
|
90 The newly ordered sequences are printed to C<STDOUT>. Redirect or
|
|
91 pipe into another tool as needed.
|
|
92
|
|
93 =item (F<order_ids_missing.txt>)
|
|
94
|
|
95 If IDs in the order list are missing in the sequence file with
|
|
96 option B<-e>
|
|
97
|
|
98 =item (F<seq_ids_missing.txt>)
|
|
99
|
|
100 If IDs in the sequence file are missing in the order ID list with
|
|
101 option B<-e>
|
|
102
|
|
103 =back
|
|
104
|
|
105 =head1 EXAMPLES
|
|
106
|
|
107 =over
|
|
108
|
|
109 =item C<perl order_fastx.pl -i infile.fq -l order_id_list.txt -s -f
|
|
110 fastq E<gt> ordered.fq>
|
|
111
|
|
112 =item C<perl order_fastx.pl -i infile.fasta -l order_id_list.txt -e
|
|
113 E<gt> ordered.fasta>
|
|
114
|
|
115 =back
|
|
116
|
|
117 =head1 VERSION
|
|
118
|
|
119 0.1 20-11-2014
|
|
120
|
|
121 =head1 AUTHOR
|
|
122
|
|
123 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
124
|
|
125 =head1 LICENSE
|
|
126
|
|
127 This program is free software: you can redistribute it and/or modify
|
|
128 it under the terms of the GNU General Public License as published by
|
|
129 the Free Software Foundation; either version 3 (GPLv3) of the
|
|
130 License, or (at your option) any later version.
|
|
131
|
|
132 This program is distributed in the hope that it will be useful, but
|
|
133 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
134 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
135 General Public License for more details.
|
|
136
|
|
137 You should have received a copy of the GNU General Public License
|
|
138 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
139
|
|
140 =cut
|
|
141
|
|
142
|
|
143 ########
|
|
144 # MAIN #
|
|
145 ########
|
|
146
|
|
147 use strict;
|
|
148 use warnings;
|
|
149 use autodie;
|
|
150 use Getopt::Long;
|
|
151 use Pod::Usage;
|
|
152
|
|
153 ### Get the options with Getopt::Long
|
|
154 my $Seq_File; # sequence file to order sequences in
|
|
155 my $Order_List; # order ID list for seq file
|
|
156 my $File_Type; # set file type; otherwise detect file type by file extension
|
|
157 my $Opt_Error_Files; # print missing IDs not found in order list or seq file to error files instead of STDERR
|
|
158 my $Opt_Skip_Errors; # skip missing IDs error statements/files
|
|
159 my $VERSION = 0.1;
|
|
160 my ($Opt_Version, $Opt_Help);
|
|
161 GetOptions ('input=s' => \$Seq_File,
|
|
162 'list=s' => \$Order_List,
|
|
163 'file_type=s' => \$File_Type,
|
|
164 'error_files' => \$Opt_Error_Files,
|
|
165 'skip_errors' => \$Opt_Skip_Errors,
|
|
166 'version' => \$Opt_Version,
|
|
167 'help|?' => \$Opt_Help);
|
|
168
|
|
169
|
|
170
|
|
171 ### Run perldoc on POD
|
|
172 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
173 die "$0 $VERSION\n" if ($Opt_Version);
|
|
174 if (!$Seq_File || !$Order_List) {
|
|
175 my $warning = "\n### Fatal error: Options '-i' or '-l' or their arguments are missing!\n";
|
|
176 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
177 }
|
|
178
|
|
179
|
|
180
|
|
181 ### Enforce mandatory or optional options
|
|
182 die "\n### Fatal error:\nUnknown file type '$File_Type' given with option '-f'. Please choose from either 'fasta' or 'fastq'!\n" if ($File_Type && $File_Type !~ /(fasta|fastq)/i);
|
|
183 warn "\n### Warning:\nIgnoring option flag '-e', because option '-s' set at the same time!\n\n" if ($Opt_Error_Files && $Opt_Skip_Errors);
|
|
184
|
|
185
|
|
186
|
|
187 ### Order input FASTA/FASTQ file according to a given list
|
|
188 open (my $Order_List_Fh, "<", "$Order_List");
|
|
189 open (my $Input_Fh, "<", "$Seq_File"); # pipe from STDIN not working because of 'seek' on filehandle
|
|
190 get_file_type() if (!$File_Type); # subroutine to determine file type by file extension
|
|
191
|
|
192 my %Order_List_IDs; # store order IDs and indicate if found in seq file
|
|
193 my %Seq_File_IDs; # store seq file IDs and indicate if present in order list
|
|
194
|
|
195 my $Next_Fasta_ID; # for multi-line FASTA input files to store next entry header/ID line while parsing in subroutine 'get_fastx_entry'
|
|
196 my $Parse_Run = 1; # indicate FIRST parsing cycle through seq file to collect all seq IDs
|
|
197
|
|
198 while (my $ord_id = <$Order_List_Fh>) {
|
|
199 chomp $ord_id;
|
|
200 next if ($ord_id =~ /^\s*$/); # skip emtpy lines in order list
|
|
201 $ord_id =~ s/^(>|@)//; # remove ">/@" for WHOLE string regex match -> ID in order list can be given with ">/@" or without (will be appended again in print)
|
|
202
|
|
203 if ($Order_List_IDs{$ord_id}) {
|
|
204 die "\n### Fatal error:\n'$ord_id' exists several times in '$Order_List' and IDs should be unique!\n";
|
|
205 } else {
|
|
206 $Order_List_IDs{$ord_id} = 1; # changes to 2 if ID was found in seq file
|
|
207 }
|
|
208
|
|
209 while (<$Input_Fh>) {
|
|
210 if (/^\s*$/) { # skip empty lines in input
|
|
211 warn "\n### Warning:\nFASTQ file includes empty lines, which is unusual. Parsing the FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i);
|
|
212 next;
|
|
213 }
|
|
214 chomp;
|
|
215
|
|
216 # FASTA file
|
|
217 if ($File_Type =~ /fasta/i) {
|
|
218 $_ = get_fastx_entry($_); # subroutine to read one FASTA sequence entry (seq in multi-line or not), returns anonymous array
|
|
219
|
|
220 # FASTQ file
|
|
221 } elsif ($File_Type =~ /fastq/i) {
|
|
222 $_ = get_fastx_entry($_); # subroutine to read one FASTQ read composed of FOUR mandatory lines, returns reference to array
|
|
223 }
|
|
224
|
|
225 if ($Seq_File_IDs{$_->[0]} && $Parse_Run == 1) { # only for first parse cycle, subsequent parsings of course will find the same IDs
|
|
226 die "\n### Fatal error:\n'$_->[0]' exists several times in '$Seq_File' and IDs should be unique!\n";
|
|
227 } elsif (!$Seq_File_IDs{$_->[0]}) {
|
|
228 $Seq_File_IDs{$_->[0]} = 1; # changes to 2 if present in order list
|
|
229 }
|
|
230
|
|
231 if ($ord_id =~ /^$_->[0]$/) { # order ID hit in seq file with the WHOLE string; de-reference array
|
|
232 $Order_List_IDs{$ord_id} = 2; # set to ID found
|
|
233 $Seq_File_IDs{$_->[0]} = 2;
|
|
234
|
|
235 print ">$_->[0]\n$_->[1]\n\n" if ($File_Type =~ /fasta/i); # print seq entry to STDOUT
|
|
236 print "\@$_->[0]\n$_->[1]\n$_->[2]\n$_->[3]\n" if ($File_Type =~ /fastq/i);
|
|
237
|
|
238 next if ($Parse_Run == 1); # parse the complete seq file once (skip 'last' below) to collect all seq IDs
|
|
239 last; # jump out of seq file 'while'
|
|
240 }
|
|
241 }
|
|
242
|
|
243 # rewind seq file for next order list ID
|
|
244 $Next_Fasta_ID = '';
|
|
245 seek $Input_Fh, 0, 0;
|
|
246 $. = 0; # set line number of seq file to 0 (seek doesn't do it automatically)
|
|
247 $Parse_Run = 0;
|
|
248 }
|
|
249 close $Input_Fh;
|
|
250 close $Order_List_Fh;
|
|
251
|
|
252
|
|
253
|
|
254 ### Print order and seq IDs that were not found in seq file or in order list, resp.
|
|
255 if (!$Opt_Skip_Errors) {
|
|
256 # order IDs not found in seq file
|
|
257 missing_IDs(\%Order_List_IDs, 'order_ids_missing.txt', 'order'); # subroutine to identify and print missing IDs
|
|
258
|
|
259 # seq file IDs not found in order list
|
|
260 missing_IDs(\%Seq_File_IDs, 'seq_ids_missing.txt', 'sequence'); # subroutine
|
|
261 }
|
|
262
|
|
263 exit;
|
|
264
|
|
265
|
|
266 #############
|
|
267 #Subroutines#
|
|
268 #############
|
|
269
|
|
270 ### Test for output file existence and give warning to STDERR
|
|
271 sub file_exist {
|
|
272 my $file = shift;
|
|
273 if (-e $file) {
|
|
274 warn "\n### Warning:\nThe error file '$file' exists already, the current errors will be appended to the existing file!\n";
|
|
275 return 1;
|
|
276 }
|
|
277 return 0;
|
|
278 }
|
|
279
|
|
280
|
|
281
|
|
282 ### Get sequence entries from FASTA/Q file
|
|
283 sub get_fastx_entry {
|
|
284 my $line = shift;
|
|
285
|
|
286 # possible multi-line seq in FASTA
|
|
287 if ($File_Type =~ /fasta/i) {
|
|
288 my ($seq, $header);
|
|
289 if ($. == 1) { # first line of file
|
|
290 die "\n### Fatal error:\nNot a FASTA input file, first line of file should be a FASTA ID/header line and start with a '>':\n$line\n" if ($line !~ /^>/);
|
|
291 $header = $line;
|
|
292 } elsif ($Next_Fasta_ID) {
|
|
293 $header = $Next_Fasta_ID;
|
|
294 $seq = $line;
|
|
295 }
|
|
296 while (<$Input_Fh>) {
|
|
297 chomp;
|
|
298 $Next_Fasta_ID = $_ if (/^>/); # store ID/header for next seq entry
|
|
299 $header =~ s/^>//; # remove '>' for WHOLE string regex match in MAIN
|
|
300 return [$header, $seq] if (/^>/); # return anonymous array with current header and seq
|
|
301 $seq .= $_; # concatenate multi-line seq
|
|
302 }
|
|
303 $header =~ s/^>//; # see above
|
|
304 return [$header, $seq] if eof;
|
|
305
|
|
306 # FASTQ: FOUR lines for each FASTQ read (seq-ID, sequence, qual-ID [optional], qual)
|
|
307 } elsif ($File_Type =~ /fastq/i) {
|
|
308 my @fastq_read;
|
|
309
|
|
310 # read name/ID, line 1
|
|
311 my $seq_id = $line;
|
|
312 die "\n### Fatal error:\nThis read doesn't have a sequence identifier/read name according to FASTQ specs, it should begin with a '\@':\n$seq_id\n" if ($seq_id !~ /^@.+/);
|
|
313 $seq_id =~ s/^@//; # remove '@' to make comparable to $qual_id and for WHOLE string regex match in MAIN
|
|
314 push(@fastq_read, $seq_id);
|
|
315
|
|
316 # sequence, line 2
|
|
317 chomp (my $seq = <$Input_Fh>);
|
|
318 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /\s+/);
|
|
319 die "\n### Fatal error:\nRead '$seq_id' has a IUPAC degenerate base (except for 'N') or non-nucleotide character in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /[^acgtun]/i);
|
|
320 push(@fastq_read, $seq);
|
|
321
|
|
322 # optional quality ID, line 3
|
|
323 chomp (my $qual_id = <$Input_Fh>);
|
|
324 die "\n### Fatal error:\nThe optional sequence identifier/read name for the quality line of read '$seq_id' is not according to FASTQ specs, it should begin with a '+':\n$qual_id\n" if ($qual_id !~ /^\+/);
|
|
325 push(@fastq_read, $qual_id);
|
|
326 $qual_id =~ s/^\+//; # if optional ID is present check if equal to $seq_id in line 1
|
|
327 die "\n### Fatal error:\nThe sequence identifier/read name of read '$seq_id' doesn't fit to the optional ID in the quality line:\n$qual_id\n" if ($qual_id && $qual_id ne $seq_id);
|
|
328
|
|
329 # quality, line 4
|
|
330 chomp (my $qual = <$Input_Fh>);
|
|
331 die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /\s+/);
|
|
332 die "\n### Fatal error:\nRead '$seq_id' has a non-ASCII character in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /[^[:ascii]]/);
|
|
333 die "\n### Fatal error:\nThe quality line of read '$seq_id' doesn't have the same number of symbols as letters in the sequence:\n$seq\n$qual\n" if (length $qual != length $seq);
|
|
334 push(@fastq_read, $qual);
|
|
335
|
|
336 return \@fastq_read; # return array-ref
|
|
337 }
|
|
338 return 0;
|
|
339 }
|
|
340
|
|
341
|
|
342
|
|
343 ### Determine file type via file extension (FASTA or FASTQ)
|
|
344 sub get_file_type {
|
|
345 if ($Seq_File =~ /.+\.(fa|fas|fasta|ffn|fna|frn|fsa)$/) { # use "|fsa)(\.gz)*$" if unzip inside script
|
|
346 $File_Type = 'fasta';
|
|
347 } elsif ($Seq_File =~ /.+\.(fastq|fq)$/) {
|
|
348 $File_Type = 'fastq';
|
|
349 }
|
|
350
|
|
351 die "\n### Fatal error:\nFile type could not be automatically detected. Sure this is a FASTA/Q file? If yes, you can force the file type by setting option '-f' to either 'fasta' or 'fastq'!\n" if (!$File_Type);
|
|
352 print STDERR "Detected file type: $File_Type\n";
|
|
353 return 1;
|
|
354 }
|
|
355
|
|
356
|
|
357
|
|
358 ### Identify and print IDs with no hit in order list or seq file
|
|
359 sub missing_IDs {
|
|
360 my ($hash_ref, $error_file, $mode) = @_;
|
|
361
|
|
362 my @missed = grep ($hash_ref->{$_} == 1, keys %$hash_ref); # set to 2 if hit, 1 if "only" present
|
|
363
|
|
364 if (@missed) {
|
|
365 file_exist($error_file) if ($Opt_Error_Files); # subroutine
|
|
366 open (my $error_fh, ">>", $error_file) if ($Opt_Error_Files);
|
|
367
|
|
368 print STDERR "\n### Warning:\nSome $mode IDs were not found in '";
|
|
369 if ($mode eq 'order') {
|
|
370 print STDERR "$Seq_File";
|
|
371 } elsif ($mode eq 'sequence') {
|
|
372 print STDERR "$Order_List";
|
|
373 }
|
|
374 print STDERR "', listed ";
|
|
375 print STDERR "below:\n" if (!$Opt_Error_Files);
|
|
376 print STDERR "in error file '$error_file'!\n" if ($Opt_Error_Files);
|
|
377
|
|
378 foreach (sort @missed) {
|
|
379 if (!$Opt_Error_Files) {
|
|
380 print STDERR "$_\t"; # separated by tab
|
|
381 } elsif ($Opt_Error_Files) {
|
|
382 print $error_fh "$_\n"; # separated by newline
|
|
383 }
|
|
384 }
|
|
385 print STDERR "\n" if (!$Opt_Error_Files); # final newline for STDERR print
|
|
386
|
|
387 close $error_fh if ($Opt_Error_Files);
|
|
388 }
|
|
389 return 1;
|
|
390 }
|