Mercurial > repos > dereeper > pangenome_explorer
view COG/bac-genomics-scripts/order_fastx/order_fastx.pl @ 13:152d7c43478b draft default tip
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 20:07:55 +0000 |
parents | e42d30da7a74 |
children |
line wrap: on
line source
#!/usr/bin/perl ####### # POD # ####### =pod =head1 NAME C<order_fastx.pl> - order sequences in FASTA or FASTQ files =head1 SYNOPSIS C<perl order_fastx.pl -i infile.fasta -l order_id_list.txt E<gt> ordered.fasta> =head1 DESCRIPTION Order sequence entries in FASTA or FASTQ sequence files according to an ID list with a given order. Beware, the IDs in the order list have to be B<identical> to the entire IDs in the sequence file. However, the ">" or "@" ID identifiers of FASTA or FASTQ files, respectively, can be omitted in the ID list. The file type is detected automatically. But, you can set the file type manually with option B<-f>. FASTQ format assumes B<four> lines per read, if this is not the case run the FASTQ file through L<C<fastx_fix.pl>|/fastx_fix> or use Heng Li's L<C<seqtk seq>|https://github.com/lh3/seqtk>: C<seqtk seq -l 0 infile.fq E<gt> outfile.fq> The script can also be used to pull a subset of sequences in the ID list from the sequence file. Probably best to set option flag B<-s> in this case, see L<"Optional options"> below. But, rather use L<C<filter_fastx.pl>|/filter_fastx>. =head1 OPTIONS =head2 Mandatory options =over 20 =item B<-i>=I<str>, B<-input>=I<str> Input FASTA or FASTQ file =item B<-l>=I<str>, B<-list>=I<str> List with sequence IDs in specified order =back =head2 Optional options =over 20 =item B<-h>, B<-help> Help (perldoc POD) =item B<-f>=I<fasta|fastq>, B<-file_type>=I<fasta|fastq> Set the file type manually =item B<-e>, B<-error_files> Write missing IDs in the seq file or the order ID list without an equivalent in the other to error files instead of C<STDERR> (see L<"OUTPUT"> below) =item B<-s>, B<-skip_errors> Skip missing ID error statements, excludes option B<-e> =item B<-v>, B<-version> Print version number to C<STDERR> =back =head1 OUTPUT =over 20 =item C<STDOUT> The newly ordered sequences are printed to C<STDOUT>. Redirect or pipe into another tool as needed. =item (F<order_ids_missing.txt>) If IDs in the order list are missing in the sequence file with option B<-e> =item (F<seq_ids_missing.txt>) If IDs in the sequence file are missing in the order ID list with option B<-e> =back =head1 EXAMPLES =over =item C<perl order_fastx.pl -i infile.fq -l order_id_list.txt -s -f fastq E<gt> ordered.fq> =item C<perl order_fastx.pl -i infile.fasta -l order_id_list.txt -e E<gt> ordered.fasta> =back =head1 VERSION 0.1 20-11-2014 =head1 AUTHOR Andreas Leimbach aleimba[at]gmx[dot]de =head1 LICENSE This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 (GPLv3) of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see L<http://www.gnu.org/licenses/>. =cut ######## # MAIN # ######## use strict; use warnings; use autodie; use Getopt::Long; use Pod::Usage; ### Get the options with Getopt::Long my $Seq_File; # sequence file to order sequences in my $Order_List; # order ID list for seq file my $File_Type; # set file type; otherwise detect file type by file extension my $Opt_Error_Files; # print missing IDs not found in order list or seq file to error files instead of STDERR my $Opt_Skip_Errors; # skip missing IDs error statements/files my $VERSION = 0.1; my ($Opt_Version, $Opt_Help); GetOptions ('input=s' => \$Seq_File, 'list=s' => \$Order_List, 'file_type=s' => \$File_Type, 'error_files' => \$Opt_Error_Files, 'skip_errors' => \$Opt_Skip_Errors, 'version' => \$Opt_Version, 'help|?' => \$Opt_Help); ### Run perldoc on POD pod2usage(-verbose => 2) if ($Opt_Help); die "$0 $VERSION\n" if ($Opt_Version); if (!$Seq_File || !$Order_List) { my $warning = "\n### Fatal error: Options '-i' or '-l' or their arguments are missing!\n"; pod2usage(-verbose => 1, -message => $warning, -exitval => 2); } ### Enforce mandatory or optional options die "\n### Fatal error:\nUnknown file type '$File_Type' given with option '-f'. Please choose from either 'fasta' or 'fastq'!\n" if ($File_Type && $File_Type !~ /(fasta|fastq)/i); warn "\n### Warning:\nIgnoring option flag '-e', because option '-s' set at the same time!\n\n" if ($Opt_Error_Files && $Opt_Skip_Errors); ### Order input FASTA/FASTQ file according to a given list open (my $Order_List_Fh, "<", "$Order_List"); open (my $Input_Fh, "<", "$Seq_File"); # pipe from STDIN not working because of 'seek' on filehandle get_file_type() if (!$File_Type); # subroutine to determine file type by file extension my %Order_List_IDs; # store order IDs and indicate if found in seq file my %Seq_File_IDs; # store seq file IDs and indicate if present in order list my $Next_Fasta_ID; # for multi-line FASTA input files to store next entry header/ID line while parsing in subroutine 'get_fastx_entry' my $Parse_Run = 1; # indicate FIRST parsing cycle through seq file to collect all seq IDs while (my $ord_id = <$Order_List_Fh>) { chomp $ord_id; next if ($ord_id =~ /^\s*$/); # skip emtpy lines in order list $ord_id =~ s/^(>|@)//; # remove ">/@" for WHOLE string regex match -> ID in order list can be given with ">/@" or without (will be appended again in print) if ($Order_List_IDs{$ord_id}) { die "\n### Fatal error:\n'$ord_id' exists several times in '$Order_List' and IDs should be unique!\n"; } else { $Order_List_IDs{$ord_id} = 1; # changes to 2 if ID was found in seq file } while (<$Input_Fh>) { if (/^\s*$/) { # skip empty lines in input warn "\n### Warning:\nFASTQ file includes empty lines, which is unusual. Parsing the FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i); next; } chomp; # FASTA file if ($File_Type =~ /fasta/i) { $_ = get_fastx_entry($_); # subroutine to read one FASTA sequence entry (seq in multi-line or not), returns anonymous array # FASTQ file } elsif ($File_Type =~ /fastq/i) { $_ = get_fastx_entry($_); # subroutine to read one FASTQ read composed of FOUR mandatory lines, returns reference to array } if ($Seq_File_IDs{$_->[0]} && $Parse_Run == 1) { # only for first parse cycle, subsequent parsings of course will find the same IDs die "\n### Fatal error:\n'$_->[0]' exists several times in '$Seq_File' and IDs should be unique!\n"; } elsif (!$Seq_File_IDs{$_->[0]}) { $Seq_File_IDs{$_->[0]} = 1; # changes to 2 if present in order list } if ($ord_id =~ /^$_->[0]$/) { # order ID hit in seq file with the WHOLE string; de-reference array $Order_List_IDs{$ord_id} = 2; # set to ID found $Seq_File_IDs{$_->[0]} = 2; print ">$_->[0]\n$_->[1]\n\n" if ($File_Type =~ /fasta/i); # print seq entry to STDOUT print "\@$_->[0]\n$_->[1]\n$_->[2]\n$_->[3]\n" if ($File_Type =~ /fastq/i); next if ($Parse_Run == 1); # parse the complete seq file once (skip 'last' below) to collect all seq IDs last; # jump out of seq file 'while' } } # rewind seq file for next order list ID $Next_Fasta_ID = ''; seek $Input_Fh, 0, 0; $. = 0; # set line number of seq file to 0 (seek doesn't do it automatically) $Parse_Run = 0; } close $Input_Fh; close $Order_List_Fh; ### Print order and seq IDs that were not found in seq file or in order list, resp. if (!$Opt_Skip_Errors) { # order IDs not found in seq file missing_IDs(\%Order_List_IDs, 'order_ids_missing.txt', 'order'); # subroutine to identify and print missing IDs # seq file IDs not found in order list missing_IDs(\%Seq_File_IDs, 'seq_ids_missing.txt', 'sequence'); # subroutine } exit; ############# #Subroutines# ############# ### Test for output file existence and give warning to STDERR sub file_exist { my $file = shift; if (-e $file) { warn "\n### Warning:\nThe error file '$file' exists already, the current errors will be appended to the existing file!\n"; return 1; } return 0; } ### Get sequence entries from FASTA/Q file sub get_fastx_entry { my $line = shift; # possible multi-line seq in FASTA if ($File_Type =~ /fasta/i) { my ($seq, $header); if ($. == 1) { # first line of file die "\n### Fatal error:\nNot a FASTA input file, first line of file should be a FASTA ID/header line and start with a '>':\n$line\n" if ($line !~ /^>/); $header = $line; } elsif ($Next_Fasta_ID) { $header = $Next_Fasta_ID; $seq = $line; } while (<$Input_Fh>) { chomp; $Next_Fasta_ID = $_ if (/^>/); # store ID/header for next seq entry $header =~ s/^>//; # remove '>' for WHOLE string regex match in MAIN return [$header, $seq] if (/^>/); # return anonymous array with current header and seq $seq .= $_; # concatenate multi-line seq } $header =~ s/^>//; # see above return [$header, $seq] if eof; # FASTQ: FOUR lines for each FASTQ read (seq-ID, sequence, qual-ID [optional], qual) } elsif ($File_Type =~ /fastq/i) { my @fastq_read; # read name/ID, line 1 my $seq_id = $line; die "\n### Fatal error:\nThis read doesn't have a sequence identifier/read name according to FASTQ specs, it should begin with a '\@':\n$seq_id\n" if ($seq_id !~ /^@.+/); $seq_id =~ s/^@//; # remove '@' to make comparable to $qual_id and for WHOLE string regex match in MAIN push(@fastq_read, $seq_id); # sequence, line 2 chomp (my $seq = <$Input_Fh>); die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /\s+/); die "\n### Fatal error:\nRead '$seq_id' has a IUPAC degenerate base (except for 'N') or non-nucleotide character in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /[^acgtun]/i); push(@fastq_read, $seq); # optional quality ID, line 3 chomp (my $qual_id = <$Input_Fh>); die "\n### Fatal error:\nThe optional sequence identifier/read name for the quality line of read '$seq_id' is not according to FASTQ specs, it should begin with a '+':\n$qual_id\n" if ($qual_id !~ /^\+/); push(@fastq_read, $qual_id); $qual_id =~ s/^\+//; # if optional ID is present check if equal to $seq_id in line 1 die "\n### Fatal error:\nThe sequence identifier/read name of read '$seq_id' doesn't fit to the optional ID in the quality line:\n$qual_id\n" if ($qual_id && $qual_id ne $seq_id); # quality, line 4 chomp (my $qual = <$Input_Fh>); die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /\s+/); die "\n### Fatal error:\nRead '$seq_id' has a non-ASCII character in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /[^[:ascii]]/); die "\n### Fatal error:\nThe quality line of read '$seq_id' doesn't have the same number of symbols as letters in the sequence:\n$seq\n$qual\n" if (length $qual != length $seq); push(@fastq_read, $qual); return \@fastq_read; # return array-ref } return 0; } ### Determine file type via file extension (FASTA or FASTQ) sub get_file_type { if ($Seq_File =~ /.+\.(fa|fas|fasta|ffn|fna|frn|fsa)$/) { # use "|fsa)(\.gz)*$" if unzip inside script $File_Type = 'fasta'; } elsif ($Seq_File =~ /.+\.(fastq|fq)$/) { $File_Type = 'fastq'; } die "\n### Fatal error:\nFile type could not be automatically detected. Sure this is a FASTA/Q file? If yes, you can force the file type by setting option '-f' to either 'fasta' or 'fastq'!\n" if (!$File_Type); print STDERR "Detected file type: $File_Type\n"; return 1; } ### Identify and print IDs with no hit in order list or seq file sub missing_IDs { my ($hash_ref, $error_file, $mode) = @_; my @missed = grep ($hash_ref->{$_} == 1, keys %$hash_ref); # set to 2 if hit, 1 if "only" present if (@missed) { file_exist($error_file) if ($Opt_Error_Files); # subroutine open (my $error_fh, ">>", $error_file) if ($Opt_Error_Files); print STDERR "\n### Warning:\nSome $mode IDs were not found in '"; if ($mode eq 'order') { print STDERR "$Seq_File"; } elsif ($mode eq 'sequence') { print STDERR "$Order_List"; } print STDERR "', listed "; print STDERR "below:\n" if (!$Opt_Error_Files); print STDERR "in error file '$error_file'!\n" if ($Opt_Error_Files); foreach (sort @missed) { if (!$Opt_Error_Files) { print STDERR "$_\t"; # separated by tab } elsif ($Opt_Error_Files) { print $error_fh "$_\n"; # separated by newline } } print STDERR "\n" if (!$Opt_Error_Files); # final newline for STDERR print close $error_fh if ($Opt_Error_Files); } return 1; }