Mercurial > repos > smarthey > paqmir_postprocess_mirdeep2
view paqmir_postprocess_mirdeep2/extract_seqs_by_ids.pl @ 0:da3c91b40f61 draft default tip
Uploaded
author | smarthey |
---|---|
date | Mon, 18 Mar 2019 09:27:29 -0400 |
parents | |
children |
line wrap: on
line source
#! /usr/bin/perl # # extract_seqs_by_ids.pl # # Copyright 2019 Sylvain Marthey <sylvain.marthey@inra.fr> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see <http://www.gnu.org/licenses/>. use strict; use Getopt::Long; use Pod::Usage; use Bio::Seq; use Bio::SeqIO; my ($help,$fasta_in,$ids_file,$file_out,$inverse,$precursor_shape,@exprs,%ids_seqs,$regular_expression); GetOptions("help|?" => \$help, "fasta_in:s" => \$fasta_in, "ids_file:s" => \$ids_file, "file_out:s" => \$file_out, "inverse:s" => \$inverse, "precursor_shape:s" => \$precursor_shape, "regular_expression:s" => \$regular_expression) or pod2usage(-message=>"Try `$0' for more information.", -exitval => 2,-verbose=> 0); pod2usage(-exitval =>1, -verbose => 2) if ($help); if((!$fasta_in || !-f($fasta_in)) ){ print "--fasta_in parameter missing or is not a valid file path: try extract_seqs_by_ids.pl --help\n"; exit; } if((!$file_out && !-f($file_out)) ){ print "--file_out parameter missing : try extract_seqs_by_ids.pl --help\n"; exit; } if(!$ids_file || !-f($ids_file)){ print "--ds_file parameter missing or is not a valid file path: try extract_seqs_by_ids.pl --help\n"; exit; } if(uc($inverse) eq 'YES' || uc($inverse) eq 'Y'){ $inverse = 1; }else{ $inverse = 0; } if(uc($regular_expression) eq 'YES' || uc($regular_expression) eq 'Y'){ $regular_expression = 1; }else{ $regular_expression = 0; } # read the input ids file open(IDS,$ids_file) or die "impossible d'ouvrir le fichier $ids_file\n"; while(<IDS>){ chomp; push(@exprs,qr/$_$/); $ids_seqs{$_}++; } close IDS; my $first = Bio::SeqIO->new(-file => $fasta_in,'-format'=>'Fasta'); my $out_fasta = Bio::SeqIO->new(-file => ">".$file_out , '-format' => 'Fasta'); # read the input fasta file while (my $seq = $first->next_seq) { my $id = $seq->id(); # print $id."\t"; if(uc($precursor_shape) eq 'YES' || uc($precursor_shape) eq 'Y'){ $id =~ s/([a-zA-Z]{3}-[a-zA-Z]{3}-\w+)-(\d+)$/$1/; # print $id."\n"; } my $found = 0; if($regular_expression){ # seach the ids_file in the identifier foreach my $expr (@exprs){ if($id =~ /$expr/){ $found++; last; } } }else{ if($ids_seqs{$id}){ $found++; } } if($found>0 && !$inverse){ $out_fasta->write_seq($seq); }elsif($found == 0 && $inverse){ $out_fasta->write_seq($seq); } } =head1 NAME extract_seqs_by_ids.pl =head1 SYNOPSIS extract_seqs_by_ids.pl --fasta_in <fasta file> --ids_file <string> --file_out <file_out_path> [--begining <yes|no> --inverse <yes|no>] =head1 OPTIONS --fasta_in fasta file containing sequences we want to extract --ids_file file containing words that will be searched in the id of the sequences we want to extract (one id peer line) --file_out out fasta file --precursor_shape searched ids doesn't contain the location tag (-1, -2, -3, -N) it will be skipped in the sequence_id research --inverse inverse the sequence selection (like grep -v) --regular_expression yes|no =head1 DESCRIPTION This tool will read the fasta file and extract sequences where identifiers contain the searched words. =head1 AUTHORS Sylvain Marthey <sylvain.marthey@inra.fr> =head1 VERSION 1.1 =head1 DATE 01/01/2019 =head1 KEYWORDS fasta, identifier, filtering =head1 EXAMPLE extract_seqs_by_ids.pl --fasta_in mirbase.fa --ids_file bta --file_out mirbase_bta.fa =cut