Mercurial > repos > dereeper > roary_plots
view Roary/lib/Bio/Roary/FilterUnknownsFromFasta.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line source
package Bio::Roary::FilterUnknownsFromFasta; # ABSTRACT: Take in fasta files, remove sequences with too many unknowns and return a list of the new files =head1 SYNOPSIS Take in fasta files, remove sequences with too many unknowns and return a list of the new files use Bio::Roary::FilterUnknownsFromFasta; my $obj = Bio::Roary::FilterUnknownsFromFasta->new( fasta_files => [], ); $obj->filtered_fasta_files(); =cut use Moose; use Bio::SeqIO; use Cwd; use Bio::Roary::Exceptions; use File::Basename; has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 ); has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 ); has 'filtered_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_filtered_fasta_files' ); has 'input_fasta_to_output_fasta' => ( is => 'ro', isa => 'HashRef', default => sub {{}} ); sub _build_filtered_fasta_files { my ($self) = @_; my @output_file_names; for my $fasta_file (@{$self->fasta_files}) { my ( $filename, $directories, $suffix ) = fileparse($fasta_file); push(@output_file_names, $self->_filter_fasta_sequences_and_return_new_file($filename,$fasta_file )); } return \@output_file_names; } sub _does_sequence_contain_too_many_unknowns { my ($self, $sequence_obj) = @_; my $maximum_number_of_Xs = int(($sequence_obj->length()*$self->maximum_percentage_of_unknowns)/100); my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g; if($number_of_Xs_found > $maximum_number_of_Xs) { return 1; } else { return 0; } } sub _filter_fasta_sequences_and_return_new_file { my ($self, $output_file, $input_file) = @_; my $output_filename = $output_file.'.tmp.filtered.fa'; my $out_fasta_obj = Bio::SeqIO->new( -file => ">".$output_filename, -format => 'Fasta'); my $fasta_obj = Bio::SeqIO->new( -file => $input_file, -format => 'Fasta'); $self->input_fasta_to_output_fasta->{$input_file} = $output_filename; while(my $seq = $fasta_obj->next_seq()) { if($self->_does_sequence_contain_too_many_unknowns($seq)) { next; } # strip out extra details put in by fastatranslate $seq->description(undef); $out_fasta_obj->write_seq($seq); } return $output_filename; } no Moose; __PACKAGE__->meta->make_immutable; 1;