Mercurial > repos > dereeper > roary_plots
comparison Roary/lib/Bio/Roary/FilterUnknownsFromFasta.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c47a5f61bc9f |
---|---|
1 package Bio::Roary::FilterUnknownsFromFasta; | |
2 | |
3 # ABSTRACT: Take in fasta files, remove sequences with too many unknowns and return a list of the new files | |
4 | |
5 =head1 SYNOPSIS | |
6 | |
7 Take in fasta files, remove sequences with too many unknowns and return a list of the new files | |
8 use Bio::Roary::FilterUnknownsFromFasta; | |
9 | |
10 my $obj = Bio::Roary::FilterUnknownsFromFasta->new( | |
11 fasta_files => [], | |
12 ); | |
13 $obj->filtered_fasta_files(); | |
14 | |
15 =cut | |
16 | |
17 use Moose; | |
18 use Bio::SeqIO; | |
19 use Cwd; | |
20 use Bio::Roary::Exceptions; | |
21 use File::Basename; | |
22 | |
23 has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); | |
24 has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 ); | |
25 has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 ); | |
26 | |
27 has 'filtered_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_filtered_fasta_files' ); | |
28 | |
29 has 'input_fasta_to_output_fasta' => ( is => 'ro', isa => 'HashRef', default => sub {{}} ); | |
30 | |
31 sub _build_filtered_fasta_files | |
32 { | |
33 my ($self) = @_; | |
34 | |
35 my @output_file_names; | |
36 for my $fasta_file (@{$self->fasta_files}) | |
37 { | |
38 my ( $filename, $directories, $suffix ) = fileparse($fasta_file); | |
39 push(@output_file_names, $self->_filter_fasta_sequences_and_return_new_file($filename,$fasta_file )); | |
40 } | |
41 return \@output_file_names; | |
42 } | |
43 | |
44 sub _does_sequence_contain_too_many_unknowns | |
45 { | |
46 my ($self, $sequence_obj) = @_; | |
47 my $maximum_number_of_Xs = int(($sequence_obj->length()*$self->maximum_percentage_of_unknowns)/100); | |
48 my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g; | |
49 if($number_of_Xs_found > $maximum_number_of_Xs) | |
50 { | |
51 return 1; | |
52 } | |
53 else | |
54 { | |
55 return 0; | |
56 } | |
57 } | |
58 | |
59 | |
60 sub _filter_fasta_sequences_and_return_new_file | |
61 { | |
62 my ($self, $output_file, $input_file) = @_; | |
63 my $output_filename = $output_file.'.tmp.filtered.fa'; | |
64 my $out_fasta_obj = Bio::SeqIO->new( -file => ">".$output_filename, -format => 'Fasta'); | |
65 my $fasta_obj = Bio::SeqIO->new( -file => $input_file, -format => 'Fasta'); | |
66 | |
67 $self->input_fasta_to_output_fasta->{$input_file} = $output_filename; | |
68 | |
69 while(my $seq = $fasta_obj->next_seq()) | |
70 { | |
71 if($self->_does_sequence_contain_too_many_unknowns($seq)) | |
72 { | |
73 next; | |
74 } | |
75 # strip out extra details put in by fastatranslate | |
76 $seq->description(undef); | |
77 $out_fasta_obj->write_seq($seq); | |
78 } | |
79 return $output_filename; | |
80 } | |
81 | |
82 | |
83 | |
84 no Moose; | |
85 __PACKAGE__->meta->make_immutable; | |
86 | |
87 1; | |
88 |