0
|
1 package Bio::Roary::FilterUnknownsFromFasta;
|
|
2
|
|
3 # ABSTRACT: Take in fasta files, remove sequences with too many unknowns and return a list of the new files
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 Take in fasta files, remove sequences with too many unknowns and return a list of the new files
|
|
8 use Bio::Roary::FilterUnknownsFromFasta;
|
|
9
|
|
10 my $obj = Bio::Roary::FilterUnknownsFromFasta->new(
|
|
11 fasta_files => [],
|
|
12 );
|
|
13 $obj->filtered_fasta_files();
|
|
14
|
|
15 =cut
|
|
16
|
|
17 use Moose;
|
|
18 use Bio::SeqIO;
|
|
19 use Cwd;
|
|
20 use Bio::Roary::Exceptions;
|
|
21 use File::Basename;
|
|
22
|
|
23 has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
|
|
24 has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
|
|
25 has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 );
|
|
26
|
|
27 has 'filtered_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_filtered_fasta_files' );
|
|
28
|
|
29 has 'input_fasta_to_output_fasta' => ( is => 'ro', isa => 'HashRef', default => sub {{}} );
|
|
30
|
|
31 sub _build_filtered_fasta_files
|
|
32 {
|
|
33 my ($self) = @_;
|
|
34
|
|
35 my @output_file_names;
|
|
36 for my $fasta_file (@{$self->fasta_files})
|
|
37 {
|
|
38 my ( $filename, $directories, $suffix ) = fileparse($fasta_file);
|
|
39 push(@output_file_names, $self->_filter_fasta_sequences_and_return_new_file($filename,$fasta_file ));
|
|
40 }
|
|
41 return \@output_file_names;
|
|
42 }
|
|
43
|
|
44 sub _does_sequence_contain_too_many_unknowns
|
|
45 {
|
|
46 my ($self, $sequence_obj) = @_;
|
|
47 my $maximum_number_of_Xs = int(($sequence_obj->length()*$self->maximum_percentage_of_unknowns)/100);
|
|
48 my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
|
|
49 if($number_of_Xs_found > $maximum_number_of_Xs)
|
|
50 {
|
|
51 return 1;
|
|
52 }
|
|
53 else
|
|
54 {
|
|
55 return 0;
|
|
56 }
|
|
57 }
|
|
58
|
|
59
|
|
60 sub _filter_fasta_sequences_and_return_new_file
|
|
61 {
|
|
62 my ($self, $output_file, $input_file) = @_;
|
|
63 my $output_filename = $output_file.'.tmp.filtered.fa';
|
|
64 my $out_fasta_obj = Bio::SeqIO->new( -file => ">".$output_filename, -format => 'Fasta');
|
|
65 my $fasta_obj = Bio::SeqIO->new( -file => $input_file, -format => 'Fasta');
|
|
66
|
|
67 $self->input_fasta_to_output_fasta->{$input_file} = $output_filename;
|
|
68
|
|
69 while(my $seq = $fasta_obj->next_seq())
|
|
70 {
|
|
71 if($self->_does_sequence_contain_too_many_unknowns($seq))
|
|
72 {
|
|
73 next;
|
|
74 }
|
|
75 # strip out extra details put in by fastatranslate
|
|
76 $seq->description(undef);
|
|
77 $out_fasta_obj->write_seq($seq);
|
|
78 }
|
|
79 return $output_filename;
|
|
80 }
|
|
81
|
|
82
|
|
83
|
|
84 no Moose;
|
|
85 __PACKAGE__->meta->make_immutable;
|
|
86
|
|
87 1;
|
|
88
|