annotate Roary/lib/Bio/Roary/FilterUnknownsFromFasta.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
1 package Bio::Roary::FilterUnknownsFromFasta;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
2
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
3 # ABSTRACT: Take in fasta files, remove sequences with too many unknowns and return a list of the new files
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
4
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
5 =head1 SYNOPSIS
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
6
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
7 Take in fasta files, remove sequences with too many unknowns and return a list of the new files
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
8 use Bio::Roary::FilterUnknownsFromFasta;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
9
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
10 my $obj = Bio::Roary::FilterUnknownsFromFasta->new(
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
11 fasta_files => [],
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
12 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
13 $obj->filtered_fasta_files();
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
14
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
15 =cut
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
16
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
17 use Moose;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
18 use Bio::SeqIO;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
19 use Cwd;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
20 use Bio::Roary::Exceptions;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
21 use File::Basename;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
22
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
23 has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
24 has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
25 has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
26
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
27 has 'filtered_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_filtered_fasta_files' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
28
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
29 has 'input_fasta_to_output_fasta' => ( is => 'ro', isa => 'HashRef', default => sub {{}} );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
30
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
31 sub _build_filtered_fasta_files
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
32 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
33 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
34
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
35 my @output_file_names;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
36 for my $fasta_file (@{$self->fasta_files})
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
37 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
38 my ( $filename, $directories, $suffix ) = fileparse($fasta_file);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
39 push(@output_file_names, $self->_filter_fasta_sequences_and_return_new_file($filename,$fasta_file ));
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
40 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
41 return \@output_file_names;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
42 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
43
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
44 sub _does_sequence_contain_too_many_unknowns
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
45 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
46 my ($self, $sequence_obj) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
47 my $maximum_number_of_Xs = int(($sequence_obj->length()*$self->maximum_percentage_of_unknowns)/100);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
48 my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
49 if($number_of_Xs_found > $maximum_number_of_Xs)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
50 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
51 return 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
52 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
53 else
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
54 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
55 return 0;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
56 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
57 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
58
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
59
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
60 sub _filter_fasta_sequences_and_return_new_file
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
61 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
62 my ($self, $output_file, $input_file) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
63 my $output_filename = $output_file.'.tmp.filtered.fa';
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
64 my $out_fasta_obj = Bio::SeqIO->new( -file => ">".$output_filename, -format => 'Fasta');
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
65 my $fasta_obj = Bio::SeqIO->new( -file => $input_file, -format => 'Fasta');
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
66
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
67 $self->input_fasta_to_output_fasta->{$input_file} = $output_filename;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
68
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
69 while(my $seq = $fasta_obj->next_seq())
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
70 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
71 if($self->_does_sequence_contain_too_many_unknowns($seq))
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
72 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
73 next;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
74 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
75 # strip out extra details put in by fastatranslate
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
76 $seq->description(undef);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
77 $out_fasta_obj->write_seq($seq);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
78 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
79 return $output_filename;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
80 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
81
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
82
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
83
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
84 no Moose;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
85 __PACKAGE__->meta->make_immutable;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
86
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
87 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
88