Mercurial > repos > dereeper > roary_plots
view Roary/lib/Bio/Roary/FilterFullClusters.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line source
package Bio::Roary::FilterFullClusters; # ABSTRACT: Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters =head1 SYNOPSIS Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters use Bio::Roary::FilterFullClusters; my $obj = Bio::Roary::FilterFullClusters->new( clusters_filename => $cluster_file, fasta_file => $fasta_file, number_of_input_files => 10, output_file => 'filtered_file' ); $obj->filter_full_clusters_from_fasta(); =cut use Moose; use Bio::SeqIO; with 'Bio::Roary::ClustersRole'; has 'number_of_input_files' => ( is => 'ro', isa => 'Int', required => 1 ); has 'fasta_file' => ( is => 'ro', isa => 'Str', required => 1 ); has 'output_file' => ( is => 'ro', isa => 'Str', required => 1 ); has '_greater_than_or_equal' => ( is => 'ro', isa => 'Bool', default => 0 ); has 'cdhit_input_fasta_file' => ( is => 'ro', isa => 'Str', required => 1 ); has 'cdhit_output_fasta_file' => ( is => 'ro', isa => 'Str', required => 1 ); has 'output_groups_file' => ( is => 'ro', isa => 'Str', required => 1 ); has '_full_cluster_gene_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__full_cluster_gene_names' ); has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' ); has '_output_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio' ); has '_all_full_cluster_genes' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__all_full_cluster_genes' ); sub _build__full_cluster_gene_names { my($self) = @_; my %full_cluster_gene_names ; for my $gene_name (keys %{$self->_clustered_genes}) { if($self->_greater_than_or_equal == 0) { if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} >= ($self->number_of_input_files -1)) { $full_cluster_gene_names{$gene_name}++; } } else { if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} == ($self->number_of_input_files -1)) { $full_cluster_gene_names{$gene_name}++; } } } return \%full_cluster_gene_names; } sub _build__input_seqio { my ($self) = @_; return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' ); } sub _build__output_seqio { my ( $self, $chunk_number ) = @_; return Bio::SeqIO->new( -file => ">".$self->output_file, -format => 'Fasta' ); } sub _build__all_full_cluster_genes { my ($self) = @_; my %full_cluster_genes; for my $gene_name (keys %{$self->_full_cluster_gene_names}) { $full_cluster_genes{$gene_name}++; for my $cluster_gene_name (@{$self->_clustered_genes->{$gene_name}}) { $full_cluster_genes{$cluster_gene_name}++; } } return \%full_cluster_genes; } sub _create_groups_file { my ($self) = @_; open(my $out_fh, '>>', $self->output_groups_file); for my $gene_name (keys %{$self->_full_cluster_gene_names}) { print {$out_fh} $gene_name."\t". join("\t", @{$self->_clustered_genes->{$gene_name}}). "\n"; } close($out_fh); } sub filter_complete_cluster_from_original_fasta { my ($self) = @_; my $input_seq_io = Bio::SeqIO->new( -file => $self->cdhit_input_fasta_file, -format => 'Fasta' ); my $output_seq_io = Bio::SeqIO->new( -file => ">".$self->cdhit_output_fasta_file, -format => 'Fasta' ); while ( my $input_seq = $input_seq_io->next_seq() ) { unless(defined($self->_all_full_cluster_genes->{$input_seq->display_id})) { $output_seq_io->write_seq($input_seq); } } $self->_create_groups_file; return $self; } sub filter_full_clusters_from_fasta { my ($self) = @_; while ( my $input_seq = $self->_input_seqio->next_seq() ) { unless(defined($self->_full_cluster_gene_names->{$input_seq->display_id})) { $self->_output_seqio->write_seq($input_seq); } } return $self; } no Moose; __PACKAGE__->meta->make_immutable; 1;