diff Roary/lib/Bio/Roary/FilterFullClusters.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Roary/lib/Bio/Roary/FilterFullClusters.pm	Fri May 14 20:27:06 2021 +0000
@@ -0,0 +1,144 @@
+package Bio::Roary::FilterFullClusters;
+
+# ABSTRACT: Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
+
+=head1 SYNOPSIS
+
+Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
+   use Bio::Roary::FilterFullClusters;
+   
+   my $obj = Bio::Roary::FilterFullClusters->new(
+       clusters_filename        => $cluster_file,
+       fasta_file           => $fasta_file,
+       number_of_input_files => 10,
+       output_file => 'filtered_file'
+     );
+   $obj->filter_full_clusters_from_fasta();
+
+=cut
+
+use Moose;
+use Bio::SeqIO;
+with 'Bio::Roary::ClustersRole';
+
+has 'number_of_input_files' => ( is => 'ro', isa => 'Int', required => 1 );
+has 'fasta_file'     => ( is => 'ro', isa => 'Str', required => 1 );
+has 'output_file'    => ( is => 'ro', isa => 'Str', required => 1 );
+has '_greater_than_or_equal' =>  ( is => 'ro', isa => 'Bool', default => 0 );
+has 'cdhit_input_fasta_file'    => ( is => 'ro', isa => 'Str', required => 1 );
+has 'cdhit_output_fasta_file'    => ( is => 'ro', isa => 'Str', required => 1 );
+
+has 'output_groups_file' => ( is => 'ro', isa => 'Str', required => 1 );
+
+has '_full_cluster_gene_names'    => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__full_cluster_gene_names' );
+has '_input_seqio'  => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
+has '_output_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio' );
+
+has '_all_full_cluster_genes'    => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__all_full_cluster_genes' );
+
+sub _build__full_cluster_gene_names
+{
+  my($self) = @_;
+  
+  my %full_cluster_gene_names ;
+  
+  for my $gene_name (keys %{$self->_clustered_genes})
+  {
+  
+    if($self->_greater_than_or_equal == 0)
+    {
+      if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} >= ($self->number_of_input_files -1))
+      {
+        $full_cluster_gene_names{$gene_name}++;
+      }
+    }
+    else
+    {
+      if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} == ($self->number_of_input_files -1))
+      {
+        $full_cluster_gene_names{$gene_name}++;
+      }
+    }
+  }
+  
+  return \%full_cluster_gene_names;
+}
+
+sub _build__input_seqio {
+    my ($self) = @_;
+    return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' );
+}
+
+sub _build__output_seqio {
+    my ( $self, $chunk_number ) = @_;
+    return Bio::SeqIO->new( -file => ">".$self->output_file, -format => 'Fasta' );
+}
+
+sub _build__all_full_cluster_genes
+{
+   my ($self) = @_;
+   my %full_cluster_genes;
+   
+   for my $gene_name (keys %{$self->_full_cluster_gene_names})
+   {
+     $full_cluster_genes{$gene_name}++;
+     for my $cluster_gene_name (@{$self->_clustered_genes->{$gene_name}})
+     {
+       $full_cluster_genes{$cluster_gene_name}++;
+     }
+   }
+   return \%full_cluster_genes;
+}
+
+
+sub _create_groups_file
+{
+  my ($self) = @_;
+  open(my $out_fh, '>>', $self->output_groups_file);
+  
+  for my $gene_name (keys %{$self->_full_cluster_gene_names})
+  {
+    print {$out_fh} $gene_name."\t". join("\t", @{$self->_clustered_genes->{$gene_name}}). "\n";
+  }
+  close($out_fh);
+}
+
+
+
+sub filter_complete_cluster_from_original_fasta
+{
+  my ($self) = @_;
+
+  my $input_seq_io  = Bio::SeqIO->new( -file => $self->cdhit_input_fasta_file, -format => 'Fasta' );
+  my $output_seq_io = Bio::SeqIO->new( -file => ">".$self->cdhit_output_fasta_file, -format => 'Fasta' );
+  
+  while ( my $input_seq = $input_seq_io->next_seq() ) 
+  {
+    unless(defined($self->_all_full_cluster_genes->{$input_seq->display_id}))
+    {
+      $output_seq_io->write_seq($input_seq);
+    }
+  }
+  
+  $self->_create_groups_file;
+  return $self;
+}
+
+sub filter_full_clusters_from_fasta
+{
+    my ($self) = @_;
+ 
+    while ( my $input_seq = $self->_input_seqio->next_seq() ) {
+      unless(defined($self->_full_cluster_gene_names->{$input_seq->display_id}))
+      {
+        $self->_output_seqio->write_seq($input_seq);
+      }
+    }
+    return $self;
+}
+
+no Moose;
+__PACKAGE__->meta->make_immutable;
+
+1;
+