diff Roary/lib/Bio/Roary/MergeMultifastaAlignments.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Roary/lib/Bio/Roary/MergeMultifastaAlignments.pm	Fri May 14 20:27:06 2021 +0000
@@ -0,0 +1,121 @@
+package Bio::Roary::MergeMultifastaAlignments;
+
+# ABSTRACT: Merge multifasta alignment files with equal numbers of sequences.
+
+=head1 SYNOPSIS
+
+Merge multifasta alignment files with equal numbers of sequences.So each sequence in each file gets concatenated together.  It is assumed the 
+sequences are in the correct order.
+   use Bio::Roary::MergeMultifastaAlignments;
+   
+   my $obj = Bio::Roary::MergeMultifastaAlignments->new(
+     multifasta_files => [],
+     output_filename  => 'output_merged.aln'
+   );
+   $obj->merge_files;
+
+=cut
+
+use Moose;
+use Bio::SeqIO;
+use Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL;
+
+has 'multifasta_files'       => ( is => 'ro', isa => 'ArrayRef',   required => 1 );
+has 'sample_names'           => ( is => 'ro', isa => 'ArrayRef',   required => 1 );
+has 'sample_names_to_genes'  => ( is => 'rw', isa => 'HashRef',    required => 1 );
+has 'output_filename'        => ( is => 'ro', isa => 'Str',        default  => 'core_alignment.aln' );
+has 'output_header_filename' => ( is => 'ro', isa => 'Str',        default  => 'core_alignment_header.embl' );
+has '_output_seqio_obj'      => ( is => 'ro', isa => 'Bio::SeqIO', lazy     => 1, builder => '_build__output_seqio_obj' );
+has '_gene_lengths'          => ( is => 'rw', isa => 'HashRef',    lazy     => 1, builder => '_build__gene_lengths' );
+has '_gene_to_sequence' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
+has '_sorted_multifasta_files' => ( is => 'rw', isa => 'ArrayRef', lazy => 1, builder => '_build__sorted_multifasta_files' );
+
+sub BUILD {
+    my ($self) = @_;
+    $self->_gene_lengths;
+}
+
+sub _input_seq_io_obj {
+    my ( $self, $filename ) = @_;
+    return Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );
+}
+
+sub _build__output_seqio_obj {
+    my ($self) = @_;
+    return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );
+}
+
+sub _build__gene_lengths {
+    my ($self) = @_;
+    my %gene_lengths;
+    for my $filename ( @{ $self->_sorted_multifasta_files } ) {
+        my $seq_io = $self->_input_seq_io_obj($filename);
+        next unless ( defined($seq_io) );
+        while ( my $seq_record = $seq_io->next_seq ) {
+
+            # Save all of the gene sequences to memory, massive speedup but a bit naughty.
+            $self->_gene_to_sequence->{$filename}->{ $seq_record->display_id } = $seq_record->seq;
+            $gene_lengths{$filename} = $seq_record->length() if ( !defined( $gene_lengths{$filename} ) );
+        }
+    }
+
+    return \%gene_lengths;
+}
+
+sub _build__sorted_multifasta_files {
+    my ($self) = @_;
+    my @sorted_gene_files = sort @{ $self->multifasta_files };
+    return \@sorted_gene_files;
+}
+
+sub _sequence_for_sample_from_gene_file {
+    my ( $self, $sample_name, $gene_file ) = @_;
+
+    # loop over this to get the geneIDs
+    for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) {
+        if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) {
+            return $self->_gene_to_sequence->{$gene_file}->{$gene_id};
+        }
+    }
+    return $self->_padded_string_for_gene_file($gene_file);
+}
+
+sub _padded_string_for_gene_file {
+    my ( $self, $gene_file ) = @_;
+    return '' unless ( defined( $self->_gene_lengths->{$gene_file} ) );
+    return '-' x ( $self->_gene_lengths->{$gene_file} );
+}
+
+sub _create_merged_sequence_for_sample {
+    my ( $self, $sample_name ) = @_;
+    my $merged_sequence = '';
+    for my $gene_file ( @{ $self->_sorted_multifasta_files } ) {
+        $merged_sequence .= $self->_sequence_for_sample_from_gene_file( $sample_name, $gene_file );
+    }
+    return $merged_sequence;
+}
+
+sub merge_files {
+    my ($self) = @_;
+
+    for my $sample_name ( @{ $self->sample_names } ) {
+        my $sequence = $self->_create_merged_sequence_for_sample($sample_name);
+        my $seq_io = Bio::Seq->new( -display_id => $sample_name, -seq => $sequence );
+        $self->_output_seqio_obj->write_seq($seq_io);
+    }
+
+    # Create a header file which gives the coordinates of each gene in the multifasta
+    Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL->new(
+        multifasta_files => $self->_sorted_multifasta_files,
+        gene_lengths     => $self->_gene_lengths,
+        output_filename  => $self->output_header_filename
+    )->create_file();
+
+    return 1;
+}
+
+no Moose;
+	__PACKAGE__->meta->make_immutable;
+
+1;
+