view Roary/lib/Bio/Roary/MergeMultifastaAlignments.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
line wrap: on
line source

package Bio::Roary::MergeMultifastaAlignments;

# ABSTRACT: Merge multifasta alignment files with equal numbers of sequences.

=head1 SYNOPSIS

Merge multifasta alignment files with equal numbers of sequences.So each sequence in each file gets concatenated together.  It is assumed the 
sequences are in the correct order.
   use Bio::Roary::MergeMultifastaAlignments;
   
   my $obj = Bio::Roary::MergeMultifastaAlignments->new(
     multifasta_files => [],
     output_filename  => 'output_merged.aln'
   );
   $obj->merge_files;

=cut

use Moose;
use Bio::SeqIO;
use Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL;

has 'multifasta_files'       => ( is => 'ro', isa => 'ArrayRef',   required => 1 );
has 'sample_names'           => ( is => 'ro', isa => 'ArrayRef',   required => 1 );
has 'sample_names_to_genes'  => ( is => 'rw', isa => 'HashRef',    required => 1 );
has 'output_filename'        => ( is => 'ro', isa => 'Str',        default  => 'core_alignment.aln' );
has 'output_header_filename' => ( is => 'ro', isa => 'Str',        default  => 'core_alignment_header.embl' );
has '_output_seqio_obj'      => ( is => 'ro', isa => 'Bio::SeqIO', lazy     => 1, builder => '_build__output_seqio_obj' );
has '_gene_lengths'          => ( is => 'rw', isa => 'HashRef',    lazy     => 1, builder => '_build__gene_lengths' );
has '_gene_to_sequence' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
has '_sorted_multifasta_files' => ( is => 'rw', isa => 'ArrayRef', lazy => 1, builder => '_build__sorted_multifasta_files' );

sub BUILD {
    my ($self) = @_;
    $self->_gene_lengths;
}

sub _input_seq_io_obj {
    my ( $self, $filename ) = @_;
    return Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );
}

sub _build__output_seqio_obj {
    my ($self) = @_;
    return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );
}

sub _build__gene_lengths {
    my ($self) = @_;
    my %gene_lengths;
    for my $filename ( @{ $self->_sorted_multifasta_files } ) {
        my $seq_io = $self->_input_seq_io_obj($filename);
        next unless ( defined($seq_io) );
        while ( my $seq_record = $seq_io->next_seq ) {

            # Save all of the gene sequences to memory, massive speedup but a bit naughty.
            $self->_gene_to_sequence->{$filename}->{ $seq_record->display_id } = $seq_record->seq;
            $gene_lengths{$filename} = $seq_record->length() if ( !defined( $gene_lengths{$filename} ) );
        }
    }

    return \%gene_lengths;
}

sub _build__sorted_multifasta_files {
    my ($self) = @_;
    my @sorted_gene_files = sort @{ $self->multifasta_files };
    return \@sorted_gene_files;
}

sub _sequence_for_sample_from_gene_file {
    my ( $self, $sample_name, $gene_file ) = @_;

    # loop over this to get the geneIDs
    for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) {
        if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) {
            return $self->_gene_to_sequence->{$gene_file}->{$gene_id};
        }
    }
    return $self->_padded_string_for_gene_file($gene_file);
}

sub _padded_string_for_gene_file {
    my ( $self, $gene_file ) = @_;
    return '' unless ( defined( $self->_gene_lengths->{$gene_file} ) );
    return '-' x ( $self->_gene_lengths->{$gene_file} );
}

sub _create_merged_sequence_for_sample {
    my ( $self, $sample_name ) = @_;
    my $merged_sequence = '';
    for my $gene_file ( @{ $self->_sorted_multifasta_files } ) {
        $merged_sequence .= $self->_sequence_for_sample_from_gene_file( $sample_name, $gene_file );
    }
    return $merged_sequence;
}

sub merge_files {
    my ($self) = @_;

    for my $sample_name ( @{ $self->sample_names } ) {
        my $sequence = $self->_create_merged_sequence_for_sample($sample_name);
        my $seq_io = Bio::Seq->new( -display_id => $sample_name, -seq => $sequence );
        $self->_output_seqio_obj->write_seq($seq_io);
    }

    # Create a header file which gives the coordinates of each gene in the multifasta
    Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL->new(
        multifasta_files => $self->_sorted_multifasta_files,
        gene_lengths     => $self->_gene_lengths,
        output_filename  => $self->output_header_filename
    )->create_file();

    return 1;
}

no Moose;
	__PACKAGE__->meta->make_immutable;

1;