view Roary/lib/Bio/Roary/OrderGenes.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
line wrap: on
line source

package Bio::Roary::OrderGenes;

# ABSTRACT: Take in GFF files and create a matrix of what genes are beside what other genes

=head1 SYNOPSIS

Take in the analyse groups and create a matrix of what genes are beside what other genes
   use Bio::Roary::OrderGenes;
   
   my $obj = Bio::Roary::OrderGenes->new(
     analyse_groups_obj => $analyse_groups_obj,
     gff_files => ['file1.gff','file2.gff']
   );
   $obj->groups_to_contigs;

=cut

use Moose;
use Bio::Roary::Exceptions;
use Bio::Roary::AnalyseGroups;
use Bio::Roary::ContigsToGeneIDsFromGFF;
use Graph;
use Graph::Writer::Dot;
use File::Basename;

has 'gff_files'                => ( is => 'ro', isa => 'ArrayRef',                  required => 1 );
has 'analyse_groups_obj'       => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', required => 1 );
has 'core_definition'          => ( is => 'ro', isa => 'Num',                       default  => 1.0 );
has 'pan_graph_filename'       => ( is => 'ro', isa => 'Str',                       default  => 'core_accessory_graph.dot' );
has 'accessory_graph_filename' => ( is => 'ro', isa => 'Str',                       default  => 'accessory_graph.dot' );
has 'sample_weights'           => ( is => 'ro', isa => 'Maybe[HashRef]' );
has 'samples_to_clusters'      => ( is => 'ro', isa => 'Maybe[HashRef]' );
has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order' );
has 'groups_to_sample_names' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
has 'group_graphs'            => ( is => 'ro', isa => 'Graph',   lazy => 1, builder => '_build_group_graphs' );
has 'groups_to_contigs'       => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs' );
has '_groups_to_file_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_file_contigs' );
has '_groups'                 => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups' );
has 'number_of_files'         => ( is => 'ro', isa => 'Int',     lazy => 1, builder => '_build_number_of_files' );
has '_groups_qc' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
has '_percentage_of_largest_weak_threshold' => ( is => 'ro', isa => 'Num', default => 0.9 );

sub _build_number_of_files {
    my ($self) = @_;
    return @{ $self->gff_files };
}

sub _build_groups {
    my ($self) = @_;
    my %groups;
    for my $group_name ( @{ $self->analyse_groups_obj->_groups } ) {
        $groups{$group_name}++;
    }
    return \%groups;
}

sub _build__groups_to_file_contigs {
    my ($self) = @_;

    my @overlapping_hypothetical_gene_ids;
    my %samples_to_groups_contigs;

    # Open each GFF file
    for my $filename ( @{ $self->gff_files } ) {
        my @groups_to_contigs;
        my $contigs_to_ids_obj = Bio::Roary::ContigsToGeneIDsFromGFF->new( gff_file => $filename );

        my ( $sample_name, $directories, $suffix ) = fileparse($filename);
        $sample_name =~ s/\.gff//gi;

        # Loop over each contig in the GFF file
        for my $contig_name ( keys %{ $contigs_to_ids_obj->contig_to_ids } ) {
            my @groups_on_contig;

            # loop over each gene in each contig in the GFF file
            for my $gene_id ( @{ $contigs_to_ids_obj->contig_to_ids->{$contig_name} } ) {

                # convert to group name
                my $group_name = $self->analyse_groups_obj->_genes_to_groups->{$gene_id};
                next unless ( defined($group_name) );

                if ( $contigs_to_ids_obj->overlapping_hypothetical_protein_ids->{$gene_id} ) {
                    $self->_groups_qc->{$group_name} =
'Hypothetical protein with no hits to refseq/uniprot/clusters/cdd/tigrfams/pfam overlapping another protein with hits';
                }
                push( @groups_on_contig, $group_name );
            }
            push( @groups_to_contigs, \@groups_on_contig );
        }
        $samples_to_groups_contigs{$sample_name} = \@groups_to_contigs;
    }

    return \%samples_to_groups_contigs;

}

sub _build_group_order {
    my ($self) = @_;
    my %group_order;

    my %groups_to_sample_names;
    for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) {
        my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name};
        for my $groups_on_contig ( @{$groups_to_file_contigs} ) {
            for ( my $i = 1 ; $i < @{$groups_on_contig} ; $i++ ) {
                my $group_from = $groups_on_contig->[ $i - 1 ];
                my $group_to   = $groups_on_contig->[$i];

                if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) {
                    $group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name};
                    push( @{ $groups_to_sample_names{$group_from} }, $sample_name );
                }
                else {
                    $group_order{$group_from}{$group_to}++;
                }
            }
            if ( @{$groups_on_contig} == 1 ) {
                my $group_from = $groups_on_contig->[0];
                my $group_to   = $groups_on_contig->[0];
                if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) {
                    $group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name};
                    push( @{ $groups_to_sample_names{$group_from} }, $sample_name );
                }
                else {
                    $group_order{$group_from}{$group_to}++;
                }
            }
        }
    }

    $self->groups_to_sample_names( \%groups_to_sample_names );
    return \%group_order;
}

sub _build_group_graphs {
    my ($self) = @_;
    return Graph->new( undirected => 1 );
}

sub _save_graph_to_file {
    my ( $self, $graph, $output_filename ) = @_;
    my $writer = Graph::Writer::Dot->new();
    $writer->write_graph( $graph, $output_filename );
    return 1;
}

sub _add_groups_to_graph {
    my ($self) = @_;

    for my $current_group ( keys %{ $self->group_order() } ) {
        for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
            my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} );
            $self->group_graphs->add_weighted_edge( $current_group, $group_to, $weight );
        }
    }

}

sub _reorder_connected_components {
    my ( $self, $graph_groups ) = @_;
    my @ordered_graph_groups;
    my @paths_and_weights;

    for my $graph_group ( @{$graph_groups} ) {
        my %groups;
        $groups{$_}++ for ( @{$graph_group} );
        my $edge_sum = 0;

        for my $current_group ( keys %groups ) {
            for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
                next unless defined( $groups{$group_to} );
                $edge_sum += $self->group_order->{$current_group}->{$group_to};
            }
        }

        my %samples_in_graph;
        for my $current_group ( keys %groups ) {
            my $sample_names = $self->groups_to_sample_names->{$current_group};
            if ( defined($sample_names) ) {
                for my $sample_name ( @{$sample_names} ) {
                    $samples_in_graph{$sample_name}++;
                }
            }
        }
        my @sample_names = sort keys %samples_in_graph;

        if ( @{$graph_group} == 1 ) {

            push(
                @paths_and_weights,
                {
                    path           => $graph_group,
                    average_weight => $edge_sum,
                    sample_names   => \@sample_names
                }
            );
        }
        else {
            my $graph = Graph->new( undirected => 1 );
            for my $current_group ( keys %groups ) {
                for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
                    if ( $groups{$group_to} ) {
                        my $weight = 1 / $self->group_order->{$current_group}->{$group_to};
                        $graph->add_weighted_edge( $current_group, $group_to, $weight );
                    }
                }
            }
            my $minimum_spanning_tree = $graph->minimum_spanning_tree;
            my $dfs_obj               = Graph::Traversal::DFS->new($minimum_spanning_tree);
            my @reordered_dfs_groups  = $dfs_obj->dfs;
            push(
                @paths_and_weights,
                {
                    path           => \@reordered_dfs_groups,
                    average_weight => $edge_sum,
                    sample_names   => \@sample_names
                }
            );
        }

    }

    return $self->_order_by_samples_and_weights( \@paths_and_weights );
}

sub _order_by_samples_and_weights {
    my ( $self, $paths_and_weights ) = @_;

    my @ordered_graph_groups;
    if ( !defined( $self->samples_to_clusters ) ) {
        my @ordered_paths_and_weights = sort { $a->{average_weight} <=> $b->{average_weight} } @{$paths_and_weights};
        @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
        return \@ordered_graph_groups;
    }

    # Find the largest cluster in each graph and regroup
    my %largest_cluster_to_paths_and_weights;
    for my $graph_details ( @{$paths_and_weights} ) {
        my %cluster_count;
        for my $sample_name ( @{ $graph_details->{sample_names} } ) {
            if ( defined( $self->samples_to_clusters->{$sample_name} ) ) {
                $cluster_count{ $self->samples_to_clusters->{$sample_name} }++;
            }
        }
        my $largest_cluster = ( sort { $cluster_count{$b} <=> $cluster_count{$a} || $a cmp $b} keys %cluster_count )[0];
        if ( !defined($largest_cluster) ) {
            my @ordered_paths_and_weights = sort { $b->{average_weight} <=> $a->{average_weight} } @{$paths_and_weights};
            @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
            return \@ordered_graph_groups;
        }

        push( @{ $largest_cluster_to_paths_and_weights{$largest_cluster}{graph_details} }, $graph_details );
        $largest_cluster_to_paths_and_weights{$largest_cluster}{largest_cluster_size} += $cluster_count{$largest_cluster};
    }

    # go through each cluster group and order by weight
    my @clustered_ordered_graph_groups;
    for my $cluster_name (
        sort {
            $largest_cluster_to_paths_and_weights{$b}->{largest_cluster_size}
              <=> $largest_cluster_to_paths_and_weights{$a}->{largest_cluster_size}
        } keys %largest_cluster_to_paths_and_weights
      )
    {
		
        my @ordered_paths_and_weights =
          sort { $b->{average_weight} <=> $a->{average_weight} } @{ $largest_cluster_to_paths_and_weights{$cluster_name}->{graph_details} };
        @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;

        for my $graph_group (@ordered_graph_groups) {
            push( @clustered_ordered_graph_groups, $graph_group );
        }
    }
    return \@clustered_ordered_graph_groups;
}

sub _build_groups_to_contigs {
    my ($self) = @_;
    $self->_add_groups_to_graph;

    my %groups_to_contigs;
    my $counter          = 1;
    my $overall_counter  = 1;
    my $counter_filtered = 1;

    # Accessory
    my $accessory_graph  = $self->_create_accessory_graph;
    my @group_graphs     = $accessory_graph->connected_components();
    my $reordered_graphs = $self->_reorder_connected_components( \@group_graphs );

    $self->_save_graph_to_file( $accessory_graph, $self->accessory_graph_filename );

    for my $contig_groups ( @{$reordered_graphs} ) {
        my $order_counter = 1;

        for my $group_name ( @{$contig_groups} ) {
            $groups_to_contigs{$group_name}{accessory_label}           = $counter;
            $groups_to_contigs{$group_name}{accessory_order}           = $order_counter;
            $groups_to_contigs{$group_name}{'accessory_overall_order'} = $overall_counter;
            $order_counter++;
            $overall_counter++;
        }
        $counter++;
    }

    # Core + accessory
    my @group_graphs_all     = $self->group_graphs->connected_components();
    my $reordered_graphs_all = $self->_reorder_connected_components( \@group_graphs_all );
    $self->_save_graph_to_file( $self->group_graphs, $self->pan_graph_filename );

    $overall_counter  = 1;
    $counter          = 1;
    $counter_filtered = 1;
    for my $contig_groups ( @{$reordered_graphs_all} ) {
        my $order_counter = 1;

        for my $group_name ( @{$contig_groups} ) {
            $groups_to_contigs{$group_name}{label}                          = $counter;
            $groups_to_contigs{$group_name}{comment}                        = '';
            $groups_to_contigs{$group_name}{order}                          = $order_counter;
            $groups_to_contigs{$group_name}{'core_accessory_overall_order'} = $overall_counter;

            if ( @{$contig_groups} <= 2 ) {
                $groups_to_contigs{$group_name}{comment} = 'Investigate';
            }
            elsif ( $self->_groups_qc->{$group_name} ) {
                $groups_to_contigs{$group_name}{comment} = $self->_groups_qc->{$group_name};
            }
            else {
                $groups_to_contigs{$group_name}{'core_accessory_overall_order_filtered'} = $counter_filtered;
                $counter_filtered++;
            }
            $order_counter++;
            $overall_counter++;
        }
        $counter++;
    }

    $counter_filtered = 1;
    for my $contig_groups ( @{$reordered_graphs} ) {
        for my $group_name ( @{$contig_groups} ) {
            if (   ( !defined( $groups_to_contigs{$group_name}{comment} ) )
                || ( defined( $groups_to_contigs{$group_name}{comment} ) && $groups_to_contigs{$group_name}{comment} eq '' ) )
            {
                $groups_to_contigs{$group_name}{'accessory_overall_order_filtered'} = $counter_filtered;
                $counter_filtered++;
            }
        }
    }

    return \%groups_to_contigs;
}

sub _create_accessory_graph {
    my ($self) = @_;
    my $graph = Graph->new( undirected => 1 );

    my %core_groups;
    my %group_freq;

    for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) {
        my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name};

        for my $groups_on_contig ( @{$groups_to_file_contigs} ) {
            for my $current_group ( @{$groups_on_contig} ) {
                $group_freq{$current_group}++;
            }
        }
    }

    for my $current_group ( keys %{ $self->group_order() } ) {
        next if ( $group_freq{$current_group} >= ( $self->number_of_files * $self->core_definition ) );
		
        for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
            if ( $group_freq{$group_to} >= ( $self->number_of_files * $self->core_definition ) ) {
                $graph->add_vertex($current_group);
            }
            else {
                my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} );
                $graph->add_weighted_edge( $current_group, $group_to, $weight );
            }
        }
    }

    return $graph;
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;