Mercurial > repos > dereeper > roary_plots
view Roary/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line source
package Bio::Roary::ExtractCoreGenesFromSpreadsheet; # ABSTRACT: Take in a spreadsheet produced by the pipeline and identify the core genes. =head1 SYNOPSIS Take in a spreadsheet produced by the pipeline and identify the core genes. use Bio::Roary::ExtractCoreGenesFromSpreadsheet; my $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( spreadsheet => 'group_statistics.csv', ); $obj->ordered_core_genes(); =cut use Moose; use Text::CSV; use Bio::Roary::GroupStatistics; use POSIX; has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 ); has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV', lazy => 1, builder => '_build__csv_parser' ); has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' ); has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' ); has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 ); has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } ); has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); has '_number_of_isolates' => ( is => 'rw', isa => 'Int' ); has '_gene_column' => ( is => 'rw', isa => 'Int' ); has '_num_isolates_column' => ( is => 'rw', isa => 'Int' ); has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int' ); has '_genome_fragement_column' => ( is => 'rw', isa => 'Int' ); has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int' ); has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' ); sub _build__min_no_isolates_for_core { my ($self) = @_; my $threshold = $self->_number_of_isolates * $self->core_definition; return $threshold; } sub _build__csv_parser { my ($self) = @_; return Text::CSV->new( { binary => 1, always_quote => 1 } ); } sub _build__input_spreadsheet_fh { my ($self) = @_; open( my $fh, $self->spreadsheet ); return $fh; } sub _update_number_of_isolates { my ( $self, $header_row ) = @_; my $number_of_isolates = @{$header_row} - @{ Bio::Roary::GroupStatistics->fixed_headers }; $self->_number_of_isolates($number_of_isolates); } sub _setup_column_mappings { my ( $self, $header_row ) = @_; # current ordering my %columns_of_interest_mappings = ( 'Gene' => 0, 'No. isolates' => 3, 'Avg sequences per isolate' => 5, 'Genome Fragment' => 6, 'Order within Fragment' => 7, 'QC' => 10, ); # Dynamically overwrite the default ordering for ( my $i = 0 ; $i < @{$header_row} ; $i++ ) { for my $col_name (%columns_of_interest_mappings) { if ( $header_row->[$i] eq $col_name ) { $columns_of_interest_mappings{$col_name} = $i; last; } } } $self->_gene_column( $columns_of_interest_mappings{'Gene'} ); $self->_num_isolates_column( $columns_of_interest_mappings{'No. isolates'} ); $self->_avg_sequences_per_isolate_column( $columns_of_interest_mappings{'Avg sequences per isolate'} ); $self->_genome_fragement_column( $columns_of_interest_mappings{'Genome Fragment'} ); $self->_order_within_fragement_column( $columns_of_interest_mappings{'Order within Fragment'} ); $self->_update_number_of_isolates($header_row); # Get the sample_names my @sample_names; for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$header_row} ; $i++ ) { push( @sample_names, $header_row->[$i] ); } $self->sample_names( \@sample_names ); } sub _length_of_fixed_headers { my ($self) = @_; return @{ Bio::Roary::GroupStatistics->fixed_headers() }; } sub _populate_sample_to_gene_lookup_with_row { my ( $self, $row ) = @_; for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$row} ; $i++ ) { if ( defined( $row->[$i] ) && $row->[$i] ne "" ) { my $sample_name = $self->sample_names->[ $i - $self->_length_of_fixed_headers() ]; $self->sample_names_to_genes->{$sample_name}->{ $row->[$i] } = 1; } } return 1; } sub _ordered_core_genes { my ($self) = @_; my %ordered_genes; while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) { next if ( @{$row} < 12 ); # no genes in group next if ( !defined( $row->[ $self->_gene_column ] ) || $row->[ $self->_gene_column ] eq '' ); # no gene name next if ( !defined( $row->[ $self->_avg_sequences_per_isolate_column ] ) || $row->[ $self->_avg_sequences_per_isolate_column ] eq '' ) ; # no average next if ( !defined( $row->[ $self->_genome_fragement_column ] ) || $row->[ $self->_genome_fragement_column ] eq '' ) ; # fragment not defined # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates next if ( $row->[ $self->_num_isolates_column ] < $self->_min_no_isolates_for_core ); if ( $self->allow_paralogs ) { # should never happen next if ( $row->[ $self->_avg_sequences_per_isolate_column ] < 1 ); } else { next if ( $row->[ $self->_avg_sequences_per_isolate_column ] != 1 ); } $ordered_genes{ $row->[ $self->_genome_fragement_column ] }{ $row->[ $self->_order_within_fragement_column ] } = $row->[ $self->_gene_column ]; $self->_populate_sample_to_gene_lookup_with_row($row); } my @ordered_core_genes; for my $fragment_key ( sort { $a <=> $b } keys %ordered_genes ) { for my $order_within_fragement ( sort { $a <=> $b } keys %{ $ordered_genes{$fragment_key} } ) { push( @ordered_core_genes, $ordered_genes{$fragment_key}{$order_within_fragement} ); } } return \@ordered_core_genes; } sub _build_ordered_core_genes { my ($self) = @_; my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ); $self->_setup_column_mappings($header_row); return $self->_ordered_core_genes(); } no Moose; __PACKAGE__->meta->make_immutable; 1;