0
|
1 package Bio::Roary::AssemblyStatistics;
|
|
2
|
|
3 # ABSTRACT: Given a spreadsheet of gene presence and absence calculate some statistics
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 Given a spreadsheet of gene presence and absence calculate some statistics
|
|
8
|
|
9 =cut
|
|
10
|
|
11 use Moose;
|
|
12 use Bio::Roary::ExtractCoreGenesFromSpreadsheet;
|
|
13 use Log::Log4perl qw(:easy);
|
|
14 with 'Bio::Roary::SpreadsheetRole';
|
|
15
|
|
16 has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'assembly_statistics.csv' );
|
|
17 has 'job_runner' => ( is => 'ro', isa => 'Str', default => 'Local' );
|
|
18 has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 );
|
|
19 has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 );
|
|
20 has '_cloud_percentage' => ( is => 'rw', isa => 'Num', default => 0.15 );
|
|
21 has '_shell_percentage' => ( is => 'rw', isa => 'Num', default => 0.95 );
|
|
22 has '_soft_core_percentage' => ( is => 'rw', isa => 'Num', default => 0.99 );
|
|
23 has 'verbose' => ( is => 'ro', isa => 'Bool', default => 0 );
|
|
24 has 'contiguous_window' => ( is => 'ro', isa => 'Int', default => 10 );
|
|
25 has 'ordered_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_genes' );
|
|
26 has '_genes_to_rows' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_rows' );
|
|
27 has 'all_sample_statistics' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_all_sample_statistics' );
|
|
28 has 'sample_names_to_column_index' => ( is => 'rw', isa => 'Maybe[HashRef]' );
|
|
29 has 'summary_output_filename'=> ( is => 'ro', isa => 'Str', default => 'summary_statistics.txt' );
|
|
30 has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger');
|
|
31 has 'gene_category_count' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_gene_category_count' );
|
|
32
|
|
33 sub BUILD {
|
|
34 my ($self) = @_;
|
|
35 $self->_genes_to_rows;
|
|
36 $self->gene_category_count;
|
|
37 }
|
|
38
|
|
39 sub _build_logger
|
|
40 {
|
|
41 my ($self) = @_;
|
|
42 Log::Log4perl->easy_init( $ERROR );
|
|
43 my $logger = get_logger();
|
|
44 return $logger;
|
|
45 }
|
|
46
|
|
47 sub create_summary_output
|
|
48 {
|
|
49 my ($self) = @_;
|
|
50 open(my $fh, '>', $self->summary_output_filename) or Bio::Roary::Exceptions::CouldntWriteToFile->throw(error => "Couldnt write to ".$self->summary_output_filename);
|
|
51
|
|
52 my $core_percentage = $self->core_definition()*100;
|
|
53 my $soft_core_percentage = $self->_soft_core_percentage*100;
|
|
54 my $shell_percentage = $self->_shell_percentage()*100;
|
|
55 my $cloud_percentage = $self->_cloud_percentage()*100;
|
|
56
|
|
57 my $core_genes = ($self->gene_category_count->{core} ? $self->gene_category_count->{core} : 0);
|
|
58 my $soft_core_genes = ($self->gene_category_count->{soft_core} ? $self->gene_category_count->{soft_core} : 0);
|
|
59 my $shell_genes =($self->gene_category_count->{shell} ? $self->gene_category_count->{shell} : 0);
|
|
60 my $cloud_genes = ($self->gene_category_count->{cloud} ? $self->gene_category_count->{cloud} : 0);
|
|
61 my $total_genes = $core_genes + $soft_core_genes + $shell_genes + $cloud_genes ;
|
|
62
|
|
63 $self->logger->warn("Very few core genes detected with the current settings. Try modifying the core definition ( -cd 90 ) and/or
|
|
64 the blast identity (-i 70) parameters. Also try checking for contamination (-qc) and ensure you only have one species.") if($core_genes < 100);
|
|
65
|
|
66 print {$fh} "Core genes\t($core_percentage".'% <= strains <= 100%)'."\t$core_genes\n";
|
|
67 print {$fh} "Soft core genes\t(".$shell_percentage."% <= strains < ".$soft_core_percentage."%)\t$soft_core_genes\n";
|
|
68 print {$fh} "Shell genes\t(".$cloud_percentage."% <= strains < ".$shell_percentage."%)\t$shell_genes\n";
|
|
69 print {$fh} "Cloud genes\t(0% <= strains < ".$cloud_percentage."%)\t$cloud_genes\n";
|
|
70 print {$fh} "Total genes\t(0% <= strains <= 100%)\t$total_genes\n";
|
|
71
|
|
72 close($fh);
|
|
73 return 1;
|
|
74 }
|
|
75
|
|
76 sub _build_gene_category_count {
|
|
77 my ($self) = @_;
|
|
78 my %gene_category_count;
|
|
79 $self->_soft_core_percentage($self->core_definition);
|
|
80
|
|
81 if ( $self->_soft_core_percentage <= $self->_shell_percentage ) {
|
|
82 $self->_shell_percentage( $self->_soft_core_percentage - 0.01 );
|
|
83 }
|
|
84
|
|
85 my $number_of_samples = keys %{ $self->sample_names_to_column_index };
|
|
86 for my $gene_name ( keys %{ $self->_genes_to_rows } ) {
|
|
87 my $isolates_with_gene = 0;
|
|
88
|
|
89 for ( my $i = $self->_num_fixed_headers ; $i < @{ $self->_genes_to_rows->{$gene_name} } ; $i++ ) {
|
|
90 $isolates_with_gene++
|
|
91 if ( defined( $self->_genes_to_rows->{$gene_name}->[$i] ) && $self->_genes_to_rows->{$gene_name}->[$i] ne "" );
|
|
92 }
|
|
93
|
|
94 if ( $isolates_with_gene < $self->_cloud_percentage() * $number_of_samples ) {
|
|
95 $gene_category_count{cloud}++;
|
|
96 }
|
|
97 elsif ( $isolates_with_gene < $self->_shell_percentage() * $number_of_samples ) {
|
|
98 $gene_category_count{shell}++;
|
|
99 }
|
|
100 elsif ( $isolates_with_gene < $self->_soft_core_percentage() * $number_of_samples ) {
|
|
101 $gene_category_count{soft_core}++;
|
|
102 }
|
|
103 else {
|
|
104 $gene_category_count{core}++;
|
|
105 }
|
|
106 }
|
|
107 return \%gene_category_count;
|
|
108 }
|
|
109
|
|
110 sub _build_ordered_genes {
|
|
111 my ($self) = @_;
|
|
112 return Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( spreadsheet => $self->spreadsheet, core_definition => $self->core_definition )
|
|
113 ->ordered_core_genes();
|
|
114 }
|
|
115
|
|
116 sub _build__genes_to_rows {
|
|
117 my ($self) = @_;
|
|
118
|
|
119 my %genes_to_rows;
|
|
120 seek( $self->_input_spreadsheet_fh, 0, 0 );
|
|
121 my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh );
|
|
122 $self->_populate_sample_names_to_column_index($header_row);
|
|
123
|
|
124 while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) {
|
|
125 next if ( !defined( $row->[0] ) || $row->[0] eq "" );
|
|
126 $genes_to_rows{ $row->[0] } = $row;
|
|
127 }
|
|
128
|
|
129 return \%genes_to_rows;
|
|
130 }
|
|
131
|
|
132 sub _populate_sample_names_to_column_index {
|
|
133 my ( $self, $row ) = @_;
|
|
134
|
|
135 my %samples_to_index;
|
|
136 for ( my $i = $self->_num_fixed_headers ; $i < @{$row} ; $i++ ) {
|
|
137 next if ( ( !defined( $row->[$i] ) ) || $row->[$i] eq "" );
|
|
138 $samples_to_index{ $row->[$i] } = $i;
|
|
139 }
|
|
140 $self->sample_names_to_column_index( \%samples_to_index );
|
|
141 }
|
|
142
|
|
143 sub _build_all_sample_statistics {
|
|
144 my ($self) = @_;
|
|
145
|
|
146 my %sample_stats;
|
|
147
|
|
148 # For each sample - loop over genes in order - number of contiguous blocks - max size of contiguous block - n50 - incorrect joins
|
|
149 for my $sample_name ( sort keys %{ $self->sample_names_to_column_index } ) {
|
|
150 $sample_stats{$sample_name} = $self->_sample_statistics($sample_name);
|
|
151 }
|
|
152 return \%sample_stats;
|
|
153 }
|
|
154
|
|
155 sub _sample_statistics {
|
|
156 my ( $self, $sample_name ) = @_;
|
|
157
|
|
158 my $sample_column_index = $self->sample_names_to_column_index->{$sample_name};
|
|
159 my @gene_ids;
|
|
160 for my $gene_name ( @{ $self->ordered_genes } ) {
|
|
161 my $sample_gene_id = $self->_genes_to_rows->{$gene_name}->[$sample_column_index];
|
|
162 next unless ( defined($sample_gene_id) );
|
|
163
|
|
164 if ( $sample_gene_id =~ /_([\d]+)$/ ) {
|
|
165 my $gene_number = $1;
|
|
166 push( @gene_ids, $gene_number );
|
|
167 }
|
|
168 else {
|
|
169 next;
|
|
170 }
|
|
171 }
|
|
172
|
|
173 return $self->_number_of_contiguous_blocks( \@gene_ids );
|
|
174 }
|
|
175
|
|
176 sub _number_of_contiguous_blocks {
|
|
177 my ( $self, $gene_ids ) = @_;
|
|
178
|
|
179 my $current_gene_id = $gene_ids->[0];
|
|
180 my $number_of_blocks = 1;
|
|
181 my $largest_block_size = 0;
|
|
182 my $block_size = 0;
|
|
183 for my $gene_id ( @{$gene_ids} ) {
|
|
184 if ( !( ( $current_gene_id + $self->contiguous_window >= $gene_id ) && ( $current_gene_id - $self->contiguous_window <= $gene_id ) )
|
|
185 )
|
|
186 {
|
|
187 if ( $block_size >= $largest_block_size ) {
|
|
188 $largest_block_size = $block_size;
|
|
189 $block_size = 0;
|
|
190 }
|
|
191 $number_of_blocks++;
|
|
192 }
|
|
193 $current_gene_id = $gene_id;
|
|
194 $block_size++;
|
|
195 }
|
|
196
|
|
197 if ( $block_size > $largest_block_size ) {
|
|
198 $largest_block_size = $block_size;
|
|
199 }
|
|
200 return { num_blocks => $number_of_blocks, largest_block_size => $largest_block_size };
|
|
201 }
|
|
202
|
|
203 no Moose;
|
|
204 __PACKAGE__->meta->make_immutable;
|
|
205
|
|
206 1;
|
|
207
|