Mercurial > repos > dereeper > roary_plots
comparison Roary/lib/Bio/Roary/AssemblyStatistics.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c47a5f61bc9f |
---|---|
1 package Bio::Roary::AssemblyStatistics; | |
2 | |
3 # ABSTRACT: Given a spreadsheet of gene presence and absence calculate some statistics | |
4 | |
5 =head1 SYNOPSIS | |
6 | |
7 Given a spreadsheet of gene presence and absence calculate some statistics | |
8 | |
9 =cut | |
10 | |
11 use Moose; | |
12 use Bio::Roary::ExtractCoreGenesFromSpreadsheet; | |
13 use Log::Log4perl qw(:easy); | |
14 with 'Bio::Roary::SpreadsheetRole'; | |
15 | |
16 has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'assembly_statistics.csv' ); | |
17 has 'job_runner' => ( is => 'ro', isa => 'Str', default => 'Local' ); | |
18 has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 ); | |
19 has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); | |
20 has '_cloud_percentage' => ( is => 'rw', isa => 'Num', default => 0.15 ); | |
21 has '_shell_percentage' => ( is => 'rw', isa => 'Num', default => 0.95 ); | |
22 has '_soft_core_percentage' => ( is => 'rw', isa => 'Num', default => 0.99 ); | |
23 has 'verbose' => ( is => 'ro', isa => 'Bool', default => 0 ); | |
24 has 'contiguous_window' => ( is => 'ro', isa => 'Int', default => 10 ); | |
25 has 'ordered_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_genes' ); | |
26 has '_genes_to_rows' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_rows' ); | |
27 has 'all_sample_statistics' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_all_sample_statistics' ); | |
28 has 'sample_names_to_column_index' => ( is => 'rw', isa => 'Maybe[HashRef]' ); | |
29 has 'summary_output_filename'=> ( is => 'ro', isa => 'Str', default => 'summary_statistics.txt' ); | |
30 has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger'); | |
31 has 'gene_category_count' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_gene_category_count' ); | |
32 | |
33 sub BUILD { | |
34 my ($self) = @_; | |
35 $self->_genes_to_rows; | |
36 $self->gene_category_count; | |
37 } | |
38 | |
39 sub _build_logger | |
40 { | |
41 my ($self) = @_; | |
42 Log::Log4perl->easy_init( $ERROR ); | |
43 my $logger = get_logger(); | |
44 return $logger; | |
45 } | |
46 | |
47 sub create_summary_output | |
48 { | |
49 my ($self) = @_; | |
50 open(my $fh, '>', $self->summary_output_filename) or Bio::Roary::Exceptions::CouldntWriteToFile->throw(error => "Couldnt write to ".$self->summary_output_filename); | |
51 | |
52 my $core_percentage = $self->core_definition()*100; | |
53 my $soft_core_percentage = $self->_soft_core_percentage*100; | |
54 my $shell_percentage = $self->_shell_percentage()*100; | |
55 my $cloud_percentage = $self->_cloud_percentage()*100; | |
56 | |
57 my $core_genes = ($self->gene_category_count->{core} ? $self->gene_category_count->{core} : 0); | |
58 my $soft_core_genes = ($self->gene_category_count->{soft_core} ? $self->gene_category_count->{soft_core} : 0); | |
59 my $shell_genes =($self->gene_category_count->{shell} ? $self->gene_category_count->{shell} : 0); | |
60 my $cloud_genes = ($self->gene_category_count->{cloud} ? $self->gene_category_count->{cloud} : 0); | |
61 my $total_genes = $core_genes + $soft_core_genes + $shell_genes + $cloud_genes ; | |
62 | |
63 $self->logger->warn("Very few core genes detected with the current settings. Try modifying the core definition ( -cd 90 ) and/or | |
64 the blast identity (-i 70) parameters. Also try checking for contamination (-qc) and ensure you only have one species.") if($core_genes < 100); | |
65 | |
66 print {$fh} "Core genes\t($core_percentage".'% <= strains <= 100%)'."\t$core_genes\n"; | |
67 print {$fh} "Soft core genes\t(".$shell_percentage."% <= strains < ".$soft_core_percentage."%)\t$soft_core_genes\n"; | |
68 print {$fh} "Shell genes\t(".$cloud_percentage."% <= strains < ".$shell_percentage."%)\t$shell_genes\n"; | |
69 print {$fh} "Cloud genes\t(0% <= strains < ".$cloud_percentage."%)\t$cloud_genes\n"; | |
70 print {$fh} "Total genes\t(0% <= strains <= 100%)\t$total_genes\n"; | |
71 | |
72 close($fh); | |
73 return 1; | |
74 } | |
75 | |
76 sub _build_gene_category_count { | |
77 my ($self) = @_; | |
78 my %gene_category_count; | |
79 $self->_soft_core_percentage($self->core_definition); | |
80 | |
81 if ( $self->_soft_core_percentage <= $self->_shell_percentage ) { | |
82 $self->_shell_percentage( $self->_soft_core_percentage - 0.01 ); | |
83 } | |
84 | |
85 my $number_of_samples = keys %{ $self->sample_names_to_column_index }; | |
86 for my $gene_name ( keys %{ $self->_genes_to_rows } ) { | |
87 my $isolates_with_gene = 0; | |
88 | |
89 for ( my $i = $self->_num_fixed_headers ; $i < @{ $self->_genes_to_rows->{$gene_name} } ; $i++ ) { | |
90 $isolates_with_gene++ | |
91 if ( defined( $self->_genes_to_rows->{$gene_name}->[$i] ) && $self->_genes_to_rows->{$gene_name}->[$i] ne "" ); | |
92 } | |
93 | |
94 if ( $isolates_with_gene < $self->_cloud_percentage() * $number_of_samples ) { | |
95 $gene_category_count{cloud}++; | |
96 } | |
97 elsif ( $isolates_with_gene < $self->_shell_percentage() * $number_of_samples ) { | |
98 $gene_category_count{shell}++; | |
99 } | |
100 elsif ( $isolates_with_gene < $self->_soft_core_percentage() * $number_of_samples ) { | |
101 $gene_category_count{soft_core}++; | |
102 } | |
103 else { | |
104 $gene_category_count{core}++; | |
105 } | |
106 } | |
107 return \%gene_category_count; | |
108 } | |
109 | |
110 sub _build_ordered_genes { | |
111 my ($self) = @_; | |
112 return Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( spreadsheet => $self->spreadsheet, core_definition => $self->core_definition ) | |
113 ->ordered_core_genes(); | |
114 } | |
115 | |
116 sub _build__genes_to_rows { | |
117 my ($self) = @_; | |
118 | |
119 my %genes_to_rows; | |
120 seek( $self->_input_spreadsheet_fh, 0, 0 ); | |
121 my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ); | |
122 $self->_populate_sample_names_to_column_index($header_row); | |
123 | |
124 while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) { | |
125 next if ( !defined( $row->[0] ) || $row->[0] eq "" ); | |
126 $genes_to_rows{ $row->[0] } = $row; | |
127 } | |
128 | |
129 return \%genes_to_rows; | |
130 } | |
131 | |
132 sub _populate_sample_names_to_column_index { | |
133 my ( $self, $row ) = @_; | |
134 | |
135 my %samples_to_index; | |
136 for ( my $i = $self->_num_fixed_headers ; $i < @{$row} ; $i++ ) { | |
137 next if ( ( !defined( $row->[$i] ) ) || $row->[$i] eq "" ); | |
138 $samples_to_index{ $row->[$i] } = $i; | |
139 } | |
140 $self->sample_names_to_column_index( \%samples_to_index ); | |
141 } | |
142 | |
143 sub _build_all_sample_statistics { | |
144 my ($self) = @_; | |
145 | |
146 my %sample_stats; | |
147 | |
148 # For each sample - loop over genes in order - number of contiguous blocks - max size of contiguous block - n50 - incorrect joins | |
149 for my $sample_name ( sort keys %{ $self->sample_names_to_column_index } ) { | |
150 $sample_stats{$sample_name} = $self->_sample_statistics($sample_name); | |
151 } | |
152 return \%sample_stats; | |
153 } | |
154 | |
155 sub _sample_statistics { | |
156 my ( $self, $sample_name ) = @_; | |
157 | |
158 my $sample_column_index = $self->sample_names_to_column_index->{$sample_name}; | |
159 my @gene_ids; | |
160 for my $gene_name ( @{ $self->ordered_genes } ) { | |
161 my $sample_gene_id = $self->_genes_to_rows->{$gene_name}->[$sample_column_index]; | |
162 next unless ( defined($sample_gene_id) ); | |
163 | |
164 if ( $sample_gene_id =~ /_([\d]+)$/ ) { | |
165 my $gene_number = $1; | |
166 push( @gene_ids, $gene_number ); | |
167 } | |
168 else { | |
169 next; | |
170 } | |
171 } | |
172 | |
173 return $self->_number_of_contiguous_blocks( \@gene_ids ); | |
174 } | |
175 | |
176 sub _number_of_contiguous_blocks { | |
177 my ( $self, $gene_ids ) = @_; | |
178 | |
179 my $current_gene_id = $gene_ids->[0]; | |
180 my $number_of_blocks = 1; | |
181 my $largest_block_size = 0; | |
182 my $block_size = 0; | |
183 for my $gene_id ( @{$gene_ids} ) { | |
184 if ( !( ( $current_gene_id + $self->contiguous_window >= $gene_id ) && ( $current_gene_id - $self->contiguous_window <= $gene_id ) ) | |
185 ) | |
186 { | |
187 if ( $block_size >= $largest_block_size ) { | |
188 $largest_block_size = $block_size; | |
189 $block_size = 0; | |
190 } | |
191 $number_of_blocks++; | |
192 } | |
193 $current_gene_id = $gene_id; | |
194 $block_size++; | |
195 } | |
196 | |
197 if ( $block_size > $largest_block_size ) { | |
198 $largest_block_size = $block_size; | |
199 } | |
200 return { num_blocks => $number_of_blocks, largest_block_size => $largest_block_size }; | |
201 } | |
202 | |
203 no Moose; | |
204 __PACKAGE__->meta->make_immutable; | |
205 | |
206 1; | |
207 |