Mercurial > repos > dereeper > roary_plots
view Roary/lib/Bio/Roary/PostAnalysis.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line source
package Bio::Roary::PostAnalysis; # ABSTRACT: Post analysis of pan genomes =head1 SYNOPSIS Create a pan genome =cut use Moose; use File::Copy; use Bio::Roary::InflateClusters; use Bio::Roary::AnalyseGroups; use Bio::Roary::GroupLabels; use Bio::Roary::AnnotateGroups; use Bio::Roary::GroupStatistics; use Bio::Roary::Output::GroupsMultifastasNucleotide; use Bio::Roary::Output::NumberOfGroups; use Bio::Roary::OrderGenes; use Bio::Roary::Output::EmblGroups; use Bio::Roary::SplitGroups; use Bio::Roary::AccessoryBinaryFasta; use Bio::Roary::External::Fasttree; use Bio::Roary::AccessoryClustering; use Bio::Roary::AssemblyStatistics; use Log::Log4perl qw(:easy); has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 ); has 'input_files' => ( is => 'rw', isa => 'ArrayRef', required => 1 ); has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' ); has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str', default => 'pan_genome.fa' ); has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'gene_presence_absence.csv' ); has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 ); has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 ); has 'clusters_filename' => ( is => 'rw', isa => 'Str', required => 1 ); has 'dont_delete_files' => ( is => 'ro', isa => 'Bool', default => 0 ); has 'dont_split_groups' => ( is => 'ro', isa => 'Bool', default => 0 ); has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 1 ); has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 ); has '_output_mcl_filename' => ( is => 'ro', isa => 'Str', default => '_uninflated_mcl_groups' ); has '_output_inflate_unsplit_clusters_filename' => ( is => 'ro', isa => 'Str', default => '_inflated_unsplit_mcl_groups' ); has '_output_inflate_clusters_filename' => ( is => 'ro', isa => 'Str', default => '_inflated_mcl_groups' ); has '_output_group_labels_filename' => ( is => 'ro', isa => 'Str', default => '_labeled_mcl_groups' ); has '_output_combined_filename' => ( is => 'ro', isa => 'Str', default => '_combined_files' ); has '_input_cd_hit_groups_file' => ( is => 'ro', isa => 'Str', default => '_combined_files.groups' ); has 'core_accessory_tab_output_filename' => ( is => 'ro', isa => 'Str', default => 'core_accessory.tab' ); has 'accessory_tab_output_filename' => ( is => 'ro', isa => 'Str', default => 'accessory.tab' ); has 'core_accessory_ordering_key' => ( is => 'ro', isa => 'Str', default => 'core_accessory_overall_order_filtered' ); has 'accessory_ordering_key' => ( is => 'ro', isa => 'Str', default => 'accessory_overall_order_filtered' ); has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 ); has 'pan_genome_reference_filename' => ( is => 'ro', isa => 'Str', default => 'pan_genome_reference.fa' ); has '_inflate_clusters_obj' => ( is => 'ro', isa => 'Bio::Roary::InflateClusters', lazy => 1, builder => '_build__inflate_clusters_obj' ); has '_group_labels_obj' => ( is => 'ro', isa => 'Bio::Roary::GroupLabels', lazy => 1, builder => '_build__group_labels_obj' ); has '_annotate_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', lazy => 1, builder => '_build__annotate_groups_obj' ); has '_analyse_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', lazy => 1, builder => '_build__analyse_groups_obj' ); has '_order_genes_obj' => ( is => 'ro', isa => 'Bio::Roary::OrderGenes', lazy => 1, builder => '_build__order_genes_obj' ); has '_group_statistics_obj' => ( is => 'ro', isa => 'Bio::Roary::GroupStatistics', lazy => 1, builder => '_build__group_statistics_obj' ); has '_number_of_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::Output::NumberOfGroups', lazy => 1, builder => '_build__number_of_groups_obj' ); has '_accessory_binary_fasta' => ( is => 'ro', isa => 'Bio::Roary::AccessoryBinaryFasta', lazy => 1, builder => '_build__accessory_binary_fasta' ); has '_groups_multifastas_nuc_obj' => ( is => 'ro', isa => 'Bio::Roary::Output::GroupsMultifastasNucleotide', lazy => 1, builder => '_build__groups_multifastas_nuc_obj' ); has '_split_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::SplitGroups', lazy => 1, builder => '_build__split_groups_obj' ); has '_accessory_binary_tree' => ( is => 'ro', isa => 'Bio::Roary::External::Fasttree', lazy => 1, builder => '_build__accessory_binary_tree' ); has '_accessory_clustering' => ( is => 'ro', isa => 'Maybe[Bio::Roary::AccessoryClustering]', lazy => 1, builder => '_build__accessory_clustering' ); has '_assembly_statistics' => ( is => 'ro', isa => 'Bio::Roary::AssemblyStatistics', lazy => 1, builder => '_build__assembly_statistics' ); has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' ); sub _build_logger { my ($self) = @_; Log::Log4perl->easy_init( level => $ERROR ); my $logger = get_logger(); return $logger; } sub run { my ($self) = @_; $self->logger->info("Reinflate clusters"); $self->_inflate_clusters_obj->inflate(); $self->logger->info("Split groups with paralogs"); ## SPLIT GROUPS WITH PARALOGS ## if ( $self->dont_split_groups ) { move( $self->_output_inflate_unsplit_clusters_filename, $self->_output_inflate_clusters_filename ); } else { $self->_split_groups_obj->split_groups; } $self->logger->info("Labelling the groups"); $self->_group_labels_obj->add_labels(); $self->logger->info("Transfering the annotation to the groups"); $self->_annotate_groups_obj->reannotate; $self->logger->info("Creating accessory binary gene presence and absence fasta"); $self->_accessory_binary_fasta->create_accessory_binary_fasta; $self->logger->info("Creating accessory binary gene presence and absence tree"); $self->_accessory_binary_tree->run; $self->logger->info("Creating accessory gene presence and absence clusters"); if ( $self->_accessory_clustering ) { $self->_accessory_clustering->sample_weights; } $self->logger->info("Creating the spreadsheet with gene presence and absence"); $self->_group_statistics_obj->create_spreadsheet; $self->_group_statistics_obj->create_rtab; $self->logger->info("Creating summary statistics of the spreadsheet"); $self->_assembly_statistics->create_summary_output; $self->logger->info("Creating tab files for R"); $self->_number_of_groups_obj->create_output_files; system("create_pan_genome_plots.R") unless ( $self->dont_create_rplots == 1 ); $self->logger->info("Create EMBL files"); $self->_create_embl_files; my $clusters_not_exceeded = 1; if ( $self->output_multifasta_files ) { $self->logger->info("Creating files with the nucleotide sequences for every cluster"); $clusters_not_exceeded = $self->_groups_multifastas_nuc_obj->create_files(); } $self->_delete_intermediate_files; if ( $clusters_not_exceeded == 0 && $self->output_multifasta_files ) { $self->logger->error("Exiting early because number of clusters is too high"); exit(); } } sub _build__assembly_statistics { my ($self) = @_; return Bio::Roary::AssemblyStatistics->new( spreadsheet => $self->_group_statistics_obj->output_filename, core_definition => $self->core_definition, logger => $self->logger ); } sub _build__accessory_clustering { my ($self) = @_; if ( ( -e $self->_accessory_binary_fasta->output_filename ) && ( -s $self->_accessory_binary_fasta->output_filename > 5 ) ) { $self->logger->info( $self->_accessory_binary_fasta->output_filename ); return Bio::Roary::AccessoryClustering->new( input_file => $self->_accessory_binary_fasta->output_filename, cpus => $self->cpus, logger => $self->logger ); } else { $self->logger->info("Theres no accessory binary file so skipping accessory binary clustering"); return undef; } } sub _build__accessory_binary_tree { my ($self) = @_; return Bio::Roary::External::Fasttree->new( input_file => $self->_accessory_binary_fasta->output_filename, verbose => $self->verbose, logger => $self->logger ); } sub _build__accessory_binary_fasta { my ($self) = @_; return Bio::Roary::AccessoryBinaryFasta->new( input_files => $self->fasta_files, annotate_groups_obj => $self->_annotate_groups_obj, analyse_groups_obj => $self->_analyse_groups_obj, logger => $self->logger ); } sub _build__split_groups_obj { my ($self) = @_; return Bio::Roary::SplitGroups->new( groupfile => $self->_output_inflate_unsplit_clusters_filename, gff_files => $self->input_files, fasta_files => $self->fasta_files, outfile => $self->_output_inflate_clusters_filename, dont_delete => $self->dont_delete_files, logger => $self->logger ); } sub _build__number_of_groups_obj { my ($self) = @_; return Bio::Roary::Output::NumberOfGroups->new( group_statistics_obj => $self->_group_statistics_obj, groups_to_contigs => $self->_order_genes_obj->groups_to_contigs, annotate_groups_obj => $self->_annotate_groups_obj, core_definition => $self->core_definition, logger => $self->logger ); } sub _build__group_statistics_obj { my ($self) = @_; return Bio::Roary::GroupStatistics->new( output_filename => $self->output_statistics_filename, annotate_groups_obj => $self->_annotate_groups_obj, analyse_groups_obj => $self->_analyse_groups_obj, groups_to_contigs => $self->_order_genes_obj->groups_to_contigs, _verbose => $self->verbose_stats, logger => $self->logger ); } sub _build__order_genes_obj { my ($self) = @_; if ( defined( $self->_accessory_clustering ) ) { return Bio::Roary::OrderGenes->new( analyse_groups_obj => $self->_analyse_groups_obj, gff_files => $self->input_files, core_definition => $self->core_definition, sample_weights => $self->_accessory_clustering->sample_weights, samples_to_clusters => $self->_accessory_clustering->samples_to_clusters, logger => $self->logger ); } else { return Bio::Roary::OrderGenes->new( analyse_groups_obj => $self->_analyse_groups_obj, gff_files => $self->input_files, core_definition => $self->core_definition, logger => $self->logger ); } } sub _build__group_labels_obj { my ($self) = @_; return Bio::Roary::GroupLabels->new( groups_filename => $self->_output_inflate_clusters_filename, output_filename => $self->_output_group_labels_filename, logger => $self->logger ); } sub _build__annotate_groups_obj { my ($self) = @_; return Bio::Roary::AnnotateGroups->new( gff_files => $self->input_files, output_filename => $self->output_filename, groups_filename => $self->_output_group_labels_filename, logger => $self->logger ); } sub _build__analyse_groups_obj { my ($self) = @_; return Bio::Roary::AnalyseGroups->new( fasta_files => $self->fasta_files, groups_filename => $self->output_filename, logger => $self->logger ); } sub _build__inflate_clusters_obj { my ($self) = @_; return Bio::Roary::InflateClusters->new( clusters_filename => $self->clusters_filename, cdhit_groups_filename => $self->_input_cd_hit_groups_file, mcl_filename => $self->_output_mcl_filename, output_file => $self->_output_inflate_unsplit_clusters_filename, logger => $self->logger ); } sub _build__groups_multifastas_nuc_obj { my ($self) = @_; return Bio::Roary::Output::GroupsMultifastasNucleotide->new( output_multifasta_files => $self->output_multifasta_files, gff_files => $self->input_files, annotate_groups => $self->_annotate_groups_obj, group_names => $self->_analyse_groups_obj->_groups, group_limit => $self->group_limit, core_definition => $self->core_definition, dont_delete_files => $self->dont_delete_files, logger => $self->logger ); } sub _create_embl_files { my ($self) = @_; my $core_accessory_tab_obj = Bio::Roary::Output::EmblGroups->new( output_filename => $self->core_accessory_tab_output_filename, annotate_groups_obj => $self->_annotate_groups_obj, analyse_groups_obj => $self->_analyse_groups_obj, ordering_key => $self->core_accessory_ordering_key, groups_to_contigs => $self->_order_genes_obj->groups_to_contigs, logger => $self->logger ); $core_accessory_tab_obj->create_files; my $accessory_tab_obj = Bio::Roary::Output::EmblGroups->new( output_filename => $self->accessory_tab_output_filename, annotate_groups_obj => $self->_annotate_groups_obj, analyse_groups_obj => $self->_analyse_groups_obj, ordering_key => $self->accessory_ordering_key, groups_to_contigs => $self->_order_genes_obj->groups_to_contigs, logger => $self->logger ); $accessory_tab_obj->create_files; } sub _delete_intermediate_files { my ($self) = @_; return if ( $self->dont_delete_files == 1 ); $self->logger->info("Cleaning up files"); for my $fasta_file ( @{ $self->fasta_files } ) { unlink($fasta_file) if ( -e $fasta_file ); } unlink( $self->_output_mcl_filename ); unlink( $self->_output_inflate_clusters_filename ); unlink( $self->_output_group_labels_filename ); unlink( $self->_output_combined_filename ); unlink( $self->clusters_filename ); unlink( $self->clusters_filename . '.clstr' ); unlink( $self->clusters_filename . '.bak.clstr' ); unlink('_gff_files'); unlink('_fasta_files'); unlink('_clustered_filtered.fa'); unlink( $self->_input_cd_hit_groups_file ); unlink('database_masking.asnb'); unlink('_clustered'); unlink('_accessory_clusters'); unlink('_accessory_clusters.clstr'); } no Moose; __PACKAGE__->meta->make_immutable; 1;