Mercurial > repos > dereeper > pangenome_explorer
changeset 14:5a5c9a6b047b draft
Uploaded
line wrap: on
line diff
--- a/PanExplorer.xml Thu May 30 20:07:55 2024 +0000 +++ b/PanExplorer.xml Tue Dec 10 16:20:53 2024 +0000 @@ -1,4 +1,4 @@ -<tool id="PanExplorer" name="PanExplorer" version="1.0"> +<tool id="PanExplorer2" name="PanExplorer2" version="2.0"> <description> Bacterial pan-genome analysis </description> <requirements> <!-- @@ -17,7 +17,14 @@ export PANEX_PATH=${__tool_directory__}; -perl ${__tool_directory__}/Perl/generateConfig.pl '$private_genomes' '$input' config.yaml '$private_genomes_fasta'; + +#if str($mode.mode) == "accessions": + perl ${__tool_directory__}/Perl/generateConfig.pl 'None' '$input' config.yaml 'None'; +#else if str($mode.mode) == "genbanks": + perl ${__tool_directory__}/Perl/generateConfig.pl '$private_genomes' 'None' config.yaml 'None'; +#else if str($mode.mode) == "fasta": + perl ${__tool_directory__}/Perl/generateConfig.pl '$private_genomes' 'None' config.yaml '$private_genomes_fasta'; +#end if cat config.yaml >$logfile; @@ -74,18 +81,40 @@ <inputs> - <param name="input" type="text" multiple="true" label="List of genbank identifiers" help="Coma separated list (ex: CP000235.1,CP001079.1,CP001759.1,CP015994.2)"/> - <param name="private_genomes" type="data" format="zip" label="Zip of genbank or GFF files" optional="true"/> - <param name="private_genomes_fasta" type="data" format="zip" label="Zip of Fasta files" optional="true"/> + <conditional name="mode"> + <param name = "mode" type="select" label="What is your inputs?"> + <option value="accessions">Prokaryote genomes: List of Genbank assembly accessions (GCA)</option> + <option value="genbanks">Prokaryote genomes: Genbank files</option> + <option value="fasta">Eukaryote genomes: FASTA + GFF files</option> + </param> + <when value="accessions"> + <param name="input" type="text" multiple="true" label="List of genbank identifiers" help="Coma separated list (ex: GCA_000007385.1,GCA_000010025.1,GCA_000019585.2)"/> + <param type="select" name="software" label="Choose the pan-genome software"> + <option value="roary">Roary</option> + <option value="panacota">PanACoTA</option> + <option value="pggb">PanGenome Graph Builder (PGGB)</option> + </param> + </when> + <when value="genbanks"> + <param name="private_genomes" type="data" format="zip" label="Zip of genbank files" optional="true"/> + <param type="select" name="software" label="Choose the pan-genome software"> + <option value="roary">Roary</option> + <option value="panacota">PanACoTA</option> + <option value="pggb">PanGenome Graph Builder (PGGB)</option> + </param> + </when> + <when value="fasta"> + <param name="private_genomes_fasta" type="data" format="zip" label="Zip of Fasta files" optional="true"/> + <param name="private_genomes" type="data" format="zip" label="Zip of GFF files" optional="true"/> + <param type="select" name="software" label="Choose the pan-genome software"> + <option value="orthofinder">OrthoFinder</option> + <option value="cactus">Minigraph-Cactus</option> + <option value="pggb">PanGenome Graph Builder (PGGB)</option> + </param> + </when> + </conditional> + <param name="min_identity" type="text" value="80" label="Minimum percentage identity for BlastP" /> - <param type="select" name="software" label="Choose the pan-genome software"> - <option value="pgap">PGAP</option> - <option value="roary">Roary</option> - <option value="panacota">PanACoTA</option> - <option value="orthofinder">OrthoFinder</option> - <option value="cactus">Minigraph-Cactus</option> - <option value="pggb">PanGenome Graph Builder (PGGB)</option> - </param> </inputs> <outputs> @@ -110,5 +139,64 @@ <data format="txt" name="logfile" label="Logfile"/> <data format="txt" name="roary_log" label="Roary Logfile"/> </outputs> +<tests> + <test> + <param name="input" value="GCA_000007385.1,GCA_000010025.1,GCA_000019585.2"/> + <param name="min_identity" value="80"/> + <param name="software" value="panacota"/> + <param name="private_genomes" value=""/> + <param name="private_genomes_fasta" value=""/> + <output name="distance_matrix" value="Accessory_based_distance_matrix.txt"/> + <output name="fastani" value="ANI.txt"/> + </test> + </tests> + <help> + +PanExplorer +======= + + PanExplorer workflow is a snakemake worklow that can be run in the backend of the PanExplorer web application. + + Homepage: https://panexplorer.southgreen.fr/ + + It allows to perform a pan-genome analysis using published and annotated bacteria genomes, using different tools that can be invoked: Roary, PGAP, PanACoTA. + + Pangenome graph builder softwares have been implemented recently in the pipeline: Minigraph-Cactus and PGGB (PanGenome Graph Builder) + + It provides a presence/absence matrix of genes, an UpsetR Diagram for synthetizing the matrix information and a COG assignation summary for each strain. + +Please visit the GitHub page for the PanExplorer workflow at: https://github.com/SouthGreenPlatform/PanExplorer_workflow + + +Inputs +------ + + Inputs can be provided as one of the following: + + * **List of genbank assembly identifiers** comma-separated(ex: GCA_000007385.1,GCA_000010025.1,GCA_000019585.2) + * **Zip of genbank files** They must include the gene annotation and the complete sequence data + * **Zip of FASTA file of genomes + Zip of GFF annotation files**: In order to make the association between sequence and annotation, they must be named with the same basename as follows: genome1.fasta, genome1.gff, myspeciesXXX.fasta, myspeciesXXX.gff... + + + +Outputs +------ + + Among the outputs: + + * **Pangenome presence absence matrix** Pangene presence/absence matrix indicating the PAV (Presence Absence Variation) of clustered genes. + * **PanBased NJ tree** Distance tree based on PAV data + * **Heaps law alpha** Estimating if a pan-genome is open or closed based on a Heaps law model. + * **Rarefaction curves** A rarefaction curve is the cumulative number of gene clusters we observe as more and more genomes are being considered + * **ANI** Average Nucleotide Identity between genomes + * **ANI heatmap** image as SVG + * **VCF file** If a pan-genome graph software has been selected, it provides a VCF of variations among all samples. + + + + </help> + <citations> + <citation type="doi">10.1093/bioinformatics/btac504</citation> + </citations> </tool>
--- a/Perl/convertANI.pl Thu May 30 20:07:55 2024 +0000 +++ b/Perl/convertANI.pl Tue Dec 10 16:20:53 2024 +0000 @@ -5,6 +5,7 @@ my $file = $ARGV[0]; my $metadata = $ARGV[1]; + my %strains; open(F,$metadata); while(<F>){ @@ -17,6 +18,7 @@ my %ANIs; my %genomes; +my %genome_names; my $num_line = 0; open(F,$file); <F>; @@ -27,6 +29,11 @@ my @infos = split(/\t/,$line); my $genome = $infos[0]; $genome = $strains{$genome}; + $genome_names{$genome}++; + if ($genome_names{$genome} > 1){ + $genome = $genome . ".". $genome_names{$genome}; + } + $genomes{$num_line} = $genome; for (my $i = 1; $i <= $#infos; $i++){ $ANIs{$i}{$num_line} = $infos[$i]; @@ -36,6 +43,7 @@ close(F); print "Genomes"; + foreach my $i(sort keys(%ANIs)){ print "\t".$genomes{$i}; }
--- a/Perl/generateConfig.pl Thu May 30 20:07:55 2024 +0000 +++ b/Perl/generateConfig.pl Tue Dec 10 16:20:53 2024 +0000 @@ -17,11 +17,14 @@ my @list_ids = split(/,/,$list); my %data = (); -foreach my $id(@list_ids){ - push @{$data{"ids"}}, "$id"; +# case list of accessions +if ($list ne 'None'){ + foreach my $id(@list_ids){ + push @{$data{"ids"}}, "$id"; + } } # case fasta+gff -if ($zip ne "None" && $zip_fasta ne "None"){ +elsif ($zip ne "None" && $zip_fasta ne "None"){ system("rm -rf $zip.genomeszip"); mkdir("$zip.genomeszip"); chdir("$zip.genomeszip");
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R/Heatmaply.R Tue Dec 10 16:20:53 2024 +0000 @@ -0,0 +1,26 @@ +#!/usr/bin/R + +library("optparse") +library(heatmaply) + +option_list = list( + make_option(c("-f", "--file"), type="character", default=NULL, + help="dataset file name", metavar="character"), + make_option(c("-o", "--out"), type="character", default="out.txt", + help="output file name [default= %default]", metavar="character") +); +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +if (is.null(opt$file)){ + print_help(opt_parser) + stop("At least one argument must be supplied (input file).\n", call.=FALSE) +} + +if (is.null(opt$out)){ + print_help(opt_parser) + stop("At least one argument must be supplied (out file).\n", call.=FALSE) +} + +mydata <- read.table(opt$file,sep="\t",fill=TRUE,header=TRUE, row.names = 1) +heatmaply(mydata,file = "heatmaply.html",plot_method="plotly",scale_fill_gradient_fun = ggplot2::scale_fill_gradient2( low = "white" , high = "blue", limits = c(0, 100)))
--- a/Snakemake_files/Snakefile_pggb_heatmap_upset Thu May 30 20:07:55 2024 +0000 +++ b/Snakemake_files/Snakefile_pggb_heatmap_upset Tue Dec 10 16:20:53 2024 +0000 @@ -53,7 +53,7 @@ """ cat outputs/genomes/*fasta >outputs/genomes/all_genomes.fa samtools faidx outputs/genomes/all_genomes.fa - reference=$(head -1 outputs/genomes/strains.txt | awk '{{print $2}}') + reference=$(head -1 outputs/genomes/all_genomes.fa | awk '{{print $2}}') pggb -i outputs/genomes/all_genomes.fa -o outputs/pggb_out -V $reference -m mv outputs/pggb_out/all_genomes.*smooth.final.gfa {output.gfa} mv outputs/pggb_out/all_genomes.*lay.draw.png {output.png1}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ANI.txt Tue Dec 10 16:20:53 2024 +0000 @@ -0,0 +1,4 @@ +Genomes Xanthomonas_oryzae_pv_oryzae_KACC_10331 Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA Xanthomonas_oryzae_pv_oryzae_PXO99A +Xanthomonas_oryzae_pv_oryzae_KACC_10331 100 99.645630 99.449890 +Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA 99.645630 100 99.396011 +Xanthomonas_oryzae_pv_oryzae_PXO99A 99.449890 99.396011 100
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Accessory_based_distance_matrix.txt Tue Dec 10 16:20:53 2024 +0000 @@ -0,0 +1,10 @@ +"row" "col" "value" +"1" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" 0 +"2" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" 1504 +"3" "Xanthomonas_oryzae_pv_oryzae_PXO99A" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" 1994 +"4" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" 1504 +"5" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" 0 +"6" "Xanthomonas_oryzae_pv_oryzae_PXO99A" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" 1516 +"7" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" "Xanthomonas_oryzae_pv_oryzae_PXO99A" 1994 +"8" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" "Xanthomonas_oryzae_pv_oryzae_PXO99A" 1516 +"9" "Xanthomonas_oryzae_pv_oryzae_PXO99A" "Xanthomonas_oryzae_pv_oryzae_PXO99A" 0