changeset 14:5a5c9a6b047b draft

Uploaded
author dereeper
date Tue, 10 Dec 2024 16:20:53 +0000
parents 152d7c43478b
children dbde253606c5
files COG/bac-genomics-scripts/LICENSE COG/bac-genomics-scripts/README.md COG/bac-genomics-scripts/calc_fastq-stats/README.md COG/bac-genomics-scripts/calc_fastq-stats/calc_fastq-stats.pl COG/bac-genomics-scripts/cat_seq/README.md COG/bac-genomics-scripts/cat_seq/cat_seq.pl COG/bac-genomics-scripts/cdd2cog/README.md COG/bac-genomics-scripts/cdd2cog/cdd2cog.pl COG/bac-genomics-scripts/cdd2cog/results/cog_stats.txt COG/bac-genomics-scripts/cdd2cog/results/func_stats.txt COG/bac-genomics-scripts/cdd2cog/results/protein-id_cog.txt COG/bac-genomics-scripts/cdd2cog/results/rps-blast_cog.txt COG/bac-genomics-scripts/cds_extractor/README.md COG/bac-genomics-scripts/cds_extractor/cds_extractor.pl COG/bac-genomics-scripts/ecoli_mlst/ADK.fas COG/bac-genomics-scripts/ecoli_mlst/FUMC.fas COG/bac-genomics-scripts/ecoli_mlst/GYRB.fas COG/bac-genomics-scripts/ecoli_mlst/ICD.fas COG/bac-genomics-scripts/ecoli_mlst/MDH.fas COG/bac-genomics-scripts/ecoli_mlst/PURA.fas COG/bac-genomics-scripts/ecoli_mlst/README.md COG/bac-genomics-scripts/ecoli_mlst/RECA.fas COG/bac-genomics-scripts/ecoli_mlst/ecoli_mlst.pl COG/bac-genomics-scripts/ecoli_mlst/publicSTs.txt COG/bac-genomics-scripts/genomes_feature_table/README.md COG/bac-genomics-scripts/genomes_feature_table/genomes_feature_table.pl COG/bac-genomics-scripts/ncbi_ftp_download/README.md COG/bac-genomics-scripts/ncbi_ftp_download/ncbi_ftp_concat_unpack.pl COG/bac-genomics-scripts/ncbi_ftp_download/ncbi_ftp_download.sh COG/bac-genomics-scripts/order_fastx/README.md COG/bac-genomics-scripts/order_fastx/order_fastx.pl COG/bac-genomics-scripts/po2anno/README.md COG/bac-genomics-scripts/po2anno/po2anno.pl COG/bac-genomics-scripts/po2group_stats/README.md COG/bac-genomics-scripts/po2group_stats/pics/README.md COG/bac-genomics-scripts/po2group_stats/pics/venn_diagram_logics.png COG/bac-genomics-scripts/po2group_stats/pics/venn_diagram_logics.svg COG/bac-genomics-scripts/po2group_stats/po2group_stats.pl COG/bac-genomics-scripts/prot_finder/README.md COG/bac-genomics-scripts/prot_finder/binary_group_stats.pl COG/bac-genomics-scripts/prot_finder/prot_binary_matrix.pl COG/bac-genomics-scripts/prot_finder/prot_finder.pl COG/bac-genomics-scripts/prot_finder/prot_finder_pipe.sh COG/bac-genomics-scripts/prot_finder/transpose_matrix.pl COG/bac-genomics-scripts/rename_fasta_id/README.md COG/bac-genomics-scripts/rename_fasta_id/rename_fasta_id.pl COG/bac-genomics-scripts/revcom_seq/README.md COG/bac-genomics-scripts/revcom_seq/revcom_seq.pl COG/bac-genomics-scripts/rod_finder/README.md COG/bac-genomics-scripts/rod_finder/blast_rod_finder.pl COG/bac-genomics-scripts/rod_finder/blast_rod_finder_legacy.sh COG/bac-genomics-scripts/sam_insert-size/README.md COG/bac-genomics-scripts/sam_insert-size/sam_insert-size.pl COG/bac-genomics-scripts/sample_fastx-txt/README.md COG/bac-genomics-scripts/sample_fastx-txt/sample_fastx-txt.pl COG/bac-genomics-scripts/seq_format-converter/README.md COG/bac-genomics-scripts/seq_format-converter/seq_format-converter.pl COG/bac-genomics-scripts/tbl2tab/README.md COG/bac-genomics-scripts/tbl2tab/example.tbl COG/bac-genomics-scripts/tbl2tab/example2.tab COG/bac-genomics-scripts/tbl2tab/tbl2tab.pl COG/bac-genomics-scripts/trunc_seq/README.md COG/bac-genomics-scripts/trunc_seq/trunc_seq.pl LICENSE PanExplorer.xml Perl/Blast_Filter.pl Perl/CalculateCogEnrichment.pl Perl/ConvertOrthofinderMatrix.pl Perl/ConvertPGAPMatrix.pl Perl/ConvertPanacotaMatrix.pl Perl/ConvertPanarooMatrix.pl Perl/ConvertRoaryMatrix.pl Perl/CreateGenePathsFromGFA.pl Perl/DNA_Transcription_Translation.pl Perl/GenerateHeatmapFromPAV.pl Perl/GeneratePAVfromBed.pl Perl/GetCogOfCluster.pl Perl/Heatmaply.pl Perl/Naegleria/assignFastqByITS.pl Perl/Naegleria/calculateFeatureDensitiesFromGFF.pl Perl/Naegleria/generateMauveJson.pl Perl/Naegleria/mergeSNP.pl Perl/ProjectPAVinCircos.pl Perl/bp_genbank2gff3.pl Perl/convertANI.pl Perl/generateConfig.pl Perl/get_data.pl Perl/reformatHeatmapSVG.pl Perl/remove_duplicates_in_gff.pl Perl/translate.pl Perl/wget.pl Python/Heatmap.py R/Heatmaply.R R/heatmap.R R/heatmap_ani.R R/heatmaply.R R/micropan_rarefaction.R R/upsetr.R README.md SkewIT/LICENSE SkewIT/README.md SkewIT/data/RefSeq97_Bacteria_GenusSkewIThresholds.txt SkewIT/data/RefSeq97_Bacteria_SkewI_incl.taxonomy.txt SkewIT/data/example_gcskewplot.png SkewIT/src/gcskew.py SkewIT/src/plot_gcskew.py SkewIT/src/skewi.py Snakemake_files/Snakefile_cactus_heatmap_upset Snakemake_files/Snakefile_orthofinder_heatmap_upset Snakemake_files/Snakefile_pggb_heatmap_upset Snakemake_files/Snakefile_wget_PGAP_heatmap_upset_COG Snakemake_files/Snakefile_wget_cactus_heatmap_upset_COG Snakemake_files/Snakefile_wget_cactus_heatmap_upset_COG2 Snakemake_files/Snakefile_wget_orthofinder_heatmap_upset_COG Snakemake_files/Snakefile_wget_panacota_heatmap_upset_COG Snakemake_files/Snakefile_wget_panacota_heatmap_upset_COG.old Snakemake_files/Snakefile_wget_panaroo_heatmap_upset_COG Snakemake_files/Snakefile_wget_pggb_heatmap_upset_COG Snakemake_files/Snakefile_wget_progressivecactus_heatmap_upset_COG Snakemake_files/Snakefile_wget_roary_heatmap_upset_COG circos_templates/circos1.conf config.yaml data/GCA_001518895.1.gb data/GCA_001746615.1.gb data/GCA_003382895.1.gb images/CP000030.1.full.draw.png images/CP000030.1.full.viz.png images/NZ_CP033176.1.full.viz.png images/all_genomes.fa.lay.draw.png images/all_genomes.fa.og.viz_multiqc.png images/dag.svg images/fastani.out.svg images/full.gfa.png images/heatmap.svg.complete.new.svg images/rarefaction_curves.svg images/upsetr.svg panexplorer_sbatch.sh test-data/ANI.txt test-data/Accessory_based_distance_matrix.txt
diffstat 7 files changed, 156 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/PanExplorer.xml	Thu May 30 20:07:55 2024 +0000
+++ b/PanExplorer.xml	Tue Dec 10 16:20:53 2024 +0000
@@ -1,4 +1,4 @@
-<tool id="PanExplorer" name="PanExplorer" version="1.0">
+<tool id="PanExplorer2" name="PanExplorer2" version="2.0">
   <description> Bacterial pan-genome analysis </description>
   <requirements>
   <!--
@@ -17,7 +17,14 @@
 
 export PANEX_PATH=${__tool_directory__};
 
-perl ${__tool_directory__}/Perl/generateConfig.pl '$private_genomes' '$input' config.yaml '$private_genomes_fasta';
+
+#if str($mode.mode) == "accessions":
+	    perl ${__tool_directory__}/Perl/generateConfig.pl 'None' '$input' config.yaml 'None';
+#else if str($mode.mode) == "genbanks":
+	    perl ${__tool_directory__}/Perl/generateConfig.pl '$private_genomes' 'None' config.yaml 'None';
+#else if str($mode.mode) == "fasta":
+            perl ${__tool_directory__}/Perl/generateConfig.pl '$private_genomes' 'None' config.yaml '$private_genomes_fasta';
+#end if
 
 cat config.yaml >$logfile;
 
@@ -74,18 +81,40 @@
 
 
   <inputs>
-  <param name="input" type="text" multiple="true" label="List of genbank identifiers" help="Coma separated list (ex: CP000235.1,CP001079.1,CP001759.1,CP015994.2)"/>
-  <param name="private_genomes" type="data" format="zip" label="Zip of genbank or GFF files" optional="true"/>
-  <param name="private_genomes_fasta" type="data" format="zip" label="Zip of Fasta files" optional="true"/>
+	  <conditional name="mode">
+                <param name = "mode" type="select" label="What is your inputs?">
+                        <option value="accessions">Prokaryote genomes: List of Genbank assembly accessions (GCA)</option>
+                        <option value="genbanks">Prokaryote genomes: Genbank files</option>
+                        <option value="fasta">Eukaryote genomes: FASTA + GFF files</option>
+                </param>
+                <when value="accessions">
+			<param name="input" type="text" multiple="true" label="List of genbank identifiers" help="Coma separated list (ex: GCA_000007385.1,GCA_000010025.1,GCA_000019585.2)"/>
+			<param type="select" name="software" label="Choose the pan-genome software">
+				<option value="roary">Roary</option>
+				<option value="panacota">PanACoTA</option>
+				<option value="pggb">PanGenome Graph Builder (PGGB)</option>
+			</param>
+                </when>
+                <when value="genbanks">
+			<param name="private_genomes" type="data" format="zip" label="Zip of genbank files" optional="true"/>
+			<param type="select" name="software" label="Choose the pan-genome software">
+                                <option value="roary">Roary</option>
+                                <option value="panacota">PanACoTA</option>
+                                <option value="pggb">PanGenome Graph Builder (PGGB)</option>
+                        </param>
+		</when>
+		<when value="fasta">
+			<param name="private_genomes_fasta" type="data" format="zip" label="Zip of Fasta files" optional="true"/>
+			<param name="private_genomes" type="data" format="zip" label="Zip of GFF files" optional="true"/>
+			<param type="select" name="software" label="Choose the pan-genome software">
+				<option value="orthofinder">OrthoFinder</option>
+				<option value="cactus">Minigraph-Cactus</option>
+				<option value="pggb">PanGenome Graph Builder (PGGB)</option>
+			</param>
+                </when>
+	</conditional>
+
   <param name="min_identity" type="text" value="80" label="Minimum percentage identity for BlastP" />
-  <param type="select" name="software" label="Choose the pan-genome software">
-    <option value="pgap">PGAP</option>
-    <option value="roary">Roary</option>
-    <option value="panacota">PanACoTA</option>
-    <option value="orthofinder">OrthoFinder</option>
-    <option value="cactus">Minigraph-Cactus</option>
-    <option value="pggb">PanGenome Graph Builder (PGGB)</option>
-  </param>
  </inputs>
 
  <outputs>
@@ -110,5 +139,64 @@
  <data format="txt" name="logfile" label="Logfile"/>
  <data format="txt" name="roary_log" label="Roary Logfile"/>
 </outputs>
+<tests>
+        <test>
+            <param name="input" value="GCA_000007385.1,GCA_000010025.1,GCA_000019585.2"/>
+            <param name="min_identity" value="80"/>
+            <param name="software" value="panacota"/>
+            <param name="private_genomes" value=""/>
+            <param name="private_genomes_fasta" value=""/>
+            <output name="distance_matrix" value="Accessory_based_distance_matrix.txt"/>
+            <output name="fastani" value="ANI.txt"/>
+        </test>
+ </tests>
+ <help>
+
+PanExplorer
+=======
+
+	 PanExplorer workflow is a snakemake worklow that can be run in the backend of the PanExplorer web application.
+
+	 Homepage: https://panexplorer.southgreen.fr/
+
+	 It allows to perform a pan-genome analysis using published and annotated bacteria genomes, using different tools that can be invoked: Roary, PGAP, PanACoTA.
+
+	 Pangenome graph builder softwares have been implemented recently in the pipeline: Minigraph-Cactus and PGGB (PanGenome Graph Builder)
+
+	 It provides a presence/absence matrix of genes, an UpsetR Diagram for synthetizing the matrix information and a COG assignation summary for each strain.
+	 
+Please visit the GitHub page for the PanExplorer workflow at: https://github.com/SouthGreenPlatform/PanExplorer_workflow
+
+
+Inputs
+------
+
+	 Inputs can be provided as one of the following:
+
+	 * **List of genbank assembly identifiers** comma-separated(ex: GCA_000007385.1,GCA_000010025.1,GCA_000019585.2)
+	 * **Zip of genbank files** They must include the gene annotation and the complete sequence data
+	 * **Zip of FASTA file of genomes + Zip of GFF annotation files**: In order to make the association between sequence and annotation, they must be named with the same basename as follows: genome1.fasta, genome1.gff, myspeciesXXX.fasta, myspeciesXXX.gff...
+
+	
+
+Outputs
+------
+
+	 Among the outputs:
+
+	 * **Pangenome presence absence matrix** Pangene presence/absence matrix indicating the PAV (Presence Absence Variation) of clustered genes.
+	 * **PanBased NJ tree** Distance tree based on PAV data
+	 * **Heaps law alpha** Estimating if a pan-genome is open or closed based on a Heaps law model.
+	 * **Rarefaction curves** A rarefaction curve is the cumulative number of gene clusters we observe as more and more genomes are being considered
+	 * **ANI** Average Nucleotide Identity between genomes
+	 * **ANI heatmap** image as SVG
+	 * **VCF file** If a pan-genome graph software has been selected, it provides a VCF of variations among all samples.
+
+
+
+    </help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btac504</citation>
+    </citations>
 
 </tool>
--- a/Perl/convertANI.pl	Thu May 30 20:07:55 2024 +0000
+++ b/Perl/convertANI.pl	Tue Dec 10 16:20:53 2024 +0000
@@ -5,6 +5,7 @@
 my $file = $ARGV[0];
 my $metadata = $ARGV[1];
 
+
 my %strains;
 open(F,$metadata);
 while(<F>){
@@ -17,6 +18,7 @@
 
 my %ANIs;
 my %genomes;
+my %genome_names;
 my $num_line = 0;
 open(F,$file);
 <F>;
@@ -27,6 +29,11 @@
 	my @infos = split(/\t/,$line);
 	my $genome = $infos[0];
 	$genome = $strains{$genome};
+	$genome_names{$genome}++;
+	if ($genome_names{$genome} > 1){
+		$genome = $genome . ".". $genome_names{$genome};
+	}
+
 	$genomes{$num_line} = $genome;
 	for (my $i = 1; $i <= $#infos; $i++){
 		$ANIs{$i}{$num_line} = $infos[$i];
@@ -36,6 +43,7 @@
 close(F);
 
 print "Genomes";
+
 foreach my $i(sort keys(%ANIs)){
 	print "\t".$genomes{$i};
 }
--- a/Perl/generateConfig.pl	Thu May 30 20:07:55 2024 +0000
+++ b/Perl/generateConfig.pl	Tue Dec 10 16:20:53 2024 +0000
@@ -17,11 +17,14 @@
 my @list_ids = split(/,/,$list);
 
 my %data = ();
-foreach my $id(@list_ids){
-	push @{$data{"ids"}}, "$id";
+# case list of accessions
+if ($list ne 'None'){
+	foreach my $id(@list_ids){
+		push @{$data{"ids"}}, "$id";
+	}
 }
 # case fasta+gff
-if ($zip ne "None" && $zip_fasta ne "None"){
+elsif ($zip ne "None" && $zip_fasta ne "None"){
 	system("rm -rf $zip.genomeszip");
 	mkdir("$zip.genomeszip");
 	chdir("$zip.genomeszip");
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/R/Heatmaply.R	Tue Dec 10 16:20:53 2024 +0000
@@ -0,0 +1,26 @@
+#!/usr/bin/R
+
+library("optparse")
+library(heatmaply)
+
+option_list = list(
+  make_option(c("-f", "--file"), type="character", default=NULL,
+              help="dataset file name", metavar="character"),
+  make_option(c("-o", "--out"), type="character", default="out.txt",
+              help="output file name [default= %default]", metavar="character")
+);
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+if (is.null(opt$file)){
+  print_help(opt_parser)
+  stop("At least one argument must be supplied (input file).\n", call.=FALSE)
+}
+
+if (is.null(opt$out)){
+  print_help(opt_parser)
+  stop("At least one argument must be supplied (out file).\n", call.=FALSE)
+}
+
+mydata <- read.table(opt$file,sep="\t",fill=TRUE,header=TRUE, row.names = 1)
+heatmaply(mydata,file = "heatmaply.html",plot_method="plotly",scale_fill_gradient_fun = ggplot2::scale_fill_gradient2( low = "white" , high = "blue", limits = c(0, 100)))
--- a/Snakemake_files/Snakefile_pggb_heatmap_upset	Thu May 30 20:07:55 2024 +0000
+++ b/Snakemake_files/Snakefile_pggb_heatmap_upset	Tue Dec 10 16:20:53 2024 +0000
@@ -53,7 +53,7 @@
         """
         cat outputs/genomes/*fasta >outputs/genomes/all_genomes.fa
         samtools faidx outputs/genomes/all_genomes.fa
-        reference=$(head -1 outputs/genomes/strains.txt | awk '{{print $2}}')
+        reference=$(head -1 outputs/genomes/all_genomes.fa | awk '{{print $2}}')
         pggb -i outputs/genomes/all_genomes.fa -o outputs/pggb_out -V $reference -m
         mv outputs/pggb_out/all_genomes.*smooth.final.gfa {output.gfa}
         mv outputs/pggb_out/all_genomes.*lay.draw.png {output.png1}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ANI.txt	Tue Dec 10 16:20:53 2024 +0000
@@ -0,0 +1,4 @@
+Genomes	Xanthomonas_oryzae_pv_oryzae_KACC_10331	Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA	Xanthomonas_oryzae_pv_oryzae_PXO99A
+Xanthomonas_oryzae_pv_oryzae_KACC_10331	100	99.645630	99.449890
+Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA	99.645630	100	99.396011
+Xanthomonas_oryzae_pv_oryzae_PXO99A	99.449890	99.396011	100
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Accessory_based_distance_matrix.txt	Tue Dec 10 16:20:53 2024 +0000
@@ -0,0 +1,10 @@
+"row" "col" "value"
+"1" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" 0
+"2" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" 1504
+"3" "Xanthomonas_oryzae_pv_oryzae_PXO99A" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" 1994
+"4" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" 1504
+"5" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" 0
+"6" "Xanthomonas_oryzae_pv_oryzae_PXO99A" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" 1516
+"7" "Xanthomonas_oryzae_pv_oryzae_KACC_10331" "Xanthomonas_oryzae_pv_oryzae_PXO99A" 1994
+"8" "Xanthomonas_oryzae_pv_oryzae_MAFF_311018_DNA" "Xanthomonas_oryzae_pv_oryzae_PXO99A" 1516
+"9" "Xanthomonas_oryzae_pv_oryzae_PXO99A" "Xanthomonas_oryzae_pv_oryzae_PXO99A" 0