changeset 0:024ea3c4c29f draft

planemo upload for repository https://github.com/usegalaxy-au/tools-au/tree/master/tools/metawrapmg commit e8f404630d1b01ef5f110369f0cc6eac03d2d2d7
author galaxy-australia
date Mon, 30 Jan 2023 22:28:33 +0000
parents
children e3a64a1d2a6e
files help_text.md macros.xml metawrapmg_binning.xml test-data/mapped_reads.r1.fastq.gz test-data/mapped_reads.r2.fastq.gz test-data/subset.fasta.gz test-data/test02.contigs test-data/test02.stats
diffstat 8 files changed, 288 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/help_text.md	Mon Jan 30 22:28:33 2023 +0000
@@ -0,0 +1,34 @@
+## MetaWRAP
+
+MetaWRAP aims to be an easy-to-use metagenomic wrapper suite that accomplishes the core tasks of metagenomic analysis.
+Additionally, metaWRAP takes bin extraction and analysis to the next level. metaWRAP is meant to be a fast and simple approach before you delve deeper into parameterization of your analysis.
+MetaWRAP can be applied to a variety of environments, including gut, water, and soil microbiomes (see metaWRAP paper for benchmarks).
+
+### MetaWRAP binning module
+
+The metaWRAP::Binning module is meant to be a convenient wrapper around three metagenomic binning software: MaxBin2, metaBAT2, and CONCOCT.
+First the metagenomic assembly is indexed with bwa-index, and then paired end reads from any number of samples are aligned to it. The alignments are sorted and compressed with samtools, and library insert size statistics are also gathered at the same time (insert size average and standard deviation).
+metaBAT2’s jgi_summarize_bam_contig_depths function is used to generate contig adundance table, and it is then converted into the correct format for each of the three binners to take as input.
+After MaxBin2, metaBAT2, and CONCOCT finish binning the contigs with default settings, the final bins folders are created with formatted bin fasta files.
+CheckM’s lineage_wf function is used to predict essential genes and estimate the completion and contamination of each bin.
+
+### MetaWRAP bin refinement
+
+The metaWRAP::Bin_refinement module utilizes a hybrid approach to take in two or three bin sets that were obtained with different software and produces a consolidated, improved bin set.
+First, binning_refiner is used to create hybridized bins from every possible combination of sets.
+If there were three bin sets: A, B, and C, then the following hybrid sets will be produced with binning_refiner: AB, BC, AC, and ABC.
+CheckM is then run to evaluate the completion and contamination of the bins in each of the 7 bin sets (3 originals, 4 hybridized).
+The bins sets are then iteratively compared to each other, and each pair is consolidated into an improved bin set.
+To do this, the same bin is identified within the two bin sets based on a minimum of 80% overlap in genome length, and the better bin is determined based on which bin has the higher score.
+The scoring function is S=Completion-5*Contamination.
+After all bin sets are incorporated into the consolidated bin collection, a de-replication function removes any duplicate contigs.
+If a contig is present in more than one bin, it is removed from all but the best bin (based on scoring function).
+CheckM is then run on the final bin set and a final report file is generated showing the completion, contamination, and other statistics generated by CheckM for each bin.
+Completion and contamination rank plots are also generated to evaluate the success of the Bin_refinement module, and compare its output to the quality of the original bins.
+
+
+---
+
+MetaWRAP's home page is [bxlab/metaWRAP](https://github.com/bxlab/metaWRAP).
+
+This tool was wrapped by the Galaxy Australia team.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Jan 30 22:28:33 2023 +0000
@@ -0,0 +1,22 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.3.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">22.05</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">metawrap-mg</requirement>
+        </requirements>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">
+                https://doi.org/10.1186/s40168-018-0541-1
+            </citation>
+        </citations>
+    </xml>
+    <xml name="xrefs">
+        <xrefs>
+            <xref type="bio.tools">metawrap</xref>
+        </xrefs>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metawrapmg_binning.xml	Mon Jan 30 22:28:33 2023 +0000
@@ -0,0 +1,189 @@
+<tool id="metawrapmg_binning" name="MetaWRAP" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
+    <description>metagenome binning pipeline</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="xrefs"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+            ## set memory usage
+            if [ -n "\$GALAXY_MEMORY_MB" ] ; then
+                GALAXY_MEMORY_GB=\$((GALAXY_MEMORY_MB / 1024)) ;
+            fi ;
+
+            ##################
+            ## SET UP FILES ##
+            ##################
+
+            ## should always be FASTA
+            #set mg_fn = 'metagenome.' + str($metagenome.ext)
+            ln -s '$metagenome' $mg_fn
+            &&
+
+            ## Only FASTQ. Separate files for each sample. Metawrap checks for
+            ## files named _1.fastq and _2.fastq.
+            #set input1_fn = 'reads_1.fastq'
+            ln -s '$input_1' $input1_fn
+            &&
+
+            #set input2_fn = 'reads_2.fastq'
+            ln -s '$input_2' $input2_fn
+            &&
+
+            #####################
+            ## INITIAL BINNING ##
+            #####################
+
+            metawrap binning 
+            --metabat2 --maxbin2 --concoct 
+            -a '$mg_fn'
+            -m \${GALAXY_MEMORY_GB:-16}
+            -o INITIAL_BINNING
+            -t \${GALAXY_SLOTS:-4}
+            '$input1_fn'
+            '$input2_fn'
+            &&
+
+            ## Check which binning programs produced bins            
+            bin_dirs=(INITIAL_BINNING/concoct_bins INITIAL_BINNING/maxbin2_bins INITIAL_BINNING/metabat2_bins) &&
+            switches=('-A' '-B' '-C') &&
+
+            i=0 &&
+            bin_string="" &&
+
+            for dir in "\${bin_dirs[@]}" ; do
+                if find "\${dir}" -mindepth 1 -maxdepth 1 | read; then
+                    bin_string="\${bin_string} \${switches[\$i]} \${dir}" ;
+                    i+=1 ;
+                fi
+            done &&
+
+            ####################
+            ## BIN REFINEMENT ##
+            ####################
+
+            ## The checkm database is included in the conda package.
+            ## Requires metawrap-mg_1.3.0--hdfd78af_1 or later. See
+            ## https://github.com/bioconda/bioconda-recipes/pull/38299.
+
+            metawrap bin_refinement
+            -t \${GALAXY_SLOTS:-4}
+            -m \${GALAXY_MEMORY_GB:-16}
+            -c $binning.c
+            -x $binning.x
+            -o BIN_REFINEMENT
+            ## Only run bin_refinement on bins with contigs
+            \${bin_string}
+    ]]></command>
+    <inputs>
+        <param name="metagenome" format="fasta" type="data" label="Metagenome" help="Metagenome co-assembly for binning" />
+        <param name="input_1" format="fastqsanger" type="data" label="Read 1" help="Original reads that were used for the assembly: read 1." />
+        <param name="input_2" format="fastqsanger" type="data" label="Read 2" help="Original reads that were used for the assembly: read 2." />
+        <section name="binning" title="Binning parameters" expanded="false">
+            <param argument='-c' type="integer" value="70" min="50" max="100" label="Percent completion" help="Minimum % completion of bins" />
+            <param argument='-x' type="integer" value="10" min="0" max="100" label="Percent contamination" help="Maximum % contamination of bins that is acceptable" />
+        </section>
+    </inputs>
+    <outputs>
+        <!-- contigs binned into fasta files -->
+        <collection name="metawrap_bins" type="list" label="MetaWRAP on ${on_string}: bins">
+            <discover_datasets pattern="metawrap_\d+_\d+_bins/(?P&lt;designation&gt;.+)\.fa" format="fasta" directory="BIN_REFINEMENT" recurse="true" match_relative_path="true" visible="false" />
+        </collection>
+        <!-- summary figures -->
+        <collection name="metawrap_figures" type="list" label="MetaWRAP on ${on_string}: summary figures">
+            <discover_datasets pattern="__designation_and_ext__" directory="BIN_REFINEMENT/figures" visible="false" />
+        </collection>
+        <!-- statistics on binning -->
+        <collection name="metawrap_stats" type="list" label="MetaWRAP on ${on_string}: stat files">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.stats" format="tabular" directory="BIN_REFINEMENT" visible="false" />
+        </collection>
+        <!-- which contig went into which bin -->
+        <collection name="metawrap_contigs" type="list" label="MetaWRAP on ${on_string}: contig assignments">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.contigs" format="tabular" directory="BIN_REFINEMENT" visible="false" />
+        </collection>
+    </outputs>
+    <tests>
+        <!-- 01: basic function -->
+        <test>
+            <param name="metagenome" value="subset.fasta.gz"/>
+            <param name="input_1" value="mapped_reads.r1.fastq.gz"/>
+            <param name="input_2" value="mapped_reads.r2.fastq.gz"/>
+            <param name="c" value="60"/>
+            <param name="x" value="15"/>
+            <!-- this is the main output, but it's too large to test -->
+            <!-- <output_collection name="metawrap_bins" type="list">
+                <element name="bin.1" file="test02.fa" ftype="fasta"/>
+            </output_collection> -->
+            <output_collection name="metawrap_stats" type="list">
+                <element name="metawrap_60_15_bins" file="test02.stats" ftype="tabular"/>
+            </output_collection>
+            <output_collection name="metawrap_contigs" type="list">
+                <element name="metawrap_60_15_bins" file="test02.contigs" ftype="tabular"/>
+            </output_collection>
+        </test>
+    </tests>
+        <help><![CDATA[
+MetaWRAP
+--------
+
+MetaWRAP aims to be an easy-to-use metagenomic wrapper suite that
+accomplishes the core tasks of metagenomic analysis. Additionally,
+metaWRAP takes bin extraction and analysis to the next level. metaWRAP
+is meant to be a fast and simple approach before you delve deeper into
+parameterization of your analysis. MetaWRAP can be applied to a variety
+of environments, including gut, water, and soil microbiomes (see
+metaWRAP paper for benchmarks).
+
+MetaWRAP binning module
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The metaWRAP::Binning module is meant to be a convenient wrapper around
+three metagenomic binning software: MaxBin2, metaBAT2, and CONCOCT.
+First the metagenomic assembly is indexed with bwa-index, and then
+paired end reads from any number of samples are aligned to it. The
+alignments are sorted and compressed with samtools, and library insert
+size statistics are also gathered at the same time (insert size average
+and standard deviation). metaBAT2’s jgi_summarize_bam_contig_depths
+function is used to generate contig adundance table, and it is then
+converted into the correct format for each of the three binners to take
+as input. After MaxBin2, metaBAT2, and CONCOCT finish binning the
+contigs with default settings, the final bins folders are created with
+formatted bin fasta files. CheckM’s lineage_wf function is used to
+predict essential genes and estimate the completion and contamination of
+each bin.
+
+MetaWRAP bin refinement
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The metaWRAP::Bin_refinement module utilizes a hybrid approach to take
+in two or three bin sets that were obtained with different software and
+produces a consolidated, improved bin set. First, binning_refiner is
+used to create hybridized bins from every possible combination of sets.
+If there were three bin sets: A, B, and C, then the following hybrid
+sets will be produced with binning_refiner: AB, BC, AC, and ABC. CheckM
+is then run to evaluate the completion and contamination of the bins in
+each of the 7 bin sets (3 originals, 4 hybridized). The bins sets are
+then iteratively compared to each other, and each pair is consolidated
+into an improved bin set. To do this, the same bin is identified within
+the two bin sets based on a minimum of 80% overlap in genome length, and
+the better bin is determined based on which bin has the higher score.
+The scoring function is S=Completion-5*Contamination. After all bin sets
+are incorporated into the consolidated bin collection, a de-replication
+function removes any duplicate contigs. If a contig is present in more
+than one bin, it is removed from all but the best bin (based on scoring
+function). CheckM is then run on the final bin set and a final report
+file is generated showing the completion, contamination, and other
+statistics generated by CheckM for each bin. Completion and
+contamination rank plots are also generated to evaluate the success of
+the Bin_refinement module, and compare its output to the quality of the
+original bins.
+
+--------------
+
+MetaWRAP’s home page is
+`bxlab/metaWRAP <https://github.com/bxlab/metaWRAP>`__.
+
+This tool was wrapped by the Galaxy Australia team.
+        ]]></help>
+    <expand macro="citations"/>
+</tool>
Binary file test-data/mapped_reads.r1.fastq.gz has changed
Binary file test-data/mapped_reads.r2.fastq.gz has changed
Binary file test-data/subset.fasta.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test02.contigs	Mon Jan 30 22:28:33 2023 +0000
@@ -0,0 +1,41 @@
+NODE_2_length_158684_cov_2.789534	bin.1
+NODE_3_length_138621_cov_2.416422	bin.1
+NODE_6_length_106569_cov_3.096156	bin.1
+NODE_7_length_99368_cov_2.860562	bin.1
+NODE_8_length_95669_cov_2.506714	bin.1
+NODE_10_length_88523_cov_2.243252	bin.1
+NODE_11_length_86536_cov_2.926990	bin.1
+NODE_13_length_73331_cov_2.369780	bin.1
+NODE_14_length_72311_cov_2.340345	bin.1
+NODE_15_length_72135_cov_2.745671	bin.1
+NODE_16_length_71859_cov_2.918389	bin.1
+NODE_17_length_70006_cov_2.553159	bin.1
+NODE_24_length_58826_cov_2.290024	bin.1
+NODE_26_length_57188_cov_2.464320	bin.1
+NODE_27_length_54578_cov_2.838857	bin.1
+NODE_30_length_51316_cov_2.828934	bin.1
+NODE_44_length_41143_cov_2.951908	bin.1
+NODE_47_length_40493_cov_2.795440	bin.1
+NODE_49_length_39976_cov_3.111871	bin.1
+NODE_58_length_35924_cov_2.623965	bin.1
+NODE_72_length_33102_cov_2.542954	bin.1
+NODE_89_length_30260_cov_2.967621	bin.1
+NODE_102_length_28495_cov_2.496167	bin.1
+NODE_118_length_26032_cov_2.640605	bin.1
+NODE_119_length_26028_cov_2.951065	bin.1
+NODE_153_length_22539_cov_2.899173	bin.1
+NODE_167_length_21736_cov_2.597805	bin.1
+NODE_229_length_18213_cov_2.462496	bin.1
+NODE_260_length_17127_cov_3.016343	bin.1
+NODE_277_length_16414_cov_2.366465	bin.1
+NODE_370_length_13686_cov_3.065733	bin.1
+NODE_381_length_13339_cov_3.032972	bin.1
+NODE_485_length_11839_cov_2.628564	bin.1
+NODE_502_length_11654_cov_2.455643	bin.1
+NODE_616_length_10584_cov_2.555798	bin.1
+NODE_725_length_9651_cov_2.904023	bin.1
+NODE_1206_length_7144_cov_2.231768	bin.1
+NODE_1409_length_6558_cov_2.842996	bin.1
+NODE_1437_length_6494_cov_3.114769	bin.1
+NODE_1488_length_6399_cov_3.331494	bin.1
+NODE_2109_length_5159_cov_3.299177	bin.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test02.stats	Mon Jan 30 22:28:33 2023 +0000
@@ -0,0 +1,2 @@
+bin	completeness	contamination	GC	lineage	N50	size	binner
+bin.1	93.73	0.335	0.406	Clostridiales	70006	1855509	binsAB