diff instrain_compare.xml @ 0:dff92aac9f75 draft

"planemo upload for repository https://github.com/MrOlm/inStrain commit e6eae71231e551c08aa96afc9f15b8ba87676101"
author iuc
date Wed, 11 Aug 2021 21:11:53 +0000
parents
children 92a7945118a9
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/instrain_compare.xml	Wed Aug 11 21:11:53 2021 +0000
@@ -0,0 +1,212 @@
+<tool id="instrain_compare" name="InStrain Compare" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Compares multiple inStrain profiles (popANI, coverage_overlap, etc.) </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="edam_ontology"/>
+    <expand macro="requirements"/>
+    <version_command>inStrain compare --version</version_command>
+    <command detect_errors="exit_code"><![CDATA[
+#if $stb
+    ln -s '$stb' 'stb_file.stb' &&
+#end if
+#if $other.genome
+    ln -s '$other.genome' 'genome_file.stb' &&
+#end if
+#for $i, $s in enumerate($input_is)
+    #if $s
+        input_count=$i    
+        mkdir -p $i-input.IS &&
+        unzip '$s' -d '$i-input.IS/' &&
+    #end if
+#end for
+inStrain compare
+    --input
+    #for $i, $s in enumerate($input_is)
+        #if $s
+            '$i-input.IS'
+        #end if
+    #end for    
+    --output 'output.IS.COMPARE'
+    --processes "\${GALAXY_SLOTS:-6}"
+#if $stb
+    --stb 'stb_file.stb'
+#end if    
+    --min_cov $variant_calling.min_cov
+    --min_freq $variant_calling.min_freq
+    --fdr $variant_calling.fdr
+    $database.database_mode
+    --breadth $database.breadth
+#if $other.scaffolds
+    --scaffolds '$other.scaffolds'
+#end if
+#if $other.genome
+    --genome 'genome_file.stb'
+#end if
+    $other.store_coverage_overlap
+    $other.store_mismatch_locations
+    $other.include_self_comparisons
+    $other.skip_plot_generation
+    --group_length $other.group_length
+    --ani_threshold $genome_clustering.ani_threshold
+    --coverage_treshold $genome_clustering.coverage_treshold
+    --clusterAlg '$genome_clustering.clusterAlg'
+    ]]></command>    
+    <inputs>
+        <param name="input_is" type="data" format="zip" multiple="true" label="inStrain Profile IS folder" help=" The Zip files for the IS profiles outputs you want to compare"/>                    
+        <param argument="--stb" type="data" format="tabular" optional="true" label="Scaffold to bin" help="This can be a file with each line listing a scaffold and a bin name, tab-seperated. This can also be a space-seperated list of .fasta files, with one genome per .fasta file. If nothing is provided, all scaffolds will be treated as belonging to the same genome"/>
+        <section name="variant_calling" title="Variant Calling Options" expanded="true">
+            <param argument="--min_cov" type="integer" value="5" label=" Minimum coverage to call an variant"/>
+            <param argument="--min_freq" type="float" value="0.05" label="Minimum SNP frequency to confirm a SNV" help="Both this AND the FDR snp count cutoff must be true to call a SNP."/>
+            <param argument="--fdr" type="float" value="1e-06" min="0" max="1" help="SNP false discovery rate- based on simulation data with a 0.1 percent error rate (Q30)"/>
+        </section>
+        <section name="database" title="Database Mode Parameters" expanded="true">
+            <param argument="--database_mode" type="boolean" truevalue="--debugdatabase_mode" falsevalue="" checked="false" label="Automatically determine which genomes are present in each Profile and only compare scaffolds from those genomes." help="All profiles must have run Profile with the same .stb"/>
+            <param argument="--breadth" type="float" value="0.5" label="Minimum breadth_minCov required to count a genome present"/>
+        </section>
+        <section name="other" title="Other Options" expanded="true">
+            <param argument="--scaffolds" type="data" format="fasta" optional="true" label="Location to a list of scaffolds to compare. You can also make this a .fasta file and it will load the scaffold names"/>
+            <param argument="--genome" type="data" format="tabular" optional="true" label="Run scaffolds belonging to this single genome only. Must provide an .stb file"/>
+            <param argument="--store_coverage_overlap" type="boolean" truevalue="--store_coverage_overlap" falsevalue="" checked="false" label="Store coverage overlap on an mm level"/>
+            <param argument="--store_mismatch_locations" type="boolean" truevalue="--store_mismatch_locations" falsevalue="" checked="false" label="Store the locations of SNPs"/>
+            <param argument="--include_self_comparisons" type="boolean" truevalue="--include_self_comparisons" falsevalue="" checked="false" label="Compare IS profiles against themself"/>
+            <param argument="--skip_plot_generation" type="boolean" truevalue="--skip_plot_generation" falsevalue="" checked="false" label="Dont create plots at the end of the run"/>
+            <param argument="--group_length" type="integer" value="10000000" label="How many bp to compare simultaneously" help="higher will use more RAM and run more quickly"/>
+        </section>
+        <section name="genome_clustering" title="Genome Clustering Options" expanded="true">
+            <param argument="--ani_threshold" type="float" value="0.99999" label="popANI threshold to cluster genomes at" help="Must provide .stb file to do so"/>
+            <param argument="--coverage_treshold" type="float" value="0.1" label="Minimum percent_genome_compared for a genome comparison to count" help="if below the popANI will be set to 0"/>
+            <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes">
+                <option value="average" selected="true">Average</option>
+                <option value="single">Single</option>
+                <option value="ward">Ward</option>
+                <option value="complete">complete</option>
+                <option value="centroid">centroid</option>
+                <option value="weighted">weighted</option>
+                <option value="median">median</option>
+            </param>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="comparisonsTable" format="tabular" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_comparisonsTable.tsv" label="Comparisons Table: Summarizes the differences between two inStrain profiles on a scaffold by scaffold level" />
+        <data name="pairwise_SNP_locations" format="tabular" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_pairwise_SNP_locations.tsv" label="Pairwise SNP locations: Lists the locations of all differences between profiles." />
+        <data name="genomeWide_compare" format="tabular" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_genomeWide_compare.tsv" label="Genome Wide compare: A genome-level summary of the differences detected by inStrain compare." />
+        <data format="tabular" name="strain_clusters" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_strain_clusters.tsv" label="Strain clusters: Generate strain-level clusters" />
+        <data format="pdf" name="inStrainCompare_dendrograms" from_work_dir="output.IS.COMPARE/figures/output.IS.COMPARE_inStrainCompare_dendrograms.pdf" label="inStrain Compare dendrograms: genomeWide microdiveristy metrics" />       
+    </outputs>
+    <tests>        
+    <test expect_num_outputs="5">
+        <param name="stb" value="N5_271_010G1.maxbin2.stb"/>
+        <param name="input_is" value="N5_271_010G1_scaffold_min1000.fa-vs-N5_271_010G1.IS.zip,N5_271_010G1_scaffold_min1000.fa-vs-N5_271_010G2.IS.zip"/>
+        <section name="variant_calling">
+            <param name="min_cov" value="5"/>
+            <param name="min_freq" value="0.05"/>
+            <param name="fdr" value="1e-06"/>
+        </section>
+        <section name="database">
+            <param name="database_mode" value="false"/>
+            <param name="breadth" value="0.5"/>
+        </section>
+        <section name="other">
+            <param name="store_coverage_overlap" value="false"/>
+            <param name="store_mismatch_locations" value="false"/>
+            <param name="include_self_comparisons" value="false"/>
+            <param name="skip_plot_generation" value="false"/>
+            <param name="group_length" value="10000000"/>
+        </section>
+        <section name="genome_clustering">
+            <param name="ani_threshold" value="0.99999"/>
+            <param name="coverage_treshold" value="0.1"/>
+            <param name="clusterAlg" value="average"/>
+        </section>
+        <output name="comparisonsTable">
+            <assert_contents>
+                <has_text text="N5_271_010G1_scaffold_73"/>
+                <has_n_lines n="168"/>
+                <has_n_columns n="11"/>
+            </assert_contents>
+        </output>
+        <output name="pairwise_SNP_locations">
+            <assert_contents>
+                <has_n_lines n="0"/>
+            </assert_contents>
+        </output>
+        <output name="genomeWide_compare">
+            <assert_contents>
+                <has_text text="name1"/>
+                <has_n_lines n="3"/>
+                <has_n_columns n="10"/>
+            </assert_contents>
+        </output>
+        <output name="strain_clusters">
+            <assert_contents>
+                <has_text text="1_1"/>
+                <has_n_lines n="5"/>
+                <has_n_columns n="3"/>
+            </assert_contents>
+        </output>
+        <output name="inStrainCompare_dendrograms">
+            <assert_contents>
+                <has_size value="384512" delta="10000" />
+            </assert_contents>
+        </output>
+    </test>
+    </tests>
+    <help><![CDATA[
+@HELP_HEADER@
+
+Compare
+=======
+
+is part of the inStrain module that provides the ability to compare multiple inStrain profiles (created by running inStrain profile).
+
+Note
+====
+
+inStrain can only compare inStrain profiles that have been mapped to the same .fasta file
+
+inStrain compare does pairwise comparisons between each input inStrain profile. For each pair, a series of steps are undertaken:
+
+1. All positions in which both IS_profile objects have at least min_cov coverage (5x by default) are identified. This information can be stored in the output by using the flag –store_coverage_overlap, but due to it’s size, it’s not stored by default.
+
+
+2. Each position identified in step 1 is compared to calculate both conANI and popANI. The way that it compares positions is by testing whether the consensus base in sample 1 is detected at all in sample 2 and vice-versa. Detection of an allele in a sample is based on that allele being above the set -min_freq and -fdr. All detected differences between each pair of samples can be reported if the flag –store_mismatch_locations is set.
+
+
+3. The coverage overlap and the average nucleotide identity for each scaffold is reported. For details on how this is done.
+
+
+Inputs
+======
+
+Multiple inStrain profiles IS outputs (zip files), all mapped to the same .fasta file
+
+
+Outputs
+=======
+
+1. comparisonsTable.tsv
+   
+   Summarizes the differences between two inStrain profiles on a scaffold by scaffold level
+
+2. pairwise_SNP_locations.tsv
+
+   Lists the locations of all differences between profiles. Because it’s a big file, this will only be created is you include the flag --store_mismatch_locations in your inStrain compare command.
+
+3. genomeWide_compare.tsv
+
+   A genome-level summary of the differences detected by inStrain compare. Generated by running inStrain genome_wide on the results of inStrain compare
+
+4. strain_clusters.tsv
+
+   The result of clustering the pairwise comparison data provided in genomeWide_compare.tsv to generate strain-level clusters. Performed using hierarchical clustering in the same manner as the program dRep
+
+5. Compare dendrograms (PDF) figure/plot
+   
+   A dendrogram comparing all samples based on popANI and based on shared_bases.
+
+    ]]></help> 
+    <citations>
+        <citation type="doi">10.1101/2020.01.22.915579</citation>
+    </citations>
+</tool>
\ No newline at end of file