view instrain_compare.xml @ 3:92a7945118a9 draft default tip

planemo upload for repository commit aa7c7a35834f36dad6e72d5f54089a578be60731
author iuc
date Thu, 02 Nov 2023 14:59:48 +0000
parents dff92aac9f75
line wrap: on
line source

<tool id="instrain_compare" name="InStrain Compare" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Compares multiple inStrain profiles (popANI, coverage_overlap, etc.) </description>
    <expand macro="biotools"/>
    <expand macro="requirements"/>
    <version_command>inStrain compare --version</version_command>
    <command detect_errors="exit_code"><![CDATA[
#if $stb
    ln -s '$stb' 'stb_file.stb' &&
#end if
#if $other.genome
    ln -s '$other.genome' 'genome_file.stb' &&
#end if
#for $i, $s in enumerate($input_is)
    #if $s
        mkdir -p $i-input.IS &&
        unzip '$s' -d '$i-input.IS/' &&
    #end if
#end for
inStrain compare
    #for $i, $s in enumerate($input_is)
        #if $s
        #end if
    #end for    
    --output 'output.IS.COMPARE'
    --processes "\${GALAXY_SLOTS:-6}"
#if $stb
    --stb 'stb_file.stb'
#end if    
    --min_cov $variant_calling.min_cov
    --min_freq $variant_calling.min_freq
    --fdr $variant_calling.fdr
    --breadth $database.breadth
#if $other.scaffolds
    --scaffolds '$other.scaffolds'
#end if
#if $other.genome
    --genome 'genome_file.stb'
#end if
    --group_length $other.group_length
    --ani_threshold $genome_clustering.ani_threshold
    --coverage_treshold $genome_clustering.coverage_treshold
    --clusterAlg '$genome_clustering.clusterAlg'
        <param name="input_is" type="data" format="zip" multiple="true" label="inStrain Profile IS folder" help=" The Zip files for the IS profiles outputs you want to compare"/>                    
        <param argument="--stb" type="data" format="tabular" optional="true" label="Scaffold to bin" help="This can be a file with each line listing a scaffold and a bin name, tab-seperated. This can also be a space-seperated list of .fasta files, with one genome per .fasta file. If nothing is provided, all scaffolds will be treated as belonging to the same genome"/>
        <section name="variant_calling" title="Variant Calling Options" expanded="true">
            <param argument="--min_cov" type="integer" value="5" label=" Minimum coverage to call an variant"/>
            <param argument="--min_freq" type="float" value="0.05" label="Minimum SNP frequency to confirm a SNV" help="Both this AND the FDR snp count cutoff must be true to call a SNP."/>
            <param argument="--fdr" type="float" value="1e-06" min="0" max="1" help="SNP false discovery rate- based on simulation data with a 0.1 percent error rate (Q30)"/>
        <section name="database" title="Database Mode Parameters" expanded="true">
            <param argument="--database_mode" type="boolean" truevalue="--debugdatabase_mode" falsevalue="" checked="false" label="Automatically determine which genomes are present in each Profile and only compare scaffolds from those genomes." help="All profiles must have run Profile with the same .stb"/>
            <param argument="--breadth" type="float" value="0.5" label="Minimum breadth_minCov required to count a genome present"/>
        <section name="other" title="Other Options" expanded="true">
            <param argument="--scaffolds" type="data" format="fasta" optional="true" label="Location to a list of scaffolds to compare. You can also make this a .fasta file and it will load the scaffold names"/>
            <param argument="--genome" type="data" format="tabular" optional="true" label="Run scaffolds belonging to this single genome only. Must provide an .stb file"/>
            <param argument="--store_coverage_overlap" type="boolean" truevalue="--store_coverage_overlap" falsevalue="" checked="false" label="Store coverage overlap on an mm level"/>
            <param argument="--store_mismatch_locations" type="boolean" truevalue="--store_mismatch_locations" falsevalue="" checked="false" label="Store the locations of SNPs"/>
            <param argument="--include_self_comparisons" type="boolean" truevalue="--include_self_comparisons" falsevalue="" checked="false" label="Compare IS profiles against themself"/>
            <param argument="--skip_plot_generation" type="boolean" truevalue="--skip_plot_generation" falsevalue="" checked="false" label="Dont create plots at the end of the run"/>
            <param argument="--group_length" type="integer" value="10000000" label="How many bp to compare simultaneously" help="higher will use more RAM and run more quickly"/>
        <section name="genome_clustering" title="Genome Clustering Options" expanded="true">
            <param argument="--ani_threshold" type="float" value="0.99999" label="popANI threshold to cluster genomes at" help="Must provide .stb file to do so"/>
            <param argument="--coverage_treshold" type="float" value="0.1" label="Minimum percent_genome_compared for a genome comparison to count" help="if below the popANI will be set to 0"/>
            <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes">
                <option value="average" selected="true">Average</option>
                <option value="single">Single</option>
                <option value="ward">Ward</option>
                <option value="complete">complete</option>
                <option value="centroid">centroid</option>
                <option value="weighted">weighted</option>
                <option value="median">median</option>
        <data name="comparisonsTable" format="tabular" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_comparisonsTable.tsv" label="Comparisons Table: Summarizes the differences between two inStrain profiles on a scaffold by scaffold level" />
        <data name="pairwise_SNP_locations" format="tabular" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_pairwise_SNP_locations.tsv" label="Pairwise SNP locations: Lists the locations of all differences between profiles." />
        <data name="genomeWide_compare" format="tabular" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_genomeWide_compare.tsv" label="Genome Wide compare: A genome-level summary of the differences detected by inStrain compare." />
        <data format="tabular" name="strain_clusters" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_strain_clusters.tsv" label="Strain clusters: Generate strain-level clusters" />
        <data format="pdf" name="inStrainCompare_dendrograms" from_work_dir="output.IS.COMPARE/figures/output.IS.COMPARE_inStrainCompare_dendrograms.pdf" label="inStrain Compare dendrograms: genomeWide microdiveristy metrics" />       
    <test expect_num_outputs="5">
        <param name="stb" value="N5_271_010G1.maxbin2.stb"/>
        <param name="input_is" value=","/>
        <section name="variant_calling">
            <param name="min_cov" value="5"/>
            <param name="min_freq" value="0.05"/>
            <param name="fdr" value="1e-06"/>
        <section name="database">
            <param name="database_mode" value="false"/>
            <param name="breadth" value="0.5"/>
        <section name="other">
            <param name="store_coverage_overlap" value="false"/>
            <param name="store_mismatch_locations" value="false"/>
            <param name="include_self_comparisons" value="false"/>
            <param name="skip_plot_generation" value="false"/>
            <param name="group_length" value="10000000"/>
        <section name="genome_clustering">
            <param name="ani_threshold" value="0.99999"/>
            <param name="coverage_treshold" value="0.1"/>
            <param name="clusterAlg" value="average"/>
        <output name="comparisonsTable">
                <has_text text="N5_271_010G1_scaffold_73"/>
                <has_n_lines n="168"/>
                <has_n_columns n="11"/>
        <output name="pairwise_SNP_locations">
                <has_n_lines n="0"/>
        <output name="genomeWide_compare">
                <has_text text="name1"/>
                <has_n_lines n="3"/>
                <has_n_columns n="10"/>
        <output name="strain_clusters">
                <has_text text="1_1"/>
                <has_n_lines n="5"/>
                <has_n_columns n="3"/>
        <output name="inStrainCompare_dendrograms">
                <has_size value="384512" delta="10000" />


is part of the inStrain module that provides the ability to compare multiple inStrain profiles (created by running inStrain profile).


inStrain can only compare inStrain profiles that have been mapped to the same .fasta file

inStrain compare does pairwise comparisons between each input inStrain profile. For each pair, a series of steps are undertaken:

1. All positions in which both IS_profile objects have at least min_cov coverage (5x by default) are identified. This information can be stored in the output by using the flag –store_coverage_overlap, but due to it’s size, it’s not stored by default.

2. Each position identified in step 1 is compared to calculate both conANI and popANI. The way that it compares positions is by testing whether the consensus base in sample 1 is detected at all in sample 2 and vice-versa. Detection of an allele in a sample is based on that allele being above the set -min_freq and -fdr. All detected differences between each pair of samples can be reported if the flag –store_mismatch_locations is set.

3. The coverage overlap and the average nucleotide identity for each scaffold is reported. For details on how this is done.


Multiple inStrain profiles IS outputs (zip files), all mapped to the same .fasta file


1. comparisonsTable.tsv
   Summarizes the differences between two inStrain profiles on a scaffold by scaffold level

2. pairwise_SNP_locations.tsv

   Lists the locations of all differences between profiles. Because it’s a big file, this will only be created is you include the flag --store_mismatch_locations in your inStrain compare command.

3. genomeWide_compare.tsv

   A genome-level summary of the differences detected by inStrain compare. Generated by running inStrain genome_wide on the results of inStrain compare

4. strain_clusters.tsv

   The result of clustering the pairwise comparison data provided in genomeWide_compare.tsv to generate strain-level clusters. Performed using hierarchical clustering in the same manner as the program dRep

5. Compare dendrograms (PDF) figure/plot
   A dendrogram comparing all samples based on popANI and based on shared_bases.

        <citation type="doi">10.1101/2020.01.22.915579</citation>