comparison instrain_compare.xml @ 0:dff92aac9f75 draft

"planemo upload for repository https://github.com/MrOlm/inStrain commit e6eae71231e551c08aa96afc9f15b8ba87676101"
author iuc
date Wed, 11 Aug 2021 21:11:53 +0000
parents
children 92a7945118a9
comparison
equal deleted inserted replaced
-1:000000000000 0:dff92aac9f75
1 <tool id="instrain_compare" name="InStrain Compare" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>Compares multiple inStrain profiles (popANI, coverage_overlap, etc.) </description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="edam_ontology"/>
7 <expand macro="requirements"/>
8 <version_command>inStrain compare --version</version_command>
9 <command detect_errors="exit_code"><![CDATA[
10 #if $stb
11 ln -s '$stb' 'stb_file.stb' &&
12 #end if
13 #if $other.genome
14 ln -s '$other.genome' 'genome_file.stb' &&
15 #end if
16 #for $i, $s in enumerate($input_is)
17 #if $s
18 input_count=$i
19 mkdir -p $i-input.IS &&
20 unzip '$s' -d '$i-input.IS/' &&
21 #end if
22 #end for
23 inStrain compare
24 --input
25 #for $i, $s in enumerate($input_is)
26 #if $s
27 '$i-input.IS'
28 #end if
29 #end for
30 --output 'output.IS.COMPARE'
31 --processes "\${GALAXY_SLOTS:-6}"
32 #if $stb
33 --stb 'stb_file.stb'
34 #end if
35 --min_cov $variant_calling.min_cov
36 --min_freq $variant_calling.min_freq
37 --fdr $variant_calling.fdr
38 $database.database_mode
39 --breadth $database.breadth
40 #if $other.scaffolds
41 --scaffolds '$other.scaffolds'
42 #end if
43 #if $other.genome
44 --genome 'genome_file.stb'
45 #end if
46 $other.store_coverage_overlap
47 $other.store_mismatch_locations
48 $other.include_self_comparisons
49 $other.skip_plot_generation
50 --group_length $other.group_length
51 --ani_threshold $genome_clustering.ani_threshold
52 --coverage_treshold $genome_clustering.coverage_treshold
53 --clusterAlg '$genome_clustering.clusterAlg'
54 ]]></command>
55 <inputs>
56 <param name="input_is" type="data" format="zip" multiple="true" label="inStrain Profile IS folder" help=" The Zip files for the IS profiles outputs you want to compare"/>
57 <param argument="--stb" type="data" format="tabular" optional="true" label="Scaffold to bin" help="This can be a file with each line listing a scaffold and a bin name, tab-seperated. This can also be a space-seperated list of .fasta files, with one genome per .fasta file. If nothing is provided, all scaffolds will be treated as belonging to the same genome"/>
58 <section name="variant_calling" title="Variant Calling Options" expanded="true">
59 <param argument="--min_cov" type="integer" value="5" label=" Minimum coverage to call an variant"/>
60 <param argument="--min_freq" type="float" value="0.05" label="Minimum SNP frequency to confirm a SNV" help="Both this AND the FDR snp count cutoff must be true to call a SNP."/>
61 <param argument="--fdr" type="float" value="1e-06" min="0" max="1" help="SNP false discovery rate- based on simulation data with a 0.1 percent error rate (Q30)"/>
62 </section>
63 <section name="database" title="Database Mode Parameters" expanded="true">
64 <param argument="--database_mode" type="boolean" truevalue="--debugdatabase_mode" falsevalue="" checked="false" label="Automatically determine which genomes are present in each Profile and only compare scaffolds from those genomes." help="All profiles must have run Profile with the same .stb"/>
65 <param argument="--breadth" type="float" value="0.5" label="Minimum breadth_minCov required to count a genome present"/>
66 </section>
67 <section name="other" title="Other Options" expanded="true">
68 <param argument="--scaffolds" type="data" format="fasta" optional="true" label="Location to a list of scaffolds to compare. You can also make this a .fasta file and it will load the scaffold names"/>
69 <param argument="--genome" type="data" format="tabular" optional="true" label="Run scaffolds belonging to this single genome only. Must provide an .stb file"/>
70 <param argument="--store_coverage_overlap" type="boolean" truevalue="--store_coverage_overlap" falsevalue="" checked="false" label="Store coverage overlap on an mm level"/>
71 <param argument="--store_mismatch_locations" type="boolean" truevalue="--store_mismatch_locations" falsevalue="" checked="false" label="Store the locations of SNPs"/>
72 <param argument="--include_self_comparisons" type="boolean" truevalue="--include_self_comparisons" falsevalue="" checked="false" label="Compare IS profiles against themself"/>
73 <param argument="--skip_plot_generation" type="boolean" truevalue="--skip_plot_generation" falsevalue="" checked="false" label="Dont create plots at the end of the run"/>
74 <param argument="--group_length" type="integer" value="10000000" label="How many bp to compare simultaneously" help="higher will use more RAM and run more quickly"/>
75 </section>
76 <section name="genome_clustering" title="Genome Clustering Options" expanded="true">
77 <param argument="--ani_threshold" type="float" value="0.99999" label="popANI threshold to cluster genomes at" help="Must provide .stb file to do so"/>
78 <param argument="--coverage_treshold" type="float" value="0.1" label="Minimum percent_genome_compared for a genome comparison to count" help="if below the popANI will be set to 0"/>
79 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes">
80 <option value="average" selected="true">Average</option>
81 <option value="single">Single</option>
82 <option value="ward">Ward</option>
83 <option value="complete">complete</option>
84 <option value="centroid">centroid</option>
85 <option value="weighted">weighted</option>
86 <option value="median">median</option>
87 </param>
88 </section>
89 </inputs>
90 <outputs>
91 <data name="comparisonsTable" format="tabular" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_comparisonsTable.tsv" label="Comparisons Table: Summarizes the differences between two inStrain profiles on a scaffold by scaffold level" />
92 <data name="pairwise_SNP_locations" format="tabular" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_pairwise_SNP_locations.tsv" label="Pairwise SNP locations: Lists the locations of all differences between profiles." />
93 <data name="genomeWide_compare" format="tabular" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_genomeWide_compare.tsv" label="Genome Wide compare: A genome-level summary of the differences detected by inStrain compare." />
94 <data format="tabular" name="strain_clusters" from_work_dir="output.IS.COMPARE/output/output.IS.COMPARE_strain_clusters.tsv" label="Strain clusters: Generate strain-level clusters" />
95 <data format="pdf" name="inStrainCompare_dendrograms" from_work_dir="output.IS.COMPARE/figures/output.IS.COMPARE_inStrainCompare_dendrograms.pdf" label="inStrain Compare dendrograms: genomeWide microdiveristy metrics" />
96 </outputs>
97 <tests>
98 <test expect_num_outputs="5">
99 <param name="stb" value="N5_271_010G1.maxbin2.stb"/>
100 <param name="input_is" value="N5_271_010G1_scaffold_min1000.fa-vs-N5_271_010G1.IS.zip,N5_271_010G1_scaffold_min1000.fa-vs-N5_271_010G2.IS.zip"/>
101 <section name="variant_calling">
102 <param name="min_cov" value="5"/>
103 <param name="min_freq" value="0.05"/>
104 <param name="fdr" value="1e-06"/>
105 </section>
106 <section name="database">
107 <param name="database_mode" value="false"/>
108 <param name="breadth" value="0.5"/>
109 </section>
110 <section name="other">
111 <param name="store_coverage_overlap" value="false"/>
112 <param name="store_mismatch_locations" value="false"/>
113 <param name="include_self_comparisons" value="false"/>
114 <param name="skip_plot_generation" value="false"/>
115 <param name="group_length" value="10000000"/>
116 </section>
117 <section name="genome_clustering">
118 <param name="ani_threshold" value="0.99999"/>
119 <param name="coverage_treshold" value="0.1"/>
120 <param name="clusterAlg" value="average"/>
121 </section>
122 <output name="comparisonsTable">
123 <assert_contents>
124 <has_text text="N5_271_010G1_scaffold_73"/>
125 <has_n_lines n="168"/>
126 <has_n_columns n="11"/>
127 </assert_contents>
128 </output>
129 <output name="pairwise_SNP_locations">
130 <assert_contents>
131 <has_n_lines n="0"/>
132 </assert_contents>
133 </output>
134 <output name="genomeWide_compare">
135 <assert_contents>
136 <has_text text="name1"/>
137 <has_n_lines n="3"/>
138 <has_n_columns n="10"/>
139 </assert_contents>
140 </output>
141 <output name="strain_clusters">
142 <assert_contents>
143 <has_text text="1_1"/>
144 <has_n_lines n="5"/>
145 <has_n_columns n="3"/>
146 </assert_contents>
147 </output>
148 <output name="inStrainCompare_dendrograms">
149 <assert_contents>
150 <has_size value="384512" delta="10000" />
151 </assert_contents>
152 </output>
153 </test>
154 </tests>
155 <help><![CDATA[
156 @HELP_HEADER@
157
158 Compare
159 =======
160
161 is part of the inStrain module that provides the ability to compare multiple inStrain profiles (created by running inStrain profile).
162
163 Note
164 ====
165
166 inStrain can only compare inStrain profiles that have been mapped to the same .fasta file
167
168 inStrain compare does pairwise comparisons between each input inStrain profile. For each pair, a series of steps are undertaken:
169
170 1. All positions in which both IS_profile objects have at least min_cov coverage (5x by default) are identified. This information can be stored in the output by using the flag –store_coverage_overlap, but due to it’s size, it’s not stored by default.
171
172
173 2. Each position identified in step 1 is compared to calculate both conANI and popANI. The way that it compares positions is by testing whether the consensus base in sample 1 is detected at all in sample 2 and vice-versa. Detection of an allele in a sample is based on that allele being above the set -min_freq and -fdr. All detected differences between each pair of samples can be reported if the flag –store_mismatch_locations is set.
174
175
176 3. The coverage overlap and the average nucleotide identity for each scaffold is reported. For details on how this is done.
177
178
179 Inputs
180 ======
181
182 Multiple inStrain profiles IS outputs (zip files), all mapped to the same .fasta file
183
184
185 Outputs
186 =======
187
188 1. comparisonsTable.tsv
189
190 Summarizes the differences between two inStrain profiles on a scaffold by scaffold level
191
192 2. pairwise_SNP_locations.tsv
193
194 Lists the locations of all differences between profiles. Because it’s a big file, this will only be created is you include the flag --store_mismatch_locations in your inStrain compare command.
195
196 3. genomeWide_compare.tsv
197
198 A genome-level summary of the differences detected by inStrain compare. Generated by running inStrain genome_wide on the results of inStrain compare
199
200 4. strain_clusters.tsv
201
202 The result of clustering the pairwise comparison data provided in genomeWide_compare.tsv to generate strain-level clusters. Performed using hierarchical clustering in the same manner as the program dRep
203
204 5. Compare dendrograms (PDF) figure/plot
205
206 A dendrogram comparing all samples based on popANI and based on shared_bases.
207
208 ]]></help>
209 <citations>
210 <citation type="doi">10.1101/2020.01.22.915579</citation>
211 </citations>
212 </tool>