Mercurial > repos > iuc > raceid_main
comparison raceid_main.xml @ 0:e01c989c7543 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/raceid commit 39918bfdb08f06862ca395ce58a6f5e4f6dd1a5e
author | iuc |
---|---|
date | Sat, 03 Mar 2018 17:34:16 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e01c989c7543 |
---|---|
1 <tool id="raceid_main" name="RaceID" version="@VERSION@.0"> | |
2 <description>Race ID pipeline for single-cell RNA analysis</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <expand macro="requirements" /> | |
7 | |
8 <command detect_errors="exit_code"><![CDATA[ | |
9 ## Filter | |
10 echo "Filtering" && | |
11 Rscript '@SCRIPT_DIR@/raceID_filter.R' '@SCRIPT_DIR@' '$rconf_source_filter' && | |
12 | |
13 ## Kmeans | |
14 echo "K-means" && | |
15 Rscript '@SCRIPT_DIR@/raceID_kmeans_heatmap.R' '@SCRIPT_DIR@' '$rconf_source_kmeans' && | |
16 | |
17 mkdir '${out_html.files_path}' && | |
18 mv plot_*.svg '${out_html.files_path}' && | |
19 | |
20 echo ' | |
21 <html><head></head> | |
22 <body> | |
23 <h1>RaceID k-means</title></h1><br /> | |
24 <h3>Gap statistic</h3> | |
25 <img src="plot_gap.svg" ><br /> | |
26 <h3>Jaccard Similarity</h3> | |
27 <img src="plot_jaccard.svg" ><br /> | |
28 <h3>Silhouette Plot</h3> | |
29 <img src="plot_silhouette.svg" ><br /> | |
30 <h3>Cluster Heatmap</h3> | |
31 <img src="plot_clustheatmap.svg" ><br /> | |
32 ' > '$out_html' && | |
33 | |
34 ## Outlier -- relies on kmeans | |
35 echo "Outlier" && | |
36 Rscript '@SCRIPT_DIR@/raceID_outlierdetect.R' '@SCRIPT_DIR@' '$rconf_source_outlier' && | |
37 | |
38 mv plot_*.svg '${out_html.files_path}' && | |
39 echo ' | |
40 <br/> | |
41 <h1>RaceID Outlier Detection</h1><br /> | |
42 <h3>Background</h3> | |
43 <img src="plot_background.svg" ><br /> | |
44 <h3>Sensitivity</h3> | |
45 <img src="plot_sensitivity.svg" ><br /> | |
46 <h3>Outlier Probability</h3> | |
47 <img src="plot_outlierprobs.svg" ><br /> | |
48 <h3>Final Heatmap</h3> | |
49 <img src="plot_finalheat.svg" ><br /> | |
50 ' >> '$out_html' && | |
51 | |
52 ## tSNE -- relies on kmeans and outlier | |
53 echo "tSNE" && | |
54 Rscript '@SCRIPT_DIR@/raceID_tsne.R' '@SCRIPT_DIR@' '$rconf_source_tsne' && | |
55 | |
56 ##mkdir '${out_html.files_path}' && | |
57 mv plot_*.svg '${out_html.files_path}' && | |
58 | |
59 echo ' | |
60 <br/> | |
61 <h1>RaceID tSNE</h1><br /> | |
62 <h3>Initial k-means clusters</h3> | |
63 <br /><img src="plot_initial.svg" > | |
64 <h3>Final clusters</h3> | |
65 <br /><img src="plot_final.svg" > | |
66 <h3>Labelled</h3> | |
67 <br /><img src="plot_labels.svg" > | |
68 <h3>Symbols</h3> | |
69 <br /><img src="plot_symbols.svg" > | |
70 ' >> '$out_html' && | |
71 | |
72 #if $section_tsne.genexp_select.use_gexpr == "Yes": | |
73 #for $gene_set in $section_tsne.genexp_select.geneset: | |
74 echo "<h3>Expression for: [${gene_set.genes.value}]</h3>" >> '$out_html' && | |
75 echo "<br /><img src=\"plot_${gene_set.genes.value}\" >" >> '$out_html' && | |
76 #end for | |
77 #end if | |
78 echo '</body></html>' >> '$out_html' | |
79 | |
80 ]]></command> | |
81 | |
82 <configfiles> | |
83 <configfile name="rconf_source_filter"> | |
84 count_matrix = '$section_filter.inp_count' | |
85 filtering = as.logical( '$section_filter.filtering.do_filter.value' ) | |
86 output_table = '$out_table_filter' | |
87 output_rdat = '@out_rdat_filter@' | |
88 | |
89 # Defaults | |
90 control_genes_filter=""; | |
91 c_mintotal = 3000; c_minexpr = 5; c_maxexpr = 500; c_minnumber = 1; | |
92 c_downsample = F; c_dsn = 1; c_rseed = 17000; | |
93 | |
94 #if $section_filter.filtering.do_filter.value == "T": | |
95 control_genes_filter = '$section_filter.filtering.remove_nonendog.value' | |
96 #if $section_filter.filtering.default_filtering_select.do_filter_defaults.value == "advanced_options": | |
97 c_mintotal = as.integer( '$section_filter.filtering.default_filtering_select.mintotal' ) | |
98 c_minexpr = as.integer( '$section_filter.filtering.default_filtering_select.minexpr' ) | |
99 c_maxexpr = as.integer( '$section_filter.filtering.default_filtering_select.maxexpr' ) | |
100 c_minnumber = as.integer( '$section_filter.filtering.default_filtering_select.minnumber' ) | |
101 #if $section_filter.filtering.default_filtering_select.dsn: | |
102 c_downsample = T; | |
103 c_dsn = as.integer( '$section_filter.filtering.default_filtering_select.dsn' ) | |
104 #end if | |
105 c_rseed = as.integer( '$section_filter.filtering.default_filtering_select.filter_rseed' ) | |
106 #end if | |
107 #end if | |
108 </configfile> | |
109 <configfile name="rconf_source_kmeans"> | |
110 sc = readRDS( '@inp_rdat_kmeans@' ) | |
111 output_rdat = '@out_rdat_kmeans@' | |
112 c_metric = 'pearson'; c_cln = 0; dogap = T; c_clustnr = 20; bgap = 50; | |
113 semethod = 'Tibs2001SEmax'; sefactor = .25; c_bootnr = 50; c_rseed = 17000; | |
114 | |
115 c_metric = '$section_kmeans.metric' | |
116 c_cln = as.integer( '$section_kmeans.cln' ) | |
117 dogap = as.logical( '$section_kmeans.gapstats.dogap.value' ) | |
118 #if $section_kmeans.gapstats.dogap.value == "T": | |
119 c_clustnr = as.integer( '$section_kmeans.gapstats.clustnr' ) | |
120 bgap = as.integer( '$section_kmeans.gapstats.bgap' ) | |
121 semethod = '$section_kmeans.gapstats.semethod.value' | |
122 sefactor = as.numeric( '$section_kmeans.gapstats.sefactor' ) | |
123 #end if | |
124 c_bootnr = as.integer( '$section_kmeans.bootnr' ) | |
125 c_rseed = as.integer( '$section_kmeans.kmeans_rseed' ) | |
126 | |
127 generate_final_rdata = T | |
128 </configfile> | |
129 <configfile name="rconf_source_outlier"> | |
130 sc = readRDS( '@inp_rdat_outlier@' ) | |
131 output_rdat = '@out_rdat_outlier@' | |
132 output_table= '$out_table_outlier' | |
133 # set defaults | |
134 c_outminc = 5; c_outlg = 2; c_probthr = 1e-3; c_outdistquant = 0.75; | |
135 | |
136 c_outminc = as.integer( '$section_outlier.outminc' ) | |
137 c_outlg = as.integer( '$section_outlier.outlg' ) | |
138 c_probthr = as.numeric( '$section_outlier.probthr' ) | |
139 c_outdistquant = as.numeric( '$section_outlier.probthr' ) | |
140 | |
141 generate_final_rdata = T | |
142 </configfile> | |
143 <configfile name="rconf_source_tsne" > | |
144 sc = readRDS( '@inp_rdat_tsne@' ) | |
145 output_rdat = '$out_rdat_tsne' # final output RData | |
146 regex_val = "" | |
147 c_rseed = '$section_tsne.tsne_rseed' | |
148 gene_sets = "" | |
149 #if $section_tsne.genexp_select.use_gexpr == 'Yes': | |
150 gene_sets = '#for $gns in $section_tsne.genexp_select.geneset# $gns.genes.value _split_ #end for#' | |
151 regex_val = '$section_tsne.genexp_select.regex' | |
152 #end if | |
153 final_rdata = T | |
154 </configfile> | |
155 </configfiles> | |
156 <!-- Filter --> | |
157 <inputs> | |
158 <section name="section_filter" title="Filtering and Normalisation" expanded="true" > | |
159 <param name="inp_count" type="data" format="tsv" label="Count matrix" help="A spreadsheet file with the first row indicating cell IDs, and the first column indicating transcript or gene IDs" /> | |
160 <conditional name="filtering" > | |
161 <param name="do_filter" type="select" label="Perform filtering?" > | |
162 <option value="T" selected="true" >Yes</option> | |
163 <option value="F" >No</option> | |
164 </param> | |
165 <when value="F" /> | |
166 <when value="T" > | |
167 <param name="remove_nonendog" type="text" label="Control gene name prefixes" help="If ERCC or other non-endogenous spike-in RNAs are within the data, please specify their prefixes (e.g. 'ERCC, HK00') in order to filter them out. (Leave blank if control genes were not used in the experiment.)" /> | |
168 <conditional name="default_filtering_select" > | |
169 <param name="do_filter_defaults" type="select" label="Parameters" > | |
170 <option value="use_defaults" selected="true" >Use Defaults</option> | |
171 <option value="advanced_options" >Advanced Options</option > | |
172 </param> | |
173 <when value="use_defaults" /> | |
174 <when value="advanced_options" > | |
175 <param name="mintotal" type="integer" value="3000" min="1" label="Minimum total transcripts" help="Discard cells with less than this number of total transcripts before normalisation." /> | |
176 <param name="minexpr" type="integer" value="5" min="1" label="Minimum expressed genes" help="Discard genes that do not express a minimum of this number of transcripts after normalisation."/> | |
177 <param name="maxexpr" type="integer" value="500" min="0" label="Maximum expressed genes" help="Discard genes that express more than this number of transcripts after normalisation. Useful if genes have oversaturated counts derived from UMI data. Set to 0 to disable this step." /> | |
178 | |
179 <param name="minnumber" type="integer" value="1" label="Minimum Cells" help="Discard genes that do not have the minimum expressed transcripts in at least this number of cells" /> | |
180 | |
181 <param name="dsn" type="integer" value="1" min="1" optional="true" label="Downsample counts" help="Average transcripts across this many samples. If this is set to 1, then sampling noise should be comparable across cells. For higher values, the data approximates median normalisation." /> | |
182 <param name="filter_rseed" type="integer" value="17000" min="0" label="Seed value (for reproducibility)" /> | |
183 </when> | |
184 </conditional> | |
185 </when> | |
186 </conditional> | |
187 <param name="filter_table_output" type="boolean" checked="false" label="Generate output table of filtered matrix?" /> | |
188 </section> | |
189 | |
190 <!-- Kmeans --> | |
191 <section name="section_kmeans" title="Clustering (k-means)" expanded="true" > | |
192 <param name="metric" type="select" label="Distance metric"> | |
193 <option value="pearson" selected="true" /> | |
194 <option value="spearman" /> | |
195 <option value="kendall" /> | |
196 <option value="euclidean" /> | |
197 <option value="maximum" /> | |
198 <option value="manhattan" /> | |
199 <option value="canberra" /> | |
200 <option value="binary" /> | |
201 <option value="minkowski" /> | |
202 </param> | |
203 | |
204 <param name="cln" type="integer" value="0" min="0" label="Number of clusters for k-means" help="Leave as zero to automatically determine the number based on gap statistics" /> | |
205 | |
206 <conditional name="gapstats"> | |
207 <param name="dogap" type="select" label="Use gap statistics to determine clusters" > | |
208 <option value="T" selected="true" >Yes</option> | |
209 <option value="F" >No</option> | |
210 </param> | |
211 | |
212 <when value="F" /> | |
213 <when value="T" > | |
214 <param name="clustnr" type="integer" value="2" min="0" label="Maximum number of clusters for the computation of the gap statistic" help="If more major cell types are expected, a higher number than 2 should bde chosen." /> | |
215 <param name="bgap" type="integer" value="50" min="1" label="Number of bootstraps to run the gap statistic calculation" /> | |
216 <param name="semethod" type="select" label="Method used for determining first local maximum" > | |
217 <option value="Tibs2001SEmax" selected="true" /> | |
218 <option value="globalmax" /> | |
219 <option value="firstmax" /> | |
220 <option value="firstSEmax" /> | |
221 <option value="globalSEmax" /> | |
222 </param> | |
223 | |
224 <param name="sefactor" type="float" value="0.25" min="0.0001" max="1" label="Fraction of the standard deviation that the local maximum must differ from neighbouring points." /> | |
225 </when> | |
226 </conditional> | |
227 | |
228 <param name="bootnr" type="integer" value="50" min="1" label="Number of bootstraps for clustering" /> | |
229 <param name="kmeans_rseed" type="integer" value="17000" min="1" label="Seed value (for reproducibility)" /> | |
230 </section> | |
231 <!-- Outlier --> | |
232 <section name="section_outlier" title="Outlier Detection" expanded="true" > | |
233 <param name="outminc" type="integer" value="5" min="1" label="Expression cutoff threshold for outlier genes" /> | |
234 <param name="probthr" type="float" value="1e-3" min="1e-8" max="1" label="Probability threshold of observing a given gene expression level in a cell" help="If lower than this cutoff, the cell is considered an outlier for this gene." /> | |
235 <param name="outlg" type="integer" value="2" min="1" label="Minimal number of outlier genes required to flag an outlier cells" /> | |
236 <param name="outdistquant" type="select" label="Merge cells into outlier clusters if their similarity exceeds this quantile in a similarity distribution for all cell pairs" > | |
237 <option value="0.25">first (0.25)</option> | |
238 <option value="0.50">second (0.50)</option> | |
239 <option value="0.75">third (0.75)</option> | |
240 </param> | |
241 </section> | |
242 <section name="section_tsne" title="tSNE plots" expanded="true" > | |
243 <!-- tSNE --> | |
244 <conditional name="genexp_select" > | |
245 <param name="use_gexpr" type="select" label="Highlight the expression of a set of (related) genes over all clusters?" > | |
246 <option value="Yes" /> | |
247 <option value="No" selected="true" /> | |
248 </param> | |
249 <when value="No" /> | |
250 <when value="Yes" > | |
251 <repeat name="geneset" title="Gene sets" > | |
252 <param name="genes" type="text" label="Gene(s) of interest" help="e.g. 'Apoa1__chr9+Apoa1bp__chr6'" > | |
253 <sanitizer invalid_char="" > | |
254 <valid initial="string.letters,string.digits"> | |
255 <add value="+" /><add value="_" /><add value="-" /> | |
256 </valid> | |
257 </sanitizer> | |
258 </param> | |
259 </repeat> | |
260 <param name="regex" type="text" value="" label="Regular expression to apply over cell labels to identify cell types" help="e.g. for barcodes [ cl_1_ACCAG, cl_1_ACGGA, cl_2_TTAC, ... ] can be grouped into [ cl_1, cl_2, ... ] by the expression: '_[ACTG]+', which removes the last '_' and any following characters belonging to A C T or G." > | |
261 <sanitizer invalid_char="" > | |
262 <valid initial="string.printable" /> | |
263 </sanitizer> | |
264 </param> | |
265 </when> | |
266 </conditional> | |
267 <param name="tsne_rseed" type="integer" min="1" value="15555" label="Seed (for reproducibility)" /> | |
268 </section> | |
269 </inputs> | |
270 | |
271 <outputs> | |
272 <!-- Filter --> | |
273 <data name="out_table_filter" format="tabular" label="${tool.name} on ${on_string}: Filter Table" > | |
274 <filter>section_filter['filtering']['do_filter'] == "T"</filter> | |
275 </data> | |
276 <!-- Outlier --> | |
277 <data name="out_table_outlier" format="tabular" label="${tool.name} on ${on_string}: Outliers" /> | |
278 <!-- TSNE --> | |
279 <data name="out_html" format="html" label="${tool.name} on ${on_string}: Web Report" /> | |
280 <data name="out_rdat_tsne" format="rdata" label="${tool.name} on ${on_string}: tSNE RData" /> | |
281 </outputs> | |
282 | |
283 <tests> | |
284 <!-- vanilla run on all but filter --> | |
285 <test> | |
286 <!-- Filter --> | |
287 <param name="inp_count" value="transcript_counts_intestine_sub.tsv" /> | |
288 <!-- These test params are MANDATORY due to the reduced size of the | |
289 input set (due to file size constraints) --> | |
290 <param name="do_filter" value="T" /> | |
291 <param name="do_filter_defaults" value="advanced_options" /> | |
292 <param name="mintotal" value="10" /> | |
293 <param name="minexpr" value="1" /> | |
294 <param name="maxexpr" value="2000" /> | |
295 <!-- Outlier --> | |
296 <!-- ... With reduced minc --> | |
297 <param name="inp_rdat_outlier" value="trans_outlier_in.rds" /> | |
298 <param name="outminc" value="1" /> | |
299 <output name="out_table_outlier" value="out_outlier1.table" /> | |
300 <!-- tSNE --> | |
301 <output name="out_html" value="out_1.html" /> | |
302 <output name="out_rdat_tsne" value="out_tsne1.rdat" /> | |
303 </test> | |
304 <!-- manual gap statistics --> | |
305 <test> | |
306 <!-- Filter --> | |
307 <param name="inp_count" value="transcript_counts_intestine_sub.tsv" /> | |
308 <param name="filter_table_output" value="T" /> | |
309 <!-- See message from previous test .. --> | |
310 <param name="do_filter" value="T" /> | |
311 <param name="do_filter_defaults" value="advanced_options" /> | |
312 <param name="mintotal" value="10" /> | |
313 <param name="minexpr" value="1" /> | |
314 <param name="maxexpr" value="2000" /> | |
315 <output name="out_table_filter" value="out_filter2.table" /> | |
316 <!-- Kmeans --> | |
317 <!-- ... Auto gap with gap params --> | |
318 <param name="inp_rdat_kmeans" value="trans_filter_ds.rds" /> | |
319 <param name="clustnr" value="5" /> | |
320 <param name="bgap" value="10" /> | |
321 <param name="semethod" value="globalSEmax" /> | |
322 <param name="sefactor" value="0.6" /> | |
323 <!-- Outlier --> | |
324 <!-- ... With reduced minc --> | |
325 <param name="inp_rdat_outlier" value="trans_outlier_in.rds" /> | |
326 <param name="outminc" value="1" /> | |
327 <output name="out_table_outlier" value="out_outlier2.table" /> | |
328 <!-- tSNE --> | |
329 <output name="out_html" value="out_2.html" /> | |
330 <output name="out_rdat_tsne" value="out_tsne2.rdat" /> | |
331 </test> | |
332 <!-- complex run --> | |
333 <test> | |
334 <!-- Filter --> | |
335 <param name="inp_count" value="transcript_counts_intestine_sub.tsv" /> | |
336 <param name="do_filter" value="T" /> | |
337 <param name="do_filter_defaults" value="advanced_options" /> | |
338 <param name="mintotal" value="10" /> | |
339 <param name="minexpr" value="1" /> | |
340 <param name="maxexpr" value="2000" /> | |
341 <param name="dsn" value="3" /> | |
342 <output name="out_table_filter" value="out_filter3.table" /> | |
343 <!-- Kmeans --> | |
344 <!-- ... Set k-value, no gap, no R obj, metrics and bootrepl. --> | |
345 <param name="inp_rdat_kmeans" value="trans_filter_ds.rds" /> | |
346 <param name="metric" value="manhattan" /> | |
347 <param name="cln" value="6" /> | |
348 <param name="dogap" value="T" /> | |
349 <param name="bootnr" value="10" /> | |
350 <!-- Outlier --> | |
351 <!-- ... No R out, other opts--> | |
352 <param name="inp_rdat_outlier" value="trans_outlier_in.rds" /> | |
353 <param name="outminc" value="1" /> | |
354 <param name="probthr" value="1e-5" /> | |
355 <param name="outlg" value="10" /> | |
356 <param name="outdistquant" value="0.50" /> | |
357 <output name="out_table_outlier" value="out_outlier3.table" /> | |
358 <!-- tSNE --> | |
359 <param name="use_gexpr" value="Yes" /> | |
360 <repeat name="geneset"> | |
361 <param name="genes" value="1110007C09Rik__chr13+1110037F02Rik__chr4+1300002K09Rik__chr4" /> | |
362 </repeat> | |
363 <repeat name="geneset"> | |
364 <param name="genes" value="0610010K14Rik__chr11+1500009L16Rik__chr10" /> | |
365 </repeat> | |
366 <param name="regex" value="[^_]+__" /> | |
367 <output name="out_html" value="out_3.html" /> | |
368 <output name="out_rdat_tsne" value="out_tsne3.rdat" /> | |
369 </test> | |
370 </tests> | |
371 | |
372 <help><![CDATA[ | |
373 | |
374 ****** | |
375 RaceID | |
376 ****** | |
377 | |
378 RaceID(v2) pipeline for scRNA, performs: | |
379 * filtering | |
380 * normalisation | |
381 * k-means clustering | |
382 * outlier detection | |
383 | |
384 Generates heatmaps, tSNE plots, and an R object which can be passed into the RaceID DiffGenes tool for expression analysis between different sets of clusters. | |
385 | |
386 **Filtering** | |
387 | |
388 This takes a count matrix/spreadsheet with cellIDs as columns and geneIDs/transcriptIDs as rows, and filters based on standard single-cell RNA pre-processing methods (minimum/maximum transcript expression in a minimum of X number of cells). A filtered matrix is produced as output | |
389 | |
390 **K-means Clustering** | |
391 | |
392 This performs k-means clustering and plots gap statistics, jaccard similarity, silhoutte plots, and preliminary heatmap. | |
393 | |
394 **Outlier Detection** | |
395 | |
396 This performs outlier detection by calibrating against a background noise model within each cluster, and searching for cells that fall outside of the transcript count distribution for that gene (modelled as a negative binomial). Cells that are outliers for more than a user-set amount of genes are suspected as being outlier cells. | |
397 | |
398 **tSNE plots** | |
399 | |
400 Generates multiple tSNE plots with custom expression highlighting for gene subsets of interest. A tSNE map can be drawn for original clusters (derived via k-means) and final clustering (derived from outliers). Any number of genes subsets of interest can be specified to measure expression within clusters for related marker genes or genes characterising a cell type. | |
401 | |
402 ]]></help> | |
403 <expand macro="citations" /> | |
404 </tool> |