Mercurial > repos > iuc > raceid_main

diff raceid_main.xml @ 0:e01c989c7543 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/raceid commit 39918bfdb08f06862ca395ce58a6f5e4f6dd1a5e
author: iuc
date: Sat, 03 Mar 2018 17:34:16 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/raceid_main.xml	Sat Mar 03 17:34:16 2018 -0500
@@ -0,0 +1,404 @@
+<tool id="raceid_main" name="RaceID" version="@VERSION@.0">
+    <description>Race ID pipeline for single-cell RNA analysis</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+
+    <command detect_errors="exit_code"><![CDATA[    
+    ## Filter
+    echo "Filtering" &&
+    Rscript '@SCRIPT_DIR@/raceID_filter.R' '@SCRIPT_DIR@' '$rconf_source_filter' &&
+
+    ## Kmeans
+    echo "K-means" &&
+    Rscript '@SCRIPT_DIR@/raceID_kmeans_heatmap.R' '@SCRIPT_DIR@' '$rconf_source_kmeans' &&
+
+    mkdir '${out_html.files_path}' &&
+    mv plot_*.svg '${out_html.files_path}' &&
+
+    echo '
+    <html><head></head>
+    <body>
+    <h1>RaceID k-means</title></h1><br />
+    <h3>Gap statistic</h3>
+    <img src="plot_gap.svg" ><br />
+    <h3>Jaccard Similarity</h3>
+    <img src="plot_jaccard.svg" ><br />
+    <h3>Silhouette Plot</h3>
+    <img src="plot_silhouette.svg" ><br />
+    <h3>Cluster Heatmap</h3>
+    <img src="plot_clustheatmap.svg" ><br />
+    ' > '$out_html' &&
+
+    ## Outlier -- relies on kmeans
+    echo "Outlier" &&
+    Rscript '@SCRIPT_DIR@/raceID_outlierdetect.R' '@SCRIPT_DIR@' '$rconf_source_outlier' &&
+
+    mv plot_*.svg '${out_html.files_path}' &&
+    echo '
+    <br/>
+    <h1>RaceID Outlier Detection</h1><br />
+    <h3>Background</h3>
+    <img src="plot_background.svg" ><br />
+    <h3>Sensitivity</h3>
+    <img src="plot_sensitivity.svg" ><br />
+    <h3>Outlier Probability</h3>
+    <img src="plot_outlierprobs.svg" ><br />
+    <h3>Final Heatmap</h3>
+    <img src="plot_finalheat.svg" ><br />
+    ' >> '$out_html' &&
+
+    ## tSNE -- relies on kmeans and outlier
+    echo "tSNE" &&
+    Rscript '@SCRIPT_DIR@/raceID_tsne.R' '@SCRIPT_DIR@' '$rconf_source_tsne' &&
+
+    ##mkdir '${out_html.files_path}' &&
+    mv plot_*.svg '${out_html.files_path}' &&
+
+    echo '
+    <br/>
+    <h1>RaceID tSNE</h1><br />
+    <h3>Initial k-means clusters</h3>
+    <br /><img src="plot_initial.svg" >
+    <h3>Final clusters</h3>
+    <br /><img src="plot_final.svg" >
+    <h3>Labelled</h3>
+    <br /><img src="plot_labels.svg" >
+    <h3>Symbols</h3>
+    <br /><img src="plot_symbols.svg" >
+    ' >> '$out_html' &&
+
+    #if $section_tsne.genexp_select.use_gexpr == "Yes":
+        #for $gene_set in $section_tsne.genexp_select.geneset:
+            echo "<h3>Expression for: [${gene_set.genes.value}]</h3>" >> '$out_html' &&
+            echo "<br /><img src=\"plot_${gene_set.genes.value}\" >" >> '$out_html' &&
+        #end for
+    #end if
+    echo '</body></html>' >> '$out_html'
+
+    ]]></command>
+
+    <configfiles>
+        <configfile name="rconf_source_filter">
+            count_matrix = '$section_filter.inp_count'
+            filtering = as.logical( '$section_filter.filtering.do_filter.value' )
+            output_table = '$out_table_filter'
+            output_rdat = '@out_rdat_filter@'
+
+            # Defaults
+            control_genes_filter="";
+            c_mintotal = 3000; c_minexpr = 5; c_maxexpr = 500; c_minnumber = 1;
+            c_downsample = F; c_dsn = 1; c_rseed = 17000;
+
+            #if $section_filter.filtering.do_filter.value == "T":
+            control_genes_filter = '$section_filter.filtering.remove_nonendog.value'
+                #if $section_filter.filtering.default_filtering_select.do_filter_defaults.value == "advanced_options":
+            c_mintotal = as.integer( '$section_filter.filtering.default_filtering_select.mintotal' )
+            c_minexpr = as.integer( '$section_filter.filtering.default_filtering_select.minexpr' )
+            c_maxexpr = as.integer( '$section_filter.filtering.default_filtering_select.maxexpr' )
+            c_minnumber = as.integer( '$section_filter.filtering.default_filtering_select.minnumber' )
+                    #if $section_filter.filtering.default_filtering_select.dsn:
+            c_downsample = T;
+            c_dsn = as.integer( '$section_filter.filtering.default_filtering_select.dsn' )
+                    #end if
+            c_rseed = as.integer( '$section_filter.filtering.default_filtering_select.filter_rseed' )
+                #end if
+            #end if
+        </configfile>
+        <configfile name="rconf_source_kmeans">
+            sc = readRDS( '@inp_rdat_kmeans@' )
+            output_rdat = '@out_rdat_kmeans@'
+            c_metric = 'pearson'; c_cln = 0; dogap = T; c_clustnr = 20; bgap = 50;
+            semethod = 'Tibs2001SEmax'; sefactor = .25; c_bootnr = 50; c_rseed = 17000;
+
+            c_metric = '$section_kmeans.metric'
+            c_cln = as.integer( '$section_kmeans.cln' )
+            dogap = as.logical( '$section_kmeans.gapstats.dogap.value' )
+                #if $section_kmeans.gapstats.dogap.value == "T":
+            c_clustnr = as.integer( '$section_kmeans.gapstats.clustnr' )
+            bgap = as.integer( '$section_kmeans.gapstats.bgap' )
+            semethod = '$section_kmeans.gapstats.semethod.value'
+            sefactor = as.numeric( '$section_kmeans.gapstats.sefactor' )
+                #end if
+            c_bootnr = as.integer( '$section_kmeans.bootnr' )
+            c_rseed = as.integer( '$section_kmeans.kmeans_rseed' )
+
+            generate_final_rdata = T
+        </configfile>
+        <configfile name="rconf_source_outlier">
+            sc = readRDS( '@inp_rdat_outlier@' )
+            output_rdat = '@out_rdat_outlier@'
+            output_table= '$out_table_outlier'
+            # set defaults
+            c_outminc = 5; c_outlg = 2; c_probthr = 1e-3; c_outdistquant = 0.75;
+
+            c_outminc = as.integer( '$section_outlier.outminc' )
+            c_outlg = as.integer( '$section_outlier.outlg' )
+            c_probthr = as.numeric( '$section_outlier.probthr' )
+            c_outdistquant = as.numeric( '$section_outlier.probthr' )
+
+            generate_final_rdata = T
+        </configfile>
+        <configfile name="rconf_source_tsne" >
+            sc = readRDS( '@inp_rdat_tsne@' )
+            output_rdat = '$out_rdat_tsne'  # final output RData
+            regex_val = ""
+            c_rseed = '$section_tsne.tsne_rseed'
+            gene_sets = ""
+            #if $section_tsne.genexp_select.use_gexpr == 'Yes':
+            gene_sets = '#for $gns in $section_tsne.genexp_select.geneset# $gns.genes.value _split_ #end for#'
+            regex_val = '$section_tsne.genexp_select.regex'
+            #end if
+            final_rdata = T
+        </configfile>
+    </configfiles>
+    <!-- Filter -->
+    <inputs>
+        <section name="section_filter" title="Filtering and Normalisation" expanded="true" >
+            <param name="inp_count" type="data" format="tsv" label="Count matrix" help="A spreadsheet file with the first row indicating cell IDs, and the first column indicating transcript or gene IDs" />
+            <conditional name="filtering" >
+                <param name="do_filter" type="select" label="Perform filtering?"  >
+                    <option value="T" selected="true" >Yes</option>
+                    <option value="F" >No</option>
+                </param>
+                <when value="F" />
+                <when value="T" >
+                    <param name="remove_nonendog" type="text" label="Control gene name prefixes" help="If ERCC or other non-endogenous spike-in RNAs are within the data, please specify their prefixes (e.g. 'ERCC, HK00') in order to filter them out. (Leave blank if control genes were not used in the experiment.)" />
+                    <conditional name="default_filtering_select" >
+                        <param name="do_filter_defaults" type="select" label="Parameters" >
+                            <option value="use_defaults" selected="true" >Use Defaults</option>
+                            <option value="advanced_options" >Advanced Options</option >
+                        </param>
+                        <when value="use_defaults" />
+                        <when value="advanced_options" >
+                            <param name="mintotal" type="integer" value="3000" min="1" label="Minimum total transcripts" help="Discard cells with less than this number of total transcripts before normalisation." />
+                            <param name="minexpr" type="integer" value="5" min="1" label="Minimum expressed genes" help="Discard genes that do not express a minimum of this number of transcripts after normalisation."/>
+                            <param name="maxexpr" type="integer" value="500" min="0" label="Maximum expressed genes" help="Discard genes that express more than this number of transcripts after normalisation. Useful if genes have oversaturated counts derived from UMI data. Set to 0 to disable this step." />
+
+                            <param name="minnumber" type="integer" value="1" label="Minimum Cells" help="Discard genes that do not have the minimum expressed transcripts in at least this number of cells" />
+
+                            <param name="dsn" type="integer" value="1" min="1" optional="true" label="Downsample counts" help="Average transcripts across this many samples. If this is set to 1, then sampling noise should be comparable across cells. For higher values, the data approximates median normalisation." />
+                            <param name="filter_rseed" type="integer" value="17000" min="0" label="Seed value (for reproducibility)" />
+                        </when>
+                    </conditional>
+                </when>
+            </conditional>
+            <param name="filter_table_output" type="boolean" checked="false" label="Generate output table of filtered matrix?" />
+        </section>
+
+        <!-- Kmeans -->
+        <section name="section_kmeans" title="Clustering (k-means)" expanded="true" >
+            <param name="metric" type="select" label="Distance metric">
+                <option value="pearson" selected="true" />
+                <option value="spearman" />
+                <option value="kendall" />
+                <option value="euclidean" />
+                <option value="maximum" />
+                <option value="manhattan" />
+                <option value="canberra" />
+                <option value="binary" />
+                <option value="minkowski" />
+            </param>
+
+            <param name="cln" type="integer" value="0" min="0" label="Number of clusters for k-means" help="Leave as zero to automatically determine the number based on gap statistics" />
+
+            <conditional name="gapstats">
+                <param name="dogap" type="select" label="Use gap statistics to determine clusters" >
+                    <option value="T" selected="true" >Yes</option>
+                    <option value="F" >No</option>
+                </param>
+
+                <when value="F" />
+                <when value="T" >
+                    <param name="clustnr" type="integer" value="2" min="0" label="Maximum number of clusters for the computation of the gap statistic" help="If more major cell types are expected, a higher number than 2 should bde chosen." />
+                    <param name="bgap" type="integer" value="50" min="1" label="Number of bootstraps to run the gap statistic calculation" />
+                    <param name="semethod" type="select" label="Method used for determining first local maximum" >
+                        <option value="Tibs2001SEmax" selected="true" />
+                        <option value="globalmax" />
+                        <option value="firstmax" />
+                        <option value="firstSEmax" />
+                        <option value="globalSEmax" />
+                    </param>
+
+                    <param name="sefactor" type="float" value="0.25" min="0.0001" max="1" label="Fraction of the standard deviation that the local maximum must differ from neighbouring points." />
+                </when>
+            </conditional>
+
+            <param name="bootnr" type="integer" value="50" min="1" label="Number of bootstraps for clustering" />
+            <param name="kmeans_rseed" type="integer" value="17000" min="1" label="Seed value (for reproducibility)" />
+        </section>
+        <!-- Outlier -->
+        <section name="section_outlier" title="Outlier Detection" expanded="true" >
+            <param name="outminc" type="integer" value="5" min="1" label="Expression cutoff threshold for outlier genes"  />
+            <param name="probthr" type="float" value="1e-3" min="1e-8" max="1" label="Probability threshold of observing a given gene expression level in a cell" help="If lower than this cutoff, the cell is considered an outlier for this gene." />
+            <param name="outlg" type="integer" value="2" min="1" label="Minimal number of outlier genes required to flag an outlier cells" />
+            <param name="outdistquant" type="select" label="Merge cells into outlier clusters if their similarity exceeds this quantile in a similarity distribution for all cell pairs" >
+                <option value="0.25">first (0.25)</option>
+                <option value="0.50">second (0.50)</option>
+                <option value="0.75">third (0.75)</option>
+            </param>
+        </section>
+        <section name="section_tsne" title="tSNE plots" expanded="true" >
+            <!-- tSNE -->
+            <conditional name="genexp_select" >
+                <param name="use_gexpr" type="select" label="Highlight the expression of a set of (related) genes over all clusters?" >
+                    <option value="Yes" />
+                    <option value="No" selected="true" />
+                </param>
+                <when value="No" />
+                <when value="Yes" >
+                    <repeat name="geneset" title="Gene sets" >
+                        <param name="genes" type="text" label="Gene(s) of interest" help="e.g. 'Apoa1__chr9+Apoa1bp__chr6'" >
+                            <sanitizer invalid_char="" >
+                                <valid initial="string.letters,string.digits">
+                                    <add value="+" /><add value="_" /><add value="-" />
+                                </valid>
+                            </sanitizer>
+                        </param>
+                    </repeat>
+                    <param name="regex" type="text" value="" label="Regular expression to apply over cell labels to identify cell types" help="e.g. for barcodes [ cl_1_ACCAG, cl_1_ACGGA, cl_2_TTAC, ... ] can be grouped into [ cl_1, cl_2, ... ] by the expression: '_[ACTG]+', which removes the last '_' and any following characters belonging to A C T or G." >
+                        <sanitizer invalid_char="" >
+                            <valid initial="string.printable" />
+                        </sanitizer>
+                    </param>
+                </when>
+            </conditional>
+            <param name="tsne_rseed" type="integer" min="1" value="15555" label="Seed (for reproducibility)" />
+        </section>
+    </inputs>
+
+    <outputs>
+        <!-- Filter -->
+        <data name="out_table_filter" format="tabular" label="${tool.name} on ${on_string}: Filter Table" >
+            <filter>section_filter['filtering']['do_filter'] == "T"</filter>
+        </data>
+        <!-- Outlier -->
+        <data name="out_table_outlier" format="tabular" label="${tool.name} on ${on_string}: Outliers" />
+        <!-- TSNE -->
+        <data name="out_html" format="html" label="${tool.name} on ${on_string}: Web Report" />
+        <data name="out_rdat_tsne" format="rdata" label="${tool.name} on ${on_string}: tSNE RData" />
+    </outputs>
+
+    <tests>
+        <!-- vanilla run on all but filter -->
+        <test>
+            <!-- Filter -->
+            <param name="inp_count" value="transcript_counts_intestine_sub.tsv" />
+            <!-- These test params are MANDATORY due to the reduced size of the
+                 input set (due to file size constraints) -->
+            <param name="do_filter" value="T" />
+            <param name="do_filter_defaults" value="advanced_options" />
+            <param name="mintotal" value="10" />
+            <param name="minexpr" value="1" />
+            <param name="maxexpr" value="2000" />
+            <!-- Outlier -->
+            <!-- ... With reduced minc -->
+            <param name="inp_rdat_outlier" value="trans_outlier_in.rds" />
+            <param name="outminc" value="1" />
+            <output name="out_table_outlier" value="out_outlier1.table" />
+            <!-- tSNE -->
+            <output name="out_html" value="out_1.html" />
+            <output name="out_rdat_tsne" value="out_tsne1.rdat" />
+        </test>
+        <!-- manual gap statistics -->
+        <test>
+            <!-- Filter -->
+            <param name="inp_count" value="transcript_counts_intestine_sub.tsv" />
+            <param name="filter_table_output" value="T" />
+            <!-- See message from previous test .. -->
+            <param name="do_filter" value="T" />
+            <param name="do_filter_defaults" value="advanced_options" />
+            <param name="mintotal" value="10" />
+            <param name="minexpr" value="1" />
+            <param name="maxexpr" value="2000" />
+            <output name="out_table_filter" value="out_filter2.table" />
+            <!-- Kmeans -->
+            <!-- ... Auto gap with gap params -->
+            <param name="inp_rdat_kmeans" value="trans_filter_ds.rds" />
+            <param name="clustnr" value="5" />
+            <param name="bgap" value="10" />
+            <param name="semethod" value="globalSEmax" />
+            <param name="sefactor" value="0.6" />
+            <!-- Outlier -->
+            <!-- ... With reduced minc -->
+            <param name="inp_rdat_outlier" value="trans_outlier_in.rds" />
+            <param name="outminc" value="1" />
+            <output name="out_table_outlier" value="out_outlier2.table" />
+            <!-- tSNE -->
+            <output name="out_html" value="out_2.html" />
+            <output name="out_rdat_tsne" value="out_tsne2.rdat" />
+        </test>
+        <!-- complex run -->
+        <test>
+            <!-- Filter -->
+            <param name="inp_count" value="transcript_counts_intestine_sub.tsv" />
+            <param name="do_filter" value="T" />
+            <param name="do_filter_defaults" value="advanced_options" />
+            <param name="mintotal" value="10" />
+            <param name="minexpr" value="1" />
+            <param name="maxexpr" value="2000" />
+            <param name="dsn" value="3" />
+            <output name="out_table_filter" value="out_filter3.table" />
+            <!-- Kmeans -->
+            <!-- ... Set k-value, no gap, no R obj, metrics and bootrepl. -->
+            <param name="inp_rdat_kmeans" value="trans_filter_ds.rds" />
+            <param name="metric" value="manhattan" />
+            <param name="cln" value="6" />
+            <param name="dogap" value="T" />
+            <param name="bootnr" value="10" />
+            <!-- Outlier -->
+            <!-- ... No R out, other opts-->
+            <param name="inp_rdat_outlier" value="trans_outlier_in.rds" />
+            <param name="outminc" value="1" />
+            <param name="probthr" value="1e-5" />
+            <param name="outlg" value="10" />
+            <param name="outdistquant" value="0.50" />
+            <output name="out_table_outlier" value="out_outlier3.table" />
+            <!-- tSNE -->
+            <param name="use_gexpr" value="Yes" />
+            <repeat name="geneset">
+                <param name="genes" value="1110007C09Rik__chr13+1110037F02Rik__chr4+1300002K09Rik__chr4" />
+            </repeat>
+            <repeat name="geneset">
+                <param name="genes" value="0610010K14Rik__chr11+1500009L16Rik__chr10" />
+            </repeat>
+            <param name="regex" value="[^_]+__" />
+            <output name="out_html" value="out_3.html" />
+            <output name="out_rdat_tsne" value="out_tsne3.rdat" />
+        </test>
+    </tests>
+
+    <help><![CDATA[
+
+******
+RaceID
+******
+
+RaceID(v2) pipeline for scRNA, performs: 
+ * filtering
+ * normalisation
+ * k-means clustering
+ * outlier detection
+
+Generates heatmaps, tSNE plots, and an R object which can be passed into the RaceID DiffGenes tool for expression analysis between different sets of clusters.
+
+**Filtering**
+
+This takes a count matrix/spreadsheet with cellIDs as columns and geneIDs/transcriptIDs as rows, and filters based on standard single-cell RNA pre-processing methods (minimum/maximum transcript expression in a minimum of X number of cells). A filtered matrix is produced as output
+
+**K-means Clustering**
+
+This performs k-means clustering and plots gap statistics, jaccard similarity, silhoutte plots, and preliminary heatmap.
+
+**Outlier Detection**
+
+This performs outlier detection by calibrating against a background noise model within each cluster, and searching for cells that fall outside of the transcript count distribution for that gene (modelled as a negative binomial). Cells that are outliers for more than a user-set amount of genes are suspected as being outlier cells.
+
+**tSNE plots**
+
+Generates multiple tSNE plots with custom expression highlighting for gene subsets of interest. A tSNE map can be drawn for original clusters (derived via k-means) and final clustering (derived from outliers). Any number of genes subsets of interest can be specified to measure expression within clusters for related marker genes or genes characterising a cell type.
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>