comparison mageck_mle.xml @ 0:eab37e8fea75 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mageck commit 71cef018eec5ee7ff7f3853599c027e80e2637fe
author iuc
date Wed, 14 Feb 2018 06:42:36 -0500
parents
children 93f268840b0c
comparison
equal deleted inserted replaced
-1:000000000000 0:eab37e8fea75
1 <?xml version="1.0"?>
2 <tool id="mageck_mle" name="MAGeCK mle" version="@VERSION@" >
3 <description>- perform maximum-likelihood estimation of gene essentiality scores</description>
4 <macros>
5 <import>mageck_macros.xml</import>
6 </macros>
7 <expand macro="requirements" />
8 <expand macro="version" />
9 <command detect_errors="exit_code"><![CDATA[
10
11 mageck mle
12
13 -k '$count_table'
14
15 #if $samples.sample_select == "dmatrix":
16 -d '$samples.design_matrix'
17 #elif $samples.sample_select == "labels":
18 --day0-label '$samples.day0_label'
19 -i '$samples.include_samples'
20 -b '$samples.beta_labels'
21 #end if
22
23 -n output
24
25 #if $adv.control_sgrna:
26 --control-sgrna $adv.control_sgrna
27 #end if
28
29 #if $adv.cnv_norm:
30 --cnv-norm $adv.cnv_norm
31 #end if
32
33 --norm-method $adv.norm_method
34 --genes-varmodeling $adv.genes_var
35 --permutation-round $adv.permutation
36 --adjust-method $adv.adjust_method
37
38 #if $adv.sgrnaeff_file:
39 --sgrna-efficiency $adv.sgrnaeff_file
40 --sgrna-eff-name-column $adv.sgrnaid_col
41 --sgrna-eff-score-column $adv.sgrnaeff_col
42 #end if
43
44 $adv.remove_outliers
45 $adv.update_eff
46 --threads \${GALAXY_SLOTS:-1}
47
48 ]]></command>
49 <inputs>
50 <param name="count_table" argument="--count-table" type="data" format="tabular" label="Counts file"
51 help="Provide a tab-separated count table. Each line in the table should include sgRNA name (1st column), target gene (2nd column) and read counts in each sample. See Help below for more information" />
52 <conditional name="samples">
53 <param name="sample_select" type="select" label="Design matrix or sample labels" help="You can choose to either provide a design matrix or specify the samples">
54 <option value="dmatrix">Design matrix</option>
55 <option value="labels">Specify samples</option>
56 </param>
57 <when value="dmatrix">
58 <param name="design_matrix" argument="--design-matrix" type="data" format="tabular" label="Design Matrix file" help="Provide a design matrix, either a file name or a quoted string of the design matrix. For example, 1,1;1,0. The row of the design matrix must match the order of the samples in the count table (if --include-samples is not specified), or the order of the samples by the --include-samples option" />
59 </when>
60 <when value="labels">
61 <param name="include_samples" argument="--include-samples" type="text" label="Sample labels" help="Specify the sample labels if the design matrix is not given by file in the --design-matrix option. Sample labels are separated by comma (,) and must match the labels in the count table" />
62 <param name="day0_label" argument="--day0-label" type="text" format="tabular" optional="true" value="" label="Control sample" help="Specify the control sample label (usually day 0 or plasmid). For every other sample label, the MLE module will treat it as a single condition and generate an corresponding design matrix" />
63 <param name="beta_labels" argument="--beta-labels" type="text" label="Variables" help="Specify the labels of the variables (i.e., beta), if the design matrix is not given by file in the --design-matrix option. Should be separated by commas (,), and the number of labels must equal to the number of columns of design matrix), including baseline labels. Default: bata_0,beta_1,beta_2,...." />
64 </when>
65 </conditional>
66
67 <section name="adv" title="Advanced Options">
68 <param name="control_sgrna" argument="--control-sgrna" type="data" format="tabular" optional="true" label="Control sgRNAs file" help="A list of control sgRNAs for normalization and for generating the null distribution of RRA" />
69 <param name="cnv_norm" argument="--cnv-norm" type="data" format="tabular" optional="true" label="CNV profile file" help="A matrix of copy number variation data across cell lines to normalize CNV-biased BetaScores" />
70 <param name="norm_method" argument="--norm-method" type="select" label="Method for normalization" help="If control is specified, the size factor will be estimated using control sgRNAs specified in --control-sgrna option. Default: Median" >
71 <option value="none">None</option>
72 <option value="median" selected="True">Median</option>
73 <option value="total">Total</option>
74 <option value="control">Control</option>
75 </param>
76 <param name="genes_var" argument="--genes-varmodeling" type="integer" value="1000" label="Number of genes for mean-variance modeling" help="Default: 1000" />
77 <param name="permutation" argument="--permutation-round" type="integer" value="10" label="Number of permutations" help="The rounds for permutation. The permutation time is (# genes) * x for x rounds of permutation. Suggested value: 100 (may take longer time). Default: 10" />
78 <param name="remove_outliers" argument="--remove-outliers" type="boolean" truevalue="--remove-outliers" falsevalue="" checked="false" optional="true" label="Try to remove outliers" help="Turning this option on will slow the algorithm" />
79 <param name="adjust_method" argument="--adjust-method" type="select" label="P-Value Adjustment Method" help="Method for sgRNA-level p-value adjustment, including False Discovery Rate (FDR), Holm's method (Holm), or Pounds's method (Pounds). Default: FDR">
80 <option value="fdr" selected="True">FDR</option>
81 <option value="holm">Holm</option>
82 <option value="pounds">Pounds</option>
83 </param>
84 <param name="sgrnaeff_file" argument="--sgrna-efficiency" type="data" format="tabular" optional="true" label="sgRNA efficiency file" help="An optional file of sgRNA efficiency prediction. The efficiency prediction will be used as an initial guess of the probability an sgRNA is efficient. Must contain at least two columns, one containing sgRNA ID, the other containing sgRNA efficiency prediction" />
85 <param name="sgrnaeff_name_col" argument="--sgrna-eff-score-column" type="data_column" data_ref="sgrnaeff_file" value="1" optional="true" label="sgRNA score column" help="The sgRNA efficiency prediction column in sgRNA efficiency prediction file (specified by the --sgrna-efficiency option). Default is 1 (the second column)." />
86 <param name="sgrnaeff_score_col" argument="--sgrna-eff-name-column" type="data_column" data_ref="sgrnaeff_file" value="0" optional="true" label="sgRNA ID column" help="The sgRNA ID column in sgRNA efficiency prediction file (specified by the --sgrna-efficiency option). Default is 0 (the first column)" />
87 <param name="update_eff" argument="--update-efficiency" type="boolean" truevalue="--update-efficiency" falsevalue="" checked="false" optional="true"
88 label="Update efficiency" help="Iteratively update sgRNA efficiency during EM iteration" />
89 <param name="out_log" type="boolean" truevalue="True" falsevalue="" checked="false"
90 label="Output logfile" help="This file includes the logging information during the execution. For count command, it will list some basic statistics of the dataset at the end, including the number of reads, the number of reads mapped to the library, the number of zero-count sgRNAs, etc. Default: No" />
91 </section>
92 </inputs>
93 <outputs>
94 <data name="gene_summary" format="tabular" from_work_dir="*.gene_summary.txt" label="${tool.name} on ${on_string}: Gene Summary (MLE)" />
95 <data name="sgrna_summary" format="tabular" from_work_dir="*.sgrna_summary.txt" label="${tool.name} on ${on_string}: sgRNA Summary (MLE)" />
96 <data name="log" format="tabular" from_work_dir="output.log" label="${tool.name} on ${on_string}: Log (MLE)" >
97 <filter>adv['out_log'] is True</filter>
98 </data>
99 </outputs>
100 <tests>
101 <test><!-- Ensure MAGeCK's demo1 test works -->
102 <param name="count_table" value="demo/demo1/sample.txt" ftype="tabular" />
103 <param name="design_matrix" ftype="tabular" value="in.mle.design_matrix.txt" />
104 <param name="out_log" value="True"/>
105 <output name="gene_summary" file="out.mle.gene_summary.txt" compare="sim_size"/>
106 <output name="sgrna_summary" file="out.mle.sgrna_summary.txt"/>
107 <output name="log" file="out.mle.log.txt" compare="sim_size"/>
108 </test>
109 </tests>
110
111 <help><![CDATA[
112 .. class:: infomark
113
114 **What it does**
115
116 **MAGeCK mle** calculates gene essentiality from CRISPR screens. Compared with the original algorithm in **MAGeCK test**,
117 MAGeCK mle uses a measurement called beta score to call gene essentialities: a positive beta score means a gene is positively selected,
118 and a negative beta score means a gene is negatively selected. It is similar to the term log-fold change in differential expression,
119 and compared with the original robust ranking aggregation (RRA) algorithm, this measurement has the following advantages:
120
121 * It has only one score for one gene, instead of two scores in RRA: one for positive selection, one for negative selection;
122 * It allows a direct comparison across multiple conditions, or even experiments;
123 * It is able to incorporate sgRNA efficiency information.
124
125 -----
126
127 **Inputs**
128
129 **sgRNA count file**
130
131 The sgRNA read count file will be used in -k parameter in the mle command. The read count file should list the names of the sgRNA, the gene it is targeting, followed by the read counts in each sample. Each item should be separated by the tab ('\t'). A header line is optional. For example in the studies of T. Wang et al. Science 2014, there are 4 CRISPR screening samples, and they are labeled as: HL60.initial, KBM7.initial, HL60.final, KBM7.final. Here are a few lines of the read count file:
132
133 ============== ======== ================ ================ ============== ==============
134 **sgRNA** **gene** **HL60.initial** **KBM7.initial** **HL60.final** **KBM7.final**
135 -------------- -------- ---------------- ---------------- -------------- --------------
136 A1CF_m52595977 A1CF 213 274 883 175
137 A1CF_m52596017 A1CF 294 412 1554 1891
138 A1CF_m52596056 A1CF 421 368 566 759
139 A1CF_m52603842 A1CF 274 243 314 855
140 A1CF_m52603847 A1CF 0 50 145 266
141 ============== ======== ================ ================ ============== ==============
142
143 **Design matrix file**
144
145 Either the sample labels can be specified in the tool form above, or alternatively, a `design matrix` file can be provided. The design matrix indicates which sample is affected by which condition. It is generally a binary matrix indicating which sample (indicated by the first column) is affected by which condition (indicated by the first row). For the meanings of the design matrix, check the input file format page.
146
147 ============ ======== ==== ====
148 **Samples** baseline HL60 KBM7
149 ------------ -------- ---- ----
150 HL60.initial 1 0 0
151 KBM7.initial 1 0 0
152 HL60.final 1 1 0
153 KBM7.final 1 0 1
154 ============ ======== ==== ====
155
156 The following are the rules for the design matrix file:
157
158 * The design matrix file must include a header line of condition labels
159 * The first column is the sample labels that must match sample labels in read count file
160 * The second column must be a "baseline" column that sets all values to "1"
161 * The element in the design matrix is either "0" or "1"
162 * You must have at least one sample of "initial state" (e.g., day 0 or plasmid) that has only one "1" in the corresponding row. That only "1" must be in the baseline column.
163 * In the design matrix above, there are four samples, two corresponding to the initial states of two cell lines, and two corresponding to the final states of two cell lines. We design two conditions (HL60 and KBM7) that model the cell type-specific effects.
164
165 **Control sgRNA file**
166
167 The optional Control sgRNAs file is used to generate null distribution when calculating the p values. If this option is not specified, MAGeCK generates the null distribution of RRA scores by assuming all of the genes in the library are non-essential, see **More Information** below. This approach is sometimes over-conservative, and you can improve this if you know some genes are not essential. By providing the corresponding sgRNA IDs in this option, MAGeCK will have a better estimation of p values. To use this option, you need to prepare a text file specifying the IDs of control sgRNAs, one line for one sgRNA ID.
168
169 **Outputs**
170
171 If successful, MAGeCK mle will generate two files, the gene_summary file (including gene beta scores), and the sgrna_summary file (including sgRNA efficiency probability predictions).
172
173 **Gene summary file (including beta scores)**
174
175 An example of the gene summary output file is below. This file includes the beta scores in two conditions specified in the design matrix (HL60|beta and KBM7|beta), and the associated statistics. For more information, check the output format specification of the **mageck test** *Gene Summary* file.
176
177 ======== ========= ============= ========== ================ ============ ===================== ================= ============= =========== ================ ============ ===================== =================
178 **Gene** **sgRNA** **HL60|beta** **HL60|z** **HL60|p-value** **HL60|fdr** **HL60|wald-p-value** **HL60|wald-fdr** **KBM7|beta** **KBM7|ze** **KBM7|p-value** **KBM7|fdr** **KBM7|wald-p-value** **KBM7|wald-fdr**
179 -------- --------- ------------- ---------- ---------------- ------------ --------------------- ----------------- ------------- ----------- ---------------- ------------ --------------------- -----------------
180 RNF14 10 0.24927 0.72077 0.36256 0.75648 0.47105 0.9999 0.57276 1.6565 0.06468 0.32386 0.097625 0.73193
181 RNF10 10 0.10159 0.29373 0.92087 0.98235 0.76896 0.9999 0.11341 0.32794 0.90145 0.97365 0.74296 0.98421
182 RNF11 10 3.6354 10.513 0.00028 0.021739 7.5197e-26 1.3376e-22 2.5928 7.4925 0.0014898 0.032024 6.7577e-14 1.33e-11
183 ======== ========= ============= ========== ================ ============ ===================== ================= ============= =========== ================ ============ ===================== =================
184
185
186 **sgRNA summary file (including sgRNA efficiency probability predictions)**
187
188 An example of the sgRNA ranking output is as follows:
189
190 ================ ======== ================= =================== ================ ============== ======= =============== =========== ========= ========= =========== ============== =========== =====================
191 **sgrna** **Gene** **control_count** **treatment_count** **control_mean** **treat_mean** **LFC** **control_var** **adj_var** **score** **p.low** **p.high** **p.twosided** **FDR** **high_in_treatment**
192 ---------------- -------- ----------------- ------------------- ---------------- -------------- ------- --------------- ----------- --------- --------- ----------- -------------- ----------- ---------------------
193 INO80B_m74682554 INO80B 0.0/0.0 1220.15/1476.14 0.810860 1348.15 10.70 0.0 19.0767 308.478 1.0 1.11022e-16 2.22044e-16 1.57651e-14 True
194 NHS_p17705966 NHS 1.62172/3.90887 2327.09/1849.95 2.76529 2088.52 9.54 2.61554 68.2450 252.480 1.0 1.11022e-16 2.22044e-16 1.57651e-14 True
195 ================ ======== ================= =================== ================ ============== ======= =============== =========== ========= ========= =========== ============== =========== =====================
196
197 The contents of each column are as follows:
198
199 * **sgrna** sgRNA ID
200 * **Gene** The targeting gene
201 * **control_count** Normalized read counts in control samples
202 * **treatment_count** Normalized read counts in treatment samples
203 * **control_mean** Mean read counts in control samples
204 * **treat_mean** Mean read counts in treatment samples
205 * **LFC** The log fold change of sgRNA
206 * **control_var** The raw variance in control samples
207 * **adj_var** The adjusted variance in control samples
208 * **score** The score of this sgRNA
209 * **p.low** p-value (lower tail)
210 * **p.high** p-value (higher tail)
211 * **p.twosided** p-value (two sided)
212 * **FDR** false discovery rate
213 * **high_in_treatment** Whether the abundance is higher in treatment samples
214
215 -----
216
217 **More Information**
218
219 **Overview of the MAGeCK algorithm**
220
221 Briefly, read counts from different samples are first median-normalized to adjust for the effect of library sizes and read count distributions. Then the variance of read counts is estimated by sharing information across features, and a negative binomial (NB) model is used to test whether sgRNA abundance differs significantly between treatments and controls. This approach is similar to those used for differential RNA-Seq analysis. We rank sgRNAs based on P-values calculated from the NB model, and use a modified robust ranking aggregation (RRA) algorithm named α-RRA to identify positively or negatively selected genes. More specifically, α-RRA assumes that if a gene has no effect on selection, then sgRNAs targeting this gene should be uniformly distributed across the ranked list of all the sgRNAs. α-RRA ranks genes by comparing the skew in rankings to the uniform null model, and prioritizes genes whose sgRNA rankings are consistently higher than expected. α-RRA calculates the statistical significance of the skew by permutation, and a detailed description of the algorithm is presented in the Materials and methods section of the `MAGeCK paper`_. Finally, MAGeCK reports positively and negatively selected pathways by applying α-RRA to the rankings of genes in a pathway.
222
223 For more information on using MAGeCK, see the `MAGeCK website here`_.
224
225 .. _`design matrix`: https://sourceforge.net/p/mageck/wiki/input/#design-matrix-file
226 .. _`MAGeCK paper`: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0554-4
227 .. _`MAGeCK website here`: https://sourceforge.net/p/mageck/wiki/QA/#using-mageck
228
229 ]]></help>
230 <expand macro="citations" />
231 </tool>