comparison gemini_stats.xml @ 5:86d4303cc3ca draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 62ed732cba355e695181924a8ed4cce49ca21c59
author iuc
date Fri, 11 Jan 2019 17:43:48 -0500
parents cdd90678004a
children b92cfa44f5be
comparison
equal deleted inserted replaced
4:cdd90678004a 5:86d4303cc3ca
1 <tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.1"> 1 <tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@">
2 <description>Compute useful variant statistics</description> 2 <description>Compute useful variant statistics</description>
3 <macros> 3 <macros>
4 <import>gemini_macros.xml</import> 4 <import>gemini_macros.xml</import>
5 <token name="@BINARY@">stats</token> 5 <token name="@BINARY@">stats</token>
6 </macros> 6 </macros>
8 <expand macro="stdio" /> 8 <expand macro="stdio" />
9 <expand macro="version_command" /> 9 <expand macro="version_command" />
10 <command> 10 <command>
11 <![CDATA[ 11 <![CDATA[
12 gemini @BINARY@ 12 gemini @BINARY@
13 $stats_type 13 #if str($stats.type) == "gts-stats":
14 #set $multiline_sql_expr = $stats.variants.gt_filter
15 #set $cmdln_param = "--gt-filter"
16 @MULTILN_SQL_EXPR_TO_CMDLN@
14 17
15 #set $multiline_sql_expr = $gt_filter 18 #if str($stats.variants.constraint).strip():
16 #set $cmdln_param = "--gt-filter" 19 #set $multiline_sql_expr = "select * from variants WHERE " + str($stats.variants.constraint)
17 @MULTILN_SQL_EXPR_TO_CMDLN@ 20 #else:
18 21 #set $multiline_sql_expr = "select * from variants"
19 #set $multiline_sql_expr = $summarize 22 #end if
20 #set $cmdln_param = "--summarize" 23 #set $cmdln_param = "--summarize"
21 @MULTILN_SQL_EXPR_TO_CMDLN@ 24 @MULTILN_SQL_EXPR_TO_CMDLN@
22 25 #else:
23 "${ infile }" 26 ${stats.stats_option}
24 > "${ outfile }" 27 #end if
28 '$infile'
29 > '$outfile'
25 ]]> 30 ]]>
26 </command> 31 </command>
27 <inputs> 32 <inputs>
28 <expand macro="infile" /> 33 <expand macro="infile" />
29 34
30 <param name="stats_type" type="select" label="Studying ..." help=""> 35 <conditional name="stats">
31 <option value="--tstv">Compute the transition and transversion ratios for the snps (--tstv)</option> 36 <param name="type" type="select"
32 <option value="--tstv-coding">Compute the transition/transversion ratios for the snps in the coding regions (--tstv-coding)</option> 37 label="Select the type of statistics you are interested in" help="">
33 <option value="--tstv-noncoding">Compute the transition/transversion ratios for the snps in the non-coding regions (--tstv-noncoding)</option> 38 <option value="gts-stats">Genotype counts tabulated by sample (--summarize)</option>
34 <option value="--snp-counts">Compute the type and count of the snps (--snp-counts)</option> 39 <option value="snp-counts">Counts of SNPs by nucleotide change (--snp-counts)</option>
35 <option value="--sfs">Calculate the site frequency spectrum of the variants (--sfs)</option> 40 <option value="tstv-stats">Transition / transversion statistics for the SNPs in the dataset</option>
36 <option value="--mds">Compute the pair-wise genetic distance between each sample (--mds)</option> 41 <option value="aaf">Alternate allele frequency spectrum of all variants (--sfs)</option>
37 <option value="--vars-by-sample">Return the total variants per sample, sum of homozygous and heterozygous variants (--vars-by-sample)</option> 42 <option value="sample-distance">Pair-wise genetic distances between for all samples (--mds)</option>
38 <option value="--gts-by-sample">Return the count of each genotype class observed per sample (--gts-by-sample)</option> 43 </param>
39 </param> 44 <when value="snp-counts">
40 45 <param name="stats_option" type="hidden" value="--snp-counts" />
41 <param name="gt_filter" type="text" area="True" size="5x50" label="Restrictions to apply to genotype values" help="(--gt-filer)"> 46 </when>
42 <expand macro="sanitize_query" /> 47 <when value="aaf">
43 </param> 48 <param name="stats_option" type="hidden" value="--sfs" />
44 49 </when>
45 <param name="summarize" type="text" area="True" size="5x50" label="The query to be issued to the database to summarize" help="(--summarize)"> 50 <when value="sample-distance">
46 <expand macro="sanitize_query" /> 51 <param name="stats_option" type="hidden" value="--mds" />
47 </param> 52 </when>
53 <when value="tstv-stats">
54 <param name="stats_option" type="select"
55 label="Calculate Ts/Tv statistics based on"
56 help="Restricting the calculation to coding/noncoding regions will only produce meaningful results with preannotated variants. If you haven't annotated your variants with SnpEff or VEP before loading them into GEMINI, select All SNPs.">
57 <option value="--tstv">All SNPs (--tstv)</option>
58 <option value="--tstv-coding">SNPs in coding regions (--tstv-coding)</option>
59 <option value="--tstv-noncoding">SNPs in non-coding regions (--tstv-noncoding)</option>
60 </param>
61 </when>
62 <when value="gts-stats">
63 <param name="stats_option" type="hidden" value="" />
64 <conditional name="variants">
65 <param name="keep" type="select"
66 label="Compute the genotype counts table based on"
67 help="If you select All variants the genotype counts will be produced using --summarize with the wildcard query &quot;select * from variants&quot;.">
68 <option value="all">All variants</option>
69 <option value="custom">Custom filtered variants</option>
70 </param>
71 <when value="all">
72 <param name="gt_filter" type="hidden" value="" />
73 <param name="constraint" type="hidden" value="" />
74 </when>
75 <when value="custom">
76 <param argument="--gt-filter" name="gt_filter" type="text" area="True" size="5x50"
77 label="Restrictions to apply to genotype values"
78 help="">
79 <expand macro="sanitize_query" />
80 </param>
81 <param name="constraint" type="text" area="True" size="5x50"
82 label="Additional constraints on the variants"
83 help="Enter valid constraints for the WHERE clause of a GEMINI query here. You could use, for example: chrom = 'chr1' or impact_severity = 'HIGH', to include only high-impact variants on chromosome 1 in the counts table.">
84 <expand macro="sanitize_query" />
85 </param>
86 </when>
87 </conditional>
88 </when>
89 </conditional>
48 </inputs> 90 </inputs>
49 <outputs> 91 <outputs>
50 <data name="outfile" format="tabular" /> 92 <data name="outfile" format="tabular" />
51 </outputs> 93 </outputs>
52 <tests> 94 <tests>
53 <test> 95 <test>
54 <!-- test vars-by-sample report --> 96 <!-- test vars-by-sample report -->
55 <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" /> 97 <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
56 <param name="stats_type" value="--vars-by-sample" /> 98 <conditional name="stats">
99 <param name="type" value="tstv-stats" />
100 <param name="stats_option" value="--tstv-coding" />
101 </conditional>
57 <output name="outfile"> 102 <output name="outfile">
58 <assert_contents> 103 <assert_contents>
59 <has_line_matching expression="sample&#009;total" /> 104 <!-- since the input file is not annotated
105 no variants will be considered to be in coding regions -->
106 <has_line line="ts&#009;tv&#009;ts/tv" />
107 <has_line line="0&#009;0&#009;0" />
60 </assert_contents> 108 </assert_contents>
61 </output> 109 </output>
62 </test> 110 </test>
63 <test> 111 <test>
64 <!-- test gts-by-sample report --> 112 <!-- test gts-by-sample report -->
65 <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" /> 113 <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" />
66 <param name="stats_type" value="--gts-by-sample" /> 114 <conditional name="stats">
115 <param name="type" value="gts-stats" />
116 <conditional name="variants">
117 <param name="keep" value="all" />
118 </conditional>
119 </conditional>
67 <output name="outfile"> 120 <output name="outfile">
68 <assert_contents> 121 <assert_contents>
69 <has_line_matching expression="sample&#009;num_hom_ref&#009;num_het&#009;num_hom_alt&#009;num_unknown&#009;total" /> 122 <has_line_matching expression="sample&#009;total&#009;num_het&#009;num_hom_alt&#009;num_hom_ref" />
70 </assert_contents> 123 </assert_contents>
71 </output> 124 </output>
72 </test> 125 </test>
73 </tests> 126 </tests>
74 <help><![CDATA[ 127 <help><![CDATA[
75 **What it does** 128 **What it does**
76 129
77 The stats tool computes some useful variant statistics for a GEMINI database. 130 The stats tool computes one of the following useful variant statistics for a GEMINI database:
78 Like computing the transition and transversion ratios for the snps.
79 131
80 **Settings and examples** 132 **Genotype counts tabulated by sample**:
81 133
82 --tstv-coding: 134 This mode uses the ``gemini stats --summarize`` option to produce a table with
83 Compute the transition/transversion ratios for the snps in the coding regions. 135 one row per sample, which tabulates the numbers of sites, for which a given
136 sample shows a:
84 137
85 --tstv-noncoding: 138 - non-reference genotype (*total* column; the sum of the *num_het* and *num_hom_alt* columns next to it)
86 Compute the transition/transversion ratios for the snps in the non-coding regions. 139 - heterozygous genotype (*num_het* column)
140 - homozygous variant genotype (*num_hom_alt* column)
141 - homozygous reference genotype (*num_hom_ref* column)
87 142
88 EXAMPLE Compute the type and count of the snps; --snp-counts:: 143 You can choose to calculate the table based on all variants in your database,
144 or to filter the variants before the calculation using GEMINI genotype filter
145 expressions and/or WHERE clauses of GEMINI queries.
146
147 **Counts of SNPs by nucleotide change**:
148
149 This runs ``gemini stats`` with the ``--snp-count`` option. The result is a
150 simple table listing the number of occurences of each observed REF->ALT change
151 in your database, e.g.::
89 152
90 type count 153 type count
91 A->G 2 154 A->G 2
92 C->T 1 155 C->T 1
93 G->A 1 156 G->A 1
94 157
95 EXAMPLE Calculate the site frequency spectrum of the variants; --sfs:: 158 **Transition / transversion statistics**
159
160 This mode uses ``gemini stats`` with the ``--tstv``, ``--tstv-coding``, or
161 ``--tstv-noncoding`` option to compute the transition/transversion ratios for
162 all SNPs, for SNPs in coding, or SNPs in non-coding regions, respectively.
163
164 The result is presented in a 1x3 table listing the number of
165 transitions (*ts* column), transversions (*tv* column) and the ratio of the two
166 (*ts/tv* column), e.g.::
167
168 ts tv ts/tv
169 126 39 3.2307
170
171 **Alternate allele frequency spectrum**
172
173 Runs ``gemini stats --sfs`` to produce binned alternate allele frequency counts
174 in a table like::
96 175
97 aaf count 176 aaf count
98 0.125 2 177 0.125 2
99 0.375 1 178 0.375 1
100 179
101 EXAMPLE Compute the pair-wise genetic distance between each sample; --mds:: 180 **Pairwise genetic distances**
181
182 Runs ``gemini stats --mds`` and tabulates all pairwise genetic distance for the
183 samples in your database. An example could look like this::
102 184
103 sample1 sample2 distance 185 sample1 sample2 distance
104 M10500 M10500 0.0 186 M10500 M10500 0.0
105 M10475 M10478 1.25 187 M10475 M10478 1.25
106 M10500 M10475 2.0 188 M10500 M10475 2.0
107 M10500 M10478 0.5714 189 M10500 M10478 0.5714
108 190
109 EXAMPLE Return a count of the types of genotypes per sample; --gts-by-sample::
110
111 sample num_hom_ref num_het num_hom_alt num_unknown total
112 M10475 4 1 3 1 9
113 M10478 2 2 4 1 9
114
115
116
117 EXAMPLE Return the total variants per sample (sum of homozygous and heterozygous variants); --vars-by-sample::
118
119 sample total
120 M10475 4
121 M10478 6
122
123 **Final solution**
124
125 --summarize:
126 If none of these tools are exactly what you want, you can summarize the variants per sample of an arbitrary query using the –summarize flag.
127
128 EXAMPLE If you wanted to know, for each sample, how many variants are on chromosome 1 that are also in dbSNP;--summarize "select * from variants where in_dbsnp=1 and chrom='chr1'"::
129
130 sample total num_het num_hom_alt
131 M10475 1 1 0
132 M128215 1 1 0
133 M10478 2 2 0
134 M10500 2 1 1
135
136
137 ]]></help> 191 ]]></help>
138 <expand macro="citations"/> 192 <expand macro="citations"/>
139 </tool> 193 </tool>