Mercurial > repos > iuc > gemini_stats
comparison gemini_stats.xml @ 5:86d4303cc3ca draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 62ed732cba355e695181924a8ed4cce49ca21c59
author | iuc |
---|---|
date | Fri, 11 Jan 2019 17:43:48 -0500 |
parents | cdd90678004a |
children | b92cfa44f5be |
comparison
equal
deleted
inserted
replaced
4:cdd90678004a | 5:86d4303cc3ca |
---|---|
1 <tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.1"> | 1 <tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@"> |
2 <description>Compute useful variant statistics</description> | 2 <description>Compute useful variant statistics</description> |
3 <macros> | 3 <macros> |
4 <import>gemini_macros.xml</import> | 4 <import>gemini_macros.xml</import> |
5 <token name="@BINARY@">stats</token> | 5 <token name="@BINARY@">stats</token> |
6 </macros> | 6 </macros> |
8 <expand macro="stdio" /> | 8 <expand macro="stdio" /> |
9 <expand macro="version_command" /> | 9 <expand macro="version_command" /> |
10 <command> | 10 <command> |
11 <![CDATA[ | 11 <![CDATA[ |
12 gemini @BINARY@ | 12 gemini @BINARY@ |
13 $stats_type | 13 #if str($stats.type) == "gts-stats": |
14 #set $multiline_sql_expr = $stats.variants.gt_filter | |
15 #set $cmdln_param = "--gt-filter" | |
16 @MULTILN_SQL_EXPR_TO_CMDLN@ | |
14 | 17 |
15 #set $multiline_sql_expr = $gt_filter | 18 #if str($stats.variants.constraint).strip(): |
16 #set $cmdln_param = "--gt-filter" | 19 #set $multiline_sql_expr = "select * from variants WHERE " + str($stats.variants.constraint) |
17 @MULTILN_SQL_EXPR_TO_CMDLN@ | 20 #else: |
18 | 21 #set $multiline_sql_expr = "select * from variants" |
19 #set $multiline_sql_expr = $summarize | 22 #end if |
20 #set $cmdln_param = "--summarize" | 23 #set $cmdln_param = "--summarize" |
21 @MULTILN_SQL_EXPR_TO_CMDLN@ | 24 @MULTILN_SQL_EXPR_TO_CMDLN@ |
22 | 25 #else: |
23 "${ infile }" | 26 ${stats.stats_option} |
24 > "${ outfile }" | 27 #end if |
28 '$infile' | |
29 > '$outfile' | |
25 ]]> | 30 ]]> |
26 </command> | 31 </command> |
27 <inputs> | 32 <inputs> |
28 <expand macro="infile" /> | 33 <expand macro="infile" /> |
29 | 34 |
30 <param name="stats_type" type="select" label="Studying ..." help=""> | 35 <conditional name="stats"> |
31 <option value="--tstv">Compute the transition and transversion ratios for the snps (--tstv)</option> | 36 <param name="type" type="select" |
32 <option value="--tstv-coding">Compute the transition/transversion ratios for the snps in the coding regions (--tstv-coding)</option> | 37 label="Select the type of statistics you are interested in" help=""> |
33 <option value="--tstv-noncoding">Compute the transition/transversion ratios for the snps in the non-coding regions (--tstv-noncoding)</option> | 38 <option value="gts-stats">Genotype counts tabulated by sample (--summarize)</option> |
34 <option value="--snp-counts">Compute the type and count of the snps (--snp-counts)</option> | 39 <option value="snp-counts">Counts of SNPs by nucleotide change (--snp-counts)</option> |
35 <option value="--sfs">Calculate the site frequency spectrum of the variants (--sfs)</option> | 40 <option value="tstv-stats">Transition / transversion statistics for the SNPs in the dataset</option> |
36 <option value="--mds">Compute the pair-wise genetic distance between each sample (--mds)</option> | 41 <option value="aaf">Alternate allele frequency spectrum of all variants (--sfs)</option> |
37 <option value="--vars-by-sample">Return the total variants per sample, sum of homozygous and heterozygous variants (--vars-by-sample)</option> | 42 <option value="sample-distance">Pair-wise genetic distances between for all samples (--mds)</option> |
38 <option value="--gts-by-sample">Return the count of each genotype class observed per sample (--gts-by-sample)</option> | 43 </param> |
39 </param> | 44 <when value="snp-counts"> |
40 | 45 <param name="stats_option" type="hidden" value="--snp-counts" /> |
41 <param name="gt_filter" type="text" area="True" size="5x50" label="Restrictions to apply to genotype values" help="(--gt-filer)"> | 46 </when> |
42 <expand macro="sanitize_query" /> | 47 <when value="aaf"> |
43 </param> | 48 <param name="stats_option" type="hidden" value="--sfs" /> |
44 | 49 </when> |
45 <param name="summarize" type="text" area="True" size="5x50" label="The query to be issued to the database to summarize" help="(--summarize)"> | 50 <when value="sample-distance"> |
46 <expand macro="sanitize_query" /> | 51 <param name="stats_option" type="hidden" value="--mds" /> |
47 </param> | 52 </when> |
53 <when value="tstv-stats"> | |
54 <param name="stats_option" type="select" | |
55 label="Calculate Ts/Tv statistics based on" | |
56 help="Restricting the calculation to coding/noncoding regions will only produce meaningful results with preannotated variants. If you haven't annotated your variants with SnpEff or VEP before loading them into GEMINI, select All SNPs."> | |
57 <option value="--tstv">All SNPs (--tstv)</option> | |
58 <option value="--tstv-coding">SNPs in coding regions (--tstv-coding)</option> | |
59 <option value="--tstv-noncoding">SNPs in non-coding regions (--tstv-noncoding)</option> | |
60 </param> | |
61 </when> | |
62 <when value="gts-stats"> | |
63 <param name="stats_option" type="hidden" value="" /> | |
64 <conditional name="variants"> | |
65 <param name="keep" type="select" | |
66 label="Compute the genotype counts table based on" | |
67 help="If you select All variants the genotype counts will be produced using --summarize with the wildcard query "select * from variants"."> | |
68 <option value="all">All variants</option> | |
69 <option value="custom">Custom filtered variants</option> | |
70 </param> | |
71 <when value="all"> | |
72 <param name="gt_filter" type="hidden" value="" /> | |
73 <param name="constraint" type="hidden" value="" /> | |
74 </when> | |
75 <when value="custom"> | |
76 <param argument="--gt-filter" name="gt_filter" type="text" area="True" size="5x50" | |
77 label="Restrictions to apply to genotype values" | |
78 help=""> | |
79 <expand macro="sanitize_query" /> | |
80 </param> | |
81 <param name="constraint" type="text" area="True" size="5x50" | |
82 label="Additional constraints on the variants" | |
83 help="Enter valid constraints for the WHERE clause of a GEMINI query here. You could use, for example: chrom = 'chr1' or impact_severity = 'HIGH', to include only high-impact variants on chromosome 1 in the counts table."> | |
84 <expand macro="sanitize_query" /> | |
85 </param> | |
86 </when> | |
87 </conditional> | |
88 </when> | |
89 </conditional> | |
48 </inputs> | 90 </inputs> |
49 <outputs> | 91 <outputs> |
50 <data name="outfile" format="tabular" /> | 92 <data name="outfile" format="tabular" /> |
51 </outputs> | 93 </outputs> |
52 <tests> | 94 <tests> |
53 <test> | 95 <test> |
54 <!-- test vars-by-sample report --> | 96 <!-- test vars-by-sample report --> |
55 <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" /> | 97 <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" /> |
56 <param name="stats_type" value="--vars-by-sample" /> | 98 <conditional name="stats"> |
99 <param name="type" value="tstv-stats" /> | |
100 <param name="stats_option" value="--tstv-coding" /> | |
101 </conditional> | |
57 <output name="outfile"> | 102 <output name="outfile"> |
58 <assert_contents> | 103 <assert_contents> |
59 <has_line_matching expression="sample	total" /> | 104 <!-- since the input file is not annotated |
105 no variants will be considered to be in coding regions --> | |
106 <has_line line="ts	tv	ts/tv" /> | |
107 <has_line line="0	0	0" /> | |
60 </assert_contents> | 108 </assert_contents> |
61 </output> | 109 </output> |
62 </test> | 110 </test> |
63 <test> | 111 <test> |
64 <!-- test gts-by-sample report --> | 112 <!-- test gts-by-sample report --> |
65 <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" /> | 113 <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" /> |
66 <param name="stats_type" value="--gts-by-sample" /> | 114 <conditional name="stats"> |
115 <param name="type" value="gts-stats" /> | |
116 <conditional name="variants"> | |
117 <param name="keep" value="all" /> | |
118 </conditional> | |
119 </conditional> | |
67 <output name="outfile"> | 120 <output name="outfile"> |
68 <assert_contents> | 121 <assert_contents> |
69 <has_line_matching expression="sample	num_hom_ref	num_het	num_hom_alt	num_unknown	total" /> | 122 <has_line_matching expression="sample	total	num_het	num_hom_alt	num_hom_ref" /> |
70 </assert_contents> | 123 </assert_contents> |
71 </output> | 124 </output> |
72 </test> | 125 </test> |
73 </tests> | 126 </tests> |
74 <help><![CDATA[ | 127 <help><![CDATA[ |
75 **What it does** | 128 **What it does** |
76 | 129 |
77 The stats tool computes some useful variant statistics for a GEMINI database. | 130 The stats tool computes one of the following useful variant statistics for a GEMINI database: |
78 Like computing the transition and transversion ratios for the snps. | |
79 | 131 |
80 **Settings and examples** | 132 **Genotype counts tabulated by sample**: |
81 | 133 |
82 --tstv-coding: | 134 This mode uses the ``gemini stats --summarize`` option to produce a table with |
83 Compute the transition/transversion ratios for the snps in the coding regions. | 135 one row per sample, which tabulates the numbers of sites, for which a given |
136 sample shows a: | |
84 | 137 |
85 --tstv-noncoding: | 138 - non-reference genotype (*total* column; the sum of the *num_het* and *num_hom_alt* columns next to it) |
86 Compute the transition/transversion ratios for the snps in the non-coding regions. | 139 - heterozygous genotype (*num_het* column) |
140 - homozygous variant genotype (*num_hom_alt* column) | |
141 - homozygous reference genotype (*num_hom_ref* column) | |
87 | 142 |
88 EXAMPLE Compute the type and count of the snps; --snp-counts:: | 143 You can choose to calculate the table based on all variants in your database, |
144 or to filter the variants before the calculation using GEMINI genotype filter | |
145 expressions and/or WHERE clauses of GEMINI queries. | |
146 | |
147 **Counts of SNPs by nucleotide change**: | |
148 | |
149 This runs ``gemini stats`` with the ``--snp-count`` option. The result is a | |
150 simple table listing the number of occurences of each observed REF->ALT change | |
151 in your database, e.g.:: | |
89 | 152 |
90 type count | 153 type count |
91 A->G 2 | 154 A->G 2 |
92 C->T 1 | 155 C->T 1 |
93 G->A 1 | 156 G->A 1 |
94 | 157 |
95 EXAMPLE Calculate the site frequency spectrum of the variants; --sfs:: | 158 **Transition / transversion statistics** |
159 | |
160 This mode uses ``gemini stats`` with the ``--tstv``, ``--tstv-coding``, or | |
161 ``--tstv-noncoding`` option to compute the transition/transversion ratios for | |
162 all SNPs, for SNPs in coding, or SNPs in non-coding regions, respectively. | |
163 | |
164 The result is presented in a 1x3 table listing the number of | |
165 transitions (*ts* column), transversions (*tv* column) and the ratio of the two | |
166 (*ts/tv* column), e.g.:: | |
167 | |
168 ts tv ts/tv | |
169 126 39 3.2307 | |
170 | |
171 **Alternate allele frequency spectrum** | |
172 | |
173 Runs ``gemini stats --sfs`` to produce binned alternate allele frequency counts | |
174 in a table like:: | |
96 | 175 |
97 aaf count | 176 aaf count |
98 0.125 2 | 177 0.125 2 |
99 0.375 1 | 178 0.375 1 |
100 | 179 |
101 EXAMPLE Compute the pair-wise genetic distance between each sample; --mds:: | 180 **Pairwise genetic distances** |
181 | |
182 Runs ``gemini stats --mds`` and tabulates all pairwise genetic distance for the | |
183 samples in your database. An example could look like this:: | |
102 | 184 |
103 sample1 sample2 distance | 185 sample1 sample2 distance |
104 M10500 M10500 0.0 | 186 M10500 M10500 0.0 |
105 M10475 M10478 1.25 | 187 M10475 M10478 1.25 |
106 M10500 M10475 2.0 | 188 M10500 M10475 2.0 |
107 M10500 M10478 0.5714 | 189 M10500 M10478 0.5714 |
108 | 190 |
109 EXAMPLE Return a count of the types of genotypes per sample; --gts-by-sample:: | |
110 | |
111 sample num_hom_ref num_het num_hom_alt num_unknown total | |
112 M10475 4 1 3 1 9 | |
113 M10478 2 2 4 1 9 | |
114 | |
115 | |
116 | |
117 EXAMPLE Return the total variants per sample (sum of homozygous and heterozygous variants); --vars-by-sample:: | |
118 | |
119 sample total | |
120 M10475 4 | |
121 M10478 6 | |
122 | |
123 **Final solution** | |
124 | |
125 --summarize: | |
126 If none of these tools are exactly what you want, you can summarize the variants per sample of an arbitrary query using the –summarize flag. | |
127 | |
128 EXAMPLE If you wanted to know, for each sample, how many variants are on chromosome 1 that are also in dbSNP;--summarize "select * from variants where in_dbsnp=1 and chrom='chr1'":: | |
129 | |
130 sample total num_het num_hom_alt | |
131 M10475 1 1 0 | |
132 M128215 1 1 0 | |
133 M10478 2 2 0 | |
134 M10500 2 1 1 | |
135 | |
136 | |
137 ]]></help> | 191 ]]></help> |
138 <expand macro="citations"/> | 192 <expand macro="citations"/> |
139 </tool> | 193 </tool> |