changeset 5:86d4303cc3ca draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 62ed732cba355e695181924a8ed4cce49ca21c59
author iuc
date Fri, 11 Jan 2019 17:43:48 -0500
parents cdd90678004a
children 73b103195e2a
files gemini_macros.xml gemini_stats.xml repository_dependencies.xml test-data/gemini_amend_input.db test-data/gemini_annotate_result.db test-data/gemini_auto_dom_input.db test-data/gemini_auto_rec_input.db test-data/gemini_comphets_input.db test-data/gemini_de_novo_input.db test-data/gemini_is_somatic_result.db test-data/gemini_load_result1.db test-data/gemini_load_result2.db test-data/gemini_versioned_databases.loc test-data/test-cache/gemini-config.yaml test-data/test-cache/gemini/data/clinvar_20160203.tidy.vcf.gz test-data/test-cache/gemini/data/clinvar_20160203.tidy.vcf.gz.tbi test-data/test-cache/gemini/data/clinvar_20170130.tidy.vcf.gz test-data/test-cache/gemini/data/clinvar_20170130.tidy.vcf.gz.tbi test-data/test-cache/gemini/data/dbsnp.b141.20140813.hg19.tidy.vcf.gz test-data/test-cache/gemini/data/dbsnp.b141.20140813.hg19.tidy.vcf.gz.tbi test-data/test-cache/gemini/data/dbsnp.b147.20160601.tidy.vcf.gz test-data/test-cache/gemini/data/dbsnp.b147.20160601.tidy.vcf.gz.tbi test-data/test-cache/gemini/data/gnomad.exomes.r2.0.1.sites.no-VEP.nohist.tidy.vcf.gz test-data/test-cache/gemini/data/gnomad.exomes.r2.0.1.sites.no-VEP.nohist.tidy.vcf.gz.tbi
diffstat 24 files changed, 301 insertions(+), 176 deletions(-) [+]
line wrap: on
line diff
--- a/gemini_macros.xml	Fri Dec 14 12:47:19 2018 -0500
+++ b/gemini_macros.xml	Fri Jan 11 17:43:48 2019 -0500
@@ -1,15 +1,12 @@
 <macros>
     <!-- gemini version to be used -->
-    <token name="@VERSION@">0.18.1</token>
+    <token name="@VERSION@">0.20.1</token>
     <!-- minimal annotation files version required by this version of gemini -->
-    <token name="@DB_VERSION@">181</token>
+    <token name="@DB_VERSION@">200</token>
 
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@VERSION@">gemini</requirement>
-            <requirement type="package" version="0.2.6">tabix</requirement>
-            <!-- for conda useage -->
-            <!--requirement type="package" version="1.3.1">htslib</requirement-->
             <yield />
         </requirements>
     </xml>
@@ -24,9 +21,17 @@
             <exit_code range=":-1" />
             <regex match="Error:" />
             <regex match="Exception:" />
+            <yield />
         </stdio>
     </xml>
 
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1371/journal.pcbi.1003153</citation>
+            <yield />
+        </citations>
+    </xml>
+
     <xml name="annotation_dir">
         <param name="annotation_databases" type="select" label="Choose a gemini annotation source">
             <options from_data_table="gemini_versioned_databases">
@@ -36,31 +41,36 @@
         </param>
     </xml>
 
-    <xml name="add_header_column">
-        <param name="header" type="boolean" truevalue="--header" falsevalue="" checked="False" 
-            label="Add a header of column names to the output" help="(--header)"/>
-    </xml>
-
-    <xml name="radius">
-        <param name="radius" type="integer" value="3" label="Set filter for Breadth-first search (BFS) in the Protein-Protein Interaction network" help="(-r)" >
-            <validator type="in_range" min="0"/>
+    <xml name="infile">
+        <param name="infile" type="data" format="gemini.sqlite" label="GEMINI database" help="Only files with version @VERSION@ are accepted." >
+            <options options_filter_attribute="metadata.gemini_version" >
+                <filter type="add_value" value="@VERSION@" />
+            </options>
         </param>
     </xml>
-    <xml name="variant_mode">
-        <param name="variant_mode" type="boolean" truevalue="--var" falsevalue="" checked="False" 
-            label="Returns variant info (e.g. impact, biotype) for interacting genes" help="(--var)"/>
+
+    <xml name="add_header_column">
+        <param argument="--header" name="header" type="boolean" truevalue="--header" falsevalue="" checked="True" 
+        label="Add a header of column names to the output" />
     </xml>
 
-    <xml name="column_filter">
+    <xml name="column_filter" token_help="" token_minimalset="variant_id, gene">
         <conditional name="report">
-            <param name="report_selector" type="select" label="Columns to include in the report"
-                help="By default, this tool reports all columns in the variants table. One may choose to report only a subset of the columns.">
-                <option value="all" selected="True">all</option>
-                <option value="column_filter">User given columns</option>
+            <param name="report_selector" type="select"
+            label="Set of columns to include in the variant report table"
+            help="@HELP@">
+                <option value="minimal">Minimal (report only a preconfigured minimal set of columns)</option>
+                <option value="full">Full (report all columns defined in the GEMINI database variants table)</option>
+                <option value="custom">Custom (report user-specified columns)</option>
             </param>
-            <when value="all"/>
-            <when value="column_filter">
-                <param name="columns" type="select" display="checkboxes" multiple="True" label="Choose columns to include in the report" help="(--columns)">
+            <when value="full" />
+            <when value="minimal">
+                <param name="columns" type="hidden" value="@MINIMALSET@" />
+                <param name="extra_cols" type="hidden" value="" />
+            </when>
+            <when value="custom">
+                <param name="columns" type="select" display="checkboxes" multiple="true" optional="true"
+                label="Choose columns to include in the report" help="(--columns)">
                     <option value="gene">gene</option>
                     <option value="chrom">chrom</option>
                     <option value="start">start</option>
@@ -69,27 +79,23 @@
                     <option value="alt">alt</option>
                     <option value="impact">impact</option>
                     <option value="impact_severity">impact_severity</option>
-                    <option value="max_aaf_all">alternative allele frequency</option>
+                    <option value="max_aaf_all">alternative allele frequency (max_aaf_all)</option>
                 </param>
-                <param name="extra_cols" type="text" label="Additional columns." help="Separate by whitespace"/>
+                <param name="extra_cols" type="text"
+                label="Additional columns (comma-separated)"
+                help="Column must be specified by the exact name they have in the GEMINI database, e.g., is_exonic or num_hom_alt, but, for genotype columns, GEMINI wildcard syntax is supported. The order of columns in the list is maintained in the output.">
+                    <expand macro="sanitize_query" />
+                </param>
             </when>
         </conditional>
     </xml>
 
-    <xml name="filter">
-        <conditional name="filter">
-            <param name="filter_selector" type="select" label="Apply additional constraints"
-                help="By default, this tool will report all variants regardless of their putative functional impact. In order to apply additional constraints on the variants returned, you can this optional filter.">
-                <option value="no">No additional constraints</option>
-                <option value="yes">Apply additional constraints</option>
-            </param>
-            <when value="no"/>
-            <when value="yes">
-                <param name="filter" type="text" label="Contraints in SQL syntax" help="Conditions applied here will become WHERE clauses in the query issued to the GEMINI database. E.g. alt='G' or impact_severity = 'HIGH'. (--filter)">
-                    <expand macro="sanitize_query" />
-                </param>
-            </when>
-        </conditional>
+    <xml name="filter" token_argument="--filter">
+        <param argument="@ARGUMENT@" name="filter" type="text"
+        label="Additional constraints expressed in SQL syntax"
+        help="Constraints defined here will become the WHERE clause of the SQL query issued to the GEMINI database. E.g. alt='G' or impact_severity = 'HIGH'.">
+            <expand macro="sanitize_query" />
+        </param>
     </xml>
 
     <xml name="sanitize_query">
@@ -103,10 +109,90 @@
        </sanitizer>
     </xml>
 
+    <xml name="lenient" token_argument="--lenient" token_truevalue="--lenient" token_help="The exact consequence of this setting depends on the type of inheritance pattern you are looking for (see the tool help below).">
+        <param argument="@ARGUMENT@" name="lenient" type="boolean" truevalue="@TRUEVALUE@" falsevalue="" checked="False"
+        label="Include hits with less convincing inheritance patterns"
+        help= "@HELP@" />
+    </xml>
+
+    <xml name="unaffected">
+        <param argument="--allow-unaffected" name="allow_unaffected" type="boolean" truevalue="--allow-unaffected" falsevalue="" checked="False"
+        label="Report candidates shared by unaffected samples"
+        help="Activating this option will enable the reporting of variants as candidate causative even if they are shared by unaffected samples in the family tree. The default will only report variants that are unique to affected samples."/>
+    </xml>
+
+    <xml name="min_kindreds" token_label="Minimum number of families with a candidate variant for a gene to be reported" token_help="This is the number of families required to have a variant fitting the inheritance model in the same gene in order for the gene and its variants to be reported. For example, we may only be interested in candidates where at least 4 families have a variant (with a fitting inheritance pattern) in that gene.">
+        <param argument="--min-kindreds" name="min_kindreds" type="integer" value="1" min="1"
+        label="@LABEL@"
+        help="@HELP@" />
+    </xml>
+
+    <xml name="insert_constraint" token_max_repeat="1">
+        <repeat name="constraint" title="Additional constraints on variants" default="0" max="@MAX_REPEAT@">
+            <expand macro="filter" />
+            <yield />
+        </repeat>
+    </xml>
+
+    <xml name="overwritable_where_default" token_default_where="">
+        <param name="overwrite_default_filter" type="boolean" checked="false"
+        label="Overwrite the default constraint of this tool"
+        help="By default, this tool restricts its analysis to @DEFAULT_WHERE@ and this constraint is applied on top of any constraint expressed above. With this option here selected, your custom constraint, if given, will overwrite the default instead." />
+    </xml>
+
+    <xml name="gt_filter" token_default_repeat="0" token_min_repeat="0" token_max_repeat="1">
+        <repeat name="filter_by_genotype" title="Genotype filter expression" default="@DEFAULT_REPEAT@" min="@MIN_REPEAT@" max="@MAX_REPEAT@">
+            <param argument="--gt-filter" name="gt_filter" type="text" value="" area="True" size="5x50"
+            label="Restrictions to apply to genotype values" help="">
+                <expand macro="sanitize_query" />
+                <validator type="expression" message="Genotype filter expression cannot be empty">value.strip()</validator>
+            </param>
+            <yield />
+        </repeat>
+    </xml>
+
+    <xml name="sample_filter">
+        <repeat name="filter_by_sample" title="Sample filter expression" default="0" max="1">
+            <param argument="--sample-filter" name="sample_filter" type="text" area="True" size="5x50"
+            label="SQL filter to use to filter the sample table" help="">
+                <expand macro="sanitize_query" />
+                <validator type="expression" message="Sample filter expression cannot be empty">value.strip()</validator>
+            </param>
+            <param argument="--in" name="in" type="select"
+            label="A variant must be in either all, none or any samples passing the sample-query filter"
+            help="">
+                <option value="">Return a variant if it is found in any sample passing the sample filter. (default) </option>
+                <option value="--in all">Return a variant if it is found in ALL samples passing the sample filter. (all)</option>
+                <option value="--in none">Return a variant if it is found in NO sample passing the sample filter. (none)</option>
+                <option value="--in only">Return a variant if it is found in any sample passing the sample filter, and in NO sample NOT passing it. (only)</option>
+                <option value="--in only all">Return a variant if is found in ALL samples passing the sample filter, and in NO sample NOT passing it. (only all)</option>
+            </param>
+            <expand macro="min_kindreds"
+            label="Minimum number of families in which a variant must pass the sample filter" help=""/>
+            <param argument="--family-wise" name="family_wise" type="boolean" truevalue="--family-wise" falsevalue="" checked="False"
+            label="Apply the sample-filter on a family-wise basis" help="If a variant passes the sample filter in at least the minimum number of families specified above it is retained." />
+        </repeat>
+    </xml>
+
+    <xml name="region_filter">
+        <repeat name="regions" title="Region Filter" default="0" min="0"
+        help="Filter variant sites by their position in the genome. If multiple Region Filters are specified, all variants that fall in ONE of the regions are reported.">
+            <param name="chrom" type="text" label="Chromosome">
+                <validator type="expression" message="A chromosome identifier is required when specifying a region filter">value.strip()</validator>
+            </param>
+            <param name="start" type="text" label="Region Start">
+                <validator type="expression" message="an integer number is required">not value or value.isdigit()</validator>
+            </param>
+            <param name="stop" type="text" label="Region End">
+                <validator type="expression" message="an integer number is required">not value or value.isdigit()</validator>
+            </param>
+        </repeat>
+    </xml>
+
     <token name="@PROVIDE_ANNO_DATA@"><![CDATA[
         mkdir gemini &&
-        ln -s "${annotation_databases.fields.path}/gemini/data" gemini/data &&
-        export GEMINI_CONFIG="${annotation_databases.fields.path}" &&
+        ln -s '${annotation_databases.fields.path}/gemini/data' gemini/data &&
+        export GEMINI_CONFIG='${annotation_databases.fields.path}' &&
     ]]></token>
 
     <token name="@MULTILN_SQL_EXPR_TO_CMDLN@">
@@ -119,67 +205,50 @@
         #end if
     </token>
 
-    <token name="@CMDLN_SQL_FILTER_FILTER_OPTION@">
-        #if str($filter.filter_selector) == 'yes' and $filter.filter:
-            --filter '${ str( $filter.filter ) }'
+    <token name="@SET_COLS@">
+        #if str($report.report_selector) == 'full':
+            #set cols = "*"
+        #else:
+            #if $report.columns and str($report.columns) != '':
+                #set $cols = str($report.columns)
+            #else
+                #set $cols = ''
+            #end if
+            #if str($report.extra_cols).strip():
+                #if $cols:
+                    #set $cols = $cols + ', ' + str($report.extra_cols)
+                #else:
+                    #set $cols = str($report.extra_cols)
+                #end if
+            #end if
+            #if not $cols:
+                #set $cols = "variant_id, gene"
+            #end if
         #end if
     </token>
 
     <token name="@COLUMN_SELECT@">
-        #if $report.report_selector != 'all':
-            --columns "${report.columns}
-            #if str($report.extra_cols).strip()
-                #echo ','+','.join(str($report.extra_cols).split()) 
-            #end if
-            "
+        @SET_COLS@
+        #if $cols != "*"
+            --columns '$cols'
         #end if
     </token>
 
-    <xml name="family">
-        <param name="families" type="text" value="" label="Comma seperated list of families to restrict the analysis to." help="e.g. Family1,Family3 (--families)"/>
-    </xml>
-
-    <xml name="lenient">
-        <param name="lenient" type="boolean" truevalue="--lenient" falsevalue="" checked="False" label="Loosen the restrictions on family structure"/>
-    </xml>
-
-    <xml name="unaffected">
-        <param name="allow_unaffected" type="boolean" truevalue="--allow-unaffected" falsevalue="" checked="False" label="Report candidates that also impact samples labeled as unaffected." help="(--allow-unaffected)"/>
-    </xml>
-
-    <xml name="min_kindreds">
-        <param name="min_kindreds" type="integer" value="1" label="The min. number of kindreds that must have a candidate variant in a gene" help="default: 1 (--min-kindreds)" />
-    </xml>
-
-    <xml name="min_sequence_depth">
-        <param name="d" type="integer" value="0" min="0" label="The minimum aligned sequence depth (genotype DP) required for each sample"
-                help="default: 0 (-d)" />
-    </xml>
-
-    <xml name="min_gq">
-        <param name="min_gq" type="integer" value="0" label="the minimum genotype quality required for each sample in a family" help="default: 0 (--min-gq)">
-            <validator type="in_range" min="0"/>
-        </param>
-    </xml>
-
-    <xml name="gt_pl_max">
-        <param name="gt_pl_max" type="integer" value="-1" min="-1" label="The maximum phred-scaled genotype likelihod (PL) allowed for each sample in a family" help="default: -1 (not set) (--gt-pl-max)" />
-    </xml>
-
-    <xml name="citations">
-        <citations>
-            <citation type="doi">10.1371/journal.pcbi.1003153</citation>
-            <yield />
-        </citations>
-    </xml>
-
-    <xml name="infile">
-        <param name="infile" type="data" format="gemini.sqlite" label="GEMINI database" help="Only files with version @VERSION@ are accepted." >
-            <options options_filter_attribute="metadata.gemini_version" >
-                <filter type="add_value" value="@VERSION@" />
-            </options>
-            <validator type="expression" message="This version of Gemini will only work with Gemini files that are for version @VERSION@.">value is not None and value.metadata.gemini_version == "@VERSION@"</validator>
-        </param>
-    </xml>
-
+    <token name="@PARSE_REGION_ELEMENTS@"><![CDATA[
+        #set $region_elements = []
+        #for $r in $regions:
+            ## The actual chromosome name needs to be single-quoted
+            ## in SQL, so we need to quote the single quotes like the
+            ## sanitize_query macro would if the whole was a parameter.
+            #set $r_elements = ["chrom = '\"'\"'%s'\"'\"'" % str($r.chrom).strip()]
+            #if str($r.start).strip():
+                #silent $r_elements.append("start >= %d" % int($r.start))
+            #end if
+            #if str($r.stop).strip():
+                #silent $r_elements.append("end <= %d" % int($r.stop))
+            #end if
+            #silent $region_elements.append("(%s)" % " AND ".join($r_elements))
+        #end for
+    ]]>
+    </token>
 </macros>
--- a/gemini_stats.xml	Fri Dec 14 12:47:19 2018 -0500
+++ b/gemini_stats.xml	Fri Jan 11 17:43:48 2019 -0500
@@ -1,4 +1,4 @@
-<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.1">
+<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@">
     <description>Compute useful variant statistics</description>
     <macros>
         <import>gemini_macros.xml</import>
@@ -10,41 +10,83 @@
     <command>
 <![CDATA[
         gemini @BINARY@
-            $stats_type
-
-            #set $multiline_sql_expr = $gt_filter
-            #set $cmdln_param = "--gt-filter"
-            @MULTILN_SQL_EXPR_TO_CMDLN@
+            #if str($stats.type) == "gts-stats":
+                #set $multiline_sql_expr = $stats.variants.gt_filter
+                #set $cmdln_param = "--gt-filter"
+                @MULTILN_SQL_EXPR_TO_CMDLN@
 
-            #set $multiline_sql_expr = $summarize
-            #set $cmdln_param = "--summarize"
-            @MULTILN_SQL_EXPR_TO_CMDLN@
-
-            "${ infile }"
-            > "${ outfile }"
+                #if str($stats.variants.constraint).strip():
+                    #set $multiline_sql_expr = "select * from variants WHERE " + str($stats.variants.constraint)
+                #else:
+                    #set $multiline_sql_expr = "select * from variants"
+                #end if
+                #set $cmdln_param = "--summarize"
+                @MULTILN_SQL_EXPR_TO_CMDLN@
+            #else:
+                ${stats.stats_option}
+            #end if
+            '$infile'
+            > '$outfile'
 ]]>
     </command>
     <inputs>
         <expand macro="infile" />
 
-        <param name="stats_type" type="select" label="Studying ..." help="">
-            <option value="--tstv">Compute the transition and transversion ratios for the snps (--tstv)</option>
-            <option value="--tstv-coding">Compute the transition/transversion ratios for the snps in the coding regions (--tstv-coding)</option>
-            <option value="--tstv-noncoding">Compute the transition/transversion ratios for the snps in the non-coding regions (--tstv-noncoding)</option>
-            <option value="--snp-counts">Compute the type and count of the snps (--snp-counts)</option>
-            <option value="--sfs">Calculate the site frequency spectrum of the variants (--sfs)</option>
-            <option value="--mds">Compute the pair-wise genetic distance between each sample (--mds)</option>
-            <option value="--vars-by-sample">Return the total variants per sample, sum of homozygous and heterozygous variants (--vars-by-sample)</option>
-            <option value="--gts-by-sample">Return the count of each genotype class observed per sample (--gts-by-sample)</option>
-        </param>
-
-        <param name="gt_filter" type="text" area="True" size="5x50" label="Restrictions to apply to genotype values" help="(--gt-filer)">
-            <expand macro="sanitize_query" />
-        </param>
-
-        <param name="summarize" type="text" area="True" size="5x50" label="The query to be issued to the database to summarize" help="(--summarize)">
-            <expand macro="sanitize_query" />
-        </param>
+        <conditional name="stats">
+            <param name="type" type="select"
+            label="Select the type of statistics you are interested in" help="">
+                <option value="gts-stats">Genotype counts tabulated by sample (--summarize)</option>
+                <option value="snp-counts">Counts of SNPs by nucleotide change (--snp-counts)</option>
+                <option value="tstv-stats">Transition / transversion statistics for the SNPs in the dataset</option>
+                <option value="aaf">Alternate allele frequency spectrum of all variants (--sfs)</option>
+                <option value="sample-distance">Pair-wise genetic distances between for all samples (--mds)</option>
+            </param>
+            <when value="snp-counts">
+                <param name="stats_option" type="hidden" value="--snp-counts" />
+            </when>
+            <when value="aaf">
+                <param name="stats_option" type="hidden" value="--sfs" />
+            </when>
+            <when value="sample-distance">
+                <param name="stats_option" type="hidden" value="--mds" />
+            </when>
+            <when value="tstv-stats">
+                <param name="stats_option" type="select"
+                label="Calculate Ts/Tv statistics based on"
+                help="Restricting the calculation to coding/noncoding regions will only produce meaningful results with preannotated variants. If you haven't annotated your variants with SnpEff or VEP before loading them into GEMINI, select All SNPs.">
+                    <option value="--tstv">All SNPs (--tstv)</option>
+                    <option value="--tstv-coding">SNPs in coding regions (--tstv-coding)</option>
+                    <option value="--tstv-noncoding">SNPs in non-coding regions (--tstv-noncoding)</option>
+                </param>
+            </when>
+            <when value="gts-stats">
+                <param name="stats_option" type="hidden" value="" />
+                <conditional name="variants">
+                    <param name="keep" type="select"
+                    label="Compute the genotype counts table based on"
+                    help="If you select All variants the genotype counts will be produced using --summarize with the wildcard query &quot;select * from variants&quot;.">
+                        <option value="all">All variants</option>
+                        <option value="custom">Custom filtered variants</option>
+                    </param>
+                    <when value="all">
+                        <param name="gt_filter" type="hidden" value="" />
+                        <param name="constraint" type="hidden" value="" />
+                    </when>
+                    <when value="custom">
+                        <param argument="--gt-filter" name="gt_filter" type="text" area="True" size="5x50"
+                        label="Restrictions to apply to genotype values"
+                        help="">
+                            <expand macro="sanitize_query" />
+                        </param>
+                        <param name="constraint" type="text" area="True" size="5x50"
+                        label="Additional constraints on the variants"
+                        help="Enter valid constraints for the WHERE clause of a GEMINI query here. You could use, for example: chrom = 'chr1' or impact_severity = 'HIGH', to include only high-impact variants on chromosome 1 in the counts table.">
+                            <expand macro="sanitize_query" />
+                        </param>
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
     </inputs>
     <outputs>
         <data name="outfile" format="tabular" />
@@ -52,21 +94,32 @@
     <tests>
         <test>
             <!-- test vars-by-sample report -->
-            <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" />
-            <param name="stats_type" value="--vars-by-sample" />
+            <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
+            <conditional name="stats">
+                <param name="type" value="tstv-stats" />
+                <param name="stats_option" value="--tstv-coding" />
+            </conditional>
             <output name="outfile">
                 <assert_contents>
-                    <has_line_matching expression="sample&#009;total" />
+                    <!-- since the input file is not annotated
+                    no variants will be considered to be in coding regions -->
+                    <has_line line="ts&#009;tv&#009;ts/tv" />
+                    <has_line line="0&#009;0&#009;0" />
                 </assert_contents>
             </output>
         </test>
         <test>
             <!-- test gts-by-sample report -->
             <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" />
-            <param name="stats_type" value="--gts-by-sample" />
+            <conditional name="stats">
+                <param name="type" value="gts-stats" />
+                <conditional name="variants">
+                    <param name="keep" value="all" />
+                </conditional>
+            </conditional>
             <output name="outfile">
                 <assert_contents>
-                    <has_line_matching expression="sample&#009;num_hom_ref&#009;num_het&#009;num_hom_alt&#009;num_unknown&#009;total" />
+                    <has_line_matching expression="sample&#009;total&#009;num_het&#009;num_hom_alt&#009;num_hom_ref" />
                 </assert_contents>
             </output>
         </test>
@@ -74,31 +127,60 @@
     <help><![CDATA[
 **What it does**
 
-The stats tool computes some useful variant statistics for a GEMINI database.
-Like computing the transition and transversion ratios for the snps.
+The stats tool computes one of the following useful variant statistics for a GEMINI database:
+
+**Genotype counts tabulated by sample**:
 
-**Settings and examples**
+This mode uses the ``gemini stats --summarize`` option to produce a table with
+one row per sample, which tabulates the numbers of sites, for which a given
+sample shows a:
 
---tstv-coding:
- Compute the transition/transversion ratios for the snps in the coding regions.
+- non-reference genotype (*total* column; the sum of the *num_het* and *num_hom_alt* columns next to it)
+- heterozygous genotype (*num_het* column)
+- homozygous variant genotype (*num_hom_alt* column)
+- homozygous reference genotype (*num_hom_ref* column)
 
---tstv-noncoding:
- Compute the transition/transversion ratios for the snps in the non-coding regions.
+You can choose to calculate the table based on all variants in your database,
+or to filter the variants before the calculation using GEMINI genotype filter
+expressions and/or WHERE clauses of GEMINI queries.
 
-EXAMPLE Compute the type and count of the snps; --snp-counts::
+**Counts of SNPs by nucleotide change**:
+
+This runs ``gemini stats`` with the ``--snp-count`` option. The result is a
+simple table listing the number of occurences of each observed REF->ALT change
+in your database, e.g.::
 
  type    count
  A->G    2
  C->T    1
  G->A    1
 
-EXAMPLE Calculate the site frequency spectrum of the variants; --sfs::
+**Transition / transversion statistics**
+
+This mode uses ``gemini stats`` with the ``--tstv``, ``--tstv-coding``, or
+``--tstv-noncoding`` option to compute the transition/transversion ratios for
+all SNPs, for SNPs in coding, or SNPs in non-coding regions, respectively.
+
+The result is presented in a 1x3 table listing the number of
+transitions (*ts* column), transversions (*tv* column) and the ratio of the two
+(*ts/tv* column), e.g.::
+
+ ts    tv    ts/tv
+ 126   39    3.2307
+
+**Alternate allele frequency spectrum**
+
+Runs ``gemini stats --sfs`` to produce binned alternate allele frequency counts
+in a table like::
 
  aaf     count
  0.125   2
  0.375   1
 
-EXAMPLE Compute the pair-wise genetic distance between each sample; --mds::
+**Pairwise genetic distances**
+
+Runs ``gemini stats --mds`` and tabulates all pairwise genetic distance for the
+samples in your database. An example could look like this::
 
  sample1  sample2  distance
  M10500   M10500   0.0
@@ -106,34 +188,6 @@
  M10500   M10475   2.0
  M10500   M10478   0.5714
 
-EXAMPLE Return a count of the types of genotypes per sample; --gts-by-sample::
-
- sample   num_hom_ref   num_het   num_hom_alt   num_unknown   total
- M10475   4             1         3             1             9
- M10478   2             2         4             1             9
-
-
-
-EXAMPLE Return the total variants per sample (sum of homozygous and heterozygous variants); --vars-by-sample::
-
- sample  total
- M10475  4
- M10478  6
-
-**Final solution**
-
---summarize:
- If none of these tools are exactly what you want, you can summarize the variants per sample of an arbitrary query using the –summarize flag. 
-
-EXAMPLE If you wanted to know, for each sample, how many variants are on chromosome 1 that are also in dbSNP;--summarize "select * from variants where in_dbsnp=1 and chrom='chr1'":: 
-
- sample   total  num_het  num_hom_alt
- M10475   1      1        0
- M128215  1      1        0
- M10478   2      2        0
- M10500   2      1        1
-
-
     ]]></help>
     <expand macro="citations"/>
 </tool>
--- a/repository_dependencies.xml	Fri Dec 14 12:47:19 2018 -0500
+++ b/repository_dependencies.xml	Fri Jan 11 17:43:48 2019 -0500
@@ -1,4 +1,4 @@
 <?xml version="1.0" ?>
 <repositories description="This requires the GEMINI data manager definition to install all required annotation databases.">
-    <repository changeset_revision="fe5a9a7d95b0" name="data_manager_gemini_database_downloader" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"/>
+    <repository changeset_revision="f57426daa04d" name="data_manager_gemini_database_downloader" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"/>
 </repositories>
\ No newline at end of file
Binary file test-data/gemini_amend_input.db has changed
Binary file test-data/gemini_annotate_result.db has changed
Binary file test-data/gemini_auto_dom_input.db has changed
Binary file test-data/gemini_auto_rec_input.db has changed
Binary file test-data/gemini_comphets_input.db has changed
Binary file test-data/gemini_de_novo_input.db has changed
Binary file test-data/gemini_is_somatic_result.db has changed
Binary file test-data/gemini_load_result1.db has changed
Binary file test-data/gemini_load_result2.db has changed
--- a/test-data/gemini_versioned_databases.loc	Fri Dec 14 12:47:19 2018 -0500
+++ b/test-data/gemini_versioned_databases.loc	Fri Jan 11 17:43:48 2019 -0500
@@ -1,3 +1,3 @@
 ## GEMINI versioned databases
 #DownloadDate	dbkey	DBversion	Description	Path
-1999-01-01	hg19	181	GEMINI annotations (test snapshot)	${__HERE__}/test-cache
+1999-01-01	hg19	200	GEMINI annotations (test snapshot)	${__HERE__}/test-cache
--- a/test-data/test-cache/gemini-config.yaml	Fri Dec 14 12:47:19 2018 -0500
+++ b/test-data/test-cache/gemini-config.yaml	Fri Jan 11 17:43:48 2019 -0500
@@ -2,12 +2,14 @@
 versions:
   ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.tidy.vcf.gz: 4
   ESP6500SI.all.snps_indels.tidy.v2.vcf.gz: 2
-  ExAC.r0.3.sites.vep.tidy.vcf.gz: 3
+  ExAC.r0.3.sites.vep.tidy.vcf.gz: 4
   GRCh37-gms-mappability.vcf.gz: 2
-  clinvar_20160203.tidy.vcf.gz: 5
+  clinvar_20170130.tidy.vcf.gz: 5
   cosmic-v68-GRCh37.tidy.vcf.gz: 3
-  dbsnp.b141.20140813.hg19.tidy.vcf.gz: 4
+  dbsnp.b147.20160601.tidy.vcf.gz: 1
   detailed_gene_table_v75: 2
   geno2mp.variants.tidy.vcf.gz: 1
+  gnomad.exomes.r2.0.1.sites.no-VEP.nohist.tidy.vcf.gz: 2
   hg19.rmsk.bed.gz: 2
   summary_gene_table_v75: 2
+  whole_genome_SNVs.tsv.compressed.gz: 2
Binary file test-data/test-cache/gemini/data/clinvar_20160203.tidy.vcf.gz has changed
Binary file test-data/test-cache/gemini/data/clinvar_20160203.tidy.vcf.gz.tbi has changed
Binary file test-data/test-cache/gemini/data/clinvar_20170130.tidy.vcf.gz has changed
Binary file test-data/test-cache/gemini/data/clinvar_20170130.tidy.vcf.gz.tbi has changed
Binary file test-data/test-cache/gemini/data/dbsnp.b141.20140813.hg19.tidy.vcf.gz has changed
Binary file test-data/test-cache/gemini/data/dbsnp.b141.20140813.hg19.tidy.vcf.gz.tbi has changed
Binary file test-data/test-cache/gemini/data/dbsnp.b147.20160601.tidy.vcf.gz has changed
Binary file test-data/test-cache/gemini/data/dbsnp.b147.20160601.tidy.vcf.gz.tbi has changed
Binary file test-data/test-cache/gemini/data/gnomad.exomes.r2.0.1.sites.no-VEP.nohist.tidy.vcf.gz has changed
Binary file test-data/test-cache/gemini/data/gnomad.exomes.r2.0.1.sites.no-VEP.nohist.tidy.vcf.gz.tbi has changed