Mercurial > repos > iuc > gemini_stats
diff gemini_stats.xml @ 5:86d4303cc3ca draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 62ed732cba355e695181924a8ed4cce49ca21c59
author | iuc |
---|---|
date | Fri, 11 Jan 2019 17:43:48 -0500 |
parents | cdd90678004a |
children | b92cfa44f5be |
line wrap: on
line diff
--- a/gemini_stats.xml Fri Dec 14 12:47:19 2018 -0500 +++ b/gemini_stats.xml Fri Jan 11 17:43:48 2019 -0500 @@ -1,4 +1,4 @@ -<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.1"> +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@"> <description>Compute useful variant statistics</description> <macros> <import>gemini_macros.xml</import> @@ -10,41 +10,83 @@ <command> <![CDATA[ gemini @BINARY@ - $stats_type - - #set $multiline_sql_expr = $gt_filter - #set $cmdln_param = "--gt-filter" - @MULTILN_SQL_EXPR_TO_CMDLN@ + #if str($stats.type) == "gts-stats": + #set $multiline_sql_expr = $stats.variants.gt_filter + #set $cmdln_param = "--gt-filter" + @MULTILN_SQL_EXPR_TO_CMDLN@ - #set $multiline_sql_expr = $summarize - #set $cmdln_param = "--summarize" - @MULTILN_SQL_EXPR_TO_CMDLN@ - - "${ infile }" - > "${ outfile }" + #if str($stats.variants.constraint).strip(): + #set $multiline_sql_expr = "select * from variants WHERE " + str($stats.variants.constraint) + #else: + #set $multiline_sql_expr = "select * from variants" + #end if + #set $cmdln_param = "--summarize" + @MULTILN_SQL_EXPR_TO_CMDLN@ + #else: + ${stats.stats_option} + #end if + '$infile' + > '$outfile' ]]> </command> <inputs> <expand macro="infile" /> - <param name="stats_type" type="select" label="Studying ..." help=""> - <option value="--tstv">Compute the transition and transversion ratios for the snps (--tstv)</option> - <option value="--tstv-coding">Compute the transition/transversion ratios for the snps in the coding regions (--tstv-coding)</option> - <option value="--tstv-noncoding">Compute the transition/transversion ratios for the snps in the non-coding regions (--tstv-noncoding)</option> - <option value="--snp-counts">Compute the type and count of the snps (--snp-counts)</option> - <option value="--sfs">Calculate the site frequency spectrum of the variants (--sfs)</option> - <option value="--mds">Compute the pair-wise genetic distance between each sample (--mds)</option> - <option value="--vars-by-sample">Return the total variants per sample, sum of homozygous and heterozygous variants (--vars-by-sample)</option> - <option value="--gts-by-sample">Return the count of each genotype class observed per sample (--gts-by-sample)</option> - </param> - - <param name="gt_filter" type="text" area="True" size="5x50" label="Restrictions to apply to genotype values" help="(--gt-filer)"> - <expand macro="sanitize_query" /> - </param> - - <param name="summarize" type="text" area="True" size="5x50" label="The query to be issued to the database to summarize" help="(--summarize)"> - <expand macro="sanitize_query" /> - </param> + <conditional name="stats"> + <param name="type" type="select" + label="Select the type of statistics you are interested in" help=""> + <option value="gts-stats">Genotype counts tabulated by sample (--summarize)</option> + <option value="snp-counts">Counts of SNPs by nucleotide change (--snp-counts)</option> + <option value="tstv-stats">Transition / transversion statistics for the SNPs in the dataset</option> + <option value="aaf">Alternate allele frequency spectrum of all variants (--sfs)</option> + <option value="sample-distance">Pair-wise genetic distances between for all samples (--mds)</option> + </param> + <when value="snp-counts"> + <param name="stats_option" type="hidden" value="--snp-counts" /> + </when> + <when value="aaf"> + <param name="stats_option" type="hidden" value="--sfs" /> + </when> + <when value="sample-distance"> + <param name="stats_option" type="hidden" value="--mds" /> + </when> + <when value="tstv-stats"> + <param name="stats_option" type="select" + label="Calculate Ts/Tv statistics based on" + help="Restricting the calculation to coding/noncoding regions will only produce meaningful results with preannotated variants. If you haven't annotated your variants with SnpEff or VEP before loading them into GEMINI, select All SNPs."> + <option value="--tstv">All SNPs (--tstv)</option> + <option value="--tstv-coding">SNPs in coding regions (--tstv-coding)</option> + <option value="--tstv-noncoding">SNPs in non-coding regions (--tstv-noncoding)</option> + </param> + </when> + <when value="gts-stats"> + <param name="stats_option" type="hidden" value="" /> + <conditional name="variants"> + <param name="keep" type="select" + label="Compute the genotype counts table based on" + help="If you select All variants the genotype counts will be produced using --summarize with the wildcard query "select * from variants"."> + <option value="all">All variants</option> + <option value="custom">Custom filtered variants</option> + </param> + <when value="all"> + <param name="gt_filter" type="hidden" value="" /> + <param name="constraint" type="hidden" value="" /> + </when> + <when value="custom"> + <param argument="--gt-filter" name="gt_filter" type="text" area="True" size="5x50" + label="Restrictions to apply to genotype values" + help=""> + <expand macro="sanitize_query" /> + </param> + <param name="constraint" type="text" area="True" size="5x50" + label="Additional constraints on the variants" + help="Enter valid constraints for the WHERE clause of a GEMINI query here. You could use, for example: chrom = 'chr1' or impact_severity = 'HIGH', to include only high-impact variants on chromosome 1 in the counts table."> + <expand macro="sanitize_query" /> + </param> + </when> + </conditional> + </when> + </conditional> </inputs> <outputs> <data name="outfile" format="tabular" /> @@ -52,21 +94,32 @@ <tests> <test> <!-- test vars-by-sample report --> - <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" /> - <param name="stats_type" value="--vars-by-sample" /> + <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" /> + <conditional name="stats"> + <param name="type" value="tstv-stats" /> + <param name="stats_option" value="--tstv-coding" /> + </conditional> <output name="outfile"> <assert_contents> - <has_line_matching expression="sample	total" /> + <!-- since the input file is not annotated + no variants will be considered to be in coding regions --> + <has_line line="ts	tv	ts/tv" /> + <has_line line="0	0	0" /> </assert_contents> </output> </test> <test> <!-- test gts-by-sample report --> <param name="infile" value="gemini_de_novo_input.db" ftype="gemini.sqlite" /> - <param name="stats_type" value="--gts-by-sample" /> + <conditional name="stats"> + <param name="type" value="gts-stats" /> + <conditional name="variants"> + <param name="keep" value="all" /> + </conditional> + </conditional> <output name="outfile"> <assert_contents> - <has_line_matching expression="sample	num_hom_ref	num_het	num_hom_alt	num_unknown	total" /> + <has_line_matching expression="sample	total	num_het	num_hom_alt	num_hom_ref" /> </assert_contents> </output> </test> @@ -74,31 +127,60 @@ <help><![CDATA[ **What it does** -The stats tool computes some useful variant statistics for a GEMINI database. -Like computing the transition and transversion ratios for the snps. +The stats tool computes one of the following useful variant statistics for a GEMINI database: + +**Genotype counts tabulated by sample**: -**Settings and examples** +This mode uses the ``gemini stats --summarize`` option to produce a table with +one row per sample, which tabulates the numbers of sites, for which a given +sample shows a: ---tstv-coding: - Compute the transition/transversion ratios for the snps in the coding regions. +- non-reference genotype (*total* column; the sum of the *num_het* and *num_hom_alt* columns next to it) +- heterozygous genotype (*num_het* column) +- homozygous variant genotype (*num_hom_alt* column) +- homozygous reference genotype (*num_hom_ref* column) ---tstv-noncoding: - Compute the transition/transversion ratios for the snps in the non-coding regions. +You can choose to calculate the table based on all variants in your database, +or to filter the variants before the calculation using GEMINI genotype filter +expressions and/or WHERE clauses of GEMINI queries. -EXAMPLE Compute the type and count of the snps; --snp-counts:: +**Counts of SNPs by nucleotide change**: + +This runs ``gemini stats`` with the ``--snp-count`` option. The result is a +simple table listing the number of occurences of each observed REF->ALT change +in your database, e.g.:: type count A->G 2 C->T 1 G->A 1 -EXAMPLE Calculate the site frequency spectrum of the variants; --sfs:: +**Transition / transversion statistics** + +This mode uses ``gemini stats`` with the ``--tstv``, ``--tstv-coding``, or +``--tstv-noncoding`` option to compute the transition/transversion ratios for +all SNPs, for SNPs in coding, or SNPs in non-coding regions, respectively. + +The result is presented in a 1x3 table listing the number of +transitions (*ts* column), transversions (*tv* column) and the ratio of the two +(*ts/tv* column), e.g.:: + + ts tv ts/tv + 126 39 3.2307 + +**Alternate allele frequency spectrum** + +Runs ``gemini stats --sfs`` to produce binned alternate allele frequency counts +in a table like:: aaf count 0.125 2 0.375 1 -EXAMPLE Compute the pair-wise genetic distance between each sample; --mds:: +**Pairwise genetic distances** + +Runs ``gemini stats --mds`` and tabulates all pairwise genetic distance for the +samples in your database. An example could look like this:: sample1 sample2 distance M10500 M10500 0.0 @@ -106,34 +188,6 @@ M10500 M10475 2.0 M10500 M10478 0.5714 -EXAMPLE Return a count of the types of genotypes per sample; --gts-by-sample:: - - sample num_hom_ref num_het num_hom_alt num_unknown total - M10475 4 1 3 1 9 - M10478 2 2 4 1 9 - - - -EXAMPLE Return the total variants per sample (sum of homozygous and heterozygous variants); --vars-by-sample:: - - sample total - M10475 4 - M10478 6 - -**Final solution** - ---summarize: - If none of these tools are exactly what you want, you can summarize the variants per sample of an arbitrary query using the –summarize flag. - -EXAMPLE If you wanted to know, for each sample, how many variants are on chromosome 1 that are also in dbSNP;--summarize "select * from variants where in_dbsnp=1 and chrom='chr1'":: - - sample total num_het num_hom_alt - M10475 1 1 0 - M128215 1 1 0 - M10478 2 2 0 - M10500 2 1 1 - - ]]></help> <expand macro="citations"/> </tool>