Mercurial > repos > iuc > gemini_query
changeset 5:cd00221d67cb draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 62ed732cba355e695181924a8ed4cce49ca21c59
line wrap: on
line diff
--- a/gemini_macros.xml Fri Dec 14 12:51:59 2018 -0500 +++ b/gemini_macros.xml Fri Jan 11 17:47:02 2019 -0500 @@ -1,15 +1,12 @@ <macros> <!-- gemini version to be used --> - <token name="@VERSION@">0.18.1</token> + <token name="@VERSION@">0.20.1</token> <!-- minimal annotation files version required by this version of gemini --> - <token name="@DB_VERSION@">181</token> + <token name="@DB_VERSION@">200</token> <xml name="requirements"> <requirements> <requirement type="package" version="@VERSION@">gemini</requirement> - <requirement type="package" version="0.2.6">tabix</requirement> - <!-- for conda useage --> - <!--requirement type="package" version="1.3.1">htslib</requirement--> <yield /> </requirements> </xml> @@ -24,9 +21,17 @@ <exit_code range=":-1" /> <regex match="Error:" /> <regex match="Exception:" /> + <yield /> </stdio> </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1003153</citation> + <yield /> + </citations> + </xml> + <xml name="annotation_dir"> <param name="annotation_databases" type="select" label="Choose a gemini annotation source"> <options from_data_table="gemini_versioned_databases"> @@ -36,31 +41,36 @@ </param> </xml> - <xml name="add_header_column"> - <param name="header" type="boolean" truevalue="--header" falsevalue="" checked="False" - label="Add a header of column names to the output" help="(--header)"/> - </xml> - - <xml name="radius"> - <param name="radius" type="integer" value="3" label="Set filter for Breadth-first search (BFS) in the Protein-Protein Interaction network" help="(-r)" > - <validator type="in_range" min="0"/> + <xml name="infile"> + <param name="infile" type="data" format="gemini.sqlite" label="GEMINI database" help="Only files with version @VERSION@ are accepted." > + <options options_filter_attribute="metadata.gemini_version" > + <filter type="add_value" value="@VERSION@" /> + </options> </param> </xml> - <xml name="variant_mode"> - <param name="variant_mode" type="boolean" truevalue="--var" falsevalue="" checked="False" - label="Returns variant info (e.g. impact, biotype) for interacting genes" help="(--var)"/> + + <xml name="add_header_column"> + <param argument="--header" name="header" type="boolean" truevalue="--header" falsevalue="" checked="True" + label="Add a header of column names to the output" /> </xml> - <xml name="column_filter"> + <xml name="column_filter" token_help="" token_minimalset="variant_id, gene"> <conditional name="report"> - <param name="report_selector" type="select" label="Columns to include in the report" - help="By default, this tool reports all columns in the variants table. One may choose to report only a subset of the columns."> - <option value="all" selected="True">all</option> - <option value="column_filter">User given columns</option> + <param name="report_selector" type="select" + label="Set of columns to include in the variant report table" + help="@HELP@"> + <option value="minimal">Minimal (report only a preconfigured minimal set of columns)</option> + <option value="full">Full (report all columns defined in the GEMINI database variants table)</option> + <option value="custom">Custom (report user-specified columns)</option> </param> - <when value="all"/> - <when value="column_filter"> - <param name="columns" type="select" display="checkboxes" multiple="True" label="Choose columns to include in the report" help="(--columns)"> + <when value="full" /> + <when value="minimal"> + <param name="columns" type="hidden" value="@MINIMALSET@" /> + <param name="extra_cols" type="hidden" value="" /> + </when> + <when value="custom"> + <param name="columns" type="select" display="checkboxes" multiple="true" optional="true" + label="Choose columns to include in the report" help="(--columns)"> <option value="gene">gene</option> <option value="chrom">chrom</option> <option value="start">start</option> @@ -69,27 +79,23 @@ <option value="alt">alt</option> <option value="impact">impact</option> <option value="impact_severity">impact_severity</option> - <option value="max_aaf_all">alternative allele frequency</option> + <option value="max_aaf_all">alternative allele frequency (max_aaf_all)</option> </param> - <param name="extra_cols" type="text" label="Additional columns." help="Separate by whitespace"/> + <param name="extra_cols" type="text" + label="Additional columns (comma-separated)" + help="Column must be specified by the exact name they have in the GEMINI database, e.g., is_exonic or num_hom_alt, but, for genotype columns, GEMINI wildcard syntax is supported. The order of columns in the list is maintained in the output."> + <expand macro="sanitize_query" /> + </param> </when> </conditional> </xml> - <xml name="filter"> - <conditional name="filter"> - <param name="filter_selector" type="select" label="Apply additional constraints" - help="By default, this tool will report all variants regardless of their putative functional impact. In order to apply additional constraints on the variants returned, you can this optional filter."> - <option value="no">No additional constraints</option> - <option value="yes">Apply additional constraints</option> - </param> - <when value="no"/> - <when value="yes"> - <param name="filter" type="text" label="Contraints in SQL syntax" help="Conditions applied here will become WHERE clauses in the query issued to the GEMINI database. E.g. alt='G' or impact_severity = 'HIGH'. (--filter)"> - <expand macro="sanitize_query" /> - </param> - </when> - </conditional> + <xml name="filter" token_argument="--filter"> + <param argument="@ARGUMENT@" name="filter" type="text" + label="Additional constraints expressed in SQL syntax" + help="Constraints defined here will become the WHERE clause of the SQL query issued to the GEMINI database. E.g. alt='G' or impact_severity = 'HIGH'."> + <expand macro="sanitize_query" /> + </param> </xml> <xml name="sanitize_query"> @@ -103,10 +109,90 @@ </sanitizer> </xml> + <xml name="lenient" token_argument="--lenient" token_truevalue="--lenient" token_help="The exact consequence of this setting depends on the type of inheritance pattern you are looking for (see the tool help below)."> + <param argument="@ARGUMENT@" name="lenient" type="boolean" truevalue="@TRUEVALUE@" falsevalue="" checked="False" + label="Include hits with less convincing inheritance patterns" + help= "@HELP@" /> + </xml> + + <xml name="unaffected"> + <param argument="--allow-unaffected" name="allow_unaffected" type="boolean" truevalue="--allow-unaffected" falsevalue="" checked="False" + label="Report candidates shared by unaffected samples" + help="Activating this option will enable the reporting of variants as candidate causative even if they are shared by unaffected samples in the family tree. The default will only report variants that are unique to affected samples."/> + </xml> + + <xml name="min_kindreds" token_label="Minimum number of families with a candidate variant for a gene to be reported" token_help="This is the number of families required to have a variant fitting the inheritance model in the same gene in order for the gene and its variants to be reported. For example, we may only be interested in candidates where at least 4 families have a variant (with a fitting inheritance pattern) in that gene."> + <param argument="--min-kindreds" name="min_kindreds" type="integer" value="1" min="1" + label="@LABEL@" + help="@HELP@" /> + </xml> + + <xml name="insert_constraint" token_max_repeat="1"> + <repeat name="constraint" title="Additional constraints on variants" default="0" max="@MAX_REPEAT@"> + <expand macro="filter" /> + <yield /> + </repeat> + </xml> + + <xml name="overwritable_where_default" token_default_where=""> + <param name="overwrite_default_filter" type="boolean" checked="false" + label="Overwrite the default constraint of this tool" + help="By default, this tool restricts its analysis to @DEFAULT_WHERE@ and this constraint is applied on top of any constraint expressed above. With this option here selected, your custom constraint, if given, will overwrite the default instead." /> + </xml> + + <xml name="gt_filter" token_default_repeat="0" token_min_repeat="0" token_max_repeat="1"> + <repeat name="filter_by_genotype" title="Genotype filter expression" default="@DEFAULT_REPEAT@" min="@MIN_REPEAT@" max="@MAX_REPEAT@"> + <param argument="--gt-filter" name="gt_filter" type="text" value="" area="True" size="5x50" + label="Restrictions to apply to genotype values" help=""> + <expand macro="sanitize_query" /> + <validator type="expression" message="Genotype filter expression cannot be empty">value.strip()</validator> + </param> + <yield /> + </repeat> + </xml> + + <xml name="sample_filter"> + <repeat name="filter_by_sample" title="Sample filter expression" default="0" max="1"> + <param argument="--sample-filter" name="sample_filter" type="text" area="True" size="5x50" + label="SQL filter to use to filter the sample table" help=""> + <expand macro="sanitize_query" /> + <validator type="expression" message="Sample filter expression cannot be empty">value.strip()</validator> + </param> + <param argument="--in" name="in" type="select" + label="A variant must be in either all, none or any samples passing the sample-query filter" + help=""> + <option value="">Return a variant if it is found in any sample passing the sample filter. (default) </option> + <option value="--in all">Return a variant if it is found in ALL samples passing the sample filter. (all)</option> + <option value="--in none">Return a variant if it is found in NO sample passing the sample filter. (none)</option> + <option value="--in only">Return a variant if it is found in any sample passing the sample filter, and in NO sample NOT passing it. (only)</option> + <option value="--in only all">Return a variant if is found in ALL samples passing the sample filter, and in NO sample NOT passing it. (only all)</option> + </param> + <expand macro="min_kindreds" + label="Minimum number of families in which a variant must pass the sample filter" help=""/> + <param argument="--family-wise" name="family_wise" type="boolean" truevalue="--family-wise" falsevalue="" checked="False" + label="Apply the sample-filter on a family-wise basis" help="If a variant passes the sample filter in at least the minimum number of families specified above it is retained." /> + </repeat> + </xml> + + <xml name="region_filter"> + <repeat name="regions" title="Region Filter" default="0" min="0" + help="Filter variant sites by their position in the genome. If multiple Region Filters are specified, all variants that fall in ONE of the regions are reported."> + <param name="chrom" type="text" label="Chromosome"> + <validator type="expression" message="A chromosome identifier is required when specifying a region filter">value.strip()</validator> + </param> + <param name="start" type="text" label="Region Start"> + <validator type="expression" message="an integer number is required">not value or value.isdigit()</validator> + </param> + <param name="stop" type="text" label="Region End"> + <validator type="expression" message="an integer number is required">not value or value.isdigit()</validator> + </param> + </repeat> + </xml> + <token name="@PROVIDE_ANNO_DATA@"><![CDATA[ mkdir gemini && - ln -s "${annotation_databases.fields.path}/gemini/data" gemini/data && - export GEMINI_CONFIG="${annotation_databases.fields.path}" && + ln -s '${annotation_databases.fields.path}/gemini/data' gemini/data && + export GEMINI_CONFIG='${annotation_databases.fields.path}' && ]]></token> <token name="@MULTILN_SQL_EXPR_TO_CMDLN@"> @@ -119,67 +205,50 @@ #end if </token> - <token name="@CMDLN_SQL_FILTER_FILTER_OPTION@"> - #if str($filter.filter_selector) == 'yes' and $filter.filter: - --filter '${ str( $filter.filter ) }' + <token name="@SET_COLS@"> + #if str($report.report_selector) == 'full': + #set cols = "*" + #else: + #if $report.columns and str($report.columns) != '': + #set $cols = str($report.columns) + #else + #set $cols = '' + #end if + #if str($report.extra_cols).strip(): + #if $cols: + #set $cols = $cols + ', ' + str($report.extra_cols) + #else: + #set $cols = str($report.extra_cols) + #end if + #end if + #if not $cols: + #set $cols = "variant_id, gene" + #end if #end if </token> <token name="@COLUMN_SELECT@"> - #if $report.report_selector != 'all': - --columns "${report.columns} - #if str($report.extra_cols).strip() - #echo ','+','.join(str($report.extra_cols).split()) - #end if - " + @SET_COLS@ + #if $cols != "*" + --columns '$cols' #end if </token> - <xml name="family"> - <param name="families" type="text" value="" label="Comma seperated list of families to restrict the analysis to." help="e.g. Family1,Family3 (--families)"/> - </xml> - - <xml name="lenient"> - <param name="lenient" type="boolean" truevalue="--lenient" falsevalue="" checked="False" label="Loosen the restrictions on family structure"/> - </xml> - - <xml name="unaffected"> - <param name="allow_unaffected" type="boolean" truevalue="--allow-unaffected" falsevalue="" checked="False" label="Report candidates that also impact samples labeled as unaffected." help="(--allow-unaffected)"/> - </xml> - - <xml name="min_kindreds"> - <param name="min_kindreds" type="integer" value="1" label="The min. number of kindreds that must have a candidate variant in a gene" help="default: 1 (--min-kindreds)" /> - </xml> - - <xml name="min_sequence_depth"> - <param name="d" type="integer" value="0" min="0" label="The minimum aligned sequence depth (genotype DP) required for each sample" - help="default: 0 (-d)" /> - </xml> - - <xml name="min_gq"> - <param name="min_gq" type="integer" value="0" label="the minimum genotype quality required for each sample in a family" help="default: 0 (--min-gq)"> - <validator type="in_range" min="0"/> - </param> - </xml> - - <xml name="gt_pl_max"> - <param name="gt_pl_max" type="integer" value="-1" min="-1" label="The maximum phred-scaled genotype likelihod (PL) allowed for each sample in a family" help="default: -1 (not set) (--gt-pl-max)" /> - </xml> - - <xml name="citations"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1003153</citation> - <yield /> - </citations> - </xml> - - <xml name="infile"> - <param name="infile" type="data" format="gemini.sqlite" label="GEMINI database" help="Only files with version @VERSION@ are accepted." > - <options options_filter_attribute="metadata.gemini_version" > - <filter type="add_value" value="@VERSION@" /> - </options> - <validator type="expression" message="This version of Gemini will only work with Gemini files that are for version @VERSION@.">value is not None and value.metadata.gemini_version == "@VERSION@"</validator> - </param> - </xml> - + <token name="@PARSE_REGION_ELEMENTS@"><![CDATA[ + #set $region_elements = [] + #for $r in $regions: + ## The actual chromosome name needs to be single-quoted + ## in SQL, so we need to quote the single quotes like the + ## sanitize_query macro would if the whole was a parameter. + #set $r_elements = ["chrom = '\"'\"'%s'\"'\"'" % str($r.chrom).strip()] + #if str($r.start).strip(): + #silent $r_elements.append("start >= %d" % int($r.start)) + #end if + #if str($r.stop).strip(): + #silent $r_elements.append("end <= %d" % int($r.stop)) + #end if + #silent $region_elements.append("(%s)" % " AND ".join($r_elements)) + #end for + ]]> + </token> </macros>
--- a/gemini_query.xml Fri Dec 14 12:51:59 2018 -0500 +++ b/gemini_query.xml Fri Jan 11 17:47:02 2019 -0500 @@ -1,8 +1,32 @@ -<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@.1"> +<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@"> <description>Querying the GEMINI database</description> <macros> <import>gemini_macros.xml</import> <token name="@BINARY@">query</token> + + <xml name="sorting"> + <param name="order_by" type="text" + label="Sort the output by the following column(s)" + help="" /> + <param name="sort_order" type="select" label="Sort order"> + <option value=" ASC">Ascending</option> + <option value=" DESC">Descending</option> + </param> + </xml> + <xml name="pheno_strat"> + <param name="phenotype" type="text" + label="Phenotype to stratify samples across" + help="Leave blank to stratify across the default phenotype column" /> + </xml> + <xml name="sample_delimiter" token_applied_to="samples"> + <param argument="--sample-delim" name="sample_delim" type="text" value="," + label="Delimiter to use in the list of affected @APPLIED_TO@" + help="" /> + </xml> + <xml name="dgidb_query"> + <param argument="--dgidb" name="dgidb" type="boolean" truevalue="--dgidb" falsevalue="" checked="False" + label="Request drug-gene interaction info from DGIdb" help="" /> + </xml> </macros> <expand macro="requirements" /> <expand macro="stdio" /> @@ -10,91 +34,251 @@ <command> <![CDATA[ gemini @BINARY@ + ${query.oformat.report.header} + ${query.oformat.report.dgidb} - --in "${in}" + #for $i in $query.filter_by_genotype: + #set $multiline_sql_expr = str($i.gt_filter) + #set $cmdln_param = "--gt-filter" + @MULTILN_SQL_EXPR_TO_CMDLN@ + #end for - #set $multiline_sql_expr = $gt_filter - #set $cmdln_param = "--gt-filter" - @MULTILN_SQL_EXPR_TO_CMDLN@ - - #set $multiline_sql_expr = $sample_filter - #set $cmdln_param = "--sample-filter" - @MULTILN_SQL_EXPR_TO_CMDLN@ + #for $i in $query.filter_by_sample: + $i.family_wise + #if int($i.min_kindreds) > 0: + --min-kindreds ${i.min_kindreds} + #end if + ${i.in} + #set $multiline_sql_expr = str($i.sample_filter) + #set $cmdln_param = "--sample-filter" + @MULTILN_SQL_EXPR_TO_CMDLN@ + #end for - $show_samples - $show_families - $family_wise - $header - $dgidb - #if $region.strip(): - --region "${region}" + #if str($query.oformat.report.format) == 'with_samples': + #set $sample_delim = str($query.oformat.report.sample_delim) or ',' + --show-samples --sample-delim '$sample_delim' + #elif str($query.oformat.report.format) == 'with_samples_flattened': + --show-samples --format sampledetail + #elif str($query.oformat.report.format) == 'with_families': + #set $sample_delim = str($query.oformat.report.sample_delim) or ',' + --show-families --sample-delim '$sample_delim' + #elif str($query.oformat.report.format) == 'carrier_summary': + --carrier-summary-by-phenotype + #if str($query.oformat.report.phenotype).strip(): + '${query.oformat.report.phenotype}' + #else: + affected + #end if + #else: + --format ${query.oformat.report.format} #end if - #if int($min_kindreds) > 0: - --min-kindreds $min_kindreds + + #if str($query.interface) == 'basic': + ## build the SQL query string from its components + #if str($query.oformat.report.format) in ('vcf', 'tped'): + #set $cols = "*" + #else: + #set $report = $query.oformat.report.report + @SET_COLS@ + #end if + #set $q = "SELECT %s FROM variants" % $cols + #set $where_clause_elements = [] + #if str($query.filter).strip(): + #silent $where_clause_elements.append(str($query.filter).strip()) + #end if + + #set $regions = $query.regions + @PARSE_REGION_ELEMENTS@ + #if $region_elements: + #silent $where_clause_elements.append(" OR ".join($region_elements)) + #end if + #if $where_clause_elements: + #set $q = $q + " WHERE " + " AND ".join($where_clause_elements) + #end if + #if str($query.oformat.report.order_by).strip(): + #set $q = $q + " ORDER BY " + str($query.oformat.report.order_by).strip() + str($query.oformat.report.sort_order) + #end if + #else + ## The user entered the SQL query string directly. + #set $q = str($query.q) #end if - ##--format FORMAT Format of output (JSON, TPED or default) # we will take default for the time being - ## --sample-delim STRING The delimiter to be used with the --show-samples option. #set $multiline_sql_expr = $q #set $cmdln_param = "-q" @MULTILN_SQL_EXPR_TO_CMDLN@ - "${ infile }" - > "${ outfile }" + '$infile' + > '$outfile' ]]> </command> - <!-- - ##TODO: - - -carrier-summary-by-phenotype CARRIER_SUMMARY - Output columns of counts of carriers and non-carriers - stratified by the given sample phenotype column--> <inputs> <expand macro="infile" /> - - <param name="q" type="text" area="True" size="5x50" label="The query to be issued to the database" help="(-q)"> - <expand macro="sanitize_query" /> - </param> - <param name="gt_filter" type="text" area="True" size="5x50" label="Restrictions to apply to genotype values" help="(--gt-filer)"> - <expand macro="sanitize_query" /> - </param> - <param name="sample_filter" type="text" area="True" size="5x50" label="SQL filter to use to filter the sample table" help="(--sample-filter)"> - <expand macro="sanitize_query" /> - </param> - - <param name="show_samples" type="boolean" truevalue="--show-samples" falsevalue="" checked="False" - label="Add a column of all sample names with a variant to each variant" help="(--show-samples)"/> - - <param name="show_families" type="boolean" truevalue="--show-families" falsevalue="" checked="False" - label="Add a column listing all of the families with a variant to each variant" help="(--show-families)"/> - - <param name="family_wise" type="boolean" truevalue="--family-wise" falsevalue="" checked="False" - label="Perform the sample-filter on a family-wise basis" help="(--family-wise)"/> - - <expand macro="add_header_column" /> - <expand macro="min_kindreds" /> - - <param name="dgidb" type="boolean" truevalue="--dgidb" falsevalue="" checked="False" - label="Request drug-gene interaction info from DGIdb" help="(--dgidb)"/> - - <param name="in" type="select" label="A variant must be in either all, none or any samples passing the sample-query filter" help="(--in)"> - <option value="all">Return a variant if all samples matching the query have the variant. (all)</option> - <option value="none">Return a variant if the variant does not appear in any of the matching samples. (none)</option> - <option value="any">Return all of the variant which are in all of the matching samples and not in any of the non-matching samples. (any)</option> - <option value="only">Return a variant if the variant is only in the matching samples and not in any of the non-matching samples. (only)</option> - </param> - - <param name="region" type="text" value="" label="Restrict query to this region" help="e.g. chr1:10-20 (--region)"/> - - + <conditional name="query"> + <param name="interface" type="select" + label="Build GEMINI query using" + help=""> + <option value="basic">Basic variant query constructor</option> + <option value="advanced">Advanced query constructor</option> + </param> + <when value="basic"> + <expand macro="gt_filter" /> + <expand macro="sample_filter" /> + <expand macro="region_filter" /> + <expand macro="filter" argument="" /> + <section name="oformat" title="Output format options" expanded="true"> + <conditional name="report"> + <param name="format" type="select" + label="Type of report to generate"> + <option value="default">tabular (GEMINI default)</option> + <option value="with_samples">tabular with affected samples</option> + <option value="with_samples_flattened">tabular with affected samples flattened</option> + <option value="with_families">tabular with affected families</option> + <option value="carrier_summary">tabular with carrier summary</option> + <option value="vcf">VCF (simplified)</option> + <option value="json">JSON</option> + <option value="tped">TPED</option> + </param> + <when value="default"> + <expand macro="add_header_column" /> + <expand macro="column_filter" + minimalset="chrom, start, end, ref, alt, gene, impact" + help=""/> + <expand macro="dgidb_query" /> + <expand macro="sorting" /> + </when> + <when value="with_samples"> + <expand macro="add_header_column" /> + <expand macro="sample_delimiter" /> + <expand macro="column_filter" + minimalset="chrom, start, end, ref, alt, gene, impact" + help=""/> + <expand macro="dgidb_query" /> + <expand macro="sorting" /> + </when> + <when value="with_samples_flattened"> + <expand macro="add_header_column" /> + <expand macro="column_filter" + minimalset="chrom, start, end, ref, alt, gene, impact" + help=""/> + <param name="dgidb" type="hidden" value="" /> + <expand macro="sorting" /> + </when> + <when value="with_families"> + <expand macro="add_header_column" /> + <expand macro="sample_delimiter" applied_to="families"/> + <expand macro="column_filter" + minimalset="chrom, start, end, ref, alt, gene, impact" + help=""/> + <expand macro="dgidb_query" /> + <expand macro="sorting" /> + </when> + <when value="carrier_summary"> + <expand macro="add_header_column" /> + <expand macro="pheno_strat" /> + <expand macro="column_filter" + minimalset="chrom, start, end, ref, alt, gene, impact" + help=""/> + <expand macro="dgidb_query" /> + <expand macro="sorting" /> + </when> + <when value="vcf"> + <expand macro="add_header_column" /> + <param name="order_by" type="hidden" value="" /> + <param name="dgidb" type="hidden" value="" /> + </when> + <when value="json"> + <param name="header" type="hidden" value="" /> + <expand macro="column_filter" + minimalset="chrom, start, end, ref, alt, gene, impact" + help=""/> + <param name="dgidb" type="hidden" value="" /> + <expand macro="sorting" /> + </when> + <when value="tped"> + <param name="header" type="hidden" value="" /> + <param name="dgidb" type="hidden" value="" /> + <expand macro="sorting" /> + </when> + </conditional> + </section> + </when> + <when value="advanced"> + <param argument="-q" name="q" type="text" area="True" size="5x50" + label="The query to be issued to the database" + help="Formulate your query using SQL syntax."> + <expand macro="sanitize_query" /> + <validator type="expression" message="Query cannot be empty">value.strip()</validator> + </param> + <expand macro="gt_filter" /> + <expand macro="sample_filter" /> + <section name="oformat" title="Output format options" expanded="true"> + <conditional name="report"> + <param name="format" type="select" + label="Type of report to generate"> + <option value="default">tabular (GEMINI default)</option> + <option value="with_samples">tabular with affected samples</option> + <option value="with_samples_flattened">tabular with affected samples flattened</option> + <option value="with_families">tabular with affected families</option> + <option value="carrier_summary">tabular with carrier summary</option> + <option value="vcf">VCF (simplified)</option> + <option value="json">JSON</option> + <option value="tped">TPED</option> + </param> + <when value="default"> + <expand macro="add_header_column" /> + <expand macro="dgidb_query" /> + </when> + <when value="with_samples"> + <expand macro="add_header_column" /> + <expand macro="sample_delimiter" /> + <expand macro="dgidb_query" /> + </when> + <when value="with_samples_flattened"> + <expand macro="add_header_column" /> + <param name="dgidb" type="hidden" value="" /> + </when> + <when value="with_families"> + <expand macro="add_header_column" /> + <expand macro="sample_delimiter" /> + <expand macro="dgidb_query" /> + </when> + <when value="carrier_summary"> + <expand macro="pheno_strat" /> + <expand macro="add_header_column" /> + <expand macro="dgidb_query" /> + </when> + <when value="vcf"> + <expand macro="add_header_column" /> + <param name="dgidb" type="hidden" value="" /> + </when> + <when value="json"> + <param name="header" type="hidden" value="" /> + <param name="dgidb" type="hidden" value="" /> + </when> + <when value="tped"> + <param name="header" type="hidden" value="" /> + <param name="dgidb" type="hidden" value="" /> + </when> + </conditional> + </section> + </when> + </conditional> </inputs> <outputs> - <data name="outfile" format="tabular" /> + <data name="outfile" format="tabular"> + <change_format> + <when input="query.oformat.report.format" value="json" format="json" /> + <when input="query.oformat.report.format" value="vcf" format="vcf" /> + </change_format> + </data> </outputs> <tests> <test> <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" /> - <param name="q" value="select chrom,start from variants limit 10" /> - <param name="header" value="True" /> + <conditional name="query"> + <param name="interface" value="advanced" /> + <param name="q" value="select chrom,start from variants limit 10" /> + </conditional> <output name="outfile"> <assert_contents> <has_line_matching expression="chrom	start" /> @@ -106,10 +290,116 @@ <![CDATA[ **What it does** -The real power in the GEMINI framework lies in the fact that all of your genetic variants have been stored in a convenient database in the context of a wealth of genome annotations that facilitate variant interpretation. -The expressive power of SQL allows one to pose intricate questions of one’s variation data. This tool offers you an easy way to query your variants! +The real power in the GEMINI framework lies in the fact that all of your +genetic variants have been stored in a convenient database in the context of a +wealth of genome annotations that facilitate variant interpretation. +The expressive power of SQL allows one to pose intricate questions of one’s +variation data. This tool offers you a flexible, yet relatively easy way +to query your variants! + +----- + +*Building your variant query with the Basic variant query constructor* + +This mode tries to break down the complexity of formulating GEMINI queries +into more easily digestable parts. In this mode, the tool also prevents you +from combining options that are incompatible or not meaningful. + +*Genotype filters* + +These are discussed `here +<https://gemini.readthedocs.io/en/latest/content/querying.html#gt-filter-filtering-on-genotypes>`__ +in the GEMINI documentation. + +The tool supports regular genotype filters like:: + + gt.sample1 == HET and gt_depths.sample1 >= 15 + +, which would keep only variants for which sample 1 is a heterozygous carrier +and if the genomic position in sample1 is covered by at least 15 sequencing +reads, as well as GEMINI wildcard filters of the general form +*(COLUMN).(SAMPLE_FILTER).(RULE).(RULE_ENFORCEMENT)* like:: + + (gt_types).(phenotype==2).(!=HOM_REF).(all) + +, which keeps only variants for which all phenotypic samples are homozygous. + +*Sample filters* + +Sample filters have the same format as the second component of the genotype +wildcard filters above, so:: + + phenotype == 2 + +would filter for phenotypically affected samples. In this case, however, the +filter determines, from which samples variants should be reported, i.e., here, +only variants found in phenotypically affected samples become analyzed. You can +use the ``--in`` filter to adjust the exact meaning of the sample filter. + +*Region filters* + +They let you restrict your analysis to parts of the genome, which can be useful +if you have prior knowledge of the approximate location of a variant of +interest. + +If you specify more then one region filter, they get combined with a logical +*OR*, meaning variants and genes falling in *any* of the regions are reported. -http://gemini.readthedocs.org/en/latest/content/querying.html +*Additional constraints on variants* + +These get translated directly into the WHERE clause of an SQL query and, thus, +have to be expressed in valid SQL syntax. As an example you could use:: + + is_exonic = 1 and impact_severity != 'LOW' + +to indicate that you are only interested in exonic variants that are not of +*LOW* impact severity, *i.e.*, not silent mutations. + +Note that in SQL syntax tests for equality use a single ``=``, while genotype +filters (discussed above) are following Python syntax and use ``==`` for the +same purpose. Also note that non-numerical values need to be enclosed in +single-quotes, *e.g.* ``'LOW'``, but numerical values must *NOT* be. + +----- + +*Building your query with the Advanced query constructor* + +For the sake of simplicity, the basic mode of the tool limits your queries to +the variants table of the underlying database. While this still allows many +useful queries to be formulated, it prevents you from joining information from +other tables (in particular, the gene_detailed table) or to query a different +table directly. + +In advanced mode, you take responsibility for formulating the complete SQL +query in correct syntax, which allows you to do anything you could do with the +command line tool. Beyond querying other tables, this includes changing output +column names, deriving simple statistics on columns using the SQL Min, Max, +Count, Avg and Sum functions, and more. + +The price you pay for this extra flexibility is that you will have to make sure +that any other tool options you set are compatible with the result of your +particular query. For example, most output formats except the tabular default +output of GEMINI are incompatible with non-standard queries. Choosing +non-compatible options can result in them getting ignored silently, but also +in tool errors, or in problems with downstream tools. + +The chapter `Querying the GEMINI database +<http://gemini.readthedocs.org/en/latest/content/querying.html>`__ of the +GEMINI documentation can get you started with formulating your own queries. + +Note that genotype filters and sample filters cannot be expressed as genuine +SQL queries, so even the Advanced query constructor is offering them. Region +filters and sort order of rows and columns on the other hand can be controlled +through SQL queries, like in this example:: + + SELECT gene, chrom, start, end, ref, alt FROM variants WHERE chrom = 'chr1' + AND start >= 10000000 and stop <= 20000000 and is_lof = 1 ORDER BY chrom, + start + +, which would report all loss-of-function variants between 10,000,000 and +20,000,000 on chr1 and report the selected columns sorted on chromosome, then +position. + ]]> </help> <expand macro="citations"/>
--- a/repository_dependencies.xml Fri Dec 14 12:51:59 2018 -0500 +++ b/repository_dependencies.xml Fri Jan 11 17:47:02 2019 -0500 @@ -1,4 +1,4 @@ <?xml version="1.0" ?> <repositories description="This requires the GEMINI data manager definition to install all required annotation databases."> - <repository changeset_revision="fe5a9a7d95b0" name="data_manager_gemini_database_downloader" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"/> + <repository changeset_revision="f57426daa04d" name="data_manager_gemini_database_downloader" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"/> </repositories> \ No newline at end of file
--- a/test-data/gemini_versioned_databases.loc Fri Dec 14 12:51:59 2018 -0500 +++ b/test-data/gemini_versioned_databases.loc Fri Jan 11 17:47:02 2019 -0500 @@ -1,3 +1,3 @@ ## GEMINI versioned databases #DownloadDate dbkey DBversion Description Path -1999-01-01 hg19 181 GEMINI annotations (test snapshot) ${__HERE__}/test-cache +1999-01-01 hg19 200 GEMINI annotations (test snapshot) ${__HERE__}/test-cache
--- a/test-data/test-cache/gemini-config.yaml Fri Dec 14 12:51:59 2018 -0500 +++ b/test-data/test-cache/gemini-config.yaml Fri Jan 11 17:47:02 2019 -0500 @@ -2,12 +2,14 @@ versions: ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.tidy.vcf.gz: 4 ESP6500SI.all.snps_indels.tidy.v2.vcf.gz: 2 - ExAC.r0.3.sites.vep.tidy.vcf.gz: 3 + ExAC.r0.3.sites.vep.tidy.vcf.gz: 4 GRCh37-gms-mappability.vcf.gz: 2 - clinvar_20160203.tidy.vcf.gz: 5 + clinvar_20170130.tidy.vcf.gz: 5 cosmic-v68-GRCh37.tidy.vcf.gz: 3 - dbsnp.b141.20140813.hg19.tidy.vcf.gz: 4 + dbsnp.b147.20160601.tidy.vcf.gz: 1 detailed_gene_table_v75: 2 geno2mp.variants.tidy.vcf.gz: 1 + gnomad.exomes.r2.0.1.sites.no-VEP.nohist.tidy.vcf.gz: 2 hg19.rmsk.bed.gz: 2 summary_gene_table_v75: 2 + whole_genome_SNVs.tsv.compressed.gz: 2