view gmql_queries_composer.xml @ 0:a80c93182db3 draft default tip

planemo upload for repository https://github.com/lu-brn/gmql-galaxy commit 953ee36ceda5814dc9baa03427bc0eb4ee2e93bd-dirty
author geco-team
date Tue, 26 Jun 2018 09:08:06 -0400
parents
children
line wrap: on
line source

<tool id="gmql_queries_composer" name="GMLQ Query Composer" version="0.1.1">
    <description>Create, Compile and Run GMQL queries step by step.</description>
    <macros>
        <import>gmql_rest_macros.xml</import>
        <import>gmql_queries_macros.xml</import>
        <import>gmql_operators_select.xml</import>
        <import>gmql_operators_order.xml</import>
        <import>gmql_operators_join.xml</import>
        <import>gmql_operators_map.xml</import>
        <import>gmql_operators_project.xml</import>
        <import>gmql_operators_cover.xml</import>
        <import>gmql_operators_extend.xml</import>
        <import>gmql_operators_group.xml</import>
        <import>gmql_operators_merge_union_diff.xml</import>
        <import>gmql_operators_tests.xml</import>
    </macros>
    <command><![CDATA[
    #if $materialize.materialize_result == 'true' :
     #if $materialize.choose_op.op == 'run' :
      mkdir -p dataset && cd dataset &&
      python $__tool_directory__/gmql_queries_composer.py
      -user=$authToken
      -cmd='run'
      -query_params=$query_params
      -query_output=$query
      #if $query_create.create == 'no' :
       -query_source=$query_file
      #end if
      -query_log=$log
	  -updated_ds_list=$updated_list
	 #else :
	  python $__tool_directory__/gmql_queries_composer.py
      -user=$authToken
      -cmd='compile'
      -query_params=$query_params
      -query_output=$query
      #if $query_create.create == 'no' :
       -query_source=$query_file
      #end if
      -query_log=$log
      #end if
    #else:
    python $__tool_directory__/gmql_queries_composer.py
     -user=$authToken
     -cmd='save'
     -query_params=$query_params
     -query_output=$query
     #if $query_create.create == 'no' :
      -query_source=$query_file
     #end if
    #end if
    ]]></command>
    <code file="dynamic_utils.py" >
         <hook validate_input="validate_variables" />
    </code>
    <configfiles>
        <inputs name="query_params" filename="params.json"/>
    </configfiles>
    <inputs>
        <param format="gmql_user" name="authToken" type="data" label="Select user" />
        <param name="query_name" type="text" label="Query Name" >
		    <validator type="regex" message="Only alphanumeric characters and underscore allowed. It must begin with
		                                     letter or underscore.">[a-zA-Z_]([\w]+)?$</validator>
	    </param>
        <conditional name="query_create" >
            <param name="create" label="Create new query or append to a saved one: " type="select" display="radio" >
                <option value="yes">New query</option>
                <option value="no">Continue</option>
            </param>
            <when value="no">
                <param name="query_file" label="Select local query" type="data" format="gmql_query" />
            </when>
        </conditional>
        <repeat name="operations" title="GMQL Operations" help="Add a new operation to the execution flow."
                min="1" default="0">
            <conditional name="operation">
                <param name="operator" type="select" label="Operation" >
                    <option value="SELECT">SELECT</option>
                    <option value="PROJECT">PROJECT</option>
                    <option value="EXTEND">EXTEND</option>
                    <option value="ORDER">ORDER</option>
                    <option value="GROUP">GROUP</option>
                    <option value="MERGE">MERGE</option>
                    <option value="UNION">UNION</option>
                    <option value="DIFFERENCE">DIFFERENCE</option>
                    <option value="JOIN">JOIN</option>
                    <option value="MAP">MAP</option>
                    <option value="COVER">COVER</option>
                </param>
                <when value="SELECT">
                    <expand macro="select" />
                </when>
                <when value="ORDER">
                    <expand macro="order" />
                </when>
                <when value="JOIN">
                    <expand macro="join" />
                </when>
                <when value="MAP">
                    <expand macro="map" />
                </when>
                <when value="PROJECT">
                    <expand macro="project" />
                </when>
                <when value="COVER">
                    <expand macro="cover" />
                </when>
                <when value="EXTEND">
                    <expand macro="extend" />
                </when>
                <when value="GROUP">
                    <expand macro="group" />
                </when>
                <when value="MERGE">
                    <expand macro="merge" />
                </when>
                <when value="UNION">
                    <expand macro="union" />
                </when>
                <when value="DIFFERENCE">
                    <expand macro="difference" />
                </when>
            </conditional>
        </repeat>
        <conditional name="materialize">
                <param name="materialize_result" type="boolean" label="Materialize final result?"
                   help="Compile and Run are available only for materialized queries. Otherwise it will only save this query" />
                <when value="true">
                    <param name="file_name" type="text" label="Name of the file into which the dataset DS will be saved"
                           help="The actual GMQL implementation materializes DS into a file with a name in the form [queryname]_[timestamp]_filename">
                         <validator type="regex" message="Only alphanumeric characters and underscore allowed.">[\w]+$</validator>
                    </param>
                    <conditional name="choose_op">
                        <param name="op" type="select" label="Run the query or Compile only" >
                            <option value="run">Run</option>
                            <option value="compile">Compile Only</option>
                        </param>
                        <when value="run">
                            <param name="out_format" type="select" label="Output format">
                                <option value="gdm">TAB Delimited (GDM)</option>
                                <option value="gtf">GTF</option>
                            </param>
                            <param name="import" type="boolean" checked="true" label="Import result automatically into Galaxy?"
                                   help="Otherwise it will be possible to import it later using GMQL Import tool." />
                        </when>
                    </conditional>
                </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="query" format="gmql_query" label="${query_name} GMQL query"/>
        <data format="txt" name="log" label="${query_name} Log" >
            <filter>materialize['materialize_result'] is True</filter>
        </data>
        <data format="gmql_repository" name="updated_list" label="${authToken.name.split()[0].rstrip('')} GMQL Datasets">
            <filter>materialize['materialize_result'] is True and materialize['choose_op']['op'] == 'run'</filter>
            <actions>
                <action name="column_names" type="metadata" default="dataset,owner" />
            </actions>
       </data>
        <collection name="query_results_m" type="list" label="${query_name} results metadata">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;[\w]+)\.(?P&lt;ext&gt;[^\._]+)?"
                               directory="dataset/metadata"/>
            <filter>materialize['materialize_result'] is True and materialize['choose_op']['op'] == 'run' and materialize['choose_op']['import'] is True</filter>
        </collection>
        <collection name="query_results_s" type="list" label="${query_name} results">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;[\w]+)\.(?P&lt;ext&gt;[^\._]+)?"
                               directory="dataset/samples"/>
            <filter>materialize['materialize_result'] is True and materialize['choose_op']['op'] == 'run' and materialize['choose_op']['import'] is True</filter>
        </collection>
    </outputs>
    <expand macro="composer_tests" />
    <help><![CDATA[
This tool presents a complete and detailed interface, rich with help sections and descriptions, to accompany the user in the composition of GMQL Queries. For further info about the language, check out the following help sections or the GMQL Documentation.

----

**What it does**

This tools allow to compose GMQL queries. Every time a new operation is added, a drop-down menu asks to chose the GMQL Operator, and, depending on the choice, the corresponding parameters are shown.
Parameters are organized in sections so that it is possible to focus on each type of parameter separately. Every operator and parameter is explained through help sections and labels, in order to accompany the user along the query composition.
For further info about the language, its operators and the data model, check out the following info sections or the GMQL Documentation.

Once the query has been composed, it is then possible to send it to the GMQL system for compilation and/or execution.

- **Save Query**: it returns the composed query as gmql_query file.
- **Compile query**: send the query to be compiled on the GMQL system. It returns the compilation log.
- **Run query**: send the query to be run on the GMQL system. User can choose to automatically import the resulting dataset or not, and the output format. It returns the log generated by the system, an updated list of the user dataset, and the two collections corresponding to the result datasets samples and metadata, respectively.

.. class:: warningmark

A gmql_user authentication token is required for every action.

.. class:: warningmark

Compile and Run are available only if a MATERIALIZE operation is included within the query. All datasets defined in a GMQL query are, by default, temporary; The MATERIALIZE operation saves the content of a dataset in a file and registers the saved dataset in the system to make it usable in other GMQL queries.

.. class:: warningmark

Every GMQL query should start with a SELECT statement.

----

.. class:: infomark

**The GenoMetric Query Language (GMQL)**

Developed by the Bioinformatics group at Politecnico di Milano, GMQL
is a high-level, declarative language that allows expressing queries easily
over genomic regions and their metadata, in a way similar to what can be
done with Relational Algebra and Structured Query Language (SQL) over
a relational database. It extends conventional algebraic operations with
bioinformatics domain-specific operations designed for genomics.

.. class:: infomark

**Genomic Data Model**

Abstractions for DNA regions and metadata are provided by the Genomic
Data Model (GDM), which thus provides interoperability across data formats. There are two kind of information:

**Genomic Regions**

r = <c, a>

c = <chr, left, right, strand>

a = {<p1, v1>, <p2, v2>, ...}

Aa region is represented by its coordinates c and a set of attributes
a which are property-value pairs. The coordinates attributes chr,
left, right, strand, are of types string, long, long, char, respectively.
Region attributes can be of any type among boolean, char, string,
int, long, double.

**Metadata**

They are all additional information about the given regions; they include data provenance, as well as biological and clinical
data. They are attribute-value pairs, where we assume both attributes and values are of type string.

A GDM **sample** is a set of genomic regions to which are associated a common identifier and the same metadata information. In Galaxy, this is a single dataset.
A GDM **dataset** is a collection of samples with the same region schema. In Galaxy, this is two collections of datasets (one collection for metadata and one for region data)

.. class:: infomark

**Genomic Distance**

It is defined as the number of nucleotide bases between genomic
regions (aligned to the same reference genome); overlapping regions have
negative distance, while adjacent regions have distance equal to 0.

.. class:: infomark

**GMQL Operators**

A query is a sequence of GMQL operations, which have the following structure:

<variable_output> = <operator>(<parameters>)<variables_input>;

where each variable stands for a GDM dataset. Operators apply to one
or two input variables and construct the result variable. Parameters of
several operators include predicates, which are made of boolean expressions
of simple predicates.

- **Select** : defines a new dataset from an existing dataset by keeping a subset of samples and/or regions from the input dataset who satisfied the given predicates.

- **Project** : creates a new dataset keeping for each sample in the input dataset only those metadata and/or region attributes expressed in the operator parameter list. This allow to remove existing attribute or to create new ones.

- **Extend** : for each sample in an input dataset, it builds new metadata attributes, assigns their values as the result functions calculated on sample region attributes, and adds them to the existing metadata attribute-value pairs of the sample.

- **Order** : is used to order either samples, sample regions, or both, according to a set of metadata and/or region attributes, and/or region coordinates.

- **Group** : performs the grouping of samples of the input dataset based on one specified metadata attribute. For each obtained group, it is possible to request the evaluation of aggregate functions on metadata attributes over the metadata contained in all samples of the group.

- **Merge** : builds a new dataset consisting of a single sample having all the regions of all the input samples, with the same attributes and the union of all the metadata attribute-values of the input samples

- **Union** : analogously to the UNION operation in relation algebra, integrate samples from different dataset within a single dataset. The union of the two schemas is performed by taking only the schema of the first dataset and removing the region attributes of the second dataset which are not present in the first one.

- **Difference** : produces one sample in the result for each sample of the first operand by keeping its metadata and only those regions (with their attributes and values) which do not intersect with any region in the second operand.

- **Map** : is a binary operation over two samples, called reference and experiment dataset. MAP computes, for each sample in the experiment dataset, aggregates over the values of the experiment regions that intersect with each reference region; we say that experiment regions are mapped to the reference regions. For each reference sample, the MAP operation produces a matrix-like structure (genomic space), where rows represent each experiment sample, columns are reference regions, and each matrix row is a vector consisting of the aggregates computed during MAP execution.

- **Join** : it acts in two phases: first, new samples are built from pairs of samples, one of the first dataset (anchor) and one of the second one (experiment), where region attributes exist in both input datasets and their values coincide (just as the relational JOIN). After that, a genometric predicate, dealing with distal properties of regions, selects the regions to include in these new samples. The number of generated output samples is the Cartesian product of the number of samples in the anchor and in the experiment dataset (if no joinby clause is specified). Predicates over metadata allow selecting sample pairs with appropriate biological conditions; genometric join predicates allow expressing distal conditions on sample regions.

- **Cover** : takes as input a dataset and returns another dataset with a single sample (if no groupby option is specified) by ”collapsing” the input samples and their regions according to the parameters minAcc and maxAcc.

- **Materialize** : saves the content of a dataset in a file and registers the saved dataset in the system to make it usable in other GMQL queries.

  ]]></help>
  <expand macro="citations" />
</tool>