comparison gmql_queries_composer.xml @ 0:a80c93182db3 draft default tip

planemo upload for repository https://github.com/lu-brn/gmql-galaxy commit 953ee36ceda5814dc9baa03427bc0eb4ee2e93bd-dirty
author geco-team
date Tue, 26 Jun 2018 09:08:06 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a80c93182db3
1 <tool id="gmql_queries_composer" name="GMLQ Query Composer" version="0.1.1">
2 <description>Create, Compile and Run GMQL queries step by step.</description>
3 <macros>
4 <import>gmql_rest_macros.xml</import>
5 <import>gmql_queries_macros.xml</import>
6 <import>gmql_operators_select.xml</import>
7 <import>gmql_operators_order.xml</import>
8 <import>gmql_operators_join.xml</import>
9 <import>gmql_operators_map.xml</import>
10 <import>gmql_operators_project.xml</import>
11 <import>gmql_operators_cover.xml</import>
12 <import>gmql_operators_extend.xml</import>
13 <import>gmql_operators_group.xml</import>
14 <import>gmql_operators_merge_union_diff.xml</import>
15 <import>gmql_operators_tests.xml</import>
16 </macros>
17 <command><![CDATA[
18 #if $materialize.materialize_result == 'true' :
19 #if $materialize.choose_op.op == 'run' :
20 mkdir -p dataset && cd dataset &&
21 python $__tool_directory__/gmql_queries_composer.py
22 -user=$authToken
23 -cmd='run'
24 -query_params=$query_params
25 -query_output=$query
26 #if $query_create.create == 'no' :
27 -query_source=$query_file
28 #end if
29 -query_log=$log
30 -updated_ds_list=$updated_list
31 #else :
32 python $__tool_directory__/gmql_queries_composer.py
33 -user=$authToken
34 -cmd='compile'
35 -query_params=$query_params
36 -query_output=$query
37 #if $query_create.create == 'no' :
38 -query_source=$query_file
39 #end if
40 -query_log=$log
41 #end if
42 #else:
43 python $__tool_directory__/gmql_queries_composer.py
44 -user=$authToken
45 -cmd='save'
46 -query_params=$query_params
47 -query_output=$query
48 #if $query_create.create == 'no' :
49 -query_source=$query_file
50 #end if
51 #end if
52 ]]></command>
53 <code file="dynamic_utils.py" >
54 <hook validate_input="validate_variables" />
55 </code>
56 <configfiles>
57 <inputs name="query_params" filename="params.json"/>
58 </configfiles>
59 <inputs>
60 <param format="gmql_user" name="authToken" type="data" label="Select user" />
61 <param name="query_name" type="text" label="Query Name" >
62 <validator type="regex" message="Only alphanumeric characters and underscore allowed. It must begin with
63 letter or underscore.">[a-zA-Z_]([\w]+)?$</validator>
64 </param>
65 <conditional name="query_create" >
66 <param name="create" label="Create new query or append to a saved one: " type="select" display="radio" >
67 <option value="yes">New query</option>
68 <option value="no">Continue</option>
69 </param>
70 <when value="no">
71 <param name="query_file" label="Select local query" type="data" format="gmql_query" />
72 </when>
73 </conditional>
74 <repeat name="operations" title="GMQL Operations" help="Add a new operation to the execution flow."
75 min="1" default="0">
76 <conditional name="operation">
77 <param name="operator" type="select" label="Operation" >
78 <option value="SELECT">SELECT</option>
79 <option value="PROJECT">PROJECT</option>
80 <option value="EXTEND">EXTEND</option>
81 <option value="ORDER">ORDER</option>
82 <option value="GROUP">GROUP</option>
83 <option value="MERGE">MERGE</option>
84 <option value="UNION">UNION</option>
85 <option value="DIFFERENCE">DIFFERENCE</option>
86 <option value="JOIN">JOIN</option>
87 <option value="MAP">MAP</option>
88 <option value="COVER">COVER</option>
89 </param>
90 <when value="SELECT">
91 <expand macro="select" />
92 </when>
93 <when value="ORDER">
94 <expand macro="order" />
95 </when>
96 <when value="JOIN">
97 <expand macro="join" />
98 </when>
99 <when value="MAP">
100 <expand macro="map" />
101 </when>
102 <when value="PROJECT">
103 <expand macro="project" />
104 </when>
105 <when value="COVER">
106 <expand macro="cover" />
107 </when>
108 <when value="EXTEND">
109 <expand macro="extend" />
110 </when>
111 <when value="GROUP">
112 <expand macro="group" />
113 </when>
114 <when value="MERGE">
115 <expand macro="merge" />
116 </when>
117 <when value="UNION">
118 <expand macro="union" />
119 </when>
120 <when value="DIFFERENCE">
121 <expand macro="difference" />
122 </when>
123 </conditional>
124 </repeat>
125 <conditional name="materialize">
126 <param name="materialize_result" type="boolean" label="Materialize final result?"
127 help="Compile and Run are available only for materialized queries. Otherwise it will only save this query" />
128 <when value="true">
129 <param name="file_name" type="text" label="Name of the file into which the dataset DS will be saved"
130 help="The actual GMQL implementation materializes DS into a file with a name in the form [queryname]_[timestamp]_filename">
131 <validator type="regex" message="Only alphanumeric characters and underscore allowed.">[\w]+$</validator>
132 </param>
133 <conditional name="choose_op">
134 <param name="op" type="select" label="Run the query or Compile only" >
135 <option value="run">Run</option>
136 <option value="compile">Compile Only</option>
137 </param>
138 <when value="run">
139 <param name="out_format" type="select" label="Output format">
140 <option value="gdm">TAB Delimited (GDM)</option>
141 <option value="gtf">GTF</option>
142 </param>
143 <param name="import" type="boolean" checked="true" label="Import result automatically into Galaxy?"
144 help="Otherwise it will be possible to import it later using GMQL Import tool." />
145 </when>
146 </conditional>
147 </when>
148 </conditional>
149 </inputs>
150 <outputs>
151 <data name="query" format="gmql_query" label="${query_name} GMQL query"/>
152 <data format="txt" name="log" label="${query_name} Log" >
153 <filter>materialize['materialize_result'] is True</filter>
154 </data>
155 <data format="gmql_repository" name="updated_list" label="${authToken.name.split()[0].rstrip('')} GMQL Datasets">
156 <filter>materialize['materialize_result'] is True and materialize['choose_op']['op'] == 'run'</filter>
157 <actions>
158 <action name="column_names" type="metadata" default="dataset,owner" />
159 </actions>
160 </data>
161 <collection name="query_results_m" type="list" label="${query_name} results metadata">
162 <discover_datasets pattern="(?P&lt;identifier_0&gt;[\w]+)\.(?P&lt;ext&gt;[^\._]+)?"
163 directory="dataset/metadata"/>
164 <filter>materialize['materialize_result'] is True and materialize['choose_op']['op'] == 'run' and materialize['choose_op']['import'] is True</filter>
165 </collection>
166 <collection name="query_results_s" type="list" label="${query_name} results">
167 <discover_datasets pattern="(?P&lt;identifier_0&gt;[\w]+)\.(?P&lt;ext&gt;[^\._]+)?"
168 directory="dataset/samples"/>
169 <filter>materialize['materialize_result'] is True and materialize['choose_op']['op'] == 'run' and materialize['choose_op']['import'] is True</filter>
170 </collection>
171 </outputs>
172 <expand macro="composer_tests" />
173 <help><![CDATA[
174 This tool presents a complete and detailed interface, rich with help sections and descriptions, to accompany the user in the composition of GMQL Queries. For further info about the language, check out the following help sections or the GMQL Documentation.
175
176 ----
177
178 **What it does**
179
180 This tools allow to compose GMQL queries. Every time a new operation is added, a drop-down menu asks to chose the GMQL Operator, and, depending on the choice, the corresponding parameters are shown.
181 Parameters are organized in sections so that it is possible to focus on each type of parameter separately. Every operator and parameter is explained through help sections and labels, in order to accompany the user along the query composition.
182 For further info about the language, its operators and the data model, check out the following info sections or the GMQL Documentation.
183
184 Once the query has been composed, it is then possible to send it to the GMQL system for compilation and/or execution.
185
186 - **Save Query**: it returns the composed query as gmql_query file.
187 - **Compile query**: send the query to be compiled on the GMQL system. It returns the compilation log.
188 - **Run query**: send the query to be run on the GMQL system. User can choose to automatically import the resulting dataset or not, and the output format. It returns the log generated by the system, an updated list of the user dataset, and the two collections corresponding to the result datasets samples and metadata, respectively.
189
190 .. class:: warningmark
191
192 A gmql_user authentication token is required for every action.
193
194 .. class:: warningmark
195
196 Compile and Run are available only if a MATERIALIZE operation is included within the query. All datasets defined in a GMQL query are, by default, temporary; The MATERIALIZE operation saves the content of a dataset in a file and registers the saved dataset in the system to make it usable in other GMQL queries.
197
198 .. class:: warningmark
199
200 Every GMQL query should start with a SELECT statement.
201
202 ----
203
204 .. class:: infomark
205
206 **The GenoMetric Query Language (GMQL)**
207
208 Developed by the Bioinformatics group at Politecnico di Milano, GMQL
209 is a high-level, declarative language that allows expressing queries easily
210 over genomic regions and their metadata, in a way similar to what can be
211 done with Relational Algebra and Structured Query Language (SQL) over
212 a relational database. It extends conventional algebraic operations with
213 bioinformatics domain-specific operations designed for genomics.
214
215 .. class:: infomark
216
217 **Genomic Data Model**
218
219 Abstractions for DNA regions and metadata are provided by the Genomic
220 Data Model (GDM), which thus provides interoperability across data formats. There are two kind of information:
221
222 **Genomic Regions**
223
224 r = <c, a>
225
226 c = <chr, left, right, strand>
227
228 a = {<p1, v1>, <p2, v2>, ...}
229
230 Aa region is represented by its coordinates c and a set of attributes
231 a which are property-value pairs. The coordinates attributes chr,
232 left, right, strand, are of types string, long, long, char, respectively.
233 Region attributes can be of any type among boolean, char, string,
234 int, long, double.
235
236 **Metadata**
237
238 They are all additional information about the given regions; they include data provenance, as well as biological and clinical
239 data. They are attribute-value pairs, where we assume both attributes and values are of type string.
240
241 A GDM **sample** is a set of genomic regions to which are associated a common identifier and the same metadata information. In Galaxy, this is a single dataset.
242 A GDM **dataset** is a collection of samples with the same region schema. In Galaxy, this is two collections of datasets (one collection for metadata and one for region data)
243
244 .. class:: infomark
245
246 **Genomic Distance**
247
248 It is defined as the number of nucleotide bases between genomic
249 regions (aligned to the same reference genome); overlapping regions have
250 negative distance, while adjacent regions have distance equal to 0.
251
252 .. class:: infomark
253
254 **GMQL Operators**
255
256 A query is a sequence of GMQL operations, which have the following structure:
257
258 <variable_output> = <operator>(<parameters>)<variables_input>;
259
260 where each variable stands for a GDM dataset. Operators apply to one
261 or two input variables and construct the result variable. Parameters of
262 several operators include predicates, which are made of boolean expressions
263 of simple predicates.
264
265 - **Select** : defines a new dataset from an existing dataset by keeping a subset of samples and/or regions from the input dataset who satisfied the given predicates.
266
267 - **Project** : creates a new dataset keeping for each sample in the input dataset only those metadata and/or region attributes expressed in the operator parameter list. This allow to remove existing attribute or to create new ones.
268
269 - **Extend** : for each sample in an input dataset, it builds new metadata attributes, assigns their values as the result functions calculated on sample region attributes, and adds them to the existing metadata attribute-value pairs of the sample.
270
271 - **Order** : is used to order either samples, sample regions, or both, according to a set of metadata and/or region attributes, and/or region coordinates.
272
273 - **Group** : performs the grouping of samples of the input dataset based on one specified metadata attribute. For each obtained group, it is possible to request the evaluation of aggregate functions on metadata attributes over the metadata contained in all samples of the group.
274
275 - **Merge** : builds a new dataset consisting of a single sample having all the regions of all the input samples, with the same attributes and the union of all the metadata attribute-values of the input samples
276
277 - **Union** : analogously to the UNION operation in relation algebra, integrate samples from different dataset within a single dataset. The union of the two schemas is performed by taking only the schema of the first dataset and removing the region attributes of the second dataset which are not present in the first one.
278
279 - **Difference** : produces one sample in the result for each sample of the first operand by keeping its metadata and only those regions (with their attributes and values) which do not intersect with any region in the second operand.
280
281 - **Map** : is a binary operation over two samples, called reference and experiment dataset. MAP computes, for each sample in the experiment dataset, aggregates over the values of the experiment regions that intersect with each reference region; we say that experiment regions are mapped to the reference regions. For each reference sample, the MAP operation produces a matrix-like structure (genomic space), where rows represent each experiment sample, columns are reference regions, and each matrix row is a vector consisting of the aggregates computed during MAP execution.
282
283 - **Join** : it acts in two phases: first, new samples are built from pairs of samples, one of the first dataset (anchor) and one of the second one (experiment), where region attributes exist in both input datasets and their values coincide (just as the relational JOIN). After that, a genometric predicate, dealing with distal properties of regions, selects the regions to include in these new samples. The number of generated output samples is the Cartesian product of the number of samples in the anchor and in the experiment dataset (if no joinby clause is specified). Predicates over metadata allow selecting sample pairs with appropriate biological conditions; genometric join predicates allow expressing distal conditions on sample regions.
284
285 - **Cover** : takes as input a dataset and returns another dataset with a single sample (if no groupby option is specified) by ”collapsing” the input samples and their regions according to the parameters minAcc and maxAcc.
286
287 - **Materialize** : saves the content of a dataset in a file and registers the saved dataset in the system to make it usable in other GMQL queries.
288
289 ]]></help>
290 <expand macro="citations" />
291 </tool>