comparison prot-scriber.xml @ 0:c840a1c77a0a draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/prot-scriber commit 8b58f3f03d6430689d228029bb2eb46c16cfff23
author iuc
date Tue, 10 May 2022 13:18:05 +0000
parents
children 1e9a43cbf524
comparison
equal deleted inserted replaced
-1:000000000000 0:c840a1c77a0a
1 <tool id="prot_scriber" name="prot-scriber" version="@TOOL_VERSION@" profile="21.05">
2 <description>Protein annotation of short human readable descriptions</description>
3 <macros>
4 <token name="@TOOL_VERSION@">0.1.1</token>
5 </macros>
6 <requirements>
7 <requirement type="package" version="@TOOL_VERSION@">prot-scriber</requirement>
8 </requirements>
9 <stdio>
10 <regex match="panicked" level="fatal" source="stderr" />
11 </stdio>
12 <command>
13 <![CDATA['prot-scriber'
14 #if str($input_config.input_config_selector) == "basic"
15 #for $sst in $input_config.seq_sim_table
16 -s '$sst'
17 #end for
18 #else if str($input_config.input_config_selector) == "advanced"
19 #for $ssr in $input_config.advanced_input_repeat
20 -s '$ssr.seq_sim_table'
21 #if $ssr.header
22 -e '$ssr.header'
23 #end if
24 #if $ssr.field_separator
25 -p '$ssr.field_separator'
26 #end if
27 #if $ssr.blacklist_regexs
28 -b '$ssr.blacklist_regexs'
29 #end if
30 #if $ssr.capture_replace_pairs
31 -c '$ssr.capture_replace_pairs'
32 #end if
33 #if $ssr.filter_regexs
34 -l '$ssr.filter_regexs'
35 #end if
36 #end for
37 #if $input_config.expert_options.non_informative_words_regexs
38 -w '$input_config.expert_options.non_informative_words_regexs'
39 #end if
40 #if $input_config.expert_options.description_split_regex
41 -r "$input_config.expert_options.description_split_regex"
42 #end if
43 #if $input_config.expert_options.center_inverse_word_information_content_at_quantile
44 -q $input_config.expert_options.center_inverse_word_information_content_at_quantile
45 #end if
46 #end if
47 #if $seq_family.seq_families
48 -f '$seq_families'
49 #end if
50 #if $seq_family.annotate_non_family_queries
51 -a
52 #end if
53 #if $seq_family.seq_family_gene_ids_separator
54 -g "$seq_family_gene_ids_separator"
55 #end if
56 #if $seq_family.seq_family_id_genes_separator
57 -i '$seq_family_id_genes_separator'
58 #end if
59 -o '$output'
60 ]]>
61 </command>
62 <inputs>
63 <conditional name="input_config">
64 <param type="select" name="input_config_selector" label="Choose input configuration options">
65 <option value="basic" selected="true">Basic</option>
66 <option value="advanced">Advanced</option>
67 </param>
68 <when value="basic">
69 <param type="data" multiple="true" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search results in tabular format (-s)" help="Files in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them.
70 Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." />
71 </when>
72 <when value="advanced">
73 <repeat name="advanced_input_repeat" title="Sequence similarity table" min="1" default="1">
74 <param type="data" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search result in tabular format (-s)" help="File in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them.
75 Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." />
76 <param type="text" optional="true" name="field_separator" argument="-p" label="Field separator (-p)" help="Field-Separator of the (-s) sequence similarity table. The default value is the 'TAB' character. Set to 'default' to use the hard coded default">
77 <sanitizer>
78 <valid initial="default">
79 <add preset="string.printable" />
80 </valid>
81 </sanitizer>
82 </param>
83 <param type="text" optional="true" name="header" argument="-e" label="Header of the sequence similarity tables (-e)" help="Header of the (-s) sequence similarity table. Separated by space (' ') the names of the
84 in order of appearance in the respective table. Required and default columns are 'qacc sacc stitle'. Set to 'default' to use the hard coded default" />
85 <param type="data" optional="true" name="blacklist_regexs" argument="-b" format="tabular" label="Blacklist Regexs (-b)" help="A file with regular expressions, one per line. Any match to any of these
86 regular expressions causes sequence similarity search result descriptions ('stitle' in Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard coded default" />
87 <param type="data" optional="true" name="capture_replace_pairs" argument="-c" format="tabular" label="Capture replace pairs (-c)" help="A file with pairs of lines. Within each pair the first line is a regular expressions
88 defining one or more capture groups. The second line of a pair is the string used to replace the match in the regular expression with. Set to 'default' to use the hard coded default" />
89 <param type="data" optional="true" name="filter_regexs" argument="-l" format="tabular" label="Filter regexs (-l)" help="A file with regular expressions, one per line. Any match to any of these
90 regular expressions causes the matched sub-string to be deleted, i.e. filtered out. Set to 'default' to use the hard coded default" />
91 </repeat>
92 <section title="Expert options" name="expert_options">
93 <param type="data" optional="true" name="non_informative_words_regexs" argument="-w" format="tabular" label="Non informative words regexs (-w)" help="A file in which regular expressions (regexs) are stored, one per line. These
94 regexs are used to recognize non-informative words, which will only receive a minimum score in the prot-scriber process that generates human readable description." />
95 <param type="text" optional="true" name="description_split_regex" argument="-r" label="Description split regex (-r)" help="A regular expression to be used to split descriptions (`stitle` in Blast
96 terminology) into words. Default is '([~_\-/|\;,':.\s]+)'.">
97 <sanitizer>
98 <valid initial="default">
99 <add preset="string.printable" />
100 </valid>
101 </sanitizer>
102 </param>
103 <param type="integer" optional="true" name="center_inverse_word_information_content_at_quantile" argument="-q" label="Center inverse word-information-content at quantile (-q)" help="The quantile (percentile) to be subtracted from calculated inverse word information
104 content to center these values. Value between 0 and 1." />
105 </section>
106 </when>
107 </conditional>
108 <section title="Sequence family annotation" name="seq_family">
109 <param type="data" optional="true" name="seq_families" argument="-f" format="tabular" label="Families of biological sequences (-f)" help="A file in which families of biological sequences are stored, one family per line. Each
110 line must have format 'fam_name TAB gene1,gene2,gene3'. Make sure no gene appears in
111 more than one family." />
112 <param type="boolean" optional="true" name="annotate_non_family_queries" argument="-a" label="Annotate non family query sequences (-a)" help="Set this to true to also annotate sequences are not member of a sequence family." />
113 <param type="text" optional="true" name="seq_family_gene_ids_separator" argument="-g" label="Sequence family file gene-id separator (-g)" help=" A regular expression used to split the list of gene_identifiers in the
114 argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'.">
115 <sanitizer>
116 <valid initial="default">
117 <add preset="string.printable" />
118 </valid>
119 </sanitizer>
120 </param>
121 <param type="text" optional="true" name="seq_family_id_genes_separator" argument="-i" label="Sequence family file family - gene-id separator (-i)" help="A string used as separator in the argument --seq-families (-f) gene families file. This
122 string separates the gene_family_identifier (name) from the gene_identifier list that family comprises. Default is 'TAB'.">
123 <sanitizer>
124 <valid initial="default">
125 <add preset="string.printable" />
126 </valid>
127 </sanitizer>
128 </param>
129 </section>
130 </inputs>
131 <outputs>
132 <data format="tabular" name="output" />
133 </outputs>
134 <tests>
135 <test>
136 <param name="input_config_selector" value="basic"/>
137 <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" />
138 <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" />
139 <output name="output" file="8_Proteins_prot-scriber.out" sort="true" />
140 </test>
141 <test>
142 <param name="input_config_selector" value="advanced" />
143 <repeat name="advanced_input_repeat">
144 <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" />
145 <param name="field_separator" value="default" />
146 <param name="header" value="qacc sacc stitle" />
147 </repeat>
148 <repeat name="advanced_input_repeat">
149 <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" />
150 <param name="field_separator" value="default" />
151 <param name="header" value="qacc sacc stitle" />
152 </repeat>
153 <output name="output" file="8_Proteins_prot-scriber.out" sort="true" />
154 </test>
155 <test>
156 <param name="input_config_selector" value="advanced" />
157 <repeat name="advanced_input_repeat">
158 <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" />
159 <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" />
160 </repeat>
161 <repeat name="advanced_input_repeat">
162 <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" />
163 <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" />
164 </repeat>
165 <param name="description_split_regex" value="([~_\-/|;,':.\s]+)" />
166 <param name="center_inverse_word_information_content_at_quantile" value="50" />
167 <output name="output" file="8_Proteins_prot-scriber.out" sort="true" />
168 </test>
169 </tests>
170 <help>
171 <![CDATA[
172
173 **What it does**
174
175 prot-scriber_ assigns short human readable descriptions (HRD) to query biological sequences using reference candidate descriptions.
176 In this, prot-scriber consumes sequence similarity search (Blast or Diamond or similar) results in tabular format.
177 customized lexical analysis is carried out on the descriptions of these Blast Hits and a resulting HRD is assigned to the query sequences.
178 For more information, examples and how to use the prot-scriber commandline tool refer to the prot-scriber README_ and MANUAL_.
179
180 .. _prot-scriber: http://github.com/usadellab/prot-scriber
181 .. _README: https://github.com/usadellab/prot-scriber/blob/master/README.md
182 .. _MANUAL: https://github.com/usadellab/prot-scriber/blob/master/README.md#manual
183
184 ----
185
186 **Input**
187
188 The input file is one or multiple tabular output(s) of a sequence similarity search (Blast, Diamon or similar).
189 Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond). The input is done via the -s parameter::
190
191 -s, --seq-sim-table
192 File in which to find sequence similarity search results in tabular format (SSST). Use
193 e.g. Blast or Diamond to produce them. Required columns are: 'qacc sacc stitle' (Blast)
194 or 'qseqid sseqid stitle' (Diamond). If the required columns, or more, appear in different order than
195 shown here you must use the --header (-e) argument. If any of the input SSSTs uses a
196 different field-separator than the '<TAB>' character, you must provide the --field-
197 separator (-p) argument. You can provide multiple SSSTs for your query proteins whose information
198 will be combined and evaluated by the tool.
199
200 **Input parameters**
201
202 prot-scriber gives the user the opportunity to fine tune parameters for the provided input tables.
203 To do so turn on the *input configuration* switch. Those are optional, as the tool also provides sensible defaults.
204 In case you decide to customize your inputs using below parameters, be advised that prot-scriber expects the
205 customized parameter for all input tables - the number of tables and e.g. *--header* parameters have to match.
206 You can set the values to 'default' if you want to use the default value for a given input table::
207
208 -e, --header
209 Header of the --seq-sim-table (-s) arg. Separated by space (' ') the names of the
210 columns in order of appearance in the respective table. Required and default columns are
211 'qacc sacc stitle'. Note that this option only understands Blast terminology, i.e. even
212 if you ran Diamond, please provide 'qacc' instead of 'qseqid' and 'sacc' instead of
213 'sseqid'. Luckily 'stitle' is 'stitle' in Diamond, too. You can have additional columns
214 that will be ignored, as long as the required columns appear in the correct order.
215 Consider this example: 'qacc sacc evalue bitscore stitle'. Set to 'default' to use the hard coded default.
216
217 -p, --field-separator
218 Field-Separator of the --seq-sim-table (-s) arg. The default value is the '<TAB>' character.
219 Consider this example: ','. You can provide 'default' to use the hard coded default (TAB).
220
221 -b, --blacklist-regexs (Expert option)
222 A file with regular expressions, one per line. Any match to any of these
223 regular expressions causes sequence similarity search result descriptions ('stitle' in
224 Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard
225 coded default. An example file can be downloaded here:
226 https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/blacklist_stitle_regexs.txt.
227
228 -l, --filter-regexs (Expert option)
229 A file with regular expressions, one per line. Any match to any of these
230 regular expressions causes the matched sub-string to be deleted, i.e. filtered out.
231 Filtering is used to process descriptions ('stitle' in Blast terminology) and prepare
232 the descriptions for the prot-scriber annotation process. In case of UniProt sequence
233 similarity search results (Blast result tables), this removes the Blast Hit identifier
234 (`sacc`) from the description (`stitle`) and also removes the taxonomic information
235 starting with e.g. 'OS=' at the end of the `stitle` strings. Set to 'default' to use
236 hard coded default. Anexample file can be downloaded here:
237 https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/filter_stitle_regexs.txt.
238
239 -c, --capture-replace-pairs (Expert option)
240 A file with pairs of lines. Within each pair the first line is a regular expressions
241 defining one or more capture groups. The second line of a pair is the
242 string used to replace the match in the regular expression with. This means the second
243 line contains the capture groups. These pairs are used to further filter
244 the sequence similarity search result descriptions ('stitle' in Blast terminology). In
245 contrast to the --filter-regex (-l) matches are not deleted, but replaced with the
246 second line of the pair. Filtering is used to process descriptions ('stitle' in Blast
247 terminology) and prepare the descriptions for the prot-scriber annotation process.
248 Set to 'default' to use the hard coded default. An example file can be downloaded here:
249 https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/capture_replace_pairs.txt.
250
251 ----
252
253 **Gene family annotation**
254
255 prot-scriber can also apply the same methodology to produce HRDs for sets of biological sequences, i.e. gene families::
256
257 -f, --seq-families
258 A file in which families of biological sequences are stored, one family per line. Each
259 line must have format 'fam-name TAB gene1,gene2,gene3'. Make sure no gene appears in
260 more than one family.
261
262 -g, --seq-family-gene-ids-separator
263 A regular expression used to split the list of gene-identifiers in the
264 argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'.
265
266 -a, --annotate-non-family-queries
267 Use this option only in combination with --seq-families (-f), i.e. when prot-scriber is
268 used to generate human readable descriptions for gene families. If in that context this
269 flag is given, queries for which there are sequence similarity search (Blast) results
270 but that are NOT member of a sequence family will receive an annotation (human readable
271 description) in the output file, too. Default value of this setting is 'OFF' (false).
272
273 ----
274
275 **Expert options**
276
277 Some additional optional configuration. Only use when you know what you are doing::
278
279 -w, --non-informative-words-regexs
280 A file in which regular expressions (regexs) are stored, one per line. These
281 regexs are used to recognize non-informative words, which will only receive a minimun
282 score in the prot-scriber process that generates human readable description. There is a
283 default list hard-coded into prot-scriber. An example file can be downloaded here:
284 https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/non_informative_words_regexs.txt.
285
286 -r, --description-split-regex
287 A regular expression to be used to split descriptions (`stitle` in Blast
288 terminology) into words. Default is '([~_\-/|\;,':.\s]+)'.
289
290 -q, --center-inverse-word-information-content-at-quantile
291 The quantile (percentile) to be subtracted from calculated inverse word information
292 content to center these values. Consequently, this must be a value between zero and one
293 or literal 50, which is interpreted as mean instead of a quantile. Default is 50,
294 implying centering at the mean.
295
296 ----
297
298 **Output**
299
300 prot-scriber outputs a single tab-separated text file with the annotated sequences or gene-families, depending on how you ran the program, one result per line::
301
302 Annotee-Identifier Human-Readable-Description
303 Soltu.DM.02G020600.1 arath strubbelig receptor family
304 Soltu.DM.S001650.1 germin member
305 Soltu.DM.03G011280.1 increased dna methylation
306 ...
307
308 ]]>
309 </help>
310 </tool>