Mercurial > repos > iuc > prot_scriber
diff prot-scriber.xml @ 0:c840a1c77a0a draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/prot-scriber commit 8b58f3f03d6430689d228029bb2eb46c16cfff23
author | iuc |
---|---|
date | Tue, 10 May 2022 13:18:05 +0000 |
parents | |
children | 1e9a43cbf524 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/prot-scriber.xml Tue May 10 13:18:05 2022 +0000 @@ -0,0 +1,310 @@ +<tool id="prot_scriber" name="prot-scriber" version="@TOOL_VERSION@" profile="21.05"> + <description>Protein annotation of short human readable descriptions</description> + <macros> + <token name="@TOOL_VERSION@">0.1.1</token> + </macros> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">prot-scriber</requirement> + </requirements> + <stdio> + <regex match="panicked" level="fatal" source="stderr" /> + </stdio> + <command> + <![CDATA['prot-scriber' + #if str($input_config.input_config_selector) == "basic" + #for $sst in $input_config.seq_sim_table + -s '$sst' + #end for + #else if str($input_config.input_config_selector) == "advanced" + #for $ssr in $input_config.advanced_input_repeat + -s '$ssr.seq_sim_table' + #if $ssr.header + -e '$ssr.header' + #end if + #if $ssr.field_separator + -p '$ssr.field_separator' + #end if + #if $ssr.blacklist_regexs + -b '$ssr.blacklist_regexs' + #end if + #if $ssr.capture_replace_pairs + -c '$ssr.capture_replace_pairs' + #end if + #if $ssr.filter_regexs + -l '$ssr.filter_regexs' + #end if + #end for + #if $input_config.expert_options.non_informative_words_regexs + -w '$input_config.expert_options.non_informative_words_regexs' + #end if + #if $input_config.expert_options.description_split_regex + -r "$input_config.expert_options.description_split_regex" + #end if + #if $input_config.expert_options.center_inverse_word_information_content_at_quantile + -q $input_config.expert_options.center_inverse_word_information_content_at_quantile + #end if + #end if + #if $seq_family.seq_families + -f '$seq_families' + #end if + #if $seq_family.annotate_non_family_queries + -a + #end if + #if $seq_family.seq_family_gene_ids_separator + -g "$seq_family_gene_ids_separator" + #end if + #if $seq_family.seq_family_id_genes_separator + -i '$seq_family_id_genes_separator' + #end if + -o '$output' + ]]> + </command> + <inputs> + <conditional name="input_config"> + <param type="select" name="input_config_selector" label="Choose input configuration options"> + <option value="basic" selected="true">Basic</option> + <option value="advanced">Advanced</option> + </param> + <when value="basic"> + <param type="data" multiple="true" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search results in tabular format (-s)" help="Files in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them. + Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." /> + </when> + <when value="advanced"> + <repeat name="advanced_input_repeat" title="Sequence similarity table" min="1" default="1"> + <param type="data" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search result in tabular format (-s)" help="File in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them. + Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." /> + <param type="text" optional="true" name="field_separator" argument="-p" label="Field separator (-p)" help="Field-Separator of the (-s) sequence similarity table. The default value is the 'TAB' character. Set to 'default' to use the hard coded default"> + <sanitizer> + <valid initial="default"> + <add preset="string.printable" /> + </valid> + </sanitizer> + </param> + <param type="text" optional="true" name="header" argument="-e" label="Header of the sequence similarity tables (-e)" help="Header of the (-s) sequence similarity table. Separated by space (' ') the names of the + in order of appearance in the respective table. Required and default columns are 'qacc sacc stitle'. Set to 'default' to use the hard coded default" /> + <param type="data" optional="true" name="blacklist_regexs" argument="-b" format="tabular" label="Blacklist Regexs (-b)" help="A file with regular expressions, one per line. Any match to any of these + regular expressions causes sequence similarity search result descriptions ('stitle' in Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard coded default" /> + <param type="data" optional="true" name="capture_replace_pairs" argument="-c" format="tabular" label="Capture replace pairs (-c)" help="A file with pairs of lines. Within each pair the first line is a regular expressions + defining one or more capture groups. The second line of a pair is the string used to replace the match in the regular expression with. Set to 'default' to use the hard coded default" /> + <param type="data" optional="true" name="filter_regexs" argument="-l" format="tabular" label="Filter regexs (-l)" help="A file with regular expressions, one per line. Any match to any of these + regular expressions causes the matched sub-string to be deleted, i.e. filtered out. Set to 'default' to use the hard coded default" /> + </repeat> + <section title="Expert options" name="expert_options"> + <param type="data" optional="true" name="non_informative_words_regexs" argument="-w" format="tabular" label="Non informative words regexs (-w)" help="A file in which regular expressions (regexs) are stored, one per line. These + regexs are used to recognize non-informative words, which will only receive a minimum score in the prot-scriber process that generates human readable description." /> + <param type="text" optional="true" name="description_split_regex" argument="-r" label="Description split regex (-r)" help="A regular expression to be used to split descriptions (`stitle` in Blast + terminology) into words. Default is '([~_\-/|\;,':.\s]+)'."> + <sanitizer> + <valid initial="default"> + <add preset="string.printable" /> + </valid> + </sanitizer> + </param> + <param type="integer" optional="true" name="center_inverse_word_information_content_at_quantile" argument="-q" label="Center inverse word-information-content at quantile (-q)" help="The quantile (percentile) to be subtracted from calculated inverse word information + content to center these values. Value between 0 and 1." /> + </section> + </when> + </conditional> + <section title="Sequence family annotation" name="seq_family"> + <param type="data" optional="true" name="seq_families" argument="-f" format="tabular" label="Families of biological sequences (-f)" help="A file in which families of biological sequences are stored, one family per line. Each + line must have format 'fam_name TAB gene1,gene2,gene3'. Make sure no gene appears in + more than one family." /> + <param type="boolean" optional="true" name="annotate_non_family_queries" argument="-a" label="Annotate non family query sequences (-a)" help="Set this to true to also annotate sequences are not member of a sequence family." /> + <param type="text" optional="true" name="seq_family_gene_ids_separator" argument="-g" label="Sequence family file gene-id separator (-g)" help=" A regular expression used to split the list of gene_identifiers in the + argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'."> + <sanitizer> + <valid initial="default"> + <add preset="string.printable" /> + </valid> + </sanitizer> + </param> + <param type="text" optional="true" name="seq_family_id_genes_separator" argument="-i" label="Sequence family file family - gene-id separator (-i)" help="A string used as separator in the argument --seq-families (-f) gene families file. This + string separates the gene_family_identifier (name) from the gene_identifier list that family comprises. Default is 'TAB'."> + <sanitizer> + <valid initial="default"> + <add preset="string.printable" /> + </valid> + </sanitizer> + </param> + </section> + </inputs> + <outputs> + <data format="tabular" name="output" /> + </outputs> + <tests> + <test> + <param name="input_config_selector" value="basic"/> + <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" /> + <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" /> + <output name="output" file="8_Proteins_prot-scriber.out" sort="true" /> + </test> + <test> + <param name="input_config_selector" value="advanced" /> + <repeat name="advanced_input_repeat"> + <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" /> + <param name="field_separator" value="default" /> + <param name="header" value="qacc sacc stitle" /> + </repeat> + <repeat name="advanced_input_repeat"> + <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" /> + <param name="field_separator" value="default" /> + <param name="header" value="qacc sacc stitle" /> + </repeat> + <output name="output" file="8_Proteins_prot-scriber.out" sort="true" /> + </test> + <test> + <param name="input_config_selector" value="advanced" /> + <repeat name="advanced_input_repeat"> + <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt" /> + <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" /> + </repeat> + <repeat name="advanced_input_repeat"> + <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt" /> + <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt" /> + </repeat> + <param name="description_split_regex" value="([~_\-/|;,':.\s]+)" /> + <param name="center_inverse_word_information_content_at_quantile" value="50" /> + <output name="output" file="8_Proteins_prot-scriber.out" sort="true" /> + </test> + </tests> + <help> + <![CDATA[ + +**What it does** + +prot-scriber_ assigns short human readable descriptions (HRD) to query biological sequences using reference candidate descriptions. +In this, prot-scriber consumes sequence similarity search (Blast or Diamond or similar) results in tabular format. +customized lexical analysis is carried out on the descriptions of these Blast Hits and a resulting HRD is assigned to the query sequences. +For more information, examples and how to use the prot-scriber commandline tool refer to the prot-scriber README_ and MANUAL_. + +.. _prot-scriber: http://github.com/usadellab/prot-scriber +.. _README: https://github.com/usadellab/prot-scriber/blob/master/README.md +.. _MANUAL: https://github.com/usadellab/prot-scriber/blob/master/README.md#manual + +---- + +**Input** + +The input file is one or multiple tabular output(s) of a sequence similarity search (Blast, Diamon or similar). +Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond). The input is done via the -s parameter:: + + -s, --seq-sim-table + File in which to find sequence similarity search results in tabular format (SSST). Use + e.g. Blast or Diamond to produce them. Required columns are: 'qacc sacc stitle' (Blast) + or 'qseqid sseqid stitle' (Diamond). If the required columns, or more, appear in different order than + shown here you must use the --header (-e) argument. If any of the input SSSTs uses a + different field-separator than the '<TAB>' character, you must provide the --field- + separator (-p) argument. You can provide multiple SSSTs for your query proteins whose information + will be combined and evaluated by the tool. + +**Input parameters** + +prot-scriber gives the user the opportunity to fine tune parameters for the provided input tables. +To do so turn on the *input configuration* switch. Those are optional, as the tool also provides sensible defaults. +In case you decide to customize your inputs using below parameters, be advised that prot-scriber expects the +customized parameter for all input tables - the number of tables and e.g. *--header* parameters have to match. +You can set the values to 'default' if you want to use the default value for a given input table:: + + -e, --header + Header of the --seq-sim-table (-s) arg. Separated by space (' ') the names of the + columns in order of appearance in the respective table. Required and default columns are + 'qacc sacc stitle'. Note that this option only understands Blast terminology, i.e. even + if you ran Diamond, please provide 'qacc' instead of 'qseqid' and 'sacc' instead of + 'sseqid'. Luckily 'stitle' is 'stitle' in Diamond, too. You can have additional columns + that will be ignored, as long as the required columns appear in the correct order. + Consider this example: 'qacc sacc evalue bitscore stitle'. Set to 'default' to use the hard coded default. + + -p, --field-separator + Field-Separator of the --seq-sim-table (-s) arg. The default value is the '<TAB>' character. + Consider this example: ','. You can provide 'default' to use the hard coded default (TAB). + + -b, --blacklist-regexs (Expert option) + A file with regular expressions, one per line. Any match to any of these + regular expressions causes sequence similarity search result descriptions ('stitle' in + Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard + coded default. An example file can be downloaded here: + https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/blacklist_stitle_regexs.txt. + + -l, --filter-regexs (Expert option) + A file with regular expressions, one per line. Any match to any of these + regular expressions causes the matched sub-string to be deleted, i.e. filtered out. + Filtering is used to process descriptions ('stitle' in Blast terminology) and prepare + the descriptions for the prot-scriber annotation process. In case of UniProt sequence + similarity search results (Blast result tables), this removes the Blast Hit identifier + (`sacc`) from the description (`stitle`) and also removes the taxonomic information + starting with e.g. 'OS=' at the end of the `stitle` strings. Set to 'default' to use + hard coded default. Anexample file can be downloaded here: + https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/filter_stitle_regexs.txt. + + -c, --capture-replace-pairs (Expert option) + A file with pairs of lines. Within each pair the first line is a regular expressions + defining one or more capture groups. The second line of a pair is the + string used to replace the match in the regular expression with. This means the second + line contains the capture groups. These pairs are used to further filter + the sequence similarity search result descriptions ('stitle' in Blast terminology). In + contrast to the --filter-regex (-l) matches are not deleted, but replaced with the + second line of the pair. Filtering is used to process descriptions ('stitle' in Blast + terminology) and prepare the descriptions for the prot-scriber annotation process. + Set to 'default' to use the hard coded default. An example file can be downloaded here: + https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/capture_replace_pairs.txt. + +---- + +**Gene family annotation** + +prot-scriber can also apply the same methodology to produce HRDs for sets of biological sequences, i.e. gene families:: + + -f, --seq-families + A file in which families of biological sequences are stored, one family per line. Each + line must have format 'fam-name TAB gene1,gene2,gene3'. Make sure no gene appears in + more than one family. + + -g, --seq-family-gene-ids-separator + A regular expression used to split the list of gene-identifiers in the + argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'. + + -a, --annotate-non-family-queries + Use this option only in combination with --seq-families (-f), i.e. when prot-scriber is + used to generate human readable descriptions for gene families. If in that context this + flag is given, queries for which there are sequence similarity search (Blast) results + but that are NOT member of a sequence family will receive an annotation (human readable + description) in the output file, too. Default value of this setting is 'OFF' (false). + +---- + +**Expert options** + +Some additional optional configuration. Only use when you know what you are doing:: + + -w, --non-informative-words-regexs + A file in which regular expressions (regexs) are stored, one per line. These + regexs are used to recognize non-informative words, which will only receive a minimun + score in the prot-scriber process that generates human readable description. There is a + default list hard-coded into prot-scriber. An example file can be downloaded here: + https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/non_informative_words_regexs.txt. + + -r, --description-split-regex + A regular expression to be used to split descriptions (`stitle` in Blast + terminology) into words. Default is '([~_\-/|\;,':.\s]+)'. + + -q, --center-inverse-word-information-content-at-quantile + The quantile (percentile) to be subtracted from calculated inverse word information + content to center these values. Consequently, this must be a value between zero and one + or literal 50, which is interpreted as mean instead of a quantile. Default is 50, + implying centering at the mean. + +---- + +**Output** + +prot-scriber outputs a single tab-separated text file with the annotated sequences or gene-families, depending on how you ran the program, one result per line:: + + Annotee-Identifier Human-Readable-Description + Soltu.DM.02G020600.1 arath strubbelig receptor family + Soltu.DM.S001650.1 germin member + Soltu.DM.03G011280.1 increased dna methylation + ... + + ]]> + </help> +</tool>