Mercurial > repos > iuc > prot_scriber

<tool id="prot_scriber" name="prot-scriber" version="@TOOL_VERSION@" profile="21.05">
    <description>Protein annotation of short human readable descriptions</description>
    <macros>
        <token name="@TOOL_VERSION@">0.1.5</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">prot-scriber</requirement>
    </requirements>
    <stdio>
        <regex match="panicked" level="fatal" source="stderr"/>
    </stdio>
    <command>
    <![CDATA['prot-scriber'
    #if str($input_config.input_config_selector) == "basic"
      #for $sst in $input_config.seq_sim_table
        -s '$sst'
      #end for
    #else if str($input_config.input_config_selector) == "advanced"
      #for $ssr in $input_config.advanced_input_repeat
        -s '$ssr.seq_sim_table'
        #if $ssr.header
          -e '$ssr.header'
        #else
          -e 'default'
        #end if
        #if $ssr.field_separator
          -p '$ssr.field_separator'
        #else
          -p 'default'
        #end if
        #if $ssr.blacklist_regexs
          -b '$ssr.blacklist_regexs'
        #else
          -b 'default'
        #end if
        #if $ssr.capture_replace_pairs
          -c '$ssr.capture_replace_pairs'
        #else
          -c 'default'
        #end if
        #if $ssr.filter_regexs
          -l '$ssr.filter_regexs'
        #else
          -l 'default'
        #end if
      #end for
      #if $input_config.expert_options.non_informative_words_regexs
        -w '$input_config.expert_options.non_informative_words_regexs'
      #end if
      #if $input_config.expert_options.description_split_regex
        -r '$input_config.expert_options.description_split_regex'
      #end if
      #if $input_config.expert_options.center_inverse_word_information_content_at_quantile
        -q $input_config.expert_options.center_inverse_word_information_content_at_quantile
      #end if
      #if $input_config.expert_options.polish_capture_replace_pairs
        -d '$input_config.expert_options.polish_capture_replace_pairs'
      #end if
    #end if
    #if $seq_family.seq_families
      -f '$seq_families'
    #end if
    #if $seq_family.annotate_non_family_queries
      -a
    #end if
    #if $seq_family.seq_family_gene_ids_separator
      -g '$seq_family_gene_ids_separator'
    #end if
    #if $seq_family.seq_family_id_genes_separator
      -i '$seq_family_id_genes_separator'
    #end if
    #if $exclude_not_annotated_queries
        -x
    #end if
    -o '$output'
    ]]>
  </command>
    <inputs>
        <conditional name="input_config">
            <param name="input_config_selector" type="select" label="Choose input configuration options">
                <option value="basic" selected="true">Basic</option>
                <option value="advanced">Advanced</option>
            </param>
            <when value="basic">
                <param name="seq_sim_table" argument="-s" type="data" format="tabular" label="Sequence similarity search results in tabular format (-s)" help="Files in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them.         Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)." multiple="true"/>
            </when>
            <when value="advanced">
                <repeat name="advanced_input_repeat" title="Sequence similarity table" min="1" default="1">
                    <param name="seq_sim_table" argument="-s" type="data" format="tabular" label="Sequence similarity search result in tabular format (-s)" help="File in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them.           Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond)."/>
                    <param name="field_separator" argument="-p" type="text" optional="true" label="Field separator (-p)" help="Field-Separator of the (-s) sequence similarity table. The default value is the 'TAB' character. Set to 'default' to use the hard coded default">
                        <sanitizer>
                            <valid initial="default">
                                <add preset="string.printable"/>
                            </valid>
                        </sanitizer>
                    </param>
                    <param name="header" argument="-e" type="text" optional="true" label="Header of the sequence similarity tables (-e)" help="Header of the (-s) sequence similarity table. Separated by space (' ') the names of the             in order of appearance in the respective table. Required and default columns are 'qacc sacc stitle'. Set to 'default' to use the hard coded default"/>
                    <param name="blacklist_regexs" argument="-b" type="data" format="tabular" optional="true" label="Blacklist Regexs (-b)" help="A file with regular expressions, one per line. Any match to any of these             regular expressions causes sequence similarity search result descriptions ('stitle' in Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard coded default"/>
                    <param name="capture_replace_pairs" argument="-c" type="data" format="tabular" optional="true" label="Capture replace pairs (-c)" help="A file with pairs of lines. Within each pair the first line is a regular expressions             defining one or more capture groups. The second line of a pair is the string used to replace the match in the regular expression with. Set to 'default' to use the hard coded default"/>
                    <param name="filter_regexs" argument="-l" type="data" format="tabular" optional="true" label="Filter regexs (-l)" help="A file with regular expressions, one per line. Any match to any of these             regular expressions causes the matched sub-string to be deleted, i.e. filtered out. Set to 'default' to use the hard coded default"/>
                </repeat>
                <section title="Expert options" name="expert_options">
                    <param name="non_informative_words_regexs" argument="-w" type="data" format="tabular" optional="true" label="Non informative words regexs (-w)" help="A file in which regular expressions (regexs) are stored, one per line. These             regexs are used to recognize non-informative words, which will only receive a minimum score in the prot-scriber process that generates human readable description."/>
                    <param name="description_split_regex" argument="-r" type="text" optional="true" label="Description split regex (-r)" help="A regular expression to be used to split descriptions (`stitle` in Blast             terminology) into words. Default is '([~_\-/|\;,':.\s]+)'.">
                        <sanitizer>
                            <valid initial="default">
                                <add preset="string.printable"/>
                            </valid>
                        </sanitizer>
                    </param>
                    <param name="center_inverse_word_information_content_at_quantile" argument="-q" type="integer" optional="true" label="Center inverse word-information-content at quantile (-q)" help="The quantile (percentile) to be subtracted from calculated inverse word information             content to center these values. Value between 0 and 1."/>
                    <param name="polish_capture_replace_pairs" argument="-d" type="data" format="txt" optional="true" label="Polishing capture replace pairs (-d)" help="A file with pairs of lines. Defines pairs of regex / replace             pairs for post polishing of annotation results. Set to 'none' or provide an empty file to supress polishing."/>
                </section>
            </when>
        </conditional>
        <section title="Sequence family annotation" name="seq_family">
            <param name="seq_families" argument="-f" type="data" format="tabular" optional="true" label="Families of biological sequences (-f)" help="A file in which families of biological sequences are stored, one family per line. Each         line must have format 'fam_name TAB gene1,gene2,gene3'. Make sure no gene appears in         more than one family."/>
            <param name="annotate_non_family_queries" argument="-a" type="boolean" optional="true" label="Annotate non family query sequences (-a)" help="Set this to true to also annotate sequences are not member of a sequence family."/>
            <param name="seq_family_gene_ids_separator" argument="-g" type="text" optional="true" label="Sequence family file gene-id separator (-g)" help=" A regular expression used to split the list of gene_identifiers in the         argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'.">
                <sanitizer>
                    <valid initial="default">
                        <add preset="string.printable"/>
                    </valid>
                </sanitizer>
            </param>
            <param name="seq_family_id_genes_separator" argument="-i" type="text" optional="true" label="Sequence family file family - gene-id separator (-i)" help="A string used as separator in the argument --seq-families (-f) gene families file. This         string separates the gene_family_identifier (name) from the gene_identifier list that family comprises. Default is 'TAB'.">
                <sanitizer>
                    <valid initial="default">
                        <add preset="string.printable"/>
                    </valid>
                </sanitizer>
            </param>
        </section>
        <param name="exclude_not_annotated_queries" argument="-x" type="boolean" optional="true" label="Exclude not annotated query sequences (-x)" help="Use this option to exclude results from the output table that could not be annotated."/>
    </inputs>
    <outputs>
        <data format="tabular" name="output"/>
    </outputs>
    <tests>
        <test>
            <param name="input_config_selector" value="basic"/>
            <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt,8_Proteins_vs_Trembl_blastp.txt"/>
            <output name="output" file="8_Proteins_prot-scriber.out" sort="true"/>
        </test>
        <test>
            <param name="input_config_selector" value="advanced"/>
            <repeat name="advanced_input_repeat">
                <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt"/>
                <param name="field_separator" value="default"/>
                <param name="header" value="qacc sacc stitle"/>
            </repeat>
            <repeat name="advanced_input_repeat">
                <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt"/>
                <param name="field_separator" value="default"/>
                <param name="header" value="qacc sacc stitle"/>
            </repeat>
            <output name="output" file="8_Proteins_prot-scriber.out" sort="true"/>
        </test>
        <test>
            <param name="input_config_selector" value="advanced"/>
            <repeat name="advanced_input_repeat">
                <param name="seq_sim_table" value="8_Proteins_vs_Swissprot_blastp.txt"/>
                <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt"/>
            </repeat>
            <repeat name="advanced_input_repeat">
                <param name="seq_sim_table" value="8_Proteins_vs_Trembl_blastp.txt"/>
                <param name="blacklist_regexs" value="blacklist_stitle_regexs.txt"/>
            </repeat>
            <param name="description_split_regex" value="([~_\-/|;,'\'':.\s]+)"/>
            <param name="center_inverse_word_information_content_at_quantile" value="50"/>
            <output name="output" file="8_Proteins_prot-scriber.out" sort="true"/>
        </test>
    </tests>
    <help>
    <![CDATA[

**What it does**

prot-scriber_ assigns short human readable descriptions (HRD) to query biological sequences using reference candidate descriptions.
In this, prot-scriber consumes sequence similarity search (Blast or Diamond or similar) results in tabular format.
customized lexical analysis is carried out on the descriptions of these Blast Hits and a resulting HRD is assigned to the query sequences.
For more information, examples and how to use the prot-scriber commandline tool refer to the prot-scriber README_ and MANUAL_.

.. _prot-scriber: http://github.com/usadellab/prot-scriber
.. _README: https://github.com/usadellab/prot-scriber/blob/master/README.md
.. _MANUAL: https://github.com/usadellab/prot-scriber/blob/master/README.md#manual

----

**Input**

The input file is one or multiple tabular output(s) of a sequence similarity search (Blast, Diamon or similar).
Required columns are: 'qacc sacc stitle' (Blast) or 'qseqid sseqid stitle' (Diamond). The input is done via the -s parameter::

    -s, --seq-sim-table
            File in which to find sequence similarity search results in tabular format (SSST). Use
            e.g. Blast or Diamond to produce them. Required columns are: 'qacc sacc stitle' (Blast)
            or 'qseqid sseqid stitle' (Diamond). If the required columns, or more, appear in different order than
            shown here you must use the --header (-e) argument. If any of the input SSSTs uses a
            different field-separator than the '<TAB>' character, you must provide the --field-
            separator (-p) argument. You can provide multiple SSSTs for your query proteins whose information
            will be combined and evaluated by the tool.

**Input parameters**

prot-scriber gives the user the opportunity to fine tune parameters for the provided input tables.
To do so turn on the *input configuration* switch. Those are optional, as the tool also provides sensible defaults.
In case you decide to customize your inputs using below parameters, be advised that prot-scriber expects the
customized parameter for all input tables - the number of tables and e.g. *--header* parameters have to match.
You can set the values to 'default' if you want to use the default value for a given input table::

    -e, --header
            Header of the --seq-sim-table (-s) arg. Separated by space (' ') the names of the
            columns in order of appearance in the respective table. Required and default columns are
            'qacc sacc stitle'. Note that this option only understands Blast terminology, i.e. even
            if you ran Diamond, please provide 'qacc' instead of 'qseqid' and 'sacc' instead of
            'sseqid'. Luckily 'stitle' is 'stitle' in Diamond, too. You can have additional columns
            that will be ignored, as long as the required columns appear in the correct order.
            Consider this example: 'qacc sacc evalue bitscore stitle'. Set to 'default' to use the hard coded default.

    -p, --field-separator
            Field-Separator of the --seq-sim-table (-s) arg. The default value is the '<TAB>' character.
            Consider this example: ','. You can provide 'default' to use the hard coded default (TAB).

    -b, --blacklist-regexs (Expert option)
            A file with regular expressions, one per line. Any match to any of these
            regular expressions causes sequence similarity search result descriptions ('stitle' in
            Blast terminology) to be discarded from the prot-scriber annotation process. Set to 'default' to use the hard
            coded default. An example file can be downloaded here:
            https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/blacklist_stitle_regexs.txt.

    -l, --filter-regexs (Expert option)
            A file with regular expressions, one per line. Any match to any of these
            regular expressions causes the matched sub-string to be deleted, i.e. filtered out.
            Filtering is used to process descriptions ('stitle' in Blast terminology) and prepare
            the descriptions for the prot-scriber annotation process. In case of UniProt sequence
            similarity search results (Blast result tables), this removes the Blast Hit identifier
            (`sacc`) from the description (`stitle`) and also removes the taxonomic information
            starting with e.g. 'OS=' at the end of the `stitle` strings. Set to 'default' to use
            hard coded default. Anexample file can be downloaded here:
            https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/filter_stitle_regexs.txt.

    -c, --capture-replace-pairs (Expert option)
            A file with pairs of lines. Within each pair the first line is a regular expressions
            defining one or more capture groups. The second line of a pair is the
            string used to replace the match in the regular expression with. This means the second
            line contains the capture groups. These pairs are used to further filter
            the sequence similarity search result descriptions ('stitle' in Blast terminology). In
            contrast to the --filter-regex (-l) matches are not deleted, but replaced with the
            second line of the pair. Filtering is used to process descriptions ('stitle' in Blast
            terminology) and prepare the descriptions for the prot-scriber annotation process.
            Set to 'default' to use the hard coded default. An example file can be downloaded here:
            https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/capture_replace_pairs.txt.

----

**Gene family annotation**

prot-scriber can also apply the same methodology to produce HRDs for sets of biological sequences, i.e. gene families::

    -f, --seq-families
            A file in which families of biological sequences are stored, one family per line. Each
            line must have format 'fam-name TAB gene1,gene2,gene3'. Make sure no gene appears in
            more than one family.

    -g, --seq-family-gene-ids-separator
            A regular expression used to split the list of gene-identifiers in the
            argument --seq-families (-f) gene families file. Default is '(\s*,\s*|\s+)'.

    -a, --annotate-non-family-queries
            Use this option only in combination with --seq-families (-f), i.e. when prot-scriber is
            used to generate human readable descriptions for gene families. If in that context this
            flag is given, queries for which there are sequence similarity search (Blast) results
            but that are NOT member of a sequence family will receive an annotation (human readable
            description) in the output file, too. Default value of this setting is 'OFF' (false).

----

**Expert options**

Some additional optional configuration. Only use when you know what you are doing::

    -w, --non-informative-words-regexs
            A file in which regular expressions (regexs) are stored, one per line. These
            regexs are used to recognize non-informative words, which will only receive a minimun
            score in the prot-scriber process that generates human readable description. There is a
            default list hard-coded into prot-scriber. An example file can be downloaded here:
            https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/non_informative_words_regexs.txt.

    -r, --description-split-regex
            A regular expression to be used to split descriptions (`stitle` in Blast
            terminology) into words. Default is '([~_\-/|\;,':.\s]+)'.

    -q, --center-inverse-word-information-content-at-quantile
            The quantile (percentile) to be subtracted from calculated inverse word information
            content to center these values. Consequently, this must be a value between zero and one
            or literal 50, which is interpreted as mean instead of a quantile. Default is 50,
            implying centering at the mean.

    -d, --polish-capture-replace-pairs
            The last step of the process generating human readable descriptions (HRDs) for the
            queries (proteins or sequence families) is to 'polish' the selected HRDs. Polishing is
            done by iterative application of regular expressions (fancy-regex) and replace
            instructions (capture-replace-pairs). If you do not want to use the default polishing
            capture replace pairs specify a file in which pairs of lines are given. Of each pair the
            first line hold a regular expression (fancy-regex syntax) and the second the replacement
            instructions providing access to capture groups. Set to 'none' or provide an empty file,
            if you want to suppress polishing. If you want to have a template file for your custom
            polishing capture-replace-pairs please refer to
            https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/polish_capture_replace_pairs.txt

----

**Output**

prot-scriber outputs a single tab-separated text file with the annotated sequences or gene-families, depending on how you ran the program, one result per line::

    Annotee-Identifier	Human-Readable-Description
    Soltu.DM.02G020600.1	arath strubbelig receptor family
    Soltu.DM.S001650.1	germin member
    Soltu.DM.03G011280.1	increased dna methylation
    ...

If you want to supress results from the output table that could not be annotated, i.e. 'unknown protein' or 'unknown sequence family' respectively use the '-x' parameter::

    -x, --exclude-not-annotated-queries
            Exclude results from the output table that could not be annotated, i.e. 'unknown
            protein' or 'unknown sequence family', respectively.

    ]]>
  </help>
    <citations>
        <citation type="bibtex">
            @misc{githubprot-scriber,
            author = {Asis Hallab},
            year = {2024},
            title = {prot-scriber},
            publisher = {Github},
            journal = {Github repository},
            url = {https://github.com/usadellab/prot-scriber},
        }</citation>
    </citations>
</tool>
author	iuc
date	Sat, 18 May 2024 20:36:38 +0000
parents	4d4df9779b7b
children