view proteinortho.xml @ 9:6140163233a5 draft default tip

planemo upload for repository https://gitlab.com/paulklemm_PHD/proteinortho commit e151cf96893602bf011c27a2d91df1ef594b774d
author iuc
date Fri, 13 Dec 2024 10:19:09 +0000
parents c5dd4f86d981
children
line wrap: on
line source

<tool id="proteinortho" name="Proteinortho" version="@TOOL_VERSION@+galaxy@WRAPPER_VERSION@" profile="@PROFILE@">
    <description>detects orthologous proteins/genes within different species</description>
    <macros>
        <import>proteinortho_macros.xml</import>
        <xml name="test_output_proteinortho" tokens="nlines" token_nlines_delta="0">
            <output name="proteinortho">
                <metadata name="column_names" value="species,genes,alg.-conn.,L.fasta,C.fasta,E.fasta,M.fasta"/>
                <assert_contents>
                    <has_n_columns n="7"/>
                    <has_n_lines n="@NLINES@" delta="@NLINES_DELTA@"/>
                    <has_line_matching expression="# Species\tGenes\tAlg\.-Conn\.\t.*"/>
                    <has_line_matching expression="[0-9]+\t[0-9]+\t.*"/>
                    <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+.*"/>
                </assert_contents>
            </output>
        </xml>
        <xml name="test_output_blastgraph" tokens="nlines" token_nlines_delta="0">
            <output name="blastgraph">
                <metadata name="column_names" value="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
                <assert_contents>
                    <has_n_columns n="6" comment="#"/>
                    <has_n_lines n="@NLINES@" delta="@NLINES_DELTA@"/>
                    <has_line_matching expression="# file_a\tfile_b"/>
                    <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba"/>
                    <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/>
                    <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/>
                </assert_contents>
            </output>
        </xml>
        <xml name="test_output_proteinorthograph" tokens="nlines" token_nlines_delta="0" token_add_columns="" token_ncolumns="6">
            <output name="proteinorthograph">
                <metadata name="column_names" value="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba@ADD_COLUMNS@"/>
                <assert_contents>
                    <has_n_columns n="@NCOLUMNS@" comment="#"/>
                    <has_n_lines n="@NLINES@" delta="@NLINES_DELTA@"/>
                    <has_line_matching expression="# file_a\tfile_b"/>
                    <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba(\tsame_strand\tsimscore)?"/>
                    <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/>
                    <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/>
                </assert_contents>
            </output>
        </xml>
    </macros>
    <expand macro="biotools"/>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
        ## the following ln-action is necessary, since the file names are used by proteinortho (output contains filenames => species names)
        #import re
        #for $f in $input_files#
            ln -sf '$f' '${re.sub('[^\w\-_.]', '_', f.element_identifier)}' &&
        #end for
        #if $synteny.synteny_options == "specified":
            #for $f in $synteny.input_files_syn#
                ln -sf '$f' '${re.sub('[^\w\-_.]', '_', f.element_identifier)}' &&
            #end for#
        #end if
        proteinortho 
            --project=result
            --cpus="\${GALAXY_SLOTS:-4}"
            #if $more_options.selfblast:
                $more_options.selfblast
            #end if
            #if $more_options.singles:
                $more_options.singles
            #end if
            #if $more_options.core:
                $more_options.core
            #end if
            --p=$p
            --e=$more_options.evalue
            --conn=$conn
            #if $more_options.cov:
                --cov=$more_options.cov
            #end if
            #if $sim:
                --sim=`LC_NUMERIC=C awk "BEGIN {printf \"%.2f\",$sim/100}"`
            #end if
            #if $more_options.identity:
                --cov=$more_options.identity
            #end if
            #if $more_options.isoform != "no":
                --isoform=$more_options.isoform
            #end if
            #if $synteny.synteny_options == "specified":
                --synteny
                --dups=$synteny.dups
                --cs=$synteny.cs
                --alpha=$synteny.alpha
            #end if
            #for $f in $input_files#
                ${re.sub('[^\w\-_.]', '_', f.element_identifier)}
            #end for#
            #if $synteny.synteny_options == "specified":    
                #for $f in $synteny.input_files_syn#
                    ${re.sub('[^\w\-_.]', '_', f.element_identifier)}
                #end for#
            #end if
            2> >(sed -E "s/.\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" 1>&2)
        #if $more_options.selfblast:
            &&
            mv result.blast-graph_clean result.blast-graph
        #end if
        #if $synteny.synteny_options == "specified":
            &&
            mv result.poff-graph result.proteinortho-graph &&
            mv result.poff.tsv result.proteinortho.tsv &&
            mv result.poff.html result.proteinortho.html
        #end if
    ]]></command>
    <inputs>
        <param name="input_files" format="fasta" type="data" multiple="true" min="2" label="Select the input fasta files (>2)" help="The input fasta files. At least 2 are needed!"/>    
        <param argument="--p" type="select" label="Similarity comparision algorithm" help="In the first step of proteinortho an all-versus-all reciprocal best hit graph is build from the input files (using this algorithm).">
            <option value="diamond" selected="true">diamond (aminoacid sequences)</option>
            <option value="autoblast">auto detect NCBI-BLAST (protein and nucleotide sequences)</option>
            <option value="blastp">NCBI-BLASTP+ (protein sequences)</option>
            <option value="blastn">NCBI-BLASTN+ (nucleotide sequences)</option>
            <option value="mmseqsp">MMseqs2 (aminoacid sequences)</option>
            <option value="mmseqsn">MMseqs2 (nucleotide sequences)</option>
            <option value="lastp">Last (aminoacid sequences)</option>
            <option value="lastn">Last (nucleotide sequences)</option>
            <option value="blatp">BLAT (aminoacid sequences)</option>
            <option value="blatn">BLAT (nucleotide sequences)</option>
        </param>
        <param argument="--sim" type="integer" value="95" min="0" max="100" label="Minimal reciprocal similarity in %" help="This and --evalue are main parameters for the generation of the reciprocal best hit graph. 1 = only the best reciprocal hits are reported, 0 = all possible reciprocal blast matches (within the E-value cutoff) are reported."/>
        <param argument="--conn" type="float" value="0.1" min="0." max="1." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values than more splits are done, resulting in more and smaller clusters. A value of 0 corresponds to no clustering."/>
        <section name="more_options" title="Additional Options" expanded="False">
            <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/>
            <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/>
            <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/>
            <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs (not compatible with synteny) "/>
            <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/>
            <param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/>
            <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format.">
                <option value="no" selected="true">Don't use isoform information</option>
                <option value="ncbi">ncbi style (..._additional.fasta)</option>
                <option value="uniprot">uniprot style (...isoform of...)</option>
                <option value="trinity">trinity style (...i4)</option>
            </param>
        </section>
        <conditional name="synteny">
            <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015. (Not compatible with selfblast)">
                <option value="no" selected="true">no</option>
                <option value="specified">yes</option>
            </param>
            <when value="no"/>
            <when value="specified">
                <param argument="--dups" type="integer" value="0" min="0" max="100" label="Number of reiterations for adjacencies heuristic, to determine duplicated regions"/>
                <param argument="--cs" type="integer" value="3" min="0" max="100" label="Size of a maximum common substring (MCS) for adjacency matches"/>
                <param argument="--alpha" type="float" value="0.5" min="0." max="1." label="Weight of adjacencies vs. sequence similarity" help="alpha[FF-adj score] + (1−alpha)[BLAST score]"/>
                <param name="input_files_syn" type="data" format="gff" multiple="true" min="2" label="Select the GFF3 files matching the input fasta files" help="The GFF3 files need matching names with the input fasta files. If you provide mybacteria123.faa or mybacteria123.fasta ... then you need to provide mybacteria123.gff here accordingly. The attributes column (#9) must contain the attribute Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective (protein) identifier in the FASTA input. For example see https://gitlab.com/paulklemm_PHD/proteinortho/-/blob/master/test/C.gff"/> 
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="blastgraph" format="tabular" label="${tool.name} on ${on_string}: RBH graph" from_work_dir="result.blast-graph">
            <actions>
                <action name="column_names" type="metadata"
                    default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
            </actions>
        </data>
        <data name="proteinortho" format="tabular" label="${tool.name} on ${on_string}: orthology-groups" from_work_dir="result.proteinortho.tsv">
            <actions>
                <action name="column_names" type="metadata"
                    default="species,genes,alg.-conn.,${','.join([ f.element_identifier for f in $input_files ])}"/>
            </actions>
        </data>
        <data name="proteinorthograph" format="tabular" label="${tool.name} on ${on_string}: orthology-pairs" from_work_dir="result.proteinortho-graph">
            <actions>
                <conditional name="synteny.synteny_options">
                    <when value="no">
                        <action name="column_names" type="metadata" default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
                    </when>
                    <when value="specified">
                        <action name="column_names" type="metadata" default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba,same_strand,simscore"/>
                    </when>
                </conditional>
            </actions>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="3"> <!-- test normal / default params -->
            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
            <param name="p" value="diamond"/>
            <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
            <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/>
            <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/>
            <assert_command>
                <has_text text="--p=diamond"/>
            </assert_command>
        </test>
        <test expect_num_outputs="3"> <!-- test normal mmseqs -->
            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
            <param name="p" value="mmseqsp"/>
            <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
            <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/>
            <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/>
            <assert_command>
                <has_text text="--p=mmseqsp"/>
            </assert_command>
        </test>
        <test expect_num_outputs="3"> <!-- various parameter -->
            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
            <param name="p" value="diamond"/>
            <param name="conn" value="1"/>
            <param name="sim" value="42"/>
            <section name="more_options">
                <param name="cov" value="42"/>
                <param name="identity" value="42"/>
                <param name="singles" value="true"/>
                <param name="core" value="true"/>
            </section>
            <expand macro="test_output_proteinortho" nlines="151" nlines_delta="50"/>
            <expand macro="test_output_blastgraph" nlines="1403" nlines_delta="300"/>
            <expand macro="test_output_proteinorthograph" nlines="239" nlines_delta="150"/>
            <assert_command>
                <has_text text="--p=diamond"/>
            </assert_command>
        </test>
        <test expect_num_outputs="3"> <!-- synteny -->
            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
            <param name="input_files_syn" value="L.gff,C.gff,E.gff,M.gff"/>
            <param name="p" value="diamond"/>
            <conditional name="synteny">
                <param name="synteny_options" value="specified"/>
            </conditional>
            <expand macro="test_output_proteinortho" nlines="38" nlines_delta="20"/>
            <expand macro="test_output_blastgraph" nlines="300" nlines_delta="150"/>
            <expand macro="test_output_proteinorthograph" nlines="119" nlines_delta="10" ncolumns="8" add_columns=",same_strand,simscore"/>
            <assert_command>
                <has_text text="--p=diamond"/>
            </assert_command>
        </test>
        <test expect_num_outputs="3"> <!-- blast -->
            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
            <param name="p" value="blastp"/>
            <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/>
            <expand macro="test_output_blastgraph" nlines="155" nlines_delta="50"/>
            <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="50"/>
            <assert_command>
                <has_text text="--p=blastp"/>
            </assert_command>
        </test>
        <test expect_num_outputs="3"> <!-- auto blast -->
            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
            <param name="p" value="autoblast"/>
            <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/>
            <expand macro="test_output_blastgraph" nlines="157" nlines_delta="50"/>
            <expand macro="test_output_proteinorthograph" nlines="136" nlines_delta="50"/>
            <assert_command>
                <has_text text="--p=autoblast"/>
            </assert_command>
        </test>
        <test expect_num_outputs="3"> <!-- last -->
            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
            <param name="p" value="lastp"/>
            <expand macro="test_output_proteinortho" nlines="34" nlines_delta="20"/>
            <expand macro="test_output_blastgraph" nlines="148" nlines_delta="50"/>
            <expand macro="test_output_proteinorthograph" nlines="134" nlines_delta="50"/>
            <assert_command>
                <has_text text="--p=lastp"/>
            </assert_command>
        </test>
        <test expect_num_outputs="3"> <!-- blat -->
            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
            <param name="p" value="blatp"/>
            <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/>
            <expand macro="test_output_blastgraph" nlines="56" nlines_delta="50"/>
            <expand macro="test_output_proteinorthograph" nlines="56" nlines_delta="50"/>
            <assert_command>
                <has_text text="--p=blatp"/>
            </assert_command>
        </test>
    </tests>
    <help><![CDATA[Proteinortho with POFF - An orthology detection tool

**What it does**

Proteinortho is a tool to detect orthologous proteins/genes within different species (at least 2). 

  | It compares similarities of given gene/protein sequences and clusters them to find significant groups.
  | The algorithm was designed to handle large-scale data and can be applied to hundreds of species at once.
  | Details can be found in (doi:10.1186/1471-2105-12-124 and doi:10.3389/fbinf.2023.1322477).
  | To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (details see doi:10.1371/journal.pone.0105015), is already built in Proteinortho. 

----

**Proteinortho in a nutshell**

----

* **(i) Build adaptive reciprocal best hit graph (RBH)**

      | Using the blast algorithm (diamond,blast,blat,...) all input sequences are compared against each other.
      | If two proteins find each other with respect to multiple criteria like minimal evalue, and similarity compared to the best hit, ... then an edge is drawn between the two proteins.
      | The result of this step is outputted to RBH

* **(ii) Cluster the RBH**

      | A spectral clustering algorithm is used to remove weak connections, reducing false positives.
      | The connected components from this process are output as orthology groups or pairs.

----

**Proteinortho output files**

----

* **RBH**

      | The result of the (i) step, the reciprocal best hit graph. 
      | First two comment line announces 2 species (# ecoli.faa   human.faa) as well as the median values (evalue_ab,bitscore_ab,evalue_ba,bitscore_ba). 
      | Following these header lines, each line corresponds to a reciprocal best hit of 2 proteins/genes (columns 1 and 2) of the announced species. The output format is shown below.
      | *seqidA*,*seqidB* = the 2 ids/names of the proteins involved 
      | *evalue_ab* = evalue with seqidA as query and seqidB as part of the database 
      | *bitscore_ab* = bitscore with seqidA as query ...
      | *evalue_ba* = evalue with seqidB as query ...

.. csv-table::
    
    seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba    
    # ecoli.faa,human.faa
    # 1.91e-112,357.5,1.825e-113,360
    L_10,C_10;test,4.32e-151,447,4.30e-151,446
    L_11,C_11,1.17e-68,209,3.00e-69,210
    L_14,C_14,3.64e-139,422,1.19e-142,431
    L_15,C_15,3.51e-100,303,2.12e-102,308
    L_16,C_16,3.75e-49,157,7.06e-50,159
    L_17,C_17,2.96e-195,578,5.50e-196,579

----

* **orthology-groups**

      | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups.
      | Every line corresponds to an orthology group. 
      | The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself. 
      | Then a column for each species follows containing the proteins of these species. 
      | If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity.
      | The '*' represents that this species does not contribute to the group.

.. csv-table::
    
    Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,mouse.faa
    5,5,0.715,C_10,C_10;test,E_10,L_10,M_10
    4,6,0.115,*,C_12,E_315,L_313,M_313
    4,5,0.167,*,C_63,E_19,L_19,M_19
    4,4,0.816,*,C_64,E_18,L_18,M_18

----

      | The first group is comprised of 5 proteins of 5 species: 'C_10' of ecoli.faa, 'C_10;test' of human.faa, 'E_10' of snail.faa, 'L_10' of wale.faa, and 'M_10' of mouse.faa.
      | The alg.-conn. (algebraic connectivity) of 0.715 indicates the connectivity of this group, the higher the more edges are connecting these 5 proteins (at most there can be 10 and at least there need to be 4).
      | The second group contains 6 proteins distributed over 4 species. The star indicates the species where no protein was found (in this case ecoli.faa).

.. csv-table::
    
    seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba    
    # ecoli.faa,human.faa
    # 1.91e-112,357.5,1.825e-113,360
    L_10,C_10;test,4.32e-151,447,4.30e-151,446
    L_11,C_11,1.17e-68,209,3.00e-69,210
    L_14,C_14,3.64e-139,422,1.19e-142,431
    L_15,C_15,3.51e-100,303,2.12e-102,308
    L_16,C_16,3.75e-49,157,7.06e-50,159
    L_17,C_17,2.96e-195,578,5.50e-196,579

----

* **orthology-pairs**

      | Similar to orthology groups, but each edge is printed individually.
      | The output is formatted the same as the RBH graph.
      | For example extracting all hits of the second group of the example orthology-group output ('4,6,0.115,*,C_12,E_315,L_313,M_313') using grep (-E, regular expression="(C_12|E_315|L_313|M_313).*(C_12|E_315|L_313|M_313)", input file=proteinortho-graph) would reveal all edges of this groups:

.. csv-table::

    seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba    
    M_313,C_12,1.18e-115,407,6.12e-116,407
    C_12,E_315,4.50e-127,445,4.09e-127,445
    L_313,M_313,0.00e+00,1368,0.00e+00,1368
    L_313,C_12,3.76e-114,402,1.94e-114,402

----

    | Especially L_313 and M_313 are very similar, probably identical.
    | The group cotnains 4 edges out of the 6 possible edges for a group of 4 proteins. The missing edges are M_313-E_315 as well as L_313-E_315. This means that E_315 is only connected to the other 3 proteins via C_12 and thus could be considered as a weak link in the group.

**Proteinortho-Tools for downstream analysis**

* `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10).
* `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other.

More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho

]]>
    </help>
    <citations>
        <citation type="doi">10.3389/fbinf.2023.1322477</citation>
        <citation type="doi">10.1186/1471-2105-12-124</citation>
        <citation type="doi">10.1371/journal.pone.0105015</citation>
    </citations>
</tool>