Mercurial > repos > iuc > proteinortho

diff proteinortho.xml @ 5:5532c0e5d4a6 draft
planemo upload for repository https://gitlab.com/paulklemm_PHD/proteinortho commit b4d8b8da2a259973c9ad90e4b9d1a3e22ae4348f
author: iuc
date: Fri, 16 Jun 2023 20:52:41 +0000
parents: a8addd4fb60a
children: 10112d9127af
--- a/proteinortho.xml	Tue Nov 22 16:49:50 2022 +0000
+++ b/proteinortho.xml	Fri Jun 16 20:52:41 2023 +0000
@@ -2,24 +2,37 @@
     <description>detects orthologous proteins/genes within different species</description>
     <macros>
         <import>proteinortho_macros.xml</import>
-        <xml name="test_outputs">
+        <xml name="test_output_proteinortho" tokens="nlines">
             <output name="proteinortho">
+                <metadata name="column_names" value="species,genes,alg.-conn.,L.fasta,C.fasta,E.fasta,M.fasta"/>
                 <assert_contents>
+                    <has_n_columns n="7"/>
+                    <has_n_lines n="@NLINES@"/>
                     <has_line_matching expression="# Species\tGenes\tAlg\.-Conn\.\t.*"/>
                     <has_line_matching expression="[0-9]+\t[0-9]+\t.*"/>
                     <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+.*"/>
                 </assert_contents>
             </output>
+        </xml>
+        <xml name="test_output_blastgraph" tokens="nlines">
             <output name="blastgraph">
+                <metadata name="column_names" value="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
                 <assert_contents>
+                    <has_n_columns n="6" comment="#"/>
+                    <has_n_lines n="@NLINES@"/>
                     <has_line_matching expression="# file_a\tfile_b"/>
                     <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba"/>
                     <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/>
                     <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/>
                 </assert_contents>
             </output>
+        </xml>
+        <xml name="test_output_proteinorthograph" tokens="nlines" token_nlines_delta="0" token_add_columns="" token_ncolumns="6">
             <output name="proteinorthograph">
+                <metadata name="column_names" value="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba@ADD_COLUMNS@"/>
                 <assert_contents>
+                    <has_n_columns n="@NCOLUMNS@" comment="#"/>
+                    <has_n_lines n="@NLINES@" delta="@NLINES_DELTA@"/>
                     <has_line_matching expression="# file_a\tfile_b"/>
                     <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba(\tsame_strand\tsimscore)?"/>
                     <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/>
@@ -44,21 +57,23 @@
         proteinortho 
             --project=result
             --cpus="\${GALAXY_SLOTS:-4}"
-            --ram="\${GALAXY_MEMORY_MB:-16000}"
             #if $more_options.selfblast:
                 $more_options.selfblast
             #end if
             #if $more_options.singles:
                 $more_options.singles
             #end if
+            #if $more_options.core:
+                $more_options.core
+            #end if
             --p=$p
-            --e=$evalue
+            --e=$more_options.evalue
             --conn=$conn
             #if $more_options.cov:
                 --cov=$more_options.cov
             #end if
-            #if $more_options.sim:
-                --sim=`LC_NUMERIC=C awk "BEGIN {printf \"%.2f\",$more_options.sim/100}"`
+            #if $sim:
+                --sim=`LC_NUMERIC=C awk "BEGIN {printf \"%.2f\",$sim/100}"`
             #end if
             #if $more_options.identity:
                 --cov=$more_options.identity
@@ -100,15 +115,16 @@
             <option value="blatp">BLAT (aminoacid sequences)</option>
             <option value="blatn">BLAT (nucleotide sequences)</option>
         </param>
-        <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="This is the main parameter for the generation of the reciprocal best hit graph. Larger values results in more false positives (connections between proteins)."/>
-        <param argument="--conn" type="float" value="0.1" min="0." max="10." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values then more splits are done, resulting in more and smaller clusters."/>
+        <param argument="--sim" type="integer" value="95" min="0" max="100" label="Minimal reciprocal similarity in %" help="This and --evalue are main parameters for the generation of the reciprocal best hit graph. 1 = only the best reciprocal hits are reported, 0 = all possible reciprocal blast matches (within the E-value cutoff) are reported."/>
+        <param argument="--conn" type="float" value="0.1" min="0." max="1." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values than more splits are done, resulting in more and smaller clusters. A value of 0 corresponds to no clustering."/>
         <section name="more_options" title="Additional Options" expanded="False">
+            <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/>
             <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/>
-            <param argument="--sim" type="integer" value="95" min="0" max="100" label="Minimal sequence similarity in %"/>
             <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/>
             <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/>
             <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/>
-            <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is build using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format.">
+            <param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/>
+            <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format.">
                 <option value="no" selected="true">Don't use isoform information</option>
                 <option value="ncbi">ncbi style (..._additional.fasta)</option>
                 <option value="uniprot">uniprot style (...isoform of...)</option>
@@ -116,7 +132,7 @@
             </param>
         </section>
         <conditional name="synteny">
-            <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015.">
+            <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015.">
                 <option value="no" selected="true">no</option>
                 <option value="specified">yes</option>
             </param>
@@ -124,75 +140,117 @@
             <when value="specified">
                 <param argument="--dups" type="integer" value="0" min="0" max="100" label="Number of reiterations for adjacencies heuristic, to determine duplicated regions"/>
                 <param argument="--cs" type="integer" value="3" min="0" max="100" label="Size of a maximum common substring (MCS) for adjacency matches"/>
-                <param argument="--alpha" type="float" value="0.5" min="0." max="1." label="Minimal percent identity of best blast hits"/>
-                <param name="input_files_syn" type="data" format="gff" multiple="true" min="2" label="Select the GFF3 files matching the input fasta files" help="The GFF3 files need matching names with the input fasta files. If you provide mybacteria123.faa or mybacteria123.fasta ... then you need to provide mybacteria123.gff here accoringly. The attributes column (#9) must contain the attribute Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective (protein) identifier in the FASTA input. For example see https://gitlab.com/paulklemm_PHD/proteinortho/-/blob/master/test/C.gff"/> 
+                <param argument="--alpha" type="float" value="0.5" min="0." max="1." label="Weight of adjacencies vs. sequence similarity" help="alpha[FF-adj score] + (1−alpha)[BLAST score]"/>
+                <param name="input_files_syn" type="data" format="gff" multiple="true" min="2" label="Select the GFF3 files matching the input fasta files" help="The GFF3 files need matching names with the input fasta files. If you provide mybacteria123.faa or mybacteria123.fasta ... then you need to provide mybacteria123.gff here accordingly. The attributes column (#9) must contain the attribute Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective (protein) identifier in the FASTA input. For example see https://gitlab.com/paulklemm_PHD/proteinortho/-/blob/master/test/C.gff"/> 
             </when>
         </conditional>
     </inputs>
     <outputs>
-        <data name="blastgraph" format="tabular" label="${tool.name} on ${on_string}: RBH graph" from_work_dir="result.blast-graph"/>
-        <data name="proteinortho" format="tabular" label="${tool.name} on ${on_string}: orthology-groups" from_work_dir="result.proteinortho.tsv"/>
-        <data name="proteinorthograph" format="tabular" label="${tool.name} on ${on_string}: orthology-pairs" from_work_dir="result.proteinortho-graph"/>
+        <data name="blastgraph" format="tabular" label="${tool.name} on ${on_string}: RBH graph" from_work_dir="result.blast-graph">
+            <actions>
+                <action name="column_names" type="metadata"
+                    default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
+            </actions>
+        </data>
+        <data name="proteinortho" format="tabular" label="${tool.name} on ${on_string}: orthology-groups" from_work_dir="result.proteinortho.tsv">
+            <actions>
+                <action name="column_names" type="metadata"
+                    default="species,genes,alg.-conn.,${','.join([ f.element_identifier for f in $input_files ])}"/>
+            </actions>
+        </data>
+        <data name="proteinorthograph" format="tabular" label="${tool.name} on ${on_string}: orthology-pairs" from_work_dir="result.proteinortho-graph">
+            <actions>
+                <conditional name="synteny.synteny_options">
+                    <when value="no">
+                        <action name="column_names" type="metadata" default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
+                    </when>
+                    <when value="specified">
+                        <action name="column_names" type="metadata" default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba,same_strand,simscore"/>
+                    </when>
+                </conditional>
+            </actions>
+        </data>
     </outputs>
     <tests>
         <test expect_num_outputs="3"> <!-- test normal -->
-            <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/>
-            <expand macro="test_outputs"/>
+            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
+            <param name="p" value="diamond"/>
+            <expand macro="test_output_proteinortho" nlines="34"/>
+            <expand macro="test_output_blastgraph" nlines="157"/>
+            <expand macro="test_output_proteinorthograph" nlines="134"/>
             <assert_command>
                 <has_text text="--p=diamond"/>
             </assert_command>
         </test>
         <test expect_num_outputs="3"> <!-- various parameter -->
-            <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/>
-            <param name="evalue" value="1"/>
+            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
+            <param name="p" value="diamond"/>
             <param name="conn" value="1"/>
-            <param name="cov" value="42"/>
             <param name="sim" value="42"/>
-            <param name="identity" value="42"/>
-            <param name="selfblast" value="true"/>
-            <param name="singles" value="true"/>
-            <expand macro="test_outputs"/>
+            <section name="more_options">
+                <param name="cov" value="42"/>
+                <param name="identity" value="42"/>
+                <param name="selfblast" value="true"/>
+                <param name="singles" value="true"/>
+                <param name="core" value="true"/>
+            </section>
+            <expand macro="test_output_proteinortho" nlines="177"/>
+            <expand macro="test_output_blastgraph" nlines="2720"/>
+            <expand macro="test_output_proteinorthograph" nlines="384"/>
             <assert_command>
                 <has_text text="--p=diamond"/>
             </assert_command>
         </test>
         <test expect_num_outputs="3"> <!-- synteny -->
-            <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/>
-            <param name="input_files_syn" value="L.gff,C.gff,C2.gff,E.gff,M.gff"/>
-            <param name="synteny_options" value="specified"/>
-            <expand macro="test_outputs"/>
+            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
+            <param name="input_files_syn" value="L.gff,C.gff,E.gff,M.gff"/>
+            <param name="p" value="diamond"/>
+            <conditional name="synteny">
+                <param name="synteny_options" value="specified"/>
+            </conditional>
+            <expand macro="test_output_proteinortho" nlines="38"/>
+            <expand macro="test_output_blastgraph" nlines="157"/>
+            <expand macro="test_output_proteinorthograph" nlines="119" nlines_delta="10" ncolumns="8" add_columns=",same_strand,simscore"/>
             <assert_command>
                 <has_text text="--p=diamond"/>
             </assert_command>
         </test>
         <test expect_num_outputs="3"> <!-- blast -->
-            <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/>
+            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
             <param name="p" value="blastp"/>
-            <expand macro="test_outputs"/>
+            <expand macro="test_output_proteinortho" nlines="32"/>
+            <expand macro="test_output_blastgraph" nlines="158"/>
+            <expand macro="test_output_proteinorthograph" nlines="142"/>
             <assert_command>
                 <has_text text="--p=blastp"/>
             </assert_command>
         </test>
         <test expect_num_outputs="3"> <!-- auto blast -->
-            <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/>
+            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
             <param name="p" value="autoblast"/>
-            <expand macro="test_outputs"/>
+            <expand macro="test_output_proteinortho" nlines="32"/>
+            <expand macro="test_output_blastgraph" nlines="158"/>
+            <expand macro="test_output_proteinorthograph" nlines="142"/>
             <assert_command>
                 <has_text text="--p=autoblast"/>
             </assert_command>
         </test>
         <test expect_num_outputs="3"> <!-- last -->
-            <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/>
+            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
             <param name="p" value="lastp"/>
-            <expand macro="test_outputs"/>
+            <expand macro="test_output_proteinortho" nlines="34"/>
+            <expand macro="test_output_blastgraph" nlines="148"/>
+            <expand macro="test_output_proteinorthograph" nlines="133"/>
             <assert_command>
                 <has_text text="--p=lastp"/>
             </assert_command>
         </test>
         <test expect_num_outputs="3"> <!-- blat -->
-            <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/>
+            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
             <param name="p" value="blastp"/>
-            <expand macro="test_outputs"/>
+            <expand macro="test_output_proteinortho" nlines="32"/>
+            <expand macro="test_output_blastgraph" nlines="158"/>
+            <expand macro="test_output_proteinorthograph" nlines="142"/>
             <assert_command>
                 <has_text text="--p=blastp"/>
             </assert_command>
@@ -205,9 +263,9 @@
 Proteinortho is a tool to detect orthologous proteins/genes within different species (at least 2). 
 
   | It compares similarities of given gene/protein sequences and clusters them to find significant groups.
-  | The algorithm was designed to handle large-scale data and can be applied to hundreds of species at one.
+  | The algorithm was designed to handle large-scale data and can be applied to hundreds of species at once.
   | Details can be found in (doi:10.1186/1471-2105-12-124).
-  | To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (details see doi:10.1371/journal.pone.0105015), is already build in Proteinortho. 
+  | To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (details see doi:10.1371/journal.pone.0105015), is already built in Proteinortho. 
 
 ----
 
@@ -218,13 +276,13 @@
 * **(i) Build adaptive reciprocal best hit graph (RBH)**
 
       | Using the blast algorithm (diamond,blast,blat,...) all input sequences are compared against each other.
-      | If two proteins find each other with respect to multiple criteria like minimal evalue, similarity compared to the best hit, ... then a edge is drawn between the two proteins.
+      | If two proteins find each other with respect to multiple criteria like minimal evalue, and similarity compared to the best hit, ... then an edge is drawn between the two proteins.
       | The result of this step is outputted to RBH
 
 * **(ii) Cluster the RBH**
 
       | Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits.
-      | The resulting connected components are outputted in orthology-groups / -PAIRS 
+      | The resulting connected components are outputted in orthology-groups / -pairs 
 
 ----
 
@@ -235,35 +293,49 @@
 * **RBH**
 
       | The result of the (i) step, the reciprocal best hit graph. 
-      | First a comment line announces 2 species (# ecoli.faa   human.faa), then each line corresponds to a reciprocal best hit between 2 proteins/genes of the announced species. The output format is shown below.
+      | First two comment line announces 2 species (# ecoli.faa   human.faa) as well as the median values (evalue_ab,bitscore_ab,evalue_ba,bitscore_ba). 
+      | Following these header lines, each line corresponds to a reciprocal best hit of 2 proteins/genes (columns 1 and 2) of the announced species. The output format is shown below.
       | *seqidA*,*seqidB* = the 2 ids/names of the proteins involved 
       | *evalue_ab* = evalue with seqidA as query and seqidB as part of the database 
       | *bitscore_ab* = bitscore with seqidA as query ...
       | *evalue_ba* = evalue with seqidB as query ...
-      | ...
 
 .. csv-table::
     
     seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba    
+    # ecoli.faa,human.faa
+    # 1.91e-112,357.5,1.825e-113,360
+    L_10,C_10;test,4.32e-151,447,4.30e-151,446
+    L_11,C_11,1.17e-68,209,3.00e-69,210
+    L_14,C_14,3.64e-139,422,1.19e-142,431
+    L_15,C_15,3.51e-100,303,2.12e-102,308
+    L_16,C_16,3.75e-49,157,7.06e-50,159
+    L_17,C_17,2.96e-195,578,5.50e-196,579
 
 ----
 
 * **orthology-groups**
 
       | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups.
-      | Every line corresponds to an orthology group of proteins/genes. 
-      | The first 3 columns characterize general properties of that group: number of proteins, species and the algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general. 
-      | Then a column for each species follows containing the proteins of that species. If a species contributes with more than one protein to a group of orthologs, then they are ordered by connectivity.
+      | Every line corresponds to an orthology group. 
+      | The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general. 
+      | Then a column for each species follows containing the proteins of these species. 
+      | If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity.
+      | The '*' represents that this species does not contribute to the group.
 
 .. csv-table::
     
-    Species,Genes,Alg.-Conn.   
+    Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,ebola.faa
+    5,5,0.715,C_10,C_10;test,E_10,L_10,M_10
+    4,6,0.115,*,C_12,E_315,L_313,M_313
+    4,5,0.167,*,C_63,E_19,L_19,M_19
+    4,4,0.816,*,C_64,E_18,L_18,M_18
 
 ----
 
 * **orthology-pairs**
 
-      | The same as orthology-groups but every edge is printed one-by-one here. The output is formatted the same as the RBH graph:
+      | The same as orthology-groups but every edge is printed one-by-one instead of the whole group. The output is formatted the same as the RBH graph:
 
 .. csv-table::
     
@@ -273,11 +345,17 @@
 
 **Proteinortho-Tools for downstream analysis**
 
-* `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file.
+* `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10).
 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other.
 
 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho
+
+**Citations:**
+
+- Lechner, Marcus, et al. "Proteinortho: detection of (co-) orthologs in large-scale analysis." BMC bioinformatics 12.1 (2011): 1-9. (10.1186/1471-2105-12-124) 
+- Lechner, Marcus, et al. "Orthology detection combining clustering and synteny for very large datasets." PLoS one 9.8 (2014): e105015. (10.1371/journal.pone.0105015)
+
 ]]>
     </help>
-    <expand macro="citations"/>
+    <expand macro="citations" /> <!--- TODO: citations are not working in usegalxy, therefore they are added manually at the above. -->
 </tool>
author	iuc
date	Fri, 16 Jun 2023 20:52:41 +0000
parents	a8addd4fb60a
children	10112d9127af