proteinortho: proteinortho.xml comparison

comparison proteinortho.xml @ 9:6140163233a5 draft default tip

planemo upload for repository https://gitlab.com/paulklemm_PHD/proteinortho commit e151cf96893602bf011c27a2d91df1ef594b774d

author	iuc
date	Fri, 13 Dec 2024 10:19:09 +0000
parents	c5dd4f86d981
children

comparison

equal deleted inserted replaced

-:54fb02338510
+:6140163233a5
 #end for#
 #end if
 2> >(sed -E "s/.\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" 1>&2)
 #if $more_options.selfblast:
 &&
-mv result.blast-graph_clean result.blast-graph;
+mv result.blast-graph_clean result.blast-graph
 #end if
 #if $synteny.synteny_options == "specified":
 &&
 mv result.poff-graph result.proteinortho-graph &&
 mv result.poff.tsv result.proteinortho.tsv &&
-mv result.poff.html result.proteinortho.html ;
+mv result.poff.html result.proteinortho.html
 #end if
 ]]></command>
 <inputs>
 <param name="input_files" format="fasta" type="data" multiple="true" min="2" label="Select the input fasta files (>2)" help="The input fasta files. At least 2 are needed!"/>
 <param argument="--p" type="select" label="Similarity comparision algorithm" help="In the first step of proteinortho an all-versus-all reciprocal best hit graph is build from the input files (using this algorithm).">
 <option value="diamond" selected="true">diamond (aminoacid sequences)</option>
 <option value="autoblast">auto detect NCBI-BLAST (protein and nucleotide sequences)</option>
 <option value="blastp">NCBI-BLASTP+ (protein sequences)</option>
 <option value="blastn">NCBI-BLASTN+ (nucleotide sequences)</option>
+<option value="mmseqsp">MMseqs2 (aminoacid sequences)</option>
+<option value="mmseqsn">MMseqs2 (nucleotide sequences)</option>
 <option value="lastp">Last (aminoacid sequences)</option>
 <option value="lastn">Last (nucleotide sequences)</option>
 <option value="blatp">BLAT (aminoacid sequences)</option>
 <option value="blatn">BLAT (nucleotide sequences)</option>
 </param>
 <param argument="--conn" type="float" value="0.1" min="0." max="1." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values than more splits are done, resulting in more and smaller clusters. A value of 0 corresponds to no clustering."/>
 <section name="more_options" title="Additional Options" expanded="False">
 <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/>
 <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/>
 <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/>
-<param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/>
+<param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs (not compatible with synteny) "/>
 <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/>
 <param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/>
 <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format.">
 <option value="no" selected="true">Don't use isoform information</option>
 <option value="ncbi">ncbi style (..._additional.fasta)</option>
 <option value="uniprot">uniprot style (...isoform of...)</option>
 <option value="trinity">trinity style (...i4)</option>
 </param>
 </section>
 <conditional name="synteny">
-<param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015.">
+<param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015. (Not compatible with selfblast)">
 <option value="no" selected="true">no</option>
 <option value="specified">yes</option>
 </param>
 <when value="no"/>
 <when value="specified">
 </conditional>
 </actions>
 </data>
 </outputs>
 <tests>
-<test expect_num_outputs="3"> <!-- test normal -->
+<test expect_num_outputs="3"> <!-- test normal / default params -->
 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
 <param name="p" value="diamond"/>
 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
 <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/>
 <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/>
 <assert_command>
 <has_text text="--p=diamond"/>
+</assert_command>
+</test>
+<test expect_num_outputs="3"> <!-- test normal mmseqs -->
+<param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
+<param name="p" value="mmseqsp"/>
+<expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
+<expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/>
+<expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/>
+<assert_command>
+<has_text text="--p=mmseqsp"/>
 </assert_command>
 </test>
 <test expect_num_outputs="3"> <!-- various parameter -->
 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
 <param name="p" value="diamond"/>
 <has_text text="--p=lastp"/>
 </assert_command>
 </test>
 <test expect_num_outputs="3"> <!-- blat -->
 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
-<param name="p" value="blastp"/>
+<param name="p" value="blatp"/>
 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/>
-<expand macro="test_output_blastgraph" nlines="156" nlines_delta="50"/>
+<expand macro="test_output_blastgraph" nlines="56" nlines_delta="50"/>
-<expand macro="test_output_proteinorthograph" nlines="136" nlines_delta="50"/>
+<expand macro="test_output_proteinorthograph" nlines="56" nlines_delta="50"/>
 <assert_command>
-<has_text text="--p=blastp"/>
+<has_text text="--p=blatp"/>
 </assert_command>
 </test>
 </tests>
 <help><![CDATA[Proteinortho with POFF - An orthology detection tool
 | If two proteins find each other with respect to multiple criteria like minimal evalue, and similarity compared to the best hit, ... then an edge is drawn between the two proteins.
 | The result of this step is outputted to RBH
 * **(ii) Cluster the RBH**
-| Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits.
+| A spectral clustering algorithm is used to remove weak connections, reducing false positives.
-| The resulting connected components are outputted in orthology-groups / -pairs
+| The connected components from this process are output as orthology groups or pairs.
 ----
 **Proteinortho output files**
 * **orthology-groups**
 | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups.
 | Every line corresponds to an orthology group.
-| The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general.
+| The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself.
 | Then a column for each species follows containing the proteins of these species.
 | If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity.
 | The '*' represents that this species does not contribute to the group.
 .. csv-table::
-Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,ebola.faa
+Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,mouse.faa
 5,5,0.715,C_10,C_10;test,E_10,L_10,M_10
 4,6,0.115,*,C_12,E_315,L_313,M_313
 4,5,0.167,*,C_63,E_19,L_19,M_19
 4,4,0.816,*,C_64,E_18,L_18,M_18
 ----
-* **orthology-pairs**
+| The first group is comprised of 5 proteins of 5 species: 'C_10' of ecoli.faa, 'C_10;test' of human.faa, 'E_10' of snail.faa, 'L_10' of wale.faa, and 'M_10' of mouse.faa.
+| The alg.-conn. (algebraic connectivity) of 0.715 indicates the connectivity of this group, the higher the more edges are connecting these 5 proteins (at most there can be 10 and at least there need to be 4).
-| The same as orthology-groups but every edge is printed one-by-one instead of the whole group. The output is formatted the same as the RBH graph:
+| The second group contains 6 proteins distributed over 4 species. The star indicates the species where no protein was found (in this case ecoli.faa).
 .. csv-table::
 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
+# ecoli.faa,human.faa
-----
+# 1.91e-112,357.5,1.825e-113,360
+L_10,C_10;test,4.32e-151,447,4.30e-151,446
+L_11,C_11,1.17e-68,209,3.00e-69,210
+L_14,C_14,3.64e-139,422,1.19e-142,431
+L_15,C_15,3.51e-100,303,2.12e-102,308
+L_16,C_16,3.75e-49,157,7.06e-50,159
+L_17,C_17,2.96e-195,578,5.50e-196,579
+----
+* **orthology-pairs**
+| Similar to orthology groups, but each edge is printed individually.
+| The output is formatted the same as the RBH graph.
+| For example extracting all hits of the second group of the example orthology-group output ('4,6,0.115,*,C_12,E_315,L_313,M_313') using grep (-E, regular expression="(C_12|E_315|L_313|M_313).*(C_12|E_315|L_313|M_313)", input file=proteinortho-graph) would reveal all edges of this groups:
+.. csv-table::
+seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
+M_313,C_12,1.18e-115,407,6.12e-116,407
+C_12,E_315,4.50e-127,445,4.09e-127,445
+L_313,M_313,0.00e+00,1368,0.00e+00,1368
+L_313,C_12,3.76e-114,402,1.94e-114,402
+----
+| Especially L_313 and M_313 are very similar, probably identical.
+| The group cotnains 4 edges out of the 6 possible edges for a group of 4 proteins. The missing edges are M_313-E_315 as well as L_313-E_315. This means that E_315 is only connected to the other 3 proteins via C_12 and thus could be considered as a weak link in the group.
 **Proteinortho-Tools for downstream analysis**
 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10).
 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other.
 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho
-**Citations:**
 ]]>
 </help>
-<expand macro="citations" /> <!--- TODO: citations are not working in usegalxy, therefore they are added manually at the above. -->
+<citations>
+<citation type="doi">10.3389/fbinf.2023.1322477</citation>
+<citation type="doi">10.1186/1471-2105-12-124</citation>
+<citation type="doi">10.1371/journal.pone.0105015</citation>
+</citations>
 </tool>

Mercurial > repos > iuc > proteinortho

comparison proteinortho.xml @ 9:6140163233a5 draft default tip