comparison proteinortho.xml @ 5:5532c0e5d4a6 draft

planemo upload for repository https://gitlab.com/paulklemm_PHD/proteinortho commit b4d8b8da2a259973c9ad90e4b9d1a3e22ae4348f
author iuc
date Fri, 16 Jun 2023 20:52:41 +0000
parents a8addd4fb60a
children 10112d9127af
comparison
equal deleted inserted replaced
4:85c411546123 5:5532c0e5d4a6
1 <tool id="proteinortho" name="Proteinortho" version="@TOOL_VERSION@+galaxy@WRAPPER_VERSION@" profile="@PROFILE@"> 1 <tool id="proteinortho" name="Proteinortho" version="@TOOL_VERSION@+galaxy@WRAPPER_VERSION@" profile="@PROFILE@">
2 <description>detects orthologous proteins/genes within different species</description> 2 <description>detects orthologous proteins/genes within different species</description>
3 <macros> 3 <macros>
4 <import>proteinortho_macros.xml</import> 4 <import>proteinortho_macros.xml</import>
5 <xml name="test_outputs"> 5 <xml name="test_output_proteinortho" tokens="nlines">
6 <output name="proteinortho"> 6 <output name="proteinortho">
7 <metadata name="column_names" value="species,genes,alg.-conn.,L.fasta,C.fasta,E.fasta,M.fasta"/>
7 <assert_contents> 8 <assert_contents>
9 <has_n_columns n="7"/>
10 <has_n_lines n="@NLINES@"/>
8 <has_line_matching expression="# Species\tGenes\tAlg\.-Conn\.\t.*"/> 11 <has_line_matching expression="# Species\tGenes\tAlg\.-Conn\.\t.*"/>
9 <has_line_matching expression="[0-9]+\t[0-9]+\t.*"/> 12 <has_line_matching expression="[0-9]+\t[0-9]+\t.*"/>
10 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+.*"/> 13 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+.*"/>
11 </assert_contents> 14 </assert_contents>
12 </output> 15 </output>
16 </xml>
17 <xml name="test_output_blastgraph" tokens="nlines">
13 <output name="blastgraph"> 18 <output name="blastgraph">
19 <metadata name="column_names" value="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
14 <assert_contents> 20 <assert_contents>
21 <has_n_columns n="6" comment="#"/>
22 <has_n_lines n="@NLINES@"/>
15 <has_line_matching expression="# file_a\tfile_b"/> 23 <has_line_matching expression="# file_a\tfile_b"/>
16 <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba"/> 24 <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba"/>
17 <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/> 25 <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/>
18 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/> 26 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/>
19 </assert_contents> 27 </assert_contents>
20 </output> 28 </output>
29 </xml>
30 <xml name="test_output_proteinorthograph" tokens="nlines" token_nlines_delta="0" token_add_columns="" token_ncolumns="6">
21 <output name="proteinorthograph"> 31 <output name="proteinorthograph">
32 <metadata name="column_names" value="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba@ADD_COLUMNS@"/>
22 <assert_contents> 33 <assert_contents>
34 <has_n_columns n="@NCOLUMNS@" comment="#"/>
35 <has_n_lines n="@NLINES@" delta="@NLINES_DELTA@"/>
23 <has_line_matching expression="# file_a\tfile_b"/> 36 <has_line_matching expression="# file_a\tfile_b"/>
24 <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba(\tsame_strand\tsimscore)?"/> 37 <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba(\tsame_strand\tsimscore)?"/>
25 <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/> 38 <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/>
26 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/> 39 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/>
27 </assert_contents> 40 </assert_contents>
42 #end for# 55 #end for#
43 #end if 56 #end if
44 proteinortho 57 proteinortho
45 --project=result 58 --project=result
46 --cpus="\${GALAXY_SLOTS:-4}" 59 --cpus="\${GALAXY_SLOTS:-4}"
47 --ram="\${GALAXY_MEMORY_MB:-16000}"
48 #if $more_options.selfblast: 60 #if $more_options.selfblast:
49 $more_options.selfblast 61 $more_options.selfblast
50 #end if 62 #end if
51 #if $more_options.singles: 63 #if $more_options.singles:
52 $more_options.singles 64 $more_options.singles
53 #end if 65 #end if
66 #if $more_options.core:
67 $more_options.core
68 #end if
54 --p=$p 69 --p=$p
55 --e=$evalue 70 --e=$more_options.evalue
56 --conn=$conn 71 --conn=$conn
57 #if $more_options.cov: 72 #if $more_options.cov:
58 --cov=$more_options.cov 73 --cov=$more_options.cov
59 #end if 74 #end if
60 #if $more_options.sim: 75 #if $sim:
61 --sim=`LC_NUMERIC=C awk "BEGIN {printf \"%.2f\",$more_options.sim/100}"` 76 --sim=`LC_NUMERIC=C awk "BEGIN {printf \"%.2f\",$sim/100}"`
62 #end if 77 #end if
63 #if $more_options.identity: 78 #if $more_options.identity:
64 --cov=$more_options.identity 79 --cov=$more_options.identity
65 #end if 80 #end if
66 #if $more_options.isoform != "no": 81 #if $more_options.isoform != "no":
98 <option value="lastp">Last (aminoacid sequences)</option> 113 <option value="lastp">Last (aminoacid sequences)</option>
99 <option value="lastn">Last (nucleotide sequences)</option> 114 <option value="lastn">Last (nucleotide sequences)</option>
100 <option value="blatp">BLAT (aminoacid sequences)</option> 115 <option value="blatp">BLAT (aminoacid sequences)</option>
101 <option value="blatn">BLAT (nucleotide sequences)</option> 116 <option value="blatn">BLAT (nucleotide sequences)</option>
102 </param> 117 </param>
103 <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="This is the main parameter for the generation of the reciprocal best hit graph. Larger values results in more false positives (connections between proteins)."/> 118 <param argument="--sim" type="integer" value="95" min="0" max="100" label="Minimal reciprocal similarity in %" help="This and --evalue are main parameters for the generation of the reciprocal best hit graph. 1 = only the best reciprocal hits are reported, 0 = all possible reciprocal blast matches (within the E-value cutoff) are reported."/>
104 <param argument="--conn" type="float" value="0.1" min="0." max="10." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values then more splits are done, resulting in more and smaller clusters."/> 119 <param argument="--conn" type="float" value="0.1" min="0." max="1." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values than more splits are done, resulting in more and smaller clusters. A value of 0 corresponds to no clustering."/>
105 <section name="more_options" title="Additional Options" expanded="False"> 120 <section name="more_options" title="Additional Options" expanded="False">
121 <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/>
106 <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/> 122 <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/>
107 <param argument="--sim" type="integer" value="95" min="0" max="100" label="Minimal sequence similarity in %"/>
108 <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/> 123 <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/>
109 <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/> 124 <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/>
110 <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/> 125 <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/>
111 <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is build using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format."> 126 <param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/>
127 <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format.">
112 <option value="no" selected="true">Don't use isoform information</option> 128 <option value="no" selected="true">Don't use isoform information</option>
113 <option value="ncbi">ncbi style (..._additional.fasta)</option> 129 <option value="ncbi">ncbi style (..._additional.fasta)</option>
114 <option value="uniprot">uniprot style (...isoform of...)</option> 130 <option value="uniprot">uniprot style (...isoform of...)</option>
115 <option value="trinity">trinity style (...i4)</option> 131 <option value="trinity">trinity style (...i4)</option>
116 </param> 132 </param>
117 </section> 133 </section>
118 <conditional name="synteny"> 134 <conditional name="synteny">
119 <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015."> 135 <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015.">
120 <option value="no" selected="true">no</option> 136 <option value="no" selected="true">no</option>
121 <option value="specified">yes</option> 137 <option value="specified">yes</option>
122 </param> 138 </param>
123 <when value="no"/> 139 <when value="no"/>
124 <when value="specified"> 140 <when value="specified">
125 <param argument="--dups" type="integer" value="0" min="0" max="100" label="Number of reiterations for adjacencies heuristic, to determine duplicated regions"/> 141 <param argument="--dups" type="integer" value="0" min="0" max="100" label="Number of reiterations for adjacencies heuristic, to determine duplicated regions"/>
126 <param argument="--cs" type="integer" value="3" min="0" max="100" label="Size of a maximum common substring (MCS) for adjacency matches"/> 142 <param argument="--cs" type="integer" value="3" min="0" max="100" label="Size of a maximum common substring (MCS) for adjacency matches"/>
127 <param argument="--alpha" type="float" value="0.5" min="0." max="1." label="Minimal percent identity of best blast hits"/> 143 <param argument="--alpha" type="float" value="0.5" min="0." max="1." label="Weight of adjacencies vs. sequence similarity" help="alpha[FF-adj score] + (1−alpha)[BLAST score]"/>
128 <param name="input_files_syn" type="data" format="gff" multiple="true" min="2" label="Select the GFF3 files matching the input fasta files" help="The GFF3 files need matching names with the input fasta files. If you provide mybacteria123.faa or mybacteria123.fasta ... then you need to provide mybacteria123.gff here accoringly. The attributes column (#9) must contain the attribute Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective (protein) identifier in the FASTA input. For example see https://gitlab.com/paulklemm_PHD/proteinortho/-/blob/master/test/C.gff"/> 144 <param name="input_files_syn" type="data" format="gff" multiple="true" min="2" label="Select the GFF3 files matching the input fasta files" help="The GFF3 files need matching names with the input fasta files. If you provide mybacteria123.faa or mybacteria123.fasta ... then you need to provide mybacteria123.gff here accordingly. The attributes column (#9) must contain the attribute Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective (protein) identifier in the FASTA input. For example see https://gitlab.com/paulklemm_PHD/proteinortho/-/blob/master/test/C.gff"/>
129 </when> 145 </when>
130 </conditional> 146 </conditional>
131 </inputs> 147 </inputs>
132 <outputs> 148 <outputs>
133 <data name="blastgraph" format="tabular" label="${tool.name} on ${on_string}: RBH graph" from_work_dir="result.blast-graph"/> 149 <data name="blastgraph" format="tabular" label="${tool.name} on ${on_string}: RBH graph" from_work_dir="result.blast-graph">
134 <data name="proteinortho" format="tabular" label="${tool.name} on ${on_string}: orthology-groups" from_work_dir="result.proteinortho.tsv"/> 150 <actions>
135 <data name="proteinorthograph" format="tabular" label="${tool.name} on ${on_string}: orthology-pairs" from_work_dir="result.proteinortho-graph"/> 151 <action name="column_names" type="metadata"
152 default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
153 </actions>
154 </data>
155 <data name="proteinortho" format="tabular" label="${tool.name} on ${on_string}: orthology-groups" from_work_dir="result.proteinortho.tsv">
156 <actions>
157 <action name="column_names" type="metadata"
158 default="species,genes,alg.-conn.,${','.join([ f.element_identifier for f in $input_files ])}"/>
159 </actions>
160 </data>
161 <data name="proteinorthograph" format="tabular" label="${tool.name} on ${on_string}: orthology-pairs" from_work_dir="result.proteinortho-graph">
162 <actions>
163 <conditional name="synteny.synteny_options">
164 <when value="no">
165 <action name="column_names" type="metadata" default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
166 </when>
167 <when value="specified">
168 <action name="column_names" type="metadata" default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba,same_strand,simscore"/>
169 </when>
170 </conditional>
171 </actions>
172 </data>
136 </outputs> 173 </outputs>
137 <tests> 174 <tests>
138 <test expect_num_outputs="3"> <!-- test normal --> 175 <test expect_num_outputs="3"> <!-- test normal -->
139 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> 176 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
140 <expand macro="test_outputs"/> 177 <param name="p" value="diamond"/>
178 <expand macro="test_output_proteinortho" nlines="34"/>
179 <expand macro="test_output_blastgraph" nlines="157"/>
180 <expand macro="test_output_proteinorthograph" nlines="134"/>
141 <assert_command> 181 <assert_command>
142 <has_text text="--p=diamond"/> 182 <has_text text="--p=diamond"/>
143 </assert_command> 183 </assert_command>
144 </test> 184 </test>
145 <test expect_num_outputs="3"> <!-- various parameter --> 185 <test expect_num_outputs="3"> <!-- various parameter -->
146 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> 186 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
147 <param name="evalue" value="1"/> 187 <param name="p" value="diamond"/>
148 <param name="conn" value="1"/> 188 <param name="conn" value="1"/>
149 <param name="cov" value="42"/>
150 <param name="sim" value="42"/> 189 <param name="sim" value="42"/>
151 <param name="identity" value="42"/> 190 <section name="more_options">
152 <param name="selfblast" value="true"/> 191 <param name="cov" value="42"/>
153 <param name="singles" value="true"/> 192 <param name="identity" value="42"/>
154 <expand macro="test_outputs"/> 193 <param name="selfblast" value="true"/>
194 <param name="singles" value="true"/>
195 <param name="core" value="true"/>
196 </section>
197 <expand macro="test_output_proteinortho" nlines="177"/>
198 <expand macro="test_output_blastgraph" nlines="2720"/>
199 <expand macro="test_output_proteinorthograph" nlines="384"/>
155 <assert_command> 200 <assert_command>
156 <has_text text="--p=diamond"/> 201 <has_text text="--p=diamond"/>
157 </assert_command> 202 </assert_command>
158 </test> 203 </test>
159 <test expect_num_outputs="3"> <!-- synteny --> 204 <test expect_num_outputs="3"> <!-- synteny -->
160 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> 205 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
161 <param name="input_files_syn" value="L.gff,C.gff,C2.gff,E.gff,M.gff"/> 206 <param name="input_files_syn" value="L.gff,C.gff,E.gff,M.gff"/>
162 <param name="synteny_options" value="specified"/> 207 <param name="p" value="diamond"/>
163 <expand macro="test_outputs"/> 208 <conditional name="synteny">
209 <param name="synteny_options" value="specified"/>
210 </conditional>
211 <expand macro="test_output_proteinortho" nlines="38"/>
212 <expand macro="test_output_blastgraph" nlines="157"/>
213 <expand macro="test_output_proteinorthograph" nlines="119" nlines_delta="10" ncolumns="8" add_columns=",same_strand,simscore"/>
164 <assert_command> 214 <assert_command>
165 <has_text text="--p=diamond"/> 215 <has_text text="--p=diamond"/>
166 </assert_command> 216 </assert_command>
167 </test> 217 </test>
168 <test expect_num_outputs="3"> <!-- blast --> 218 <test expect_num_outputs="3"> <!-- blast -->
169 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> 219 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
170 <param name="p" value="blastp"/> 220 <param name="p" value="blastp"/>
171 <expand macro="test_outputs"/> 221 <expand macro="test_output_proteinortho" nlines="32"/>
222 <expand macro="test_output_blastgraph" nlines="158"/>
223 <expand macro="test_output_proteinorthograph" nlines="142"/>
172 <assert_command> 224 <assert_command>
173 <has_text text="--p=blastp"/> 225 <has_text text="--p=blastp"/>
174 </assert_command> 226 </assert_command>
175 </test> 227 </test>
176 <test expect_num_outputs="3"> <!-- auto blast --> 228 <test expect_num_outputs="3"> <!-- auto blast -->
177 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> 229 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
178 <param name="p" value="autoblast"/> 230 <param name="p" value="autoblast"/>
179 <expand macro="test_outputs"/> 231 <expand macro="test_output_proteinortho" nlines="32"/>
232 <expand macro="test_output_blastgraph" nlines="158"/>
233 <expand macro="test_output_proteinorthograph" nlines="142"/>
180 <assert_command> 234 <assert_command>
181 <has_text text="--p=autoblast"/> 235 <has_text text="--p=autoblast"/>
182 </assert_command> 236 </assert_command>
183 </test> 237 </test>
184 <test expect_num_outputs="3"> <!-- last --> 238 <test expect_num_outputs="3"> <!-- last -->
185 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> 239 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
186 <param name="p" value="lastp"/> 240 <param name="p" value="lastp"/>
187 <expand macro="test_outputs"/> 241 <expand macro="test_output_proteinortho" nlines="34"/>
242 <expand macro="test_output_blastgraph" nlines="148"/>
243 <expand macro="test_output_proteinorthograph" nlines="133"/>
188 <assert_command> 244 <assert_command>
189 <has_text text="--p=lastp"/> 245 <has_text text="--p=lastp"/>
190 </assert_command> 246 </assert_command>
191 </test> 247 </test>
192 <test expect_num_outputs="3"> <!-- blat --> 248 <test expect_num_outputs="3"> <!-- blat -->
193 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> 249 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
194 <param name="p" value="blastp"/> 250 <param name="p" value="blastp"/>
195 <expand macro="test_outputs"/> 251 <expand macro="test_output_proteinortho" nlines="32"/>
252 <expand macro="test_output_blastgraph" nlines="158"/>
253 <expand macro="test_output_proteinorthograph" nlines="142"/>
196 <assert_command> 254 <assert_command>
197 <has_text text="--p=blastp"/> 255 <has_text text="--p=blastp"/>
198 </assert_command> 256 </assert_command>
199 </test> 257 </test>
200 </tests> 258 </tests>
203 **What it does** 261 **What it does**
204 262
205 Proteinortho is a tool to detect orthologous proteins/genes within different species (at least 2). 263 Proteinortho is a tool to detect orthologous proteins/genes within different species (at least 2).
206 264
207 | It compares similarities of given gene/protein sequences and clusters them to find significant groups. 265 | It compares similarities of given gene/protein sequences and clusters them to find significant groups.
208 | The algorithm was designed to handle large-scale data and can be applied to hundreds of species at one. 266 | The algorithm was designed to handle large-scale data and can be applied to hundreds of species at once.
209 | Details can be found in (doi:10.1186/1471-2105-12-124). 267 | Details can be found in (doi:10.1186/1471-2105-12-124).
210 | To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (details see doi:10.1371/journal.pone.0105015), is already build in Proteinortho. 268 | To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (details see doi:10.1371/journal.pone.0105015), is already built in Proteinortho.
211 269
212 ---- 270 ----
213 271
214 **Proteinortho in a nutshell** 272 **Proteinortho in a nutshell**
215 273
216 ---- 274 ----
217 275
218 * **(i) Build adaptive reciprocal best hit graph (RBH)** 276 * **(i) Build adaptive reciprocal best hit graph (RBH)**
219 277
220 | Using the blast algorithm (diamond,blast,blat,...) all input sequences are compared against each other. 278 | Using the blast algorithm (diamond,blast,blat,...) all input sequences are compared against each other.
221 | If two proteins find each other with respect to multiple criteria like minimal evalue, similarity compared to the best hit, ... then a edge is drawn between the two proteins. 279 | If two proteins find each other with respect to multiple criteria like minimal evalue, and similarity compared to the best hit, ... then an edge is drawn between the two proteins.
222 | The result of this step is outputted to RBH 280 | The result of this step is outputted to RBH
223 281
224 * **(ii) Cluster the RBH** 282 * **(ii) Cluster the RBH**
225 283
226 | Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits. 284 | Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits.
227 | The resulting connected components are outputted in orthology-groups / -PAIRS 285 | The resulting connected components are outputted in orthology-groups / -pairs
228 286
229 ---- 287 ----
230 288
231 **Proteinortho output files** 289 **Proteinortho output files**
232 290
233 ---- 291 ----
234 292
235 * **RBH** 293 * **RBH**
236 294
237 | The result of the (i) step, the reciprocal best hit graph. 295 | The result of the (i) step, the reciprocal best hit graph.
238 | First a comment line announces 2 species (# ecoli.faa human.faa), then each line corresponds to a reciprocal best hit between 2 proteins/genes of the announced species. The output format is shown below. 296 | First two comment line announces 2 species (# ecoli.faa human.faa) as well as the median values (evalue_ab,bitscore_ab,evalue_ba,bitscore_ba).
297 | Following these header lines, each line corresponds to a reciprocal best hit of 2 proteins/genes (columns 1 and 2) of the announced species. The output format is shown below.
239 | *seqidA*,*seqidB* = the 2 ids/names of the proteins involved 298 | *seqidA*,*seqidB* = the 2 ids/names of the proteins involved
240 | *evalue_ab* = evalue with seqidA as query and seqidB as part of the database 299 | *evalue_ab* = evalue with seqidA as query and seqidB as part of the database
241 | *bitscore_ab* = bitscore with seqidA as query ... 300 | *bitscore_ab* = bitscore with seqidA as query ...
242 | *evalue_ba* = evalue with seqidB as query ... 301 | *evalue_ba* = evalue with seqidB as query ...
243 | ...
244 302
245 .. csv-table:: 303 .. csv-table::
246 304
247 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba 305 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
306 # ecoli.faa,human.faa
307 # 1.91e-112,357.5,1.825e-113,360
308 L_10,C_10;test,4.32e-151,447,4.30e-151,446
309 L_11,C_11,1.17e-68,209,3.00e-69,210
310 L_14,C_14,3.64e-139,422,1.19e-142,431
311 L_15,C_15,3.51e-100,303,2.12e-102,308
312 L_16,C_16,3.75e-49,157,7.06e-50,159
313 L_17,C_17,2.96e-195,578,5.50e-196,579
248 314
249 ---- 315 ----
250 316
251 * **orthology-groups** 317 * **orthology-groups**
252 318
253 | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups. 319 | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups.
254 | Every line corresponds to an orthology group of proteins/genes. 320 | Every line corresponds to an orthology group.
255 | The first 3 columns characterize general properties of that group: number of proteins, species and the algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general. 321 | The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general.
256 | Then a column for each species follows containing the proteins of that species. If a species contributes with more than one protein to a group of orthologs, then they are ordered by connectivity. 322 | Then a column for each species follows containing the proteins of these species.
323 | If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity.
324 | The '*' represents that this species does not contribute to the group.
257 325
258 .. csv-table:: 326 .. csv-table::
259 327
260 Species,Genes,Alg.-Conn. 328 Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,ebola.faa
329 5,5,0.715,C_10,C_10;test,E_10,L_10,M_10
330 4,6,0.115,*,C_12,E_315,L_313,M_313
331 4,5,0.167,*,C_63,E_19,L_19,M_19
332 4,4,0.816,*,C_64,E_18,L_18,M_18
261 333
262 ---- 334 ----
263 335
264 * **orthology-pairs** 336 * **orthology-pairs**
265 337
266 | The same as orthology-groups but every edge is printed one-by-one here. The output is formatted the same as the RBH graph: 338 | The same as orthology-groups but every edge is printed one-by-one instead of the whole group. The output is formatted the same as the RBH graph:
267 339
268 .. csv-table:: 340 .. csv-table::
269 341
270 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba 342 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
271 343
272 ---- 344 ----
273 345
274 **Proteinortho-Tools for downstream analysis** 346 **Proteinortho-Tools for downstream analysis**
275 347
276 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file. 348 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10).
277 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other. 349 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other.
278 350
279 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho 351 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho
352
353 **Citations:**
354
355 - Lechner, Marcus, et al. "Proteinortho: detection of (co-) orthologs in large-scale analysis." BMC bioinformatics 12.1 (2011): 1-9. (10.1186/1471-2105-12-124)
356 - Lechner, Marcus, et al. "Orthology detection combining clustering and synteny for very large datasets." PLoS one 9.8 (2014): e105015. (10.1371/journal.pone.0105015)
357
280 ]]> 358 ]]>
281 </help> 359 </help>
282 <expand macro="citations"/> 360 <expand macro="citations" /> <!--- TODO: citations are not working in usegalxy, therefore they are added manually at the above. -->
283 </tool> 361 </tool>