comparison proteinortho.xml @ 9:6140163233a5 draft default tip

planemo upload for repository https://gitlab.com/paulklemm_PHD/proteinortho commit e151cf96893602bf011c27a2d91df1ef594b774d
author iuc
date Fri, 13 Dec 2024 10:19:09 +0000
parents c5dd4f86d981
children
comparison
equal deleted inserted replaced
8:54fb02338510 9:6140163233a5
97 #end for# 97 #end for#
98 #end if 98 #end if
99 2> >(sed -E "s/.\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" 1>&2) 99 2> >(sed -E "s/.\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" 1>&2)
100 #if $more_options.selfblast: 100 #if $more_options.selfblast:
101 && 101 &&
102 mv result.blast-graph_clean result.blast-graph; 102 mv result.blast-graph_clean result.blast-graph
103 #end if 103 #end if
104 #if $synteny.synteny_options == "specified": 104 #if $synteny.synteny_options == "specified":
105 && 105 &&
106 mv result.poff-graph result.proteinortho-graph && 106 mv result.poff-graph result.proteinortho-graph &&
107 mv result.poff.tsv result.proteinortho.tsv && 107 mv result.poff.tsv result.proteinortho.tsv &&
108 mv result.poff.html result.proteinortho.html ; 108 mv result.poff.html result.proteinortho.html
109 #end if 109 #end if
110 ]]></command> 110 ]]></command>
111 <inputs> 111 <inputs>
112 <param name="input_files" format="fasta" type="data" multiple="true" min="2" label="Select the input fasta files (>2)" help="The input fasta files. At least 2 are needed!"/> 112 <param name="input_files" format="fasta" type="data" multiple="true" min="2" label="Select the input fasta files (>2)" help="The input fasta files. At least 2 are needed!"/>
113 <param argument="--p" type="select" label="Similarity comparision algorithm" help="In the first step of proteinortho an all-versus-all reciprocal best hit graph is build from the input files (using this algorithm)."> 113 <param argument="--p" type="select" label="Similarity comparision algorithm" help="In the first step of proteinortho an all-versus-all reciprocal best hit graph is build from the input files (using this algorithm).">
114 <option value="diamond" selected="true">diamond (aminoacid sequences)</option> 114 <option value="diamond" selected="true">diamond (aminoacid sequences)</option>
115 <option value="autoblast">auto detect NCBI-BLAST (protein and nucleotide sequences)</option> 115 <option value="autoblast">auto detect NCBI-BLAST (protein and nucleotide sequences)</option>
116 <option value="blastp">NCBI-BLASTP+ (protein sequences)</option> 116 <option value="blastp">NCBI-BLASTP+ (protein sequences)</option>
117 <option value="blastn">NCBI-BLASTN+ (nucleotide sequences)</option> 117 <option value="blastn">NCBI-BLASTN+ (nucleotide sequences)</option>
118 <option value="mmseqsp">MMseqs2 (aminoacid sequences)</option>
119 <option value="mmseqsn">MMseqs2 (nucleotide sequences)</option>
118 <option value="lastp">Last (aminoacid sequences)</option> 120 <option value="lastp">Last (aminoacid sequences)</option>
119 <option value="lastn">Last (nucleotide sequences)</option> 121 <option value="lastn">Last (nucleotide sequences)</option>
120 <option value="blatp">BLAT (aminoacid sequences)</option> 122 <option value="blatp">BLAT (aminoacid sequences)</option>
121 <option value="blatn">BLAT (nucleotide sequences)</option> 123 <option value="blatn">BLAT (nucleotide sequences)</option>
122 </param> 124 </param>
124 <param argument="--conn" type="float" value="0.1" min="0." max="1." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values than more splits are done, resulting in more and smaller clusters. A value of 0 corresponds to no clustering."/> 126 <param argument="--conn" type="float" value="0.1" min="0." max="1." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values than more splits are done, resulting in more and smaller clusters. A value of 0 corresponds to no clustering."/>
125 <section name="more_options" title="Additional Options" expanded="False"> 127 <section name="more_options" title="Additional Options" expanded="False">
126 <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/> 128 <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/>
127 <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/> 129 <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/>
128 <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/> 130 <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/>
129 <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/> 131 <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs (not compatible with synteny) "/>
130 <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/> 132 <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/>
131 <param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/> 133 <param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/>
132 <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format."> 134 <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format.">
133 <option value="no" selected="true">Don't use isoform information</option> 135 <option value="no" selected="true">Don't use isoform information</option>
134 <option value="ncbi">ncbi style (..._additional.fasta)</option> 136 <option value="ncbi">ncbi style (..._additional.fasta)</option>
135 <option value="uniprot">uniprot style (...isoform of...)</option> 137 <option value="uniprot">uniprot style (...isoform of...)</option>
136 <option value="trinity">trinity style (...i4)</option> 138 <option value="trinity">trinity style (...i4)</option>
137 </param> 139 </param>
138 </section> 140 </section>
139 <conditional name="synteny"> 141 <conditional name="synteny">
140 <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015."> 142 <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015. (Not compatible with selfblast)">
141 <option value="no" selected="true">no</option> 143 <option value="no" selected="true">no</option>
142 <option value="specified">yes</option> 144 <option value="specified">yes</option>
143 </param> 145 </param>
144 <when value="no"/> 146 <when value="no"/>
145 <when value="specified"> 147 <when value="specified">
175 </conditional> 177 </conditional>
176 </actions> 178 </actions>
177 </data> 179 </data>
178 </outputs> 180 </outputs>
179 <tests> 181 <tests>
180 <test expect_num_outputs="3"> <!-- test normal --> 182 <test expect_num_outputs="3"> <!-- test normal / default params -->
181 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> 183 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
182 <param name="p" value="diamond"/> 184 <param name="p" value="diamond"/>
183 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/> 185 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
184 <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/> 186 <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/>
185 <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/> 187 <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/>
186 <assert_command> 188 <assert_command>
187 <has_text text="--p=diamond"/> 189 <has_text text="--p=diamond"/>
190 </assert_command>
191 </test>
192 <test expect_num_outputs="3"> <!-- test normal mmseqs -->
193 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
194 <param name="p" value="mmseqsp"/>
195 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
196 <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/>
197 <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/>
198 <assert_command>
199 <has_text text="--p=mmseqsp"/>
188 </assert_command> 200 </assert_command>
189 </test> 201 </test>
190 <test expect_num_outputs="3"> <!-- various parameter --> 202 <test expect_num_outputs="3"> <!-- various parameter -->
191 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> 203 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
192 <param name="p" value="diamond"/> 204 <param name="p" value="diamond"/>
249 <has_text text="--p=lastp"/> 261 <has_text text="--p=lastp"/>
250 </assert_command> 262 </assert_command>
251 </test> 263 </test>
252 <test expect_num_outputs="3"> <!-- blat --> 264 <test expect_num_outputs="3"> <!-- blat -->
253 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> 265 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
254 <param name="p" value="blastp"/> 266 <param name="p" value="blatp"/>
255 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/> 267 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/>
256 <expand macro="test_output_blastgraph" nlines="156" nlines_delta="50"/> 268 <expand macro="test_output_blastgraph" nlines="56" nlines_delta="50"/>
257 <expand macro="test_output_proteinorthograph" nlines="136" nlines_delta="50"/> 269 <expand macro="test_output_proteinorthograph" nlines="56" nlines_delta="50"/>
258 <assert_command> 270 <assert_command>
259 <has_text text="--p=blastp"/> 271 <has_text text="--p=blatp"/>
260 </assert_command> 272 </assert_command>
261 </test> 273 </test>
262 </tests> 274 </tests>
263 <help><![CDATA[Proteinortho with POFF - An orthology detection tool 275 <help><![CDATA[Proteinortho with POFF - An orthology detection tool
264 276
283 | If two proteins find each other with respect to multiple criteria like minimal evalue, and similarity compared to the best hit, ... then an edge is drawn between the two proteins. 295 | If two proteins find each other with respect to multiple criteria like minimal evalue, and similarity compared to the best hit, ... then an edge is drawn between the two proteins.
284 | The result of this step is outputted to RBH 296 | The result of this step is outputted to RBH
285 297
286 * **(ii) Cluster the RBH** 298 * **(ii) Cluster the RBH**
287 299
288 | Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits. 300 | A spectral clustering algorithm is used to remove weak connections, reducing false positives.
289 | The resulting connected components are outputted in orthology-groups / -pairs 301 | The connected components from this process are output as orthology groups or pairs.
290 302
291 ---- 303 ----
292 304
293 **Proteinortho output files** 305 **Proteinortho output files**
294 306
320 332
321 * **orthology-groups** 333 * **orthology-groups**
322 334
323 | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups. 335 | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups.
324 | Every line corresponds to an orthology group. 336 | Every line corresponds to an orthology group.
325 | The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general. 337 | The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself.
326 | Then a column for each species follows containing the proteins of these species. 338 | Then a column for each species follows containing the proteins of these species.
327 | If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity. 339 | If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity.
328 | The '*' represents that this species does not contribute to the group. 340 | The '*' represents that this species does not contribute to the group.
329 341
330 .. csv-table:: 342 .. csv-table::
331 343
332 Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,ebola.faa 344 Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,mouse.faa
333 5,5,0.715,C_10,C_10;test,E_10,L_10,M_10 345 5,5,0.715,C_10,C_10;test,E_10,L_10,M_10
334 4,6,0.115,*,C_12,E_315,L_313,M_313 346 4,6,0.115,*,C_12,E_315,L_313,M_313
335 4,5,0.167,*,C_63,E_19,L_19,M_19 347 4,5,0.167,*,C_63,E_19,L_19,M_19
336 4,4,0.816,*,C_64,E_18,L_18,M_18 348 4,4,0.816,*,C_64,E_18,L_18,M_18
337 349
338 ---- 350 ----
339 351
340 * **orthology-pairs** 352 | The first group is comprised of 5 proteins of 5 species: 'C_10' of ecoli.faa, 'C_10;test' of human.faa, 'E_10' of snail.faa, 'L_10' of wale.faa, and 'M_10' of mouse.faa.
341 353 | The alg.-conn. (algebraic connectivity) of 0.715 indicates the connectivity of this group, the higher the more edges are connecting these 5 proteins (at most there can be 10 and at least there need to be 4).
342 | The same as orthology-groups but every edge is printed one-by-one instead of the whole group. The output is formatted the same as the RBH graph: 354 | The second group contains 6 proteins distributed over 4 species. The star indicates the species where no protein was found (in this case ecoli.faa).
343 355
344 .. csv-table:: 356 .. csv-table::
345 357
346 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba 358 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
347 359 # ecoli.faa,human.faa
348 ---- 360 # 1.91e-112,357.5,1.825e-113,360
361 L_10,C_10;test,4.32e-151,447,4.30e-151,446
362 L_11,C_11,1.17e-68,209,3.00e-69,210
363 L_14,C_14,3.64e-139,422,1.19e-142,431
364 L_15,C_15,3.51e-100,303,2.12e-102,308
365 L_16,C_16,3.75e-49,157,7.06e-50,159
366 L_17,C_17,2.96e-195,578,5.50e-196,579
367
368 ----
369
370 * **orthology-pairs**
371
372 | Similar to orthology groups, but each edge is printed individually.
373 | The output is formatted the same as the RBH graph.
374 | For example extracting all hits of the second group of the example orthology-group output ('4,6,0.115,*,C_12,E_315,L_313,M_313') using grep (-E, regular expression="(C_12|E_315|L_313|M_313).*(C_12|E_315|L_313|M_313)", input file=proteinortho-graph) would reveal all edges of this groups:
375
376 .. csv-table::
377
378 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
379 M_313,C_12,1.18e-115,407,6.12e-116,407
380 C_12,E_315,4.50e-127,445,4.09e-127,445
381 L_313,M_313,0.00e+00,1368,0.00e+00,1368
382 L_313,C_12,3.76e-114,402,1.94e-114,402
383
384 ----
385
386 | Especially L_313 and M_313 are very similar, probably identical.
387 | The group cotnains 4 edges out of the 6 possible edges for a group of 4 proteins. The missing edges are M_313-E_315 as well as L_313-E_315. This means that E_315 is only connected to the other 3 proteins via C_12 and thus could be considered as a weak link in the group.
349 388
350 **Proteinortho-Tools for downstream analysis** 389 **Proteinortho-Tools for downstream analysis**
351 390
352 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10). 391 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10).
353 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other. 392 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other.
354 393
355 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho 394 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho
356 395
357 **Citations:**
358
359 ]]> 396 ]]>
360 </help> 397 </help>
361 <expand macro="citations" /> <!--- TODO: citations are not working in usegalxy, therefore they are added manually at the above. --> 398 <citations>
399 <citation type="doi">10.3389/fbinf.2023.1322477</citation>
400 <citation type="doi">10.1186/1471-2105-12-124</citation>
401 <citation type="doi">10.1371/journal.pone.0105015</citation>
402 </citations>
362 </tool> 403 </tool>