Mercurial > repos > iuc > proteinortho
comparison proteinortho.xml @ 9:6140163233a5 draft default tip
planemo upload for repository https://gitlab.com/paulklemm_PHD/proteinortho commit e151cf96893602bf011c27a2d91df1ef594b774d
author | iuc |
---|---|
date | Fri, 13 Dec 2024 10:19:09 +0000 |
parents | c5dd4f86d981 |
children |
comparison
equal
deleted
inserted
replaced
8:54fb02338510 | 9:6140163233a5 |
---|---|
97 #end for# | 97 #end for# |
98 #end if | 98 #end if |
99 2> >(sed -E "s/.\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" 1>&2) | 99 2> >(sed -E "s/.\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" 1>&2) |
100 #if $more_options.selfblast: | 100 #if $more_options.selfblast: |
101 && | 101 && |
102 mv result.blast-graph_clean result.blast-graph; | 102 mv result.blast-graph_clean result.blast-graph |
103 #end if | 103 #end if |
104 #if $synteny.synteny_options == "specified": | 104 #if $synteny.synteny_options == "specified": |
105 && | 105 && |
106 mv result.poff-graph result.proteinortho-graph && | 106 mv result.poff-graph result.proteinortho-graph && |
107 mv result.poff.tsv result.proteinortho.tsv && | 107 mv result.poff.tsv result.proteinortho.tsv && |
108 mv result.poff.html result.proteinortho.html ; | 108 mv result.poff.html result.proteinortho.html |
109 #end if | 109 #end if |
110 ]]></command> | 110 ]]></command> |
111 <inputs> | 111 <inputs> |
112 <param name="input_files" format="fasta" type="data" multiple="true" min="2" label="Select the input fasta files (>2)" help="The input fasta files. At least 2 are needed!"/> | 112 <param name="input_files" format="fasta" type="data" multiple="true" min="2" label="Select the input fasta files (>2)" help="The input fasta files. At least 2 are needed!"/> |
113 <param argument="--p" type="select" label="Similarity comparision algorithm" help="In the first step of proteinortho an all-versus-all reciprocal best hit graph is build from the input files (using this algorithm)."> | 113 <param argument="--p" type="select" label="Similarity comparision algorithm" help="In the first step of proteinortho an all-versus-all reciprocal best hit graph is build from the input files (using this algorithm)."> |
114 <option value="diamond" selected="true">diamond (aminoacid sequences)</option> | 114 <option value="diamond" selected="true">diamond (aminoacid sequences)</option> |
115 <option value="autoblast">auto detect NCBI-BLAST (protein and nucleotide sequences)</option> | 115 <option value="autoblast">auto detect NCBI-BLAST (protein and nucleotide sequences)</option> |
116 <option value="blastp">NCBI-BLASTP+ (protein sequences)</option> | 116 <option value="blastp">NCBI-BLASTP+ (protein sequences)</option> |
117 <option value="blastn">NCBI-BLASTN+ (nucleotide sequences)</option> | 117 <option value="blastn">NCBI-BLASTN+ (nucleotide sequences)</option> |
118 <option value="mmseqsp">MMseqs2 (aminoacid sequences)</option> | |
119 <option value="mmseqsn">MMseqs2 (nucleotide sequences)</option> | |
118 <option value="lastp">Last (aminoacid sequences)</option> | 120 <option value="lastp">Last (aminoacid sequences)</option> |
119 <option value="lastn">Last (nucleotide sequences)</option> | 121 <option value="lastn">Last (nucleotide sequences)</option> |
120 <option value="blatp">BLAT (aminoacid sequences)</option> | 122 <option value="blatp">BLAT (aminoacid sequences)</option> |
121 <option value="blatn">BLAT (nucleotide sequences)</option> | 123 <option value="blatn">BLAT (nucleotide sequences)</option> |
122 </param> | 124 </param> |
124 <param argument="--conn" type="float" value="0.1" min="0." max="1." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values than more splits are done, resulting in more and smaller clusters. A value of 0 corresponds to no clustering."/> | 126 <param argument="--conn" type="float" value="0.1" min="0." max="1." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values than more splits are done, resulting in more and smaller clusters. A value of 0 corresponds to no clustering."/> |
125 <section name="more_options" title="Additional Options" expanded="False"> | 127 <section name="more_options" title="Additional Options" expanded="False"> |
126 <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/> | 128 <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/> |
127 <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/> | 129 <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/> |
128 <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/> | 130 <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/> |
129 <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/> | 131 <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs (not compatible with synteny) "/> |
130 <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/> | 132 <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/> |
131 <param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/> | 133 <param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/> |
132 <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format."> | 134 <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format."> |
133 <option value="no" selected="true">Don't use isoform information</option> | 135 <option value="no" selected="true">Don't use isoform information</option> |
134 <option value="ncbi">ncbi style (..._additional.fasta)</option> | 136 <option value="ncbi">ncbi style (..._additional.fasta)</option> |
135 <option value="uniprot">uniprot style (...isoform of...)</option> | 137 <option value="uniprot">uniprot style (...isoform of...)</option> |
136 <option value="trinity">trinity style (...i4)</option> | 138 <option value="trinity">trinity style (...i4)</option> |
137 </param> | 139 </param> |
138 </section> | 140 </section> |
139 <conditional name="synteny"> | 141 <conditional name="synteny"> |
140 <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015."> | 142 <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015. (Not compatible with selfblast)"> |
141 <option value="no" selected="true">no</option> | 143 <option value="no" selected="true">no</option> |
142 <option value="specified">yes</option> | 144 <option value="specified">yes</option> |
143 </param> | 145 </param> |
144 <when value="no"/> | 146 <when value="no"/> |
145 <when value="specified"> | 147 <when value="specified"> |
175 </conditional> | 177 </conditional> |
176 </actions> | 178 </actions> |
177 </data> | 179 </data> |
178 </outputs> | 180 </outputs> |
179 <tests> | 181 <tests> |
180 <test expect_num_outputs="3"> <!-- test normal --> | 182 <test expect_num_outputs="3"> <!-- test normal / default params --> |
181 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> | 183 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> |
182 <param name="p" value="diamond"/> | 184 <param name="p" value="diamond"/> |
183 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/> | 185 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/> |
184 <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/> | 186 <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/> |
185 <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/> | 187 <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/> |
186 <assert_command> | 188 <assert_command> |
187 <has_text text="--p=diamond"/> | 189 <has_text text="--p=diamond"/> |
190 </assert_command> | |
191 </test> | |
192 <test expect_num_outputs="3"> <!-- test normal mmseqs --> | |
193 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> | |
194 <param name="p" value="mmseqsp"/> | |
195 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/> | |
196 <expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/> | |
197 <expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/> | |
198 <assert_command> | |
199 <has_text text="--p=mmseqsp"/> | |
188 </assert_command> | 200 </assert_command> |
189 </test> | 201 </test> |
190 <test expect_num_outputs="3"> <!-- various parameter --> | 202 <test expect_num_outputs="3"> <!-- various parameter --> |
191 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> | 203 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> |
192 <param name="p" value="diamond"/> | 204 <param name="p" value="diamond"/> |
249 <has_text text="--p=lastp"/> | 261 <has_text text="--p=lastp"/> |
250 </assert_command> | 262 </assert_command> |
251 </test> | 263 </test> |
252 <test expect_num_outputs="3"> <!-- blat --> | 264 <test expect_num_outputs="3"> <!-- blat --> |
253 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> | 265 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> |
254 <param name="p" value="blastp"/> | 266 <param name="p" value="blatp"/> |
255 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/> | 267 <expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/> |
256 <expand macro="test_output_blastgraph" nlines="156" nlines_delta="50"/> | 268 <expand macro="test_output_blastgraph" nlines="56" nlines_delta="50"/> |
257 <expand macro="test_output_proteinorthograph" nlines="136" nlines_delta="50"/> | 269 <expand macro="test_output_proteinorthograph" nlines="56" nlines_delta="50"/> |
258 <assert_command> | 270 <assert_command> |
259 <has_text text="--p=blastp"/> | 271 <has_text text="--p=blatp"/> |
260 </assert_command> | 272 </assert_command> |
261 </test> | 273 </test> |
262 </tests> | 274 </tests> |
263 <help><![CDATA[Proteinortho with POFF - An orthology detection tool | 275 <help><![CDATA[Proteinortho with POFF - An orthology detection tool |
264 | 276 |
283 | If two proteins find each other with respect to multiple criteria like minimal evalue, and similarity compared to the best hit, ... then an edge is drawn between the two proteins. | 295 | If two proteins find each other with respect to multiple criteria like minimal evalue, and similarity compared to the best hit, ... then an edge is drawn between the two proteins. |
284 | The result of this step is outputted to RBH | 296 | The result of this step is outputted to RBH |
285 | 297 |
286 * **(ii) Cluster the RBH** | 298 * **(ii) Cluster the RBH** |
287 | 299 |
288 | Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits. | 300 | A spectral clustering algorithm is used to remove weak connections, reducing false positives. |
289 | The resulting connected components are outputted in orthology-groups / -pairs | 301 | The connected components from this process are output as orthology groups or pairs. |
290 | 302 |
291 ---- | 303 ---- |
292 | 304 |
293 **Proteinortho output files** | 305 **Proteinortho output files** |
294 | 306 |
320 | 332 |
321 * **orthology-groups** | 333 * **orthology-groups** |
322 | 334 |
323 | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups. | 335 | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups. |
324 | Every line corresponds to an orthology group. | 336 | Every line corresponds to an orthology group. |
325 | The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general. | 337 | The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself. |
326 | Then a column for each species follows containing the proteins of these species. | 338 | Then a column for each species follows containing the proteins of these species. |
327 | If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity. | 339 | If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity. |
328 | The '*' represents that this species does not contribute to the group. | 340 | The '*' represents that this species does not contribute to the group. |
329 | 341 |
330 .. csv-table:: | 342 .. csv-table:: |
331 | 343 |
332 Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,ebola.faa | 344 Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,mouse.faa |
333 5,5,0.715,C_10,C_10;test,E_10,L_10,M_10 | 345 5,5,0.715,C_10,C_10;test,E_10,L_10,M_10 |
334 4,6,0.115,*,C_12,E_315,L_313,M_313 | 346 4,6,0.115,*,C_12,E_315,L_313,M_313 |
335 4,5,0.167,*,C_63,E_19,L_19,M_19 | 347 4,5,0.167,*,C_63,E_19,L_19,M_19 |
336 4,4,0.816,*,C_64,E_18,L_18,M_18 | 348 4,4,0.816,*,C_64,E_18,L_18,M_18 |
337 | 349 |
338 ---- | 350 ---- |
339 | 351 |
340 * **orthology-pairs** | 352 | The first group is comprised of 5 proteins of 5 species: 'C_10' of ecoli.faa, 'C_10;test' of human.faa, 'E_10' of snail.faa, 'L_10' of wale.faa, and 'M_10' of mouse.faa. |
341 | 353 | The alg.-conn. (algebraic connectivity) of 0.715 indicates the connectivity of this group, the higher the more edges are connecting these 5 proteins (at most there can be 10 and at least there need to be 4). |
342 | The same as orthology-groups but every edge is printed one-by-one instead of the whole group. The output is formatted the same as the RBH graph: | 354 | The second group contains 6 proteins distributed over 4 species. The star indicates the species where no protein was found (in this case ecoli.faa). |
343 | 355 |
344 .. csv-table:: | 356 .. csv-table:: |
345 | 357 |
346 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba | 358 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba |
347 | 359 # ecoli.faa,human.faa |
348 ---- | 360 # 1.91e-112,357.5,1.825e-113,360 |
361 L_10,C_10;test,4.32e-151,447,4.30e-151,446 | |
362 L_11,C_11,1.17e-68,209,3.00e-69,210 | |
363 L_14,C_14,3.64e-139,422,1.19e-142,431 | |
364 L_15,C_15,3.51e-100,303,2.12e-102,308 | |
365 L_16,C_16,3.75e-49,157,7.06e-50,159 | |
366 L_17,C_17,2.96e-195,578,5.50e-196,579 | |
367 | |
368 ---- | |
369 | |
370 * **orthology-pairs** | |
371 | |
372 | Similar to orthology groups, but each edge is printed individually. | |
373 | The output is formatted the same as the RBH graph. | |
374 | For example extracting all hits of the second group of the example orthology-group output ('4,6,0.115,*,C_12,E_315,L_313,M_313') using grep (-E, regular expression="(C_12|E_315|L_313|M_313).*(C_12|E_315|L_313|M_313)", input file=proteinortho-graph) would reveal all edges of this groups: | |
375 | |
376 .. csv-table:: | |
377 | |
378 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba | |
379 M_313,C_12,1.18e-115,407,6.12e-116,407 | |
380 C_12,E_315,4.50e-127,445,4.09e-127,445 | |
381 L_313,M_313,0.00e+00,1368,0.00e+00,1368 | |
382 L_313,C_12,3.76e-114,402,1.94e-114,402 | |
383 | |
384 ---- | |
385 | |
386 | Especially L_313 and M_313 are very similar, probably identical. | |
387 | The group cotnains 4 edges out of the 6 possible edges for a group of 4 proteins. The missing edges are M_313-E_315 as well as L_313-E_315. This means that E_315 is only connected to the other 3 proteins via C_12 and thus could be considered as a weak link in the group. | |
349 | 388 |
350 **Proteinortho-Tools for downstream analysis** | 389 **Proteinortho-Tools for downstream analysis** |
351 | 390 |
352 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10). | 391 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10). |
353 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other. | 392 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other. |
354 | 393 |
355 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho | 394 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho |
356 | 395 |
357 **Citations:** | |
358 | |
359 ]]> | 396 ]]> |
360 </help> | 397 </help> |
361 <expand macro="citations" /> <!--- TODO: citations are not working in usegalxy, therefore they are added manually at the above. --> | 398 <citations> |
399 <citation type="doi">10.3389/fbinf.2023.1322477</citation> | |
400 <citation type="doi">10.1186/1471-2105-12-124</citation> | |
401 <citation type="doi">10.1371/journal.pone.0105015</citation> | |
402 </citations> | |
362 </tool> | 403 </tool> |