Mercurial > repos > iuc > proteinortho
comparison proteinortho.xml @ 5:5532c0e5d4a6 draft
planemo upload for repository https://gitlab.com/paulklemm_PHD/proteinortho commit b4d8b8da2a259973c9ad90e4b9d1a3e22ae4348f
author | iuc |
---|---|
date | Fri, 16 Jun 2023 20:52:41 +0000 |
parents | a8addd4fb60a |
children | 10112d9127af |
comparison
equal
deleted
inserted
replaced
4:85c411546123 | 5:5532c0e5d4a6 |
---|---|
1 <tool id="proteinortho" name="Proteinortho" version="@TOOL_VERSION@+galaxy@WRAPPER_VERSION@" profile="@PROFILE@"> | 1 <tool id="proteinortho" name="Proteinortho" version="@TOOL_VERSION@+galaxy@WRAPPER_VERSION@" profile="@PROFILE@"> |
2 <description>detects orthologous proteins/genes within different species</description> | 2 <description>detects orthologous proteins/genes within different species</description> |
3 <macros> | 3 <macros> |
4 <import>proteinortho_macros.xml</import> | 4 <import>proteinortho_macros.xml</import> |
5 <xml name="test_outputs"> | 5 <xml name="test_output_proteinortho" tokens="nlines"> |
6 <output name="proteinortho"> | 6 <output name="proteinortho"> |
7 <metadata name="column_names" value="species,genes,alg.-conn.,L.fasta,C.fasta,E.fasta,M.fasta"/> | |
7 <assert_contents> | 8 <assert_contents> |
9 <has_n_columns n="7"/> | |
10 <has_n_lines n="@NLINES@"/> | |
8 <has_line_matching expression="# Species\tGenes\tAlg\.-Conn\.\t.*"/> | 11 <has_line_matching expression="# Species\tGenes\tAlg\.-Conn\.\t.*"/> |
9 <has_line_matching expression="[0-9]+\t[0-9]+\t.*"/> | 12 <has_line_matching expression="[0-9]+\t[0-9]+\t.*"/> |
10 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+.*"/> | 13 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+.*"/> |
11 </assert_contents> | 14 </assert_contents> |
12 </output> | 15 </output> |
16 </xml> | |
17 <xml name="test_output_blastgraph" tokens="nlines"> | |
13 <output name="blastgraph"> | 18 <output name="blastgraph"> |
19 <metadata name="column_names" value="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/> | |
14 <assert_contents> | 20 <assert_contents> |
21 <has_n_columns n="6" comment="#"/> | |
22 <has_n_lines n="@NLINES@"/> | |
15 <has_line_matching expression="# file_a\tfile_b"/> | 23 <has_line_matching expression="# file_a\tfile_b"/> |
16 <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba"/> | 24 <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba"/> |
17 <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/> | 25 <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/> |
18 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/> | 26 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/> |
19 </assert_contents> | 27 </assert_contents> |
20 </output> | 28 </output> |
29 </xml> | |
30 <xml name="test_output_proteinorthograph" tokens="nlines" token_nlines_delta="0" token_add_columns="" token_ncolumns="6"> | |
21 <output name="proteinorthograph"> | 31 <output name="proteinorthograph"> |
32 <metadata name="column_names" value="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba@ADD_COLUMNS@"/> | |
22 <assert_contents> | 33 <assert_contents> |
34 <has_n_columns n="@NCOLUMNS@" comment="#"/> | |
35 <has_n_lines n="@NLINES@" delta="@NLINES_DELTA@"/> | |
23 <has_line_matching expression="# file_a\tfile_b"/> | 36 <has_line_matching expression="# file_a\tfile_b"/> |
24 <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba(\tsame_strand\tsimscore)?"/> | 37 <has_line_matching expression="# a\tb\tevalue_ab\tbitscore_ab\tevalue_ba\tbitscore_ba(\tsame_strand\tsimscore)?"/> |
25 <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/> | 38 <has_line_matching expression="# (C|C2|E|L|M)\.fasta\t(C|C2|E|L|M)\.fasta"/> |
26 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/> | 39 <has_line_matching expression=".*(C|C2|E|L|M)_[0-9]+\t(C|C2|E|L|M)_[0-9]+.*"/> |
27 </assert_contents> | 40 </assert_contents> |
42 #end for# | 55 #end for# |
43 #end if | 56 #end if |
44 proteinortho | 57 proteinortho |
45 --project=result | 58 --project=result |
46 --cpus="\${GALAXY_SLOTS:-4}" | 59 --cpus="\${GALAXY_SLOTS:-4}" |
47 --ram="\${GALAXY_MEMORY_MB:-16000}" | |
48 #if $more_options.selfblast: | 60 #if $more_options.selfblast: |
49 $more_options.selfblast | 61 $more_options.selfblast |
50 #end if | 62 #end if |
51 #if $more_options.singles: | 63 #if $more_options.singles: |
52 $more_options.singles | 64 $more_options.singles |
53 #end if | 65 #end if |
66 #if $more_options.core: | |
67 $more_options.core | |
68 #end if | |
54 --p=$p | 69 --p=$p |
55 --e=$evalue | 70 --e=$more_options.evalue |
56 --conn=$conn | 71 --conn=$conn |
57 #if $more_options.cov: | 72 #if $more_options.cov: |
58 --cov=$more_options.cov | 73 --cov=$more_options.cov |
59 #end if | 74 #end if |
60 #if $more_options.sim: | 75 #if $sim: |
61 --sim=`LC_NUMERIC=C awk "BEGIN {printf \"%.2f\",$more_options.sim/100}"` | 76 --sim=`LC_NUMERIC=C awk "BEGIN {printf \"%.2f\",$sim/100}"` |
62 #end if | 77 #end if |
63 #if $more_options.identity: | 78 #if $more_options.identity: |
64 --cov=$more_options.identity | 79 --cov=$more_options.identity |
65 #end if | 80 #end if |
66 #if $more_options.isoform != "no": | 81 #if $more_options.isoform != "no": |
98 <option value="lastp">Last (aminoacid sequences)</option> | 113 <option value="lastp">Last (aminoacid sequences)</option> |
99 <option value="lastn">Last (nucleotide sequences)</option> | 114 <option value="lastn">Last (nucleotide sequences)</option> |
100 <option value="blatp">BLAT (aminoacid sequences)</option> | 115 <option value="blatp">BLAT (aminoacid sequences)</option> |
101 <option value="blatn">BLAT (nucleotide sequences)</option> | 116 <option value="blatn">BLAT (nucleotide sequences)</option> |
102 </param> | 117 </param> |
103 <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="This is the main parameter for the generation of the reciprocal best hit graph. Larger values results in more false positives (connections between proteins)."/> | 118 <param argument="--sim" type="integer" value="95" min="0" max="100" label="Minimal reciprocal similarity in %" help="This and --evalue are main parameters for the generation of the reciprocal best hit graph. 1 = only the best reciprocal hits are reported, 0 = all possible reciprocal blast matches (within the E-value cutoff) are reported."/> |
104 <param argument="--conn" type="float" value="0.1" min="0." max="10." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values then more splits are done, resulting in more and smaller clusters."/> | 119 <param argument="--conn" type="float" value="0.1" min="0." max="1." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values than more splits are done, resulting in more and smaller clusters. A value of 0 corresponds to no clustering."/> |
105 <section name="more_options" title="Additional Options" expanded="False"> | 120 <section name="more_options" title="Additional Options" expanded="False"> |
121 <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/> | |
106 <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/> | 122 <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/> |
107 <param argument="--sim" type="integer" value="95" min="0" max="100" label="Minimal sequence similarity in %"/> | |
108 <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/> | 123 <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/> |
109 <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/> | 124 <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/> |
110 <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/> | 125 <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/> |
111 <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is build using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format."> | 126 <param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/> |
127 <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format."> | |
112 <option value="no" selected="true">Don't use isoform information</option> | 128 <option value="no" selected="true">Don't use isoform information</option> |
113 <option value="ncbi">ncbi style (..._additional.fasta)</option> | 129 <option value="ncbi">ncbi style (..._additional.fasta)</option> |
114 <option value="uniprot">uniprot style (...isoform of...)</option> | 130 <option value="uniprot">uniprot style (...isoform of...)</option> |
115 <option value="trinity">trinity style (...i4)</option> | 131 <option value="trinity">trinity style (...i4)</option> |
116 </param> | 132 </param> |
117 </section> | 133 </section> |
118 <conditional name="synteny"> | 134 <conditional name="synteny"> |
119 <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015."> | 135 <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015."> |
120 <option value="no" selected="true">no</option> | 136 <option value="no" selected="true">no</option> |
121 <option value="specified">yes</option> | 137 <option value="specified">yes</option> |
122 </param> | 138 </param> |
123 <when value="no"/> | 139 <when value="no"/> |
124 <when value="specified"> | 140 <when value="specified"> |
125 <param argument="--dups" type="integer" value="0" min="0" max="100" label="Number of reiterations for adjacencies heuristic, to determine duplicated regions"/> | 141 <param argument="--dups" type="integer" value="0" min="0" max="100" label="Number of reiterations for adjacencies heuristic, to determine duplicated regions"/> |
126 <param argument="--cs" type="integer" value="3" min="0" max="100" label="Size of a maximum common substring (MCS) for adjacency matches"/> | 142 <param argument="--cs" type="integer" value="3" min="0" max="100" label="Size of a maximum common substring (MCS) for adjacency matches"/> |
127 <param argument="--alpha" type="float" value="0.5" min="0." max="1." label="Minimal percent identity of best blast hits"/> | 143 <param argument="--alpha" type="float" value="0.5" min="0." max="1." label="Weight of adjacencies vs. sequence similarity" help="alpha[FF-adj score] + (1−alpha)[BLAST score]"/> |
128 <param name="input_files_syn" type="data" format="gff" multiple="true" min="2" label="Select the GFF3 files matching the input fasta files" help="The GFF3 files need matching names with the input fasta files. If you provide mybacteria123.faa or mybacteria123.fasta ... then you need to provide mybacteria123.gff here accoringly. The attributes column (#9) must contain the attribute Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective (protein) identifier in the FASTA input. For example see https://gitlab.com/paulklemm_PHD/proteinortho/-/blob/master/test/C.gff"/> | 144 <param name="input_files_syn" type="data" format="gff" multiple="true" min="2" label="Select the GFF3 files matching the input fasta files" help="The GFF3 files need matching names with the input fasta files. If you provide mybacteria123.faa or mybacteria123.fasta ... then you need to provide mybacteria123.gff here accordingly. The attributes column (#9) must contain the attribute Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective (protein) identifier in the FASTA input. For example see https://gitlab.com/paulklemm_PHD/proteinortho/-/blob/master/test/C.gff"/> |
129 </when> | 145 </when> |
130 </conditional> | 146 </conditional> |
131 </inputs> | 147 </inputs> |
132 <outputs> | 148 <outputs> |
133 <data name="blastgraph" format="tabular" label="${tool.name} on ${on_string}: RBH graph" from_work_dir="result.blast-graph"/> | 149 <data name="blastgraph" format="tabular" label="${tool.name} on ${on_string}: RBH graph" from_work_dir="result.blast-graph"> |
134 <data name="proteinortho" format="tabular" label="${tool.name} on ${on_string}: orthology-groups" from_work_dir="result.proteinortho.tsv"/> | 150 <actions> |
135 <data name="proteinorthograph" format="tabular" label="${tool.name} on ${on_string}: orthology-pairs" from_work_dir="result.proteinortho-graph"/> | 151 <action name="column_names" type="metadata" |
152 default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/> | |
153 </actions> | |
154 </data> | |
155 <data name="proteinortho" format="tabular" label="${tool.name} on ${on_string}: orthology-groups" from_work_dir="result.proteinortho.tsv"> | |
156 <actions> | |
157 <action name="column_names" type="metadata" | |
158 default="species,genes,alg.-conn.,${','.join([ f.element_identifier for f in $input_files ])}"/> | |
159 </actions> | |
160 </data> | |
161 <data name="proteinorthograph" format="tabular" label="${tool.name} on ${on_string}: orthology-pairs" from_work_dir="result.proteinortho-graph"> | |
162 <actions> | |
163 <conditional name="synteny.synteny_options"> | |
164 <when value="no"> | |
165 <action name="column_names" type="metadata" default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/> | |
166 </when> | |
167 <when value="specified"> | |
168 <action name="column_names" type="metadata" default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba,same_strand,simscore"/> | |
169 </when> | |
170 </conditional> | |
171 </actions> | |
172 </data> | |
136 </outputs> | 173 </outputs> |
137 <tests> | 174 <tests> |
138 <test expect_num_outputs="3"> <!-- test normal --> | 175 <test expect_num_outputs="3"> <!-- test normal --> |
139 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | 176 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> |
140 <expand macro="test_outputs"/> | 177 <param name="p" value="diamond"/> |
178 <expand macro="test_output_proteinortho" nlines="34"/> | |
179 <expand macro="test_output_blastgraph" nlines="157"/> | |
180 <expand macro="test_output_proteinorthograph" nlines="134"/> | |
141 <assert_command> | 181 <assert_command> |
142 <has_text text="--p=diamond"/> | 182 <has_text text="--p=diamond"/> |
143 </assert_command> | 183 </assert_command> |
144 </test> | 184 </test> |
145 <test expect_num_outputs="3"> <!-- various parameter --> | 185 <test expect_num_outputs="3"> <!-- various parameter --> |
146 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | 186 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> |
147 <param name="evalue" value="1"/> | 187 <param name="p" value="diamond"/> |
148 <param name="conn" value="1"/> | 188 <param name="conn" value="1"/> |
149 <param name="cov" value="42"/> | |
150 <param name="sim" value="42"/> | 189 <param name="sim" value="42"/> |
151 <param name="identity" value="42"/> | 190 <section name="more_options"> |
152 <param name="selfblast" value="true"/> | 191 <param name="cov" value="42"/> |
153 <param name="singles" value="true"/> | 192 <param name="identity" value="42"/> |
154 <expand macro="test_outputs"/> | 193 <param name="selfblast" value="true"/> |
194 <param name="singles" value="true"/> | |
195 <param name="core" value="true"/> | |
196 </section> | |
197 <expand macro="test_output_proteinortho" nlines="177"/> | |
198 <expand macro="test_output_blastgraph" nlines="2720"/> | |
199 <expand macro="test_output_proteinorthograph" nlines="384"/> | |
155 <assert_command> | 200 <assert_command> |
156 <has_text text="--p=diamond"/> | 201 <has_text text="--p=diamond"/> |
157 </assert_command> | 202 </assert_command> |
158 </test> | 203 </test> |
159 <test expect_num_outputs="3"> <!-- synteny --> | 204 <test expect_num_outputs="3"> <!-- synteny --> |
160 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | 205 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> |
161 <param name="input_files_syn" value="L.gff,C.gff,C2.gff,E.gff,M.gff"/> | 206 <param name="input_files_syn" value="L.gff,C.gff,E.gff,M.gff"/> |
162 <param name="synteny_options" value="specified"/> | 207 <param name="p" value="diamond"/> |
163 <expand macro="test_outputs"/> | 208 <conditional name="synteny"> |
209 <param name="synteny_options" value="specified"/> | |
210 </conditional> | |
211 <expand macro="test_output_proteinortho" nlines="38"/> | |
212 <expand macro="test_output_blastgraph" nlines="157"/> | |
213 <expand macro="test_output_proteinorthograph" nlines="119" nlines_delta="10" ncolumns="8" add_columns=",same_strand,simscore"/> | |
164 <assert_command> | 214 <assert_command> |
165 <has_text text="--p=diamond"/> | 215 <has_text text="--p=diamond"/> |
166 </assert_command> | 216 </assert_command> |
167 </test> | 217 </test> |
168 <test expect_num_outputs="3"> <!-- blast --> | 218 <test expect_num_outputs="3"> <!-- blast --> |
169 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | 219 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> |
170 <param name="p" value="blastp"/> | 220 <param name="p" value="blastp"/> |
171 <expand macro="test_outputs"/> | 221 <expand macro="test_output_proteinortho" nlines="32"/> |
222 <expand macro="test_output_blastgraph" nlines="158"/> | |
223 <expand macro="test_output_proteinorthograph" nlines="142"/> | |
172 <assert_command> | 224 <assert_command> |
173 <has_text text="--p=blastp"/> | 225 <has_text text="--p=blastp"/> |
174 </assert_command> | 226 </assert_command> |
175 </test> | 227 </test> |
176 <test expect_num_outputs="3"> <!-- auto blast --> | 228 <test expect_num_outputs="3"> <!-- auto blast --> |
177 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | 229 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> |
178 <param name="p" value="autoblast"/> | 230 <param name="p" value="autoblast"/> |
179 <expand macro="test_outputs"/> | 231 <expand macro="test_output_proteinortho" nlines="32"/> |
232 <expand macro="test_output_blastgraph" nlines="158"/> | |
233 <expand macro="test_output_proteinorthograph" nlines="142"/> | |
180 <assert_command> | 234 <assert_command> |
181 <has_text text="--p=autoblast"/> | 235 <has_text text="--p=autoblast"/> |
182 </assert_command> | 236 </assert_command> |
183 </test> | 237 </test> |
184 <test expect_num_outputs="3"> <!-- last --> | 238 <test expect_num_outputs="3"> <!-- last --> |
185 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | 239 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> |
186 <param name="p" value="lastp"/> | 240 <param name="p" value="lastp"/> |
187 <expand macro="test_outputs"/> | 241 <expand macro="test_output_proteinortho" nlines="34"/> |
242 <expand macro="test_output_blastgraph" nlines="148"/> | |
243 <expand macro="test_output_proteinorthograph" nlines="133"/> | |
188 <assert_command> | 244 <assert_command> |
189 <has_text text="--p=lastp"/> | 245 <has_text text="--p=lastp"/> |
190 </assert_command> | 246 </assert_command> |
191 </test> | 247 </test> |
192 <test expect_num_outputs="3"> <!-- blat --> | 248 <test expect_num_outputs="3"> <!-- blat --> |
193 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | 249 <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/> |
194 <param name="p" value="blastp"/> | 250 <param name="p" value="blastp"/> |
195 <expand macro="test_outputs"/> | 251 <expand macro="test_output_proteinortho" nlines="32"/> |
252 <expand macro="test_output_blastgraph" nlines="158"/> | |
253 <expand macro="test_output_proteinorthograph" nlines="142"/> | |
196 <assert_command> | 254 <assert_command> |
197 <has_text text="--p=blastp"/> | 255 <has_text text="--p=blastp"/> |
198 </assert_command> | 256 </assert_command> |
199 </test> | 257 </test> |
200 </tests> | 258 </tests> |
203 **What it does** | 261 **What it does** |
204 | 262 |
205 Proteinortho is a tool to detect orthologous proteins/genes within different species (at least 2). | 263 Proteinortho is a tool to detect orthologous proteins/genes within different species (at least 2). |
206 | 264 |
207 | It compares similarities of given gene/protein sequences and clusters them to find significant groups. | 265 | It compares similarities of given gene/protein sequences and clusters them to find significant groups. |
208 | The algorithm was designed to handle large-scale data and can be applied to hundreds of species at one. | 266 | The algorithm was designed to handle large-scale data and can be applied to hundreds of species at once. |
209 | Details can be found in (doi:10.1186/1471-2105-12-124). | 267 | Details can be found in (doi:10.1186/1471-2105-12-124). |
210 | To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (details see doi:10.1371/journal.pone.0105015), is already build in Proteinortho. | 268 | To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (details see doi:10.1371/journal.pone.0105015), is already built in Proteinortho. |
211 | 269 |
212 ---- | 270 ---- |
213 | 271 |
214 **Proteinortho in a nutshell** | 272 **Proteinortho in a nutshell** |
215 | 273 |
216 ---- | 274 ---- |
217 | 275 |
218 * **(i) Build adaptive reciprocal best hit graph (RBH)** | 276 * **(i) Build adaptive reciprocal best hit graph (RBH)** |
219 | 277 |
220 | Using the blast algorithm (diamond,blast,blat,...) all input sequences are compared against each other. | 278 | Using the blast algorithm (diamond,blast,blat,...) all input sequences are compared against each other. |
221 | If two proteins find each other with respect to multiple criteria like minimal evalue, similarity compared to the best hit, ... then a edge is drawn between the two proteins. | 279 | If two proteins find each other with respect to multiple criteria like minimal evalue, and similarity compared to the best hit, ... then an edge is drawn between the two proteins. |
222 | The result of this step is outputted to RBH | 280 | The result of this step is outputted to RBH |
223 | 281 |
224 * **(ii) Cluster the RBH** | 282 * **(ii) Cluster the RBH** |
225 | 283 |
226 | Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits. | 284 | Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits. |
227 | The resulting connected components are outputted in orthology-groups / -PAIRS | 285 | The resulting connected components are outputted in orthology-groups / -pairs |
228 | 286 |
229 ---- | 287 ---- |
230 | 288 |
231 **Proteinortho output files** | 289 **Proteinortho output files** |
232 | 290 |
233 ---- | 291 ---- |
234 | 292 |
235 * **RBH** | 293 * **RBH** |
236 | 294 |
237 | The result of the (i) step, the reciprocal best hit graph. | 295 | The result of the (i) step, the reciprocal best hit graph. |
238 | First a comment line announces 2 species (# ecoli.faa human.faa), then each line corresponds to a reciprocal best hit between 2 proteins/genes of the announced species. The output format is shown below. | 296 | First two comment line announces 2 species (# ecoli.faa human.faa) as well as the median values (evalue_ab,bitscore_ab,evalue_ba,bitscore_ba). |
297 | Following these header lines, each line corresponds to a reciprocal best hit of 2 proteins/genes (columns 1 and 2) of the announced species. The output format is shown below. | |
239 | *seqidA*,*seqidB* = the 2 ids/names of the proteins involved | 298 | *seqidA*,*seqidB* = the 2 ids/names of the proteins involved |
240 | *evalue_ab* = evalue with seqidA as query and seqidB as part of the database | 299 | *evalue_ab* = evalue with seqidA as query and seqidB as part of the database |
241 | *bitscore_ab* = bitscore with seqidA as query ... | 300 | *bitscore_ab* = bitscore with seqidA as query ... |
242 | *evalue_ba* = evalue with seqidB as query ... | 301 | *evalue_ba* = evalue with seqidB as query ... |
243 | ... | |
244 | 302 |
245 .. csv-table:: | 303 .. csv-table:: |
246 | 304 |
247 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba | 305 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba |
306 # ecoli.faa,human.faa | |
307 # 1.91e-112,357.5,1.825e-113,360 | |
308 L_10,C_10;test,4.32e-151,447,4.30e-151,446 | |
309 L_11,C_11,1.17e-68,209,3.00e-69,210 | |
310 L_14,C_14,3.64e-139,422,1.19e-142,431 | |
311 L_15,C_15,3.51e-100,303,2.12e-102,308 | |
312 L_16,C_16,3.75e-49,157,7.06e-50,159 | |
313 L_17,C_17,2.96e-195,578,5.50e-196,579 | |
248 | 314 |
249 ---- | 315 ---- |
250 | 316 |
251 * **orthology-groups** | 317 * **orthology-groups** |
252 | 318 |
253 | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups. | 319 | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups. |
254 | Every line corresponds to an orthology group of proteins/genes. | 320 | Every line corresponds to an orthology group. |
255 | The first 3 columns characterize general properties of that group: number of proteins, species and the algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general. | 321 | The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general. |
256 | Then a column for each species follows containing the proteins of that species. If a species contributes with more than one protein to a group of orthologs, then they are ordered by connectivity. | 322 | Then a column for each species follows containing the proteins of these species. |
323 | If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity. | |
324 | The '*' represents that this species does not contribute to the group. | |
257 | 325 |
258 .. csv-table:: | 326 .. csv-table:: |
259 | 327 |
260 Species,Genes,Alg.-Conn. | 328 Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,ebola.faa |
329 5,5,0.715,C_10,C_10;test,E_10,L_10,M_10 | |
330 4,6,0.115,*,C_12,E_315,L_313,M_313 | |
331 4,5,0.167,*,C_63,E_19,L_19,M_19 | |
332 4,4,0.816,*,C_64,E_18,L_18,M_18 | |
261 | 333 |
262 ---- | 334 ---- |
263 | 335 |
264 * **orthology-pairs** | 336 * **orthology-pairs** |
265 | 337 |
266 | The same as orthology-groups but every edge is printed one-by-one here. The output is formatted the same as the RBH graph: | 338 | The same as orthology-groups but every edge is printed one-by-one instead of the whole group. The output is formatted the same as the RBH graph: |
267 | 339 |
268 .. csv-table:: | 340 .. csv-table:: |
269 | 341 |
270 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba | 342 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba |
271 | 343 |
272 ---- | 344 ---- |
273 | 345 |
274 **Proteinortho-Tools for downstream analysis** | 346 **Proteinortho-Tools for downstream analysis** |
275 | 347 |
276 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file. | 348 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10). |
277 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other. | 349 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other. |
278 | 350 |
279 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho | 351 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho |
352 | |
353 **Citations:** | |
354 | |
355 - Lechner, Marcus, et al. "Proteinortho: detection of (co-) orthologs in large-scale analysis." BMC bioinformatics 12.1 (2011): 1-9. (10.1186/1471-2105-12-124) | |
356 - Lechner, Marcus, et al. "Orthology detection combining clustering and synteny for very large datasets." PLoS one 9.8 (2014): e105015. (10.1371/journal.pone.0105015) | |
357 | |
280 ]]> | 358 ]]> |
281 </help> | 359 </help> |
282 <expand macro="citations"/> | 360 <expand macro="citations" /> <!--- TODO: citations are not working in usegalxy, therefore they are added manually at the above. --> |
283 </tool> | 361 </tool> |