3
|
1 <tool id="ffp_phylogeny" name="Feature Frequency Profile Phylogeny" version="0.1.04">
|
0
|
2 <description>An alignment free comparison tool for phylogenetic analysis and text comparison</description>
|
|
3 <requirements>
|
|
4 <requirement type="package" version="0.3.19_d4382db015acec0e5cc43d6c1ac80ae12cb7e6b3">ffp-phylogeny</requirement>
|
|
5 </requirements>
|
|
6
|
|
7 <macros>
|
|
8 <token name="@BINARY@">./ffp_phylogeny.py</token>
|
|
9 <import>ffp_macros.xml</import>
|
|
10 </macros>
|
|
11 <expand macro="requirements" />
|
|
12 <command interpreter="python"><![CDATA[
|
|
13 ffp_phylogeny.py
|
|
14 #for $i in $sequence.filesin
|
|
15 "$i" ## full file paths
|
|
16 #end for
|
|
17 -x "
|
|
18 #for $i in $sequence.filesin
|
|
19 $i.name, ## original file names
|
|
20 #end for
|
|
21 "
|
|
22 -t "$(sequence.file_type.split('-')[0])"
|
|
23 -l "$length"
|
|
24 -o "$info"
|
1
|
25 ##if $normalize
|
0
|
26 ## -n
|
|
27 ##end if
|
1
|
28 #if $sequence.file_type != 'text'
|
|
29 #if $sequence.file_type == 'amino-multi' or $sequence.file_type == 'nucleotide-multi'
|
0
|
30 -m
|
|
31 #end if
|
1
|
32 #if $sequence.groupings
|
|
33 #pass
|
|
34 #else
|
0
|
35 -d
|
|
36 #end if
|
1
|
37 #if $metric
|
0
|
38 -M "$metric"
|
|
39 #end if
|
1
|
40 #if $similarity
|
0
|
41 -s
|
|
42 #end if
|
1
|
43 #if $abbreviate
|
0
|
44 -a
|
|
45 #end if
|
|
46 #end if
|
3
|
47 #if $phylogeny.phylo_type == 'filt'
|
0
|
48 -f "$phylogeny.filt.filter_type"
|
|
49 -L "$phylogeny.filt.lower"
|
|
50 -U "$phylogeny.filt.upper"
|
|
51 #end if
|
1
|
52 #if $tree
|
0
|
53 -T
|
|
54 #end if
|
|
55 ##ffpjsd -n FLOAT , --normval=FLOAT
|
|
56 ## For option -e, --euclid, change the n-norm distance (Default is n=2) to any other value where n > 1
|
|
57
|
|
58 ]]></command>
|
|
59 <expand macro="stdio" />
|
|
60 <inputs>
|
|
61
|
|
62 <!-- Either amino acid or nucleotide input -->
|
|
63 <!-- Ideally we could determine from file content or suffix what type it is -->
|
|
64
|
|
65 <param name="length" type="integer" min="1" max="25" label="l-mer length" value="6" help="String of valid characters of this length will be counted. Synonyms: feature, k-mer, n-gram, k-tuple" size="2"/>
|
|
66 <!--
|
|
67 <param name="normalize" label="Normalize counts into relative frequency" type="boolean" checked="true" help="" />
|
|
68 -->
|
|
69 <conditional name="sequence">
|
1
|
70 <param type="select" name="file_type" label="File type" help="Note: For phylogeny display, at least three profiles are required.">
|
|
71 <option value="amino">Amino Acids, one profile per file</option>
|
|
72 <option value="amino-multi">Amino Acids, one profile per fasta sequence in file</option>
|
|
73 <option value="nucleotide">Nucleic acids, one profile per file</option>
|
|
74 <option value="nucleotide-multi">Nucleic acids, one profile per fasta sequence in file</option>
|
0
|
75 <option value="text">Text, single file</option>
|
|
76 </param>
|
|
77
|
|
78 <when value="amino"><!-- ffpaa -->
|
|
79 <param name="filesin" type="data" label="Select input file(s)" format="fasta" multiple="true" />
|
1
|
80 <param name="groupings" label="Enable amino acid grouping" type="boolean" checked="true" help="Counts amino acids in groups rather than individually (usually advantageous, see below)." />
|
0
|
81 </when>
|
|
82
|
|
83 <when value="amino-multi">
|
|
84 <param name="filesin" type="data" label="Select input file(s)" format="fasta" multiple="true" />
|
1
|
85 <param name="groupings" label="Enable amino acid grouping" type="boolean" checked="true" help="Counts amino acids in groups rather than individually (usually advantageous, see below)." />
|
0
|
86 </when>
|
|
87
|
|
88 <when value="nucleotide"><!-- ffpry -->
|
|
89 <param name="filesin" type="data" label="Select input file(s)" format="fasta" multiple="true" />
|
1
|
90 <param name="groupings" label="Enable purine / pyrimidine grouping" type="boolean" checked="true" help="Counts each nucleotide as a purine(R) or pyrimidine(Y) rather than individually (usually advantageous)." />
|
0
|
91 </when>
|
|
92
|
|
93 <when value="nucleotide-multi">
|
|
94 <param name="filesin" type="data" label="Select input file(s)" format="fasta" multiple="true" />
|
1
|
95 <param name="groupings" label="Enable purine / pyrimidine grouping" type="boolean" checked="true" help="Counts each nucleotide as a purine(R) or pyrimidine(Y) rather than individually (usually advantageous)." />
|
0
|
96 </when>
|
|
97
|
|
98 <when value="text"><!-- ffptxt -->
|
|
99 <param name="filesin" type="data" multiple="true"/>
|
|
100 </when>
|
|
101
|
|
102
|
|
103 </conditional>
|
|
104
|
|
105 <conditional name="phylogeny">
|
|
106 <param type="select" name="phylo_type" label="Feature filtering">
|
|
107 <option value="all">Include all features (Phenetic phylogeny)</option>
|
|
108 <option value="filt">Include only filtered features (Core/evolutionary phylogeny) </option>
|
|
109 </param>
|
|
110 <when value="all"></when>
|
|
111 <when value="filt">
|
|
112 <conditional name="filt">
|
|
113
|
|
114 <param type="select" name="filter_type" label="Filter type" help="Features are included in profiles if at least 1 profile has lower count/percent, and no profile has more than upper count/percent">
|
|
115 <option value="count">lower / upper count limit</option>
|
|
116 <option value="f">raw frequencies</option>
|
|
117 <option value="n">normal distribution</option>
|
|
118 <option value="e">extreme value (Gumbel) distribution</option>
|
|
119 </param>
|
|
120 <when value="count">
|
|
121 <param name="lower" type="integer" label="lower count (one profile needs at least this)" value="0" min="0" />
|
|
122 <param name="upper" type="integer" label="upper count (no profile can have more than this)" value="0" min="0" />
|
|
123 </when>
|
|
124 <when value="f">
|
|
125 <param name="lower" type="float" label="lower %" value="0.05" min="0" max=".5" />
|
|
126 <param name="upper" type="float" label="upper %" value="0.95" min=".5" max="1" />
|
|
127 </when>
|
|
128 <when value="n">
|
|
129 <param name="lower" type="float" label="lower %" value="0.05" min="0" max=".5" />
|
|
130 <param name="upper" type="float" label="upper %" value="0.95" min=".5" max="1" />
|
|
131 </when>
|
|
132 <when value="e">
|
|
133 <param name="lower" type="float" label="lower %" value="0.05" min="0" max=".5" />
|
|
134 <param name="upper" type="float" label="upper %" value="0.95" min=".5" max="1" />
|
|
135 </when>
|
|
136
|
|
137 </conditional>
|
|
138 </when>
|
|
139
|
|
140 </conditional>
|
|
141
|
|
142 <param type="select" name="metric" label="Continuous Distance Measure" help="See ffpjsd documentation for details.">
|
|
143 <option value="" selected="true">Jensen Shannon divergence (default)</option>
|
|
144 <option value="euclid">Euclidean</option>
|
|
145 <option value="euclid2">Euclidean squared</option>
|
|
146 <option value="cosine">Cosine</option>
|
|
147 <option value="manhattan">Manhattan</option>
|
|
148 <option value="pearson">pearson correlation coefficient*</option>
|
|
149 <option value="chebyshev">Chebyshev</option>
|
|
150 <option value="canberra">Canberra</option>
|
|
151 <option value="hamming">Hamming</option>
|
|
152 <option value="evol">Evolutionary Distance used in E.coli Publications</option>
|
|
153
|
|
154 <!--
|
|
155
|
|
156 With these options the input FFPs are treated as binary data. When two FFPs (i and j) are compared each
|
|
157 distance measure uses a cross tabulation for pairwise feature comparison with sums A, B, C and D. A is
|
|
158 the number of features which are present in both vectors while D is the number of features that are absent in
|
|
159 both vectors. B means the feature is present in i and absent in j. C means the feature is absent in i but
|
|
160 present in j. N is the sum of A+B+C+D. All of the binary distance options can be used together with the -s
|
|
161 option to print a similarity matrix. THe binary distance do not need to be normalized with ffprwn.
|
|
162
|
|
163 <option value="">BINARY DISTANCE MEASURES</option>
|
|
164
|
|
165 -->
|
|
166
|
|
167 <option value="matching">matching*</option>
|
|
168 <option value="jaccard">Jaccard*</option>
|
|
169 <option value="tanimoto">Rogers-Tanimoto*</option>
|
|
170 <option value="dice">Dice*</option>
|
|
171 <option value="antidice">anti-Dice*</option>
|
|
172 <option value="sneath">Sneath-Sokal*</option>
|
|
173 <option value="hamman">Hamman*</option>
|
|
174 <option value="phi">Pearson Phi*</option>
|
|
175 <option value="anderberg">Anderberg*</option>
|
|
176 <option value="gower">Gower*</option>
|
|
177 <option value="russel">Russel-Rao*</option>
|
|
178 <option value="yule">Yule*</option>
|
|
179 <option value="ochiai">Ochiai*</option>
|
|
180 <option value="kulczynski">Kulczynski*</option>
|
|
181
|
|
182 </param>
|
|
183
|
|
184 <param type="boolean" name="similarity" label="*Similarity Matrix" help="Print a similarity matrix rather than a distance matrix for items marked by asterisk(*). This option effects the output of distances metrics which have a value normalized from 0 to 1 or -1 to 1."/>
|
|
185
|
|
186 <param type="boolean" name="abbreviate" label="Short labels" help="Shorten tree taxonomy labels as much as possible."/>
|
|
187
|
|
188 <param type="boolean" name="tree" checked="true" label="Generate Tree Phylogeny" truevalue="1" falsevalue="0" />
|
|
189
|
|
190 </inputs>
|
|
191 <outputs>
|
|
192 <data name="info" format="nhx" label="Feature Frequency Profile">
|
|
193 <change_format>
|
|
194 <when input="tree" value="0" format="tabular"/>
|
|
195 </change_format>
|
|
196 <!-- doesn't work: filter>tree == "1"</filter -->
|
|
197 </data>
|
|
198 </outputs>
|
|
199
|
|
200 <tests>
|
|
201 <test>
|
|
202 <param name="length" value="1"/>
|
|
203 <param name="tree" value="0"/>
|
1
|
204 <param name="groupings" value="false"/>
|
0
|
205 <param name="file_type" value="nucleotide"/>
|
|
206 <param name="filesin" value="genome1,genome2"/>
|
|
207 <output name="info" file="test_length_1_output.tabular"/>
|
|
208 </test>
|
|
209 <test>
|
|
210 <param name="length" value="2"/>
|
|
211 <param name="tree" value="0"/>
|
1
|
212 <param name="groupings" value="false"/>
|
0
|
213 <param name="file_type" value="nucleotide"/>
|
|
214 <param name="filesin" value="genome1,genome2"/>
|
|
215 <output name="info" file="test_length_2_output.tabular"/>
|
|
216 </test>
|
1
|
217 <test>
|
|
218 <param name="length" value="2"/>
|
|
219 <param name="tree" value="0"/>
|
|
220 <param name="groupings" value="true"/>
|
|
221 <param name="file_type" value="nucleotide-multi"/>
|
|
222 <param name="filesin" value="genome1,genome2"/>
|
|
223 <output name="info" file="test_length_2b_output.tabular"/>
|
|
224 </test>
|
0
|
225 </tests>
|
|
226
|
|
227 <help><![CDATA[
|
|
228
|
|
229 .. class:: infomark
|
|
230
|
|
231
|
|
232 **What it does**
|
|
233
|
|
234 FFP (Feature frequency profile) is an alignment free comparison tool for phylogenetic analysis and text comparison. It can be applied to nucleotide sequences, complete genomes, proteomes and even used for text comparison.
|
|
235
|
1
|
236 This galaxy tool prepares a mini-pipeline consisting of **[ffpry | ffpaa | ffptxt] > [ ffpfilt | ffpcol > ffprwn] > ffpjsd > ffptree** . The last step is optional - by deselecting the "Generate Tree Phylogeny" checkbox, the tool will output only the precursor distance matrix file rather than a Newick (.nhx) formatted tree file.
|
0
|
237
|
|
238 Each sequence or text file has a profile containing tallies of each feature found. A feature is a string of valid characters of given length.
|
|
239
|
|
240 For nucleotide data, by default each character (ATGC) is grouped as either purine(R) or pyrmidine(Y) before being counted.
|
|
241
|
|
242 For amino acid data, by default each character is grouped into one of the following:
|
|
243 (ST),(DE),(KQR),(IVLM),(FWY),C,G,A,N,H,P. Each group is represented by the first character in its series.
|
|
244
|
|
245 One other key concept is that a given feature, e.g. "TAA" is counted in forward
|
|
246 AND reverse directions, mirroring the idea that a feature's orientation is not
|
|
247 so important to distinguish when it comes to alignment-free comparison.
|
|
248 The counts for "TAA" and "AAT" are merged.
|
|
249
|
|
250 The labeling of the resulting counted feature items is perhaps the trickiest
|
|
251 concept to master. Due to computational efficiency measures taken by the
|
|
252 developers, a feature that we see on paper as "TAC" may be stored and labeled
|
|
253 internally as "GTA", its reverse compliment. One must look for the alternative
|
|
254 if one does not find the original.
|
|
255
|
|
256 Also note that in amino acid sequences the stop codon "*" (or any other character
|
|
257 that is not in the Amino acid alphabet) causes that character frame not to be
|
|
258 counted. Also, character frames never span across fasta entries.
|
|
259
|
|
260 A few tutorials:
|
|
261 * http://sourceforge.net/projects/ffp-phylogeny/files/Documentation/tutorial.pdf
|
|
262 * https://github.com/apetkau/microbial-informatics-2014/tree/master/labs/ffp-phylogeny
|
|
263
|
|
264 -------
|
|
265
|
|
266 .. class:: warningmark
|
|
267
|
|
268 **Note**
|
|
269
|
|
270 Taxonomy label details: If each file contains one profile, the file's name is used to label the profile.
|
|
271 If each file contains fasta sequences to profile individually, their fasta identifiers will be used to label them.
|
|
272 The "short labels" option will find the shortest label that uniquely identifies each profile.
|
|
273 Either way, there are some quirks: ffpjsd clips labels to 10 characters if they are greater than 50 characters, so all labels are trimmed to 50 characters first.
|
|
274 Also "id" is prefixed to any numeric label since some tree visualizers won't show purely numeric labels.
|
|
275 In the accidental case where a Fasta sequence label is a duplicate of a previous one it will be prefixed by "DupLabel-".
|
|
276
|
|
277 The command line ffpjsd can hang if one provides an l-mer length greater than the length of file content.
|
|
278 One must identify its process id (">ps aux | grep ffpjsd") and kill it (">kill [process id]").
|
|
279 -------
|
|
280
|
|
281 **References**
|
|
282
|
|
283 The original ffp-phylogeny code is at http://ffp-phylogeny.sourceforge.net/ .
|
|
284 This tool uses Aaron Petkau's modified version: https://github.com/apetkau/ffp-3.19-custom .
|
|
285
|
|
286 The development of the ff-phylogeny should be attributed to:
|
|
287
|
|
288 Sims GE, Jun S-R, Wu GA, Kim S-H. Alignment-free genome comparison with feature frequency profiles (FFP) and optimal resolutions. Proceedings of the National Academy of Sciences of the United States of America 2009;106(8):2677-2682. doi:10.1073/pnas.0813249106.
|
|
289
|
|
290 ]]></help>
|
|
291 </tool>
|
|
292
|
|
293
|