Mercurial > repos > damion > ffp_phylogeny

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ffp_macros.xml	Mon Feb 23 18:25:25 2015 -0500
@@ -0,0 +1,21 @@
+<macros>
+
+    <xml name="stdio">
+        <stdio>
+            <!-- Anything other than zero is an error -->
+            <exit_code range="1:" />
+            <exit_code range=":-1" />
+            <!-- In case the return code has not been set propery check stderr too -->
+            <regex match="Error:" />
+            <regex match="Exception:" />
+        </stdio>
+    </xml>
+
+    <xml name="requirements">
+        <requirements>
+            <requirement type="binary">@BINARY@</requirement>
+        </requirements>
+        <version_command interpreter="python">@BINARY@ --version</version_command>
+    </xml>
+
+</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ffp_phylogeny.py	Mon Feb 23 18:25:25 2015 -0500
@@ -0,0 +1,353 @@
+#!/usr/bin/python
+import optparse
+import time
+import os
+import tempfile
+import sys
+import shlex, subprocess
+from string import maketrans
+
+VERSION_NUMBER = "0.1.00"
+
+class MyParser(optparse.OptionParser):
+	"""
+	 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output
+	 Provides a better class for displaying formatted help info in epilog() portion of optParse; allows for carriage returns.
+	"""
+	def format_epilog(self, formatter):
+		return self.epilog
+
+
+def stop_err( msg ):
+    sys.stderr.write("%s\n" % msg)
+    sys.exit(1)
+
+def getTaxonomyNames(type, multiple, abbreviate, filepaths, filenames):
+	"""
+	Returns a taxonomic list of names corresponding to each file being analyzed by ffp.
+	This may also include names for each fasta sequence found within a file if the
+	"-m" multiple option is provided. 	Default is to use the file names rather than fasta id's inside the files.
+	NOTE: THIS DOES NOT (MUST NOT) REORDER NAMES IN NAME ARRAY.
+	EACH NAME ENTRY IS TRIMMED AND MADE UNIQUE
+
+	@param type string ['text','amino','nucleotide']
+	@param multiple boolean Flag indicates to look within files for labels
+	@param abbreviate boolean Flag indicates to shorten labels
+	@filenames array original input file names as user selected them
+	@filepaths array resulting galaxy dataset file .dat paths
+
+	"""
+	# Take off prefix/suffix whitespace/comma :
+	taxonomy = filenames.strip().strip(',').split(',')
+	translations = maketrans(' .-	','____')
+	names=[]
+	ptr = 0
+
+	for file in filepaths:
+		# First, convert space, period to underscore in file names.	  ffprwn IS VERY SENSITIVE ABOUT THIS.
+		# Also trim labels to 50 characters.  Turns out ffpjsd is kneecapping a taxonomy label to 10 characters if it is greater than 50 chars.
+		taxonomyitem = taxonomy[ptr].strip().translate(translations)[:50]
+		# print taxonomyitem
+		if not type in 'text' and multiple:
+			#Must read each fasta file, looking for all lines beginning ">"
+			with open(file) as fastafile:
+				lineptr = 0
+				for line in fastafile:
+					if line[0] == '>':
+						name = line[1:].split(None,1)[0].strip()[:50]
+						# Odd case where no fasta description found
+						if name == '': name = taxonomyitem + '.' + str(lineptr)
+						names.append(name)
+						lineptr += 1
+		else:
+			names.append(taxonomyitem)
+
+		ptr += 1
+
+	if abbreviate:
+		names = trimCommonPrefixes(names)
+		names = trimCommonPrefixes(names, True) # reverse = Suffixes.
+
+	return names
+
+def trimCommonPrefixes(names, reverse=False):
+	"""
+	Examines sorted array of names.  Trims off prefix of each subsequent pair.
+
+	@param names array of textual labels (file names or fasta taxonomy ids)
+	@param reverse boolean whether to reverse array strings before doing prefix trimming.
+	"""
+	wordybits = '|.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+
+	if reverse:
+		names = map(lambda name: name[::-1], names) #reverses characters in names
+
+	sortednames = sorted(names)
+	ptr = 0
+	sortedlen = len(sortednames)
+	oldprefixlen=0
+	prefixlen=0
+	for name in sortednames:
+		ptr += 1
+
+		#If we're not at the very last item, reevaluate prefixlen
+		if ptr < sortedlen:
+
+			# Skip first item in an any duplicate pair.  Leave duplicate name in full.
+			if name == sortednames[ptr]:
+				if reverse:
+					continue
+				else:
+					names[names.index(name)] = 'DupLabel-' + name
+					continue
+
+			# See http://stackoverflow.com/questions/9114402/regexp-finding-longest-common-prefix-of-two-strings
+			prefixlen = len( name[:([x[0]==x[1] for x in zip(name, sortednames[ptr])]+[0]).index(0)] )
+
+		if prefixlen <= oldprefixlen:
+			newprefix = name[:oldprefixlen]
+		else:
+			newprefix = name[:prefixlen]
+		# Expands label to include any preceeding characters that were probably part of it.
+		newprefix = newprefix.rstrip(wordybits)
+		newname = name[len(newprefix):]
+		# Some tree visualizers don't show numeric labels?!?!
+		if not reverse and newname.replace('.','',1).isdigit():
+			newname = 'id_' + newname
+		names[names.index(name)] = newname #extract name after prefix part; has nl in it
+		oldprefixlen = prefixlen
+
+	if reverse:
+		names = map(lambda name: name[::-1], names) #now back to original direction
+
+	return names
+
+def getTaxonomyFile(names):
+	"""
+	FFP's ffpjsd -p [taxon file of labels] option creates a phylip tree with
+	given taxon labels
+
+	@param names array of datafile names or fasta sequence ids
+	"""
+
+	try:
+		temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False)
+		taxonomyTempFile = temp.name
+		temp.writelines(name + '\n' for name in names)
+
+	except:
+		stop_err("Galaxy configuration error for ffp_phylogeny tool. Unable to write taxonomy file " + taxonomyTempFile)
+
+	finally:
+		temp.close()
+
+	return taxonomyTempFile
+
+
+def check_output(command):
+	"""
+	Execute a command line containing a series of pipes; and handle error cases by exiting at first error case.  This is a substitute for Python 2.7 subprocess.check_output() - allowing piped commands without shell=True call .  Based on Python subprocess docs 17.1.4.2
+
+	ISSUE: warnings on stderr are given with no exit code 0:
+		ffpry: Warning: No keys of length 6 found.
+		ffpcol: (null): Not a key valued FFP.
+
+	Can't use communicate() because this closes processes' stdout
+	file handle even without errors because of read to end of stdout:
+	(stdoutdata, stderrdata) = processes[ptr-1].communicate()
+
+	"""
+	commands = command.split("|")
+	processes = []
+	ptr = 0
+	for command_line in commands:
+		print command_line.strip()
+		args = shlex.split(command_line.strip())
+		if ptr == 0:
+			proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+			processes.append(proc)
+		else:
+			# It seems the act of reading standard error output is enough to trigger
+			# error code signal for that process, i.e. so that retcode returns a code.
+			stderrdata = processes[ptr-1].stderr.read()
+			retcode = processes[ptr-1].poll()
+			if retcode or len(stderrdata) > 0:
+				stop_err(stderrdata)
+
+			newProcess = subprocess.Popen(args, stdin=processes[ptr-1].stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+			processes.append(newProcess)
+			processes[ptr-1].stdout.close() # Allow prev. process to receive a SIGPIPE if current process exits.
+
+		ptr += 1
+
+	(stdoutdata, stderrdata) = processes[ptr-1].communicate()
+	retcode = processes[ptr-1].poll()
+	if retcode or len(stderrdata) > 0:
+		stop_err(stderrdata)
+
+	return stdoutdata
+
+
+class ReportEngine(object):
+
+	def __init__(self): pass
+
+	def __main__(self):
+
+
+		## *************************** Parse Command Line *****************************
+		parser = MyParser(
+			description = 'FFP (Feature frequency profile) is an alignment free comparison tool',
+			usage = 'python ffp_phylogeny.py [input_files] [output file] [options]',
+			epilog="""Details:
+
+			FFP (Feature frequency profile) is an alignment free comparison tool for phylogenetic analysis and text comparison. It can be applied to nucleotide sequences, complete genomes, proteomes and even used for text comparison.
+
+		""")
+
+		parser.set_defaults(row_limit=0)
+		# Don't use "-h" , it is reserved for --help!
+
+		parser.add_option('-t', '--type', type='choice', dest='type', default='text',
+			choices=['amino','nucleotide','text'],
+			help='Choice of Amino acid, nucleotide or plain text sequences to find features in')
+
+		parser.add_option('-l', '--length', type='int', dest='length', default=6,
+			help='Features (any string of valid characters found in data) of this length will be counted.  Synonyms: l-mer, k-mer, n-gram, k-tuple')
+
+		#parser.add_option('-n', '--normalize', dest='normalize', default=True, action='store_true',
+		#	help='Normalize counts into relative frequency')
+
+		parser.add_option('-m', '--multiple', dest='multiple', default=False, action='store_true',
+			help='By default all sequences in a fasta file be treated as 1 sequence to profile.  This option enables each sequence found in a fasta file to have its own profile.')
+
+		parser.add_option('-M', '--metric', type='string', dest='metric',
+			help='Various metrics to measure count distances by.')
+
+		parser.add_option('-x', '--taxonomy', type='string', dest='taxonomy',
+			help='Taxanomic label for each profile/sequence.')
+
+		parser.add_option('-d', '--disable', dest='disable', default=False, action='store_true',
+			help='By default amino acid and nucleotide characters are grouped by functional category (protein or purine/pyrimidine group) before being counted.  Disable this to treat individual characters as distinct.')
+
+		parser.add_option('-a', '--abbreviate', dest='abbreviate', default=False, action='store_true',
+			help='Shorten tree taxonomy labels as much as possible.')
+
+		parser.add_option('-s', '--similarity', dest='similarity', default=False, action='store_true',
+			help='Enables pearson correlation coefficient matrix and any of the binary distance measures to be turned into similarity matrixes.')
+
+		parser.add_option('-f', '--filter', type='choice', dest='filter', default='none',
+			choices=['none','f','n','e','freq','norm','evd'],
+			help='Choice of [f=raw frequency|n=normal|e=extreme value (Gumbel)] distribution: Features are trimmed from the data based on lower/upper cutoff points according to the given distribution.')
+
+		parser.add_option('-L', '--lower', type='float', dest='lower',
+			help='Filter lower bound is a 0.00 percentages')
+
+		parser.add_option('-U', '--upper', type='float', dest='upper',
+			help='Filter upper bound is a 0.00 percentages')
+
+		parser.add_option('-o', '--output', type='string', dest='output',
+			help='Path of output file to create')
+
+		parser.add_option('-T', '--tree', dest='tree', default=False, action='store_true', help='Generate Phylogenetic Tree output file')
+
+		parser.add_option('-v', '--version', dest='version', default=False, action='store_true', help='Version number')
+
+		# Could also have -D INT decimal precision included for ffprwn .
+
+		options, args = parser.parse_args()
+
+		if options.version:
+			print VERSION_NUMBER
+			return
+
+		import time
+		time_start = time.time()
+
+		try:
+			in_files = args[:]
+
+		except:
+			stop_err("Expecting at least 1 input data file.")
+
+
+		#ffptxt / ffpaa / ffpry
+		if options.type in 'text':
+			command = 'ffptxt'
+
+		else:
+			if options.type == 'amino':
+				command = 'ffpaa'
+			else:
+				command = 'ffpry'
+
+			if options.disable:
+				command += ' -d'
+
+			if options.multiple:
+				command += ' -m'
+
+		command += ' -l ' + str(options.length)
+
+		if len(in_files): #Note: app isn't really suited to stdio
+			command += ' "' + '" "'.join(in_files) + '"'
+
+		#ffpcol / ffpfilt
+		if options.filter != 'none':
+			command += ' | ffpfilt'
+			if options.filter != 'count':
+				command += ' -' + options.filter
+			if options.lower > 0:
+				command += ' --lower ' + str(options.lower)
+			if options.upper > 0:
+				command += ' --upper ' + str(options.upper)
+
+		else:
+			command += ' | ffpcol'
+
+		if options.type in 'text':
+			command += ' -t'
+
+		else:
+
+			if options.type == 'amino':
+				command += ' -a'
+
+			if options.disable:
+				command += ' -d'
+
+		#if options.normalize:
+		command += ' | ffprwn'
+
+		#Now create a taxonomy label file, ensuring a name exists for each profile.
+		taxonomyNames = getTaxonomyNames(options.type, options.multiple, options.abbreviate, in_files, options.taxonomy)
+		taxonomyTempFile = getTaxonomyFile(taxonomyNames)
+		# -p = Include phylip format 'infile' of the taxon names to use.  Very simple, just a list of fasta identifier names.
+		command += ' | ffpjsd -p ' + taxonomyTempFile
+
+		if options.metric and len(options.metric) >0 :
+			command += ' --' + options.metric
+			if options.similarity:
+				command += ' -s'
+
+		# Generate Newick (.nhx) formatted tree if we have at least 3 taxonomy items:
+		if options.tree:
+			if len(taxonomyNames) > 2:
+				command += ' | ffptree -q'
+			else:
+				stop_err("For a phylogenetic tree display, one must have at least 3 ffp profiles.")
+
+		result = check_output(command)
+		with open(options.output,'w') as fw:
+			fw.writelines(result)
+		os.remove(taxonomyTempFile)
+
+if __name__ == '__main__':
+
+	time_start = time.time()
+
+	reportEngine = ReportEngine()
+	reportEngine.__main__()
+
+	print('Execution time (seconds): ' + str(int(time.time()-time_start)))
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ffp_phylogeny.xml	Mon Feb 23 18:25:25 2015 -0500
@@ -0,0 +1,284 @@
+<tool id="ffp_phylogeny" name="Feature Frequency Profile Phylogeny" version="0.1.00">
+	<description>An alignment free comparison tool for phylogenetic analysis and text comparison</description>
+	<requirements>
+		<requirement type="package" version="0.3.19_d4382db015acec0e5cc43d6c1ac80ae12cb7e6b3">ffp-phylogeny</requirement>
+	</requirements>
+
+	<macros>
+		<token name="@BINARY@">./ffp_phylogeny.py</token>
+		<import>ffp_macros.xml</import>
+	</macros>
+	<expand macro="requirements" />
+   <command interpreter="python"><![CDATA[
+		ffp_phylogeny.py
+		#for $i in $sequence.filesin
+			"$i" ## full file paths
+		#end for
+		-x "
+		#for $i in $sequence.filesin
+			$i.name, ## original file names
+		#end for
+		"
+		-t "$(sequence.file_type.split('-')[0])"
+		-l "$length"
+		-o "$info"
+		##if $normalize:
+		##	-n
+		##end if
+		#if $sequence.file_type != 'text':
+			#if $sequence.file_type.find('multi') > 0:
+				-m
+			#end if
+			#if $sequence.grouping:
+				-d
+			#end if
+			#if $metric:
+				-M "$metric"
+			#end if
+			#if $similarity:
+				-s
+			#end if
+			#if $abbreviate:
+				-a
+			#end if
+		#end if
+		#if $phylogeny.phylo_type == 'filter':
+			-f "$phylogeny.filt.filter_type"
+			-L "$phylogeny.filt.lower"
+			-U "$phylogeny.filt.upper"
+		#end if
+		#if $tree:
+			-T
+		#end if
+
+		##ffpjsd -n FLOAT , --normval=FLOAT
+		## For option -e, --euclid, change the n-norm distance (Default is n=2) to any other value where n > 1
+
+    ]]></command>
+    <expand macro="stdio" />
+    <inputs>
+
+		<!-- Either amino acid or nucleotide input -->
+		<!-- Ideally we could determine from file content or suffix what type it is -->
+
+		<param name="length" type="integer" min="1" max="25" label="l-mer length" value="6" help="String of valid characters of this length will be counted.  Synonyms: feature, k-mer, n-gram, k-tuple" size="2"/>
+		<!--
+		<param name="normalize" label="Normalize counts into relative frequency" type="boolean" checked="true" help="" />
+		-->
+		<conditional name="sequence">
+			<param type="select" name="file_type" label="File type" help="Note: For phylogeny display, at least three profiles are required, as files or fasta sequences within a file.">
+				<option value="amino">Amino Acids, one sequence per file</option>
+				<option value="amino-multi">Amino Acids, multiple fasta sequences per file</option>
+				<option value="nucleotide">Nucleic acids, one sequence per file</option>
+				<option value="nucleotide-multi">Nucleic acids, multiple fasta sequences per file</option>
+				<option value="text">Text, single file</option>
+			</param>
+
+			<when value="amino"><!-- ffpaa -->
+				<param name="filesin" type="data" label="Select input file(s)" format="fasta" multiple="true" />
+				<param name="grouping" label="Enable amino acid grouping" type="boolean" checked="true" help="Counts amino acids in groups rather than individually (usually advantageous, see below)." />
+			</when>
+
+			<when value="amino-multi">
+				<param name="filesin" type="data" label="Select input file(s)" format="fasta" multiple="true" />
+				<param name="grouping" label="Enable amino acid grouping" type="boolean" checked="true" help="Counts amino acids in groups rather than individually (usually advantageous, see below)." />
+			</when>
+
+			<when value="nucleotide"><!-- ffpry -->
+				<param name="filesin" type="data" label="Select input file(s)" format="fasta" multiple="true" />
+				<param name="grouping" label="Enable purine / pyrimidine grouping" type="boolean" checked="true" help="Counts each nucleotide as a purine(R) or pyrimidine(Y) rather than individually (usually advantageous)." />
+			</when>
+
+			<when value="nucleotide-multi">
+				<param name="filesin" type="data" label="Select input file(s)" format="fasta" multiple="true" />
+				<param name="grouping" label="Enable purine / pyrimidine grouping" type="boolean" checked="true" help="Counts each nucleotide as a purine(R) or pyrimidine(Y) rather than individually (usually advantageous)." />
+			</when>
+
+			<when value="text"><!-- ffptxt -->
+				<param name="filesin" type="data" multiple="true"/>
+			</when>
+
+
+       	</conditional>
+
+		<conditional name="phylogeny">
+			<param type="select" name="phylo_type" label="Feature filtering">
+				<option value="all">Include all features (Phenetic phylogeny)</option>
+				<option value="filt">Include only filtered features (Core/evolutionary phylogeny) </option>
+			</param>
+			<when value="all"></when>
+			<when value="filt">
+				<conditional name="filt">
+
+					<param type="select" name="filter_type" label="Filter type" help="Features are included in profiles if at least 1 profile has lower count/percent, and no profile has more than upper count/percent">
+						<option value="count">lower / upper count limit</option>
+						<option value="f">raw frequencies</option>
+						<option value="n">normal distribution</option>
+						<option value="e">extreme value (Gumbel) distribution</option>
+					</param>
+					<when value="count">
+						<param name="lower" type="integer" label="lower count (one profile needs at least this)" value="0" min="0" />
+						<param name="upper" type="integer" label="upper count (no profile can have more than this)" value="0" min="0" />
+					</when>
+					<when value="f">
+						<param name="lower" type="float" label="lower &#37;" value="0.05" min="0" max=".5" />
+						<param name="upper" type="float" label="upper &#37;" value="0.95" min=".5" max="1" />
+					</when>
+					<when value="n">
+						<param name="lower" type="float" label="lower &#37;" value="0.05" min="0" max=".5" />
+						<param name="upper" type="float" label="upper &#37;" value="0.95" min=".5" max="1" />
+					</when>
+					<when value="e">
+						<param name="lower" type="float" label="lower &#37;" value="0.05" min="0" max=".5" />
+						<param name="upper" type="float" label="upper &#37;" value="0.95" min=".5" max="1" />
+					</when>
+
+				</conditional>
+			</when>
+
+		</conditional>
+
+		<param type="select" name="metric" label="Continuous Distance Measure" help="See ffpjsd documentation for details.">
+			<option value="" selected="true">Jensen Shannon divergence (default)</option>
+			<option value="euclid">Euclidean</option>
+			<option value="euclid2">Euclidean squared</option>
+			<option value="cosine">Cosine</option>
+			<option value="manhattan">Manhattan</option>
+			<option value="pearson">pearson correlation coefficient*</option>
+			<option value="chebyshev">Chebyshev</option>
+			<option value="canberra">Canberra</option>
+			<option value="hamming">Hamming</option>
+			<option value="evol">Evolutionary Distance used in E.coli Publications</option>
+
+<!--
+
+With these options the input FFPs are treated as binary data. When two FFPs (i and j) are compared each
+distance measure uses a cross tabulation for pairwise feature comparison with sums A, B, C and D. A is
+the number of features which are present in both vectors while D is the number of features that are absent in
+both vectors. B means the feature is present in i and absent in j. C means the feature is absent in i but
+present in j. N is the sum of A+B+C+D. All of the binary distance options can be used together with the -s
+option to print a similarity matrix. THe binary distance do not need to be normalized with ffprwn.
+
+			<option value="">BINARY DISTANCE MEASURES</option>
+
+-->
+
+			<option value="matching">matching*</option>
+			<option value="jaccard">Jaccard*</option>
+			<option value="tanimoto">Rogers-Tanimoto*</option>
+			<option value="dice">Dice*</option>
+			<option value="antidice">anti-Dice*</option>
+			<option value="sneath">Sneath-Sokal*</option>
+			<option value="hamman">Hamman*</option>
+			<option value="phi">Pearson Phi*</option>
+			<option value="anderberg">Anderberg*</option>
+			<option value="gower">Gower*</option>
+			<option value="russel">Russel-Rao*</option>
+			<option value="yule">Yule*</option>
+			<option value="ochiai">Ochiai*</option>
+			<option value="kulczynski">Kulczynski*</option>
+
+		</param>
+
+		<param type="boolean" name="similarity" label="*Similarity Matrix" help="Print a similarity matrix rather than a distance matrix for items marked by asterisk(*). This option effects the output of distances metrics which have a value normalized from 0 to 1 or -1 to 1."/>
+
+		<param type="boolean" name="abbreviate" label="Short labels" help="Shorten tree taxonomy labels as much as possible."/>
+
+		<param type="boolean" name="tree" checked="true" label="Generate Tree Phylogeny" truevalue="1" falsevalue="0" />
+
+    </inputs>
+    <outputs>
+        <data name="info" format="nhx" label="Feature Frequency Profile">
+			<change_format>
+				<when input="tree" value="0" format="tabular"/>
+			</change_format>
+			<!-- doesn't work: filter>tree == "1"</filter -->
+		</data>
+    </outputs>
+
+	<tests>
+		<test>
+			<param name="length" value="1"/>
+			<param name="tree" value="0"/>
+			<param name="grouping" value="true"/>
+			<param name="file_type" value="nucleotide"/>
+			<param name="filesin" value="genome1,genome2"/>
+			<output name="info" file="test_length_1_output.tabular"/>
+		</test>
+		<test>
+			<param name="length" value="2"/>
+			<param name="tree" value="0"/>
+			<param name="grouping" value="true"/>
+			<param name="file_type" value="nucleotide"/>
+			<param name="filesin" value="genome1,genome2"/>
+			<output name="info" file="test_length_2_output.tabular"/>
+		</test>
+	</tests>
+
+    <help><![CDATA[
+
+.. class:: infomark
+
+
+**What it does**
+
+FFP (Feature frequency profile) is an alignment free comparison tool for phylogenetic analysis and text comparison. It can be applied to nucleotide sequences, complete genomes, proteomes and even used for text comparison.
+
+This galaxy tool prepares a mini-pipeline consisting of **[ffpry | ffpaa | ffptxt] &gt; [ ffpfilt | ffpcol &gt; ffprwn] &gt; ffpjsd &gt; ffptree**  .  The last step is optional - by deselecting the "Generate Tree Phylogeny" checkbox, the tool will output a distance matrix rather than a Newick (.nhx) formatted tree file.
+
+Each sequence or text file has a profile containing tallies of each feature found.  A feature is a string of valid characters of given length.
+
+For nucleotide data, by default each character (ATGC) is grouped as either purine(R) or pyrmidine(Y) before being counted.
+
+For amino acid data, by default each character is grouped into one of the following:
+(ST),(DE),(KQR),(IVLM),(FWY),C,G,A,N,H,P. Each group is represented by the first character in its series.
+
+One other key concept is that a given feature, e.g. "TAA" is counted in forward
+AND reverse directions, mirroring the idea that a feature&apos;s orientation is not
+so important to distinguish when it comes to alignment-free comparison.
+The counts for "TAA" and "AAT" are merged.
+
+The labeling of the resulting counted feature items is perhaps the trickiest
+concept to master.  Due to computational efficiency measures taken by the
+developers, a feature that we see on paper as "TAC" may be stored and labeled
+internally as "GTA", its reverse compliment.  One must look for the alternative
+if one does not find the original.
+
+Also note that in amino acid sequences the stop codon "*" (or any other character
+that is not in the Amino acid alphabet) causes that character frame not to be
+counted.  Also, character frames never span across fasta entries.
+
+A few tutorials:
+ * http://sourceforge.net/projects/ffp-phylogeny/files/Documentation/tutorial.pdf
+ * https://github.com/apetkau/microbial-informatics-2014/tree/master/labs/ffp-phylogeny
+
+-------
+
+.. class:: warningmark
+
+**Note**
+
+Taxonomy label details: If each file contains one profile, the file's name is used to label the profile.
+If each file contains fasta sequences to profile individually, their fasta identifiers will be used to label them.
+The "short labels" option will find the shortest label that uniquely identifies each profile.
+Either way, there are some quirks: ffpjsd clips labels to 10 characters if they are greater than 50 characters, so all labels are trimmed to 50 characters first.
+Also "id" is prefixed to any numeric label since some tree visualizers won't show purely numeric labels.
+In the accidental case where a Fasta sequence label is a duplicate of a previous one it will be prefixed by "DupLabel-".
+
+The command line ffpjsd can hang if one provides an l-mer length greater than the length of file content.
+One must identify its process id (">ps aux | grep ffpjsd") and kill it (">kill [process id]").
+-------
+
+**References**
+
+The original ffp-phylogeny code is at http://ffp-phylogeny.sourceforge.net/ .
+This tool uses Aaron Petkau's modified version: https://github.com/apetkau/ffp-3.19-custom .
+
+The development of the ff-phylogeny should be attributed to:
+
+Sims GE, Jun S-R, Wu GA, Kim S-H. Alignment-free genome comparison with feature frequency profiles (FFP) and optimal resolutions. Proceedings of the National Academy of Sciences of the United States of America 2009;106(8):2677-2682. doi:10.1073/pnas.0813249106.
+
+    ]]></help>
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tarballit.sh	Mon Feb 23 18:25:25 2015 -0500
@@ -0,0 +1,2 @@
+#!/bin/bash
+ tar -zcvf ffp_phylogeny.tar.gz * --exclude "*~" --exclude "tool_test_output*" --exclude "*gz"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome1	Mon Feb 23 18:25:25 2015 -0500
@@ -0,0 +1,2 @@
+>genome1
+AATT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/genome2	Mon Feb 23 18:25:25 2015 -0500
@@ -0,0 +1,2 @@
+>genome2
+AAGG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_length_1_output.tabular	Mon Feb 23 18:25:25 2015 -0500
@@ -0,0 +1,3 @@
+2
+genome1                                           0.00e+00 1.89e-01
+genome2                                           1.89e-01 0.00e+00
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_length_2_output.tabular	Mon Feb 23 18:25:25 2015 -0500
@@ -0,0 +1,3 @@
+2
+genome1                                           0.00e+00 4.58e-01
+genome2                                           4.58e-01 0.00e+00
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Feb 23 18:25:25 2015 -0500
@@ -0,0 +1,24 @@
+<?xml version="1.0"?>
+<tool_dependency>
+	<package name="ffp-phylogeny" version="0.3.19_d4382db015acec0e5cc43d6c1ac80ae12cb7e6b3">
+		<install version="1.0">
+			<actions>
+				<action type="shell_command">git clone https://github.com/apetkau/ffp-3.19-custom.git ffp-phylogeny</action>
+				<action type="shell_command">git reset --hard d4382db015acec0e5cc43d6c1ac80ae12cb7e6b3</action>
+				<action type="shell_command">./configure --disable-gui --prefix=$INSTALL_DIR</action>
+				<action type="make_install"></action>
+				<!-- action type="move_directory_files">
+					<source_directory>bin</source_directory>
+					<destination_directory>$INSTALL_DIR/bin</destination_directory>
+				</action -->
+				<action type="set_environment">
+					<environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+				</action>
+			</actions>
+		</install>
+		<readme>
+			apetkau/ffp-3.19-custom is a customized version of http://sourceforge.net/projects/ffp-phylogeny/
+		</readme>
+	</package>
+</tool_dependency>
+