comparison ffp_phylogeny.py @ 0:eb6e5e78a066 draft

Uploaded
author damion
date Mon, 23 Feb 2015 18:25:25 -0500
parents
children d1c88b118a3f
comparison
equal deleted inserted replaced
-1:000000000000 0:eb6e5e78a066
1 #!/usr/bin/python
2 import optparse
3 import time
4 import os
5 import tempfile
6 import sys
7 import shlex, subprocess
8 from string import maketrans
9
10 VERSION_NUMBER = "0.1.00"
11
12 class MyParser(optparse.OptionParser):
13 """
14 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output
15 Provides a better class for displaying formatted help info in epilog() portion of optParse; allows for carriage returns.
16 """
17 def format_epilog(self, formatter):
18 return self.epilog
19
20
21 def stop_err( msg ):
22 sys.stderr.write("%s\n" % msg)
23 sys.exit(1)
24
25 def getTaxonomyNames(type, multiple, abbreviate, filepaths, filenames):
26 """
27 Returns a taxonomic list of names corresponding to each file being analyzed by ffp.
28 This may also include names for each fasta sequence found within a file if the
29 "-m" multiple option is provided. Default is to use the file names rather than fasta id's inside the files.
30 NOTE: THIS DOES NOT (MUST NOT) REORDER NAMES IN NAME ARRAY.
31 EACH NAME ENTRY IS TRIMMED AND MADE UNIQUE
32
33 @param type string ['text','amino','nucleotide']
34 @param multiple boolean Flag indicates to look within files for labels
35 @param abbreviate boolean Flag indicates to shorten labels
36 @filenames array original input file names as user selected them
37 @filepaths array resulting galaxy dataset file .dat paths
38
39 """
40 # Take off prefix/suffix whitespace/comma :
41 taxonomy = filenames.strip().strip(',').split(',')
42 translations = maketrans(' .- ','____')
43 names=[]
44 ptr = 0
45
46 for file in filepaths:
47 # First, convert space, period to underscore in file names. ffprwn IS VERY SENSITIVE ABOUT THIS.
48 # Also trim labels to 50 characters. Turns out ffpjsd is kneecapping a taxonomy label to 10 characters if it is greater than 50 chars.
49 taxonomyitem = taxonomy[ptr].strip().translate(translations)[:50]
50 # print taxonomyitem
51 if not type in 'text' and multiple:
52 #Must read each fasta file, looking for all lines beginning ">"
53 with open(file) as fastafile:
54 lineptr = 0
55 for line in fastafile:
56 if line[0] == '>':
57 name = line[1:].split(None,1)[0].strip()[:50]
58 # Odd case where no fasta description found
59 if name == '': name = taxonomyitem + '.' + str(lineptr)
60 names.append(name)
61 lineptr += 1
62 else:
63 names.append(taxonomyitem)
64
65 ptr += 1
66
67 if abbreviate:
68 names = trimCommonPrefixes(names)
69 names = trimCommonPrefixes(names, True) # reverse = Suffixes.
70
71 return names
72
73 def trimCommonPrefixes(names, reverse=False):
74 """
75 Examines sorted array of names. Trims off prefix of each subsequent pair.
76
77 @param names array of textual labels (file names or fasta taxonomy ids)
78 @param reverse boolean whether to reverse array strings before doing prefix trimming.
79 """
80 wordybits = '|.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
81
82 if reverse:
83 names = map(lambda name: name[::-1], names) #reverses characters in names
84
85 sortednames = sorted(names)
86 ptr = 0
87 sortedlen = len(sortednames)
88 oldprefixlen=0
89 prefixlen=0
90 for name in sortednames:
91 ptr += 1
92
93 #If we're not at the very last item, reevaluate prefixlen
94 if ptr < sortedlen:
95
96 # Skip first item in an any duplicate pair. Leave duplicate name in full.
97 if name == sortednames[ptr]:
98 if reverse:
99 continue
100 else:
101 names[names.index(name)] = 'DupLabel-' + name
102 continue
103
104 # See http://stackoverflow.com/questions/9114402/regexp-finding-longest-common-prefix-of-two-strings
105 prefixlen = len( name[:([x[0]==x[1] for x in zip(name, sortednames[ptr])]+[0]).index(0)] )
106
107 if prefixlen <= oldprefixlen:
108 newprefix = name[:oldprefixlen]
109 else:
110 newprefix = name[:prefixlen]
111 # Expands label to include any preceeding characters that were probably part of it.
112 newprefix = newprefix.rstrip(wordybits)
113 newname = name[len(newprefix):]
114 # Some tree visualizers don't show numeric labels?!?!
115 if not reverse and newname.replace('.','',1).isdigit():
116 newname = 'id_' + newname
117 names[names.index(name)] = newname #extract name after prefix part; has nl in it
118 oldprefixlen = prefixlen
119
120 if reverse:
121 names = map(lambda name: name[::-1], names) #now back to original direction
122
123 return names
124
125 def getTaxonomyFile(names):
126 """
127 FFP's ffpjsd -p [taxon file of labels] option creates a phylip tree with
128 given taxon labels
129
130 @param names array of datafile names or fasta sequence ids
131 """
132
133 try:
134 temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False)
135 taxonomyTempFile = temp.name
136 temp.writelines(name + '\n' for name in names)
137
138 except:
139 stop_err("Galaxy configuration error for ffp_phylogeny tool. Unable to write taxonomy file " + taxonomyTempFile)
140
141 finally:
142 temp.close()
143
144 return taxonomyTempFile
145
146
147 def check_output(command):
148 """
149 Execute a command line containing a series of pipes; and handle error cases by exiting at first error case. This is a substitute for Python 2.7 subprocess.check_output() - allowing piped commands without shell=True call . Based on Python subprocess docs 17.1.4.2
150
151 ISSUE: warnings on stderr are given with no exit code 0:
152 ffpry: Warning: No keys of length 6 found.
153 ffpcol: (null): Not a key valued FFP.
154
155 Can't use communicate() because this closes processes' stdout
156 file handle even without errors because of read to end of stdout:
157 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
158
159 """
160 commands = command.split("|")
161 processes = []
162 ptr = 0
163 for command_line in commands:
164 print command_line.strip()
165 args = shlex.split(command_line.strip())
166 if ptr == 0:
167 proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
168 processes.append(proc)
169 else:
170 # It seems the act of reading standard error output is enough to trigger
171 # error code signal for that process, i.e. so that retcode returns a code.
172 stderrdata = processes[ptr-1].stderr.read()
173 retcode = processes[ptr-1].poll()
174 if retcode or len(stderrdata) > 0:
175 stop_err(stderrdata)
176
177 newProcess = subprocess.Popen(args, stdin=processes[ptr-1].stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
178 processes.append(newProcess)
179 processes[ptr-1].stdout.close() # Allow prev. process to receive a SIGPIPE if current process exits.
180
181 ptr += 1
182
183 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
184 retcode = processes[ptr-1].poll()
185 if retcode or len(stderrdata) > 0:
186 stop_err(stderrdata)
187
188 return stdoutdata
189
190
191 class ReportEngine(object):
192
193 def __init__(self): pass
194
195 def __main__(self):
196
197
198 ## *************************** Parse Command Line *****************************
199 parser = MyParser(
200 description = 'FFP (Feature frequency profile) is an alignment free comparison tool',
201 usage = 'python ffp_phylogeny.py [input_files] [output file] [options]',
202 epilog="""Details:
203
204 FFP (Feature frequency profile) is an alignment free comparison tool for phylogenetic analysis and text comparison. It can be applied to nucleotide sequences, complete genomes, proteomes and even used for text comparison.
205
206 """)
207
208 parser.set_defaults(row_limit=0)
209 # Don't use "-h" , it is reserved for --help!
210
211 parser.add_option('-t', '--type', type='choice', dest='type', default='text',
212 choices=['amino','nucleotide','text'],
213 help='Choice of Amino acid, nucleotide or plain text sequences to find features in')
214
215 parser.add_option('-l', '--length', type='int', dest='length', default=6,
216 help='Features (any string of valid characters found in data) of this length will be counted. Synonyms: l-mer, k-mer, n-gram, k-tuple')
217
218 #parser.add_option('-n', '--normalize', dest='normalize', default=True, action='store_true',
219 # help='Normalize counts into relative frequency')
220
221 parser.add_option('-m', '--multiple', dest='multiple', default=False, action='store_true',
222 help='By default all sequences in a fasta file be treated as 1 sequence to profile. This option enables each sequence found in a fasta file to have its own profile.')
223
224 parser.add_option('-M', '--metric', type='string', dest='metric',
225 help='Various metrics to measure count distances by.')
226
227 parser.add_option('-x', '--taxonomy', type='string', dest='taxonomy',
228 help='Taxanomic label for each profile/sequence.')
229
230 parser.add_option('-d', '--disable', dest='disable', default=False, action='store_true',
231 help='By default amino acid and nucleotide characters are grouped by functional category (protein or purine/pyrimidine group) before being counted. Disable this to treat individual characters as distinct.')
232
233 parser.add_option('-a', '--abbreviate', dest='abbreviate', default=False, action='store_true',
234 help='Shorten tree taxonomy labels as much as possible.')
235
236 parser.add_option('-s', '--similarity', dest='similarity', default=False, action='store_true',
237 help='Enables pearson correlation coefficient matrix and any of the binary distance measures to be turned into similarity matrixes.')
238
239 parser.add_option('-f', '--filter', type='choice', dest='filter', default='none',
240 choices=['none','f','n','e','freq','norm','evd'],
241 help='Choice of [f=raw frequency|n=normal|e=extreme value (Gumbel)] distribution: Features are trimmed from the data based on lower/upper cutoff points according to the given distribution.')
242
243 parser.add_option('-L', '--lower', type='float', dest='lower',
244 help='Filter lower bound is a 0.00 percentages')
245
246 parser.add_option('-U', '--upper', type='float', dest='upper',
247 help='Filter upper bound is a 0.00 percentages')
248
249 parser.add_option('-o', '--output', type='string', dest='output',
250 help='Path of output file to create')
251
252 parser.add_option('-T', '--tree', dest='tree', default=False, action='store_true', help='Generate Phylogenetic Tree output file')
253
254 parser.add_option('-v', '--version', dest='version', default=False, action='store_true', help='Version number')
255
256 # Could also have -D INT decimal precision included for ffprwn .
257
258 options, args = parser.parse_args()
259
260 if options.version:
261 print VERSION_NUMBER
262 return
263
264 import time
265 time_start = time.time()
266
267 try:
268 in_files = args[:]
269
270 except:
271 stop_err("Expecting at least 1 input data file.")
272
273
274 #ffptxt / ffpaa / ffpry
275 if options.type in 'text':
276 command = 'ffptxt'
277
278 else:
279 if options.type == 'amino':
280 command = 'ffpaa'
281 else:
282 command = 'ffpry'
283
284 if options.disable:
285 command += ' -d'
286
287 if options.multiple:
288 command += ' -m'
289
290 command += ' -l ' + str(options.length)
291
292 if len(in_files): #Note: app isn't really suited to stdio
293 command += ' "' + '" "'.join(in_files) + '"'
294
295 #ffpcol / ffpfilt
296 if options.filter != 'none':
297 command += ' | ffpfilt'
298 if options.filter != 'count':
299 command += ' -' + options.filter
300 if options.lower > 0:
301 command += ' --lower ' + str(options.lower)
302 if options.upper > 0:
303 command += ' --upper ' + str(options.upper)
304
305 else:
306 command += ' | ffpcol'
307
308 if options.type in 'text':
309 command += ' -t'
310
311 else:
312
313 if options.type == 'amino':
314 command += ' -a'
315
316 if options.disable:
317 command += ' -d'
318
319 #if options.normalize:
320 command += ' | ffprwn'
321
322 #Now create a taxonomy label file, ensuring a name exists for each profile.
323 taxonomyNames = getTaxonomyNames(options.type, options.multiple, options.abbreviate, in_files, options.taxonomy)
324 taxonomyTempFile = getTaxonomyFile(taxonomyNames)
325 # -p = Include phylip format 'infile' of the taxon names to use. Very simple, just a list of fasta identifier names.
326 command += ' | ffpjsd -p ' + taxonomyTempFile
327
328 if options.metric and len(options.metric) >0 :
329 command += ' --' + options.metric
330 if options.similarity:
331 command += ' -s'
332
333 # Generate Newick (.nhx) formatted tree if we have at least 3 taxonomy items:
334 if options.tree:
335 if len(taxonomyNames) > 2:
336 command += ' | ffptree -q'
337 else:
338 stop_err("For a phylogenetic tree display, one must have at least 3 ffp profiles.")
339
340 result = check_output(command)
341 with open(options.output,'w') as fw:
342 fw.writelines(result)
343 os.remove(taxonomyTempFile)
344
345 if __name__ == '__main__':
346
347 time_start = time.time()
348
349 reportEngine = ReportEngine()
350 reportEngine.__main__()
351
352 print('Execution time (seconds): ' + str(int(time.time()-time_start)))
353