0
|
1 #!/usr/bin/python
|
|
2 import optparse
|
1
|
3 import re
|
0
|
4 import time
|
|
5 import os
|
|
6 import tempfile
|
|
7 import sys
|
|
8 import shlex, subprocess
|
|
9 from string import maketrans
|
|
10
|
2
|
11 VERSION_NUMBER = "0.1.03"
|
0
|
12
|
|
13 class MyParser(optparse.OptionParser):
|
|
14 """
|
|
15 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output
|
|
16 Provides a better class for displaying formatted help info in epilog() portion of optParse; allows for carriage returns.
|
|
17 """
|
|
18 def format_epilog(self, formatter):
|
|
19 return self.epilog
|
|
20
|
|
21
|
|
22 def stop_err( msg ):
|
|
23 sys.stderr.write("%s\n" % msg)
|
|
24 sys.exit(1)
|
|
25
|
|
26 def getTaxonomyNames(type, multiple, abbreviate, filepaths, filenames):
|
|
27 """
|
|
28 Returns a taxonomic list of names corresponding to each file being analyzed by ffp.
|
|
29 This may also include names for each fasta sequence found within a file if the
|
|
30 "-m" multiple option is provided. Default is to use the file names rather than fasta id's inside the files.
|
|
31 NOTE: THIS DOES NOT (MUST NOT) REORDER NAMES IN NAME ARRAY.
|
|
32 EACH NAME ENTRY IS TRIMMED AND MADE UNIQUE
|
|
33
|
|
34 @param type string ['text','amino','nucleotide']
|
|
35 @param multiple boolean Flag indicates to look within files for labels
|
|
36 @param abbreviate boolean Flag indicates to shorten labels
|
|
37 @filenames array original input file names as user selected them
|
|
38 @filepaths array resulting galaxy dataset file .dat paths
|
|
39
|
|
40 """
|
|
41 # Take off prefix/suffix whitespace/comma :
|
|
42 taxonomy = filenames.strip().strip(',').split(',')
|
|
43 names=[]
|
|
44 ptr = 0
|
|
45
|
|
46 for file in filepaths:
|
2
|
47 # Trim labels to 50 characters max. ffpjsd kneecaps a taxonomy label to 10 characters if it is greater than 50 chars.
|
|
48 taxonomyitem = taxonomy[ptr].strip()[:50] #.translate(translations)
|
|
49 # Convert non-alphanumeric characters to underscore in taxonomy names. ffprwn IS VERY SENSITIVE ABOUT THIS.
|
|
50 taxonomyitem = re.sub('[^0-9a-zA-Z]+', '_', taxonomyitem)
|
|
51
|
|
52 if (not type in 'text') and multiple:
|
0
|
53 #Must read each fasta file, looking for all lines beginning ">"
|
|
54 with open(file) as fastafile:
|
|
55 lineptr = 0
|
|
56 for line in fastafile:
|
|
57 if line[0] == '>':
|
|
58 name = line[1:].split(None,1)[0].strip()[:50]
|
|
59 # Odd case where no fasta description found
|
|
60 if name == '': name = taxonomyitem + '.' + str(lineptr)
|
|
61 names.append(name)
|
|
62 lineptr += 1
|
|
63 else:
|
2
|
64
|
0
|
65 names.append(taxonomyitem)
|
|
66
|
|
67 ptr += 1
|
|
68
|
|
69 if abbreviate:
|
|
70 names = trimCommonPrefixes(names)
|
|
71 names = trimCommonPrefixes(names, True) # reverse = Suffixes.
|
|
72
|
|
73 return names
|
|
74
|
|
75 def trimCommonPrefixes(names, reverse=False):
|
|
76 """
|
|
77 Examines sorted array of names. Trims off prefix of each subsequent pair.
|
|
78
|
|
79 @param names array of textual labels (file names or fasta taxonomy ids)
|
|
80 @param reverse boolean whether to reverse array strings before doing prefix trimming.
|
|
81 """
|
|
82 wordybits = '|.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
|
83
|
|
84 if reverse:
|
|
85 names = map(lambda name: name[::-1], names) #reverses characters in names
|
|
86
|
|
87 sortednames = sorted(names)
|
|
88 ptr = 0
|
|
89 sortedlen = len(sortednames)
|
|
90 oldprefixlen=0
|
|
91 prefixlen=0
|
|
92 for name in sortednames:
|
|
93 ptr += 1
|
|
94
|
|
95 #If we're not at the very last item, reevaluate prefixlen
|
|
96 if ptr < sortedlen:
|
|
97
|
|
98 # Skip first item in an any duplicate pair. Leave duplicate name in full.
|
|
99 if name == sortednames[ptr]:
|
|
100 if reverse:
|
|
101 continue
|
|
102 else:
|
|
103 names[names.index(name)] = 'DupLabel-' + name
|
|
104 continue
|
|
105
|
|
106 # See http://stackoverflow.com/questions/9114402/regexp-finding-longest-common-prefix-of-two-strings
|
|
107 prefixlen = len( name[:([x[0]==x[1] for x in zip(name, sortednames[ptr])]+[0]).index(0)] )
|
|
108
|
|
109 if prefixlen <= oldprefixlen:
|
|
110 newprefix = name[:oldprefixlen]
|
|
111 else:
|
|
112 newprefix = name[:prefixlen]
|
|
113 # Expands label to include any preceeding characters that were probably part of it.
|
|
114 newprefix = newprefix.rstrip(wordybits)
|
|
115 newname = name[len(newprefix):]
|
|
116 # Some tree visualizers don't show numeric labels?!?!
|
|
117 if not reverse and newname.replace('.','',1).isdigit():
|
|
118 newname = 'id_' + newname
|
|
119 names[names.index(name)] = newname #extract name after prefix part; has nl in it
|
|
120 oldprefixlen = prefixlen
|
|
121
|
|
122 if reverse:
|
|
123 names = map(lambda name: name[::-1], names) #now back to original direction
|
|
124
|
|
125 return names
|
|
126
|
|
127 def getTaxonomyFile(names):
|
|
128 """
|
|
129 FFP's ffpjsd -p [taxon file of labels] option creates a phylip tree with
|
|
130 given taxon labels
|
|
131
|
|
132 @param names array of datafile names or fasta sequence ids
|
|
133 """
|
|
134
|
|
135 try:
|
|
136 temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False)
|
|
137 taxonomyTempFile = temp.name
|
|
138 temp.writelines(name + '\n' for name in names)
|
|
139
|
|
140 except:
|
|
141 stop_err("Galaxy configuration error for ffp_phylogeny tool. Unable to write taxonomy file " + taxonomyTempFile)
|
|
142
|
|
143 finally:
|
|
144 temp.close()
|
|
145
|
|
146 return taxonomyTempFile
|
|
147
|
|
148
|
|
149 def check_output(command):
|
|
150 """
|
|
151 Execute a command line containing a series of pipes; and handle error cases by exiting at first error case. This is a substitute for Python 2.7 subprocess.check_output() - allowing piped commands without shell=True call . Based on Python subprocess docs 17.1.4.2
|
|
152
|
|
153 ISSUE: warnings on stderr are given with no exit code 0:
|
|
154 ffpry: Warning: No keys of length 6 found.
|
|
155 ffpcol: (null): Not a key valued FFP.
|
|
156
|
|
157 Can't use communicate() because this closes processes' stdout
|
|
158 file handle even without errors because of read to end of stdout:
|
|
159 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
|
|
160
|
|
161 """
|
|
162 commands = command.split("|")
|
|
163 processes = []
|
|
164 ptr = 0
|
1
|
165 substantive = re.compile('[a-zA-Z0-9]+')
|
|
166
|
0
|
167 for command_line in commands:
|
|
168 print command_line.strip()
|
|
169 args = shlex.split(command_line.strip())
|
|
170 if ptr == 0:
|
|
171 proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
172 processes.append(proc)
|
|
173 else:
|
1
|
174
|
|
175 #this has to come before error processing?
|
|
176 newProcess = subprocess.Popen(args, stdin=processes[ptr-1].stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
177
|
0
|
178 # It seems the act of reading standard error output is enough to trigger
|
|
179 # error code signal for that process, i.e. so that retcode returns a code.
|
1
|
180 retcode = processes[ptr-1].poll()
|
0
|
181 stderrdata = processes[ptr-1].stderr.read()
|
2
|
182 #Issue with ffptree is it outputs ---- ... ---- on stderr even when ok.
|
1
|
183 if retcode or (len(stderrdata) > 0 and substantive.search(stderrdata)):
|
|
184 stop_err(stderrdata)
|
0
|
185
|
1
|
186 processes.append(newProcess)
|
0
|
187 processes[ptr-1].stdout.close() # Allow prev. process to receive a SIGPIPE if current process exits.
|
|
188
|
|
189 ptr += 1
|
|
190
|
1
|
191 retcode = processes[ptr-1].poll()
|
0
|
192 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
|
1
|
193 if retcode or (len(stderrdata) > 0 and substantive.search(stderrdata)):
|
0
|
194 stop_err(stderrdata)
|
|
195
|
|
196 return stdoutdata
|
|
197
|
|
198
|
|
199 class ReportEngine(object):
|
|
200
|
|
201 def __init__(self): pass
|
|
202
|
|
203 def __main__(self):
|
|
204
|
|
205
|
|
206 ## *************************** Parse Command Line *****************************
|
|
207 parser = MyParser(
|
|
208 description = 'FFP (Feature frequency profile) is an alignment free comparison tool',
|
|
209 usage = 'python ffp_phylogeny.py [input_files] [output file] [options]',
|
|
210 epilog="""Details:
|
|
211
|
|
212 FFP (Feature frequency profile) is an alignment free comparison tool for phylogenetic analysis and text comparison. It can be applied to nucleotide sequences, complete genomes, proteomes and even used for text comparison.
|
|
213
|
|
214 """)
|
|
215
|
|
216 parser.set_defaults(row_limit=0)
|
|
217 # Don't use "-h" , it is reserved for --help!
|
|
218
|
|
219 parser.add_option('-t', '--type', type='choice', dest='type', default='text',
|
|
220 choices=['amino','nucleotide','text'],
|
|
221 help='Choice of Amino acid, nucleotide or plain text sequences to find features in')
|
|
222
|
|
223 parser.add_option('-l', '--length', type='int', dest='length', default=6,
|
|
224 help='Features (any string of valid characters found in data) of this length will be counted. Synonyms: l-mer, k-mer, n-gram, k-tuple')
|
|
225
|
|
226 #parser.add_option('-n', '--normalize', dest='normalize', default=True, action='store_true',
|
|
227 # help='Normalize counts into relative frequency')
|
|
228
|
|
229 parser.add_option('-m', '--multiple', dest='multiple', default=False, action='store_true',
|
|
230 help='By default all sequences in a fasta file be treated as 1 sequence to profile. This option enables each sequence found in a fasta file to have its own profile.')
|
|
231
|
|
232 parser.add_option('-M', '--metric', type='string', dest='metric',
|
|
233 help='Various metrics to measure count distances by.')
|
|
234
|
|
235 parser.add_option('-x', '--taxonomy', type='string', dest='taxonomy',
|
|
236 help='Taxanomic label for each profile/sequence.')
|
|
237
|
|
238 parser.add_option('-d', '--disable', dest='disable', default=False, action='store_true',
|
|
239 help='By default amino acid and nucleotide characters are grouped by functional category (protein or purine/pyrimidine group) before being counted. Disable this to treat individual characters as distinct.')
|
|
240
|
|
241 parser.add_option('-a', '--abbreviate', dest='abbreviate', default=False, action='store_true',
|
|
242 help='Shorten tree taxonomy labels as much as possible.')
|
|
243
|
|
244 parser.add_option('-s', '--similarity', dest='similarity', default=False, action='store_true',
|
|
245 help='Enables pearson correlation coefficient matrix and any of the binary distance measures to be turned into similarity matrixes.')
|
|
246
|
|
247 parser.add_option('-f', '--filter', type='choice', dest='filter', default='none',
|
3
|
248 choices=['none','count','f','n','e','freq','norm','evd'],
|
0
|
249 help='Choice of [f=raw frequency|n=normal|e=extreme value (Gumbel)] distribution: Features are trimmed from the data based on lower/upper cutoff points according to the given distribution.')
|
|
250
|
|
251 parser.add_option('-L', '--lower', type='float', dest='lower',
|
|
252 help='Filter lower bound is a 0.00 percentages')
|
|
253
|
|
254 parser.add_option('-U', '--upper', type='float', dest='upper',
|
|
255 help='Filter upper bound is a 0.00 percentages')
|
|
256
|
|
257 parser.add_option('-o', '--output', type='string', dest='output',
|
|
258 help='Path of output file to create')
|
|
259
|
|
260 parser.add_option('-T', '--tree', dest='tree', default=False, action='store_true', help='Generate Phylogenetic Tree output file')
|
|
261
|
|
262 parser.add_option('-v', '--version', dest='version', default=False, action='store_true', help='Version number')
|
|
263
|
|
264 # Could also have -D INT decimal precision included for ffprwn .
|
|
265
|
|
266 options, args = parser.parse_args()
|
|
267
|
|
268 if options.version:
|
|
269 print VERSION_NUMBER
|
|
270 return
|
|
271
|
|
272 import time
|
|
273 time_start = time.time()
|
|
274
|
|
275 try:
|
|
276 in_files = args[:]
|
|
277
|
|
278 except:
|
|
279 stop_err("Expecting at least 1 input data file.")
|
|
280
|
|
281
|
|
282 #ffptxt / ffpaa / ffpry
|
|
283 if options.type in 'text':
|
|
284 command = 'ffptxt'
|
|
285
|
|
286 else:
|
|
287 if options.type == 'amino':
|
|
288 command = 'ffpaa'
|
|
289 else:
|
|
290 command = 'ffpry'
|
|
291
|
|
292 if options.disable:
|
|
293 command += ' -d'
|
|
294
|
|
295 if options.multiple:
|
|
296 command += ' -m'
|
|
297
|
|
298 command += ' -l ' + str(options.length)
|
|
299
|
|
300 if len(in_files): #Note: app isn't really suited to stdio
|
|
301 command += ' "' + '" "'.join(in_files) + '"'
|
|
302
|
|
303 #ffpcol / ffpfilt
|
|
304 if options.filter != 'none':
|
|
305 command += ' | ffpfilt'
|
|
306 if options.filter != 'count':
|
|
307 command += ' -' + options.filter
|
|
308 if options.lower > 0:
|
|
309 command += ' --lower ' + str(options.lower)
|
|
310 if options.upper > 0:
|
|
311 command += ' --upper ' + str(options.upper)
|
|
312
|
|
313 else:
|
|
314 command += ' | ffpcol'
|
|
315
|
|
316 if options.type in 'text':
|
|
317 command += ' -t'
|
|
318
|
|
319 else:
|
|
320
|
|
321 if options.type == 'amino':
|
|
322 command += ' -a'
|
|
323
|
|
324 if options.disable:
|
|
325 command += ' -d'
|
|
326
|
|
327 #if options.normalize:
|
|
328 command += ' | ffprwn'
|
|
329
|
|
330 #Now create a taxonomy label file, ensuring a name exists for each profile.
|
|
331 taxonomyNames = getTaxonomyNames(options.type, options.multiple, options.abbreviate, in_files, options.taxonomy)
|
|
332 taxonomyTempFile = getTaxonomyFile(taxonomyNames)
|
1
|
333
|
0
|
334 # -p = Include phylip format 'infile' of the taxon names to use. Very simple, just a list of fasta identifier names.
|
|
335 command += ' | ffpjsd -p ' + taxonomyTempFile
|
|
336
|
|
337 if options.metric and len(options.metric) >0 :
|
|
338 command += ' --' + options.metric
|
|
339 if options.similarity:
|
|
340 command += ' -s'
|
|
341
|
|
342 # Generate Newick (.nhx) formatted tree if we have at least 3 taxonomy items:
|
|
343 if options.tree:
|
|
344 if len(taxonomyNames) > 2:
|
|
345 command += ' | ffptree -q'
|
|
346 else:
|
|
347 stop_err("For a phylogenetic tree display, one must have at least 3 ffp profiles.")
|
|
348
|
1
|
349 #print command
|
|
350
|
0
|
351 result = check_output(command)
|
|
352 with open(options.output,'w') as fw:
|
|
353 fw.writelines(result)
|
|
354 os.remove(taxonomyTempFile)
|
|
355
|
|
356 if __name__ == '__main__':
|
|
357
|
|
358 time_start = time.time()
|
|
359
|
|
360 reportEngine = ReportEngine()
|
|
361 reportEngine.__main__()
|
|
362
|
|
363 print('Execution time (seconds): ' + str(int(time.time()-time_start)))
|
|
364
|