0
|
1 #!/usr/bin/python
|
|
2 import optparse
|
1
|
3 import re
|
0
|
4 import time
|
|
5 import os
|
|
6 import tempfile
|
|
7 import sys
|
|
8 import shlex, subprocess
|
|
9 from string import maketrans
|
|
10
|
|
11 VERSION_NUMBER = "0.1.00"
|
|
12
|
|
13 class MyParser(optparse.OptionParser):
|
|
14 """
|
|
15 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output
|
|
16 Provides a better class for displaying formatted help info in epilog() portion of optParse; allows for carriage returns.
|
|
17 """
|
|
18 def format_epilog(self, formatter):
|
|
19 return self.epilog
|
|
20
|
|
21
|
|
22 def stop_err( msg ):
|
|
23 sys.stderr.write("%s\n" % msg)
|
|
24 sys.exit(1)
|
|
25
|
|
26 def getTaxonomyNames(type, multiple, abbreviate, filepaths, filenames):
|
|
27 """
|
|
28 Returns a taxonomic list of names corresponding to each file being analyzed by ffp.
|
|
29 This may also include names for each fasta sequence found within a file if the
|
|
30 "-m" multiple option is provided. Default is to use the file names rather than fasta id's inside the files.
|
|
31 NOTE: THIS DOES NOT (MUST NOT) REORDER NAMES IN NAME ARRAY.
|
|
32 EACH NAME ENTRY IS TRIMMED AND MADE UNIQUE
|
|
33
|
|
34 @param type string ['text','amino','nucleotide']
|
|
35 @param multiple boolean Flag indicates to look within files for labels
|
|
36 @param abbreviate boolean Flag indicates to shorten labels
|
|
37 @filenames array original input file names as user selected them
|
|
38 @filepaths array resulting galaxy dataset file .dat paths
|
|
39
|
|
40 """
|
|
41 # Take off prefix/suffix whitespace/comma :
|
|
42 taxonomy = filenames.strip().strip(',').split(',')
|
|
43 translations = maketrans(' .- ','____')
|
|
44 names=[]
|
|
45 ptr = 0
|
|
46
|
|
47 for file in filepaths:
|
|
48 # First, convert space, period to underscore in file names. ffprwn IS VERY SENSITIVE ABOUT THIS.
|
|
49 # Also trim labels to 50 characters. Turns out ffpjsd is kneecapping a taxonomy label to 10 characters if it is greater than 50 chars.
|
|
50 taxonomyitem = taxonomy[ptr].strip().translate(translations)[:50]
|
|
51 # print taxonomyitem
|
|
52 if not type in 'text' and multiple:
|
|
53 #Must read each fasta file, looking for all lines beginning ">"
|
|
54 with open(file) as fastafile:
|
|
55 lineptr = 0
|
|
56 for line in fastafile:
|
|
57 if line[0] == '>':
|
|
58 name = line[1:].split(None,1)[0].strip()[:50]
|
|
59 # Odd case where no fasta description found
|
|
60 if name == '': name = taxonomyitem + '.' + str(lineptr)
|
|
61 names.append(name)
|
|
62 lineptr += 1
|
|
63 else:
|
|
64 names.append(taxonomyitem)
|
|
65
|
|
66 ptr += 1
|
|
67
|
|
68 if abbreviate:
|
|
69 names = trimCommonPrefixes(names)
|
|
70 names = trimCommonPrefixes(names, True) # reverse = Suffixes.
|
|
71
|
|
72 return names
|
|
73
|
|
74 def trimCommonPrefixes(names, reverse=False):
|
|
75 """
|
|
76 Examines sorted array of names. Trims off prefix of each subsequent pair.
|
|
77
|
|
78 @param names array of textual labels (file names or fasta taxonomy ids)
|
|
79 @param reverse boolean whether to reverse array strings before doing prefix trimming.
|
|
80 """
|
|
81 wordybits = '|.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
|
82
|
|
83 if reverse:
|
|
84 names = map(lambda name: name[::-1], names) #reverses characters in names
|
|
85
|
|
86 sortednames = sorted(names)
|
|
87 ptr = 0
|
|
88 sortedlen = len(sortednames)
|
|
89 oldprefixlen=0
|
|
90 prefixlen=0
|
|
91 for name in sortednames:
|
|
92 ptr += 1
|
|
93
|
|
94 #If we're not at the very last item, reevaluate prefixlen
|
|
95 if ptr < sortedlen:
|
|
96
|
|
97 # Skip first item in an any duplicate pair. Leave duplicate name in full.
|
|
98 if name == sortednames[ptr]:
|
|
99 if reverse:
|
|
100 continue
|
|
101 else:
|
|
102 names[names.index(name)] = 'DupLabel-' + name
|
|
103 continue
|
|
104
|
|
105 # See http://stackoverflow.com/questions/9114402/regexp-finding-longest-common-prefix-of-two-strings
|
|
106 prefixlen = len( name[:([x[0]==x[1] for x in zip(name, sortednames[ptr])]+[0]).index(0)] )
|
|
107
|
|
108 if prefixlen <= oldprefixlen:
|
|
109 newprefix = name[:oldprefixlen]
|
|
110 else:
|
|
111 newprefix = name[:prefixlen]
|
|
112 # Expands label to include any preceeding characters that were probably part of it.
|
|
113 newprefix = newprefix.rstrip(wordybits)
|
|
114 newname = name[len(newprefix):]
|
|
115 # Some tree visualizers don't show numeric labels?!?!
|
|
116 if not reverse and newname.replace('.','',1).isdigit():
|
|
117 newname = 'id_' + newname
|
|
118 names[names.index(name)] = newname #extract name after prefix part; has nl in it
|
|
119 oldprefixlen = prefixlen
|
|
120
|
|
121 if reverse:
|
|
122 names = map(lambda name: name[::-1], names) #now back to original direction
|
|
123
|
|
124 return names
|
|
125
|
|
126 def getTaxonomyFile(names):
|
|
127 """
|
|
128 FFP's ffpjsd -p [taxon file of labels] option creates a phylip tree with
|
|
129 given taxon labels
|
|
130
|
|
131 @param names array of datafile names or fasta sequence ids
|
|
132 """
|
|
133
|
|
134 try:
|
|
135 temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False)
|
|
136 taxonomyTempFile = temp.name
|
|
137 temp.writelines(name + '\n' for name in names)
|
|
138
|
|
139 except:
|
|
140 stop_err("Galaxy configuration error for ffp_phylogeny tool. Unable to write taxonomy file " + taxonomyTempFile)
|
|
141
|
|
142 finally:
|
|
143 temp.close()
|
|
144
|
|
145 return taxonomyTempFile
|
|
146
|
|
147
|
|
148 def check_output(command):
|
|
149 """
|
|
150 Execute a command line containing a series of pipes; and handle error cases by exiting at first error case. This is a substitute for Python 2.7 subprocess.check_output() - allowing piped commands without shell=True call . Based on Python subprocess docs 17.1.4.2
|
|
151
|
|
152 ISSUE: warnings on stderr are given with no exit code 0:
|
|
153 ffpry: Warning: No keys of length 6 found.
|
|
154 ffpcol: (null): Not a key valued FFP.
|
|
155
|
|
156 Can't use communicate() because this closes processes' stdout
|
|
157 file handle even without errors because of read to end of stdout:
|
|
158 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
|
|
159
|
|
160 """
|
|
161 commands = command.split("|")
|
|
162 processes = []
|
|
163 ptr = 0
|
1
|
164 substantive = re.compile('[a-zA-Z0-9]+')
|
|
165
|
0
|
166 for command_line in commands:
|
|
167 print command_line.strip()
|
|
168 args = shlex.split(command_line.strip())
|
|
169 if ptr == 0:
|
|
170 proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
171 processes.append(proc)
|
|
172 else:
|
1
|
173
|
|
174 #this has to come before error processing?
|
|
175 newProcess = subprocess.Popen(args, stdin=processes[ptr-1].stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
176
|
0
|
177 # It seems the act of reading standard error output is enough to trigger
|
|
178 # error code signal for that process, i.e. so that retcode returns a code.
|
1
|
179 retcode = processes[ptr-1].poll()
|
0
|
180 stderrdata = processes[ptr-1].stderr.read()
|
1
|
181 #Issue with ffptree is it outputs ----....---- on stderr
|
|
182 if retcode or (len(stderrdata) > 0 and substantive.search(stderrdata)):
|
|
183 stop_err(stderrdata)
|
0
|
184
|
1
|
185 processes.append(newProcess)
|
0
|
186 processes[ptr-1].stdout.close() # Allow prev. process to receive a SIGPIPE if current process exits.
|
|
187
|
|
188 ptr += 1
|
|
189
|
1
|
190 retcode = processes[ptr-1].poll()
|
0
|
191 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
|
1
|
192 if retcode or (len(stderrdata) > 0 and substantive.search(stderrdata)):
|
0
|
193 stop_err(stderrdata)
|
|
194
|
|
195 return stdoutdata
|
|
196
|
|
197
|
|
198 class ReportEngine(object):
|
|
199
|
|
200 def __init__(self): pass
|
|
201
|
|
202 def __main__(self):
|
|
203
|
|
204
|
|
205 ## *************************** Parse Command Line *****************************
|
|
206 parser = MyParser(
|
|
207 description = 'FFP (Feature frequency profile) is an alignment free comparison tool',
|
|
208 usage = 'python ffp_phylogeny.py [input_files] [output file] [options]',
|
|
209 epilog="""Details:
|
|
210
|
|
211 FFP (Feature frequency profile) is an alignment free comparison tool for phylogenetic analysis and text comparison. It can be applied to nucleotide sequences, complete genomes, proteomes and even used for text comparison.
|
|
212
|
|
213 """)
|
|
214
|
|
215 parser.set_defaults(row_limit=0)
|
|
216 # Don't use "-h" , it is reserved for --help!
|
|
217
|
|
218 parser.add_option('-t', '--type', type='choice', dest='type', default='text',
|
|
219 choices=['amino','nucleotide','text'],
|
|
220 help='Choice of Amino acid, nucleotide or plain text sequences to find features in')
|
|
221
|
|
222 parser.add_option('-l', '--length', type='int', dest='length', default=6,
|
|
223 help='Features (any string of valid characters found in data) of this length will be counted. Synonyms: l-mer, k-mer, n-gram, k-tuple')
|
|
224
|
|
225 #parser.add_option('-n', '--normalize', dest='normalize', default=True, action='store_true',
|
|
226 # help='Normalize counts into relative frequency')
|
|
227
|
|
228 parser.add_option('-m', '--multiple', dest='multiple', default=False, action='store_true',
|
|
229 help='By default all sequences in a fasta file be treated as 1 sequence to profile. This option enables each sequence found in a fasta file to have its own profile.')
|
|
230
|
|
231 parser.add_option('-M', '--metric', type='string', dest='metric',
|
|
232 help='Various metrics to measure count distances by.')
|
|
233
|
|
234 parser.add_option('-x', '--taxonomy', type='string', dest='taxonomy',
|
|
235 help='Taxanomic label for each profile/sequence.')
|
|
236
|
|
237 parser.add_option('-d', '--disable', dest='disable', default=False, action='store_true',
|
|
238 help='By default amino acid and nucleotide characters are grouped by functional category (protein or purine/pyrimidine group) before being counted. Disable this to treat individual characters as distinct.')
|
|
239
|
|
240 parser.add_option('-a', '--abbreviate', dest='abbreviate', default=False, action='store_true',
|
|
241 help='Shorten tree taxonomy labels as much as possible.')
|
|
242
|
|
243 parser.add_option('-s', '--similarity', dest='similarity', default=False, action='store_true',
|
|
244 help='Enables pearson correlation coefficient matrix and any of the binary distance measures to be turned into similarity matrixes.')
|
|
245
|
|
246 parser.add_option('-f', '--filter', type='choice', dest='filter', default='none',
|
|
247 choices=['none','f','n','e','freq','norm','evd'],
|
|
248 help='Choice of [f=raw frequency|n=normal|e=extreme value (Gumbel)] distribution: Features are trimmed from the data based on lower/upper cutoff points according to the given distribution.')
|
|
249
|
|
250 parser.add_option('-L', '--lower', type='float', dest='lower',
|
|
251 help='Filter lower bound is a 0.00 percentages')
|
|
252
|
|
253 parser.add_option('-U', '--upper', type='float', dest='upper',
|
|
254 help='Filter upper bound is a 0.00 percentages')
|
|
255
|
|
256 parser.add_option('-o', '--output', type='string', dest='output',
|
|
257 help='Path of output file to create')
|
|
258
|
|
259 parser.add_option('-T', '--tree', dest='tree', default=False, action='store_true', help='Generate Phylogenetic Tree output file')
|
|
260
|
|
261 parser.add_option('-v', '--version', dest='version', default=False, action='store_true', help='Version number')
|
|
262
|
|
263 # Could also have -D INT decimal precision included for ffprwn .
|
|
264
|
|
265 options, args = parser.parse_args()
|
|
266
|
|
267 if options.version:
|
|
268 print VERSION_NUMBER
|
|
269 return
|
|
270
|
|
271 import time
|
|
272 time_start = time.time()
|
|
273
|
|
274 try:
|
|
275 in_files = args[:]
|
|
276
|
|
277 except:
|
|
278 stop_err("Expecting at least 1 input data file.")
|
|
279
|
|
280
|
|
281 #ffptxt / ffpaa / ffpry
|
|
282 if options.type in 'text':
|
|
283 command = 'ffptxt'
|
|
284
|
|
285 else:
|
|
286 if options.type == 'amino':
|
|
287 command = 'ffpaa'
|
|
288 else:
|
|
289 command = 'ffpry'
|
|
290
|
|
291 if options.disable:
|
|
292 command += ' -d'
|
|
293
|
|
294 if options.multiple:
|
|
295 command += ' -m'
|
|
296
|
|
297 command += ' -l ' + str(options.length)
|
|
298
|
|
299 if len(in_files): #Note: app isn't really suited to stdio
|
|
300 command += ' "' + '" "'.join(in_files) + '"'
|
|
301
|
|
302 #ffpcol / ffpfilt
|
|
303 if options.filter != 'none':
|
|
304 command += ' | ffpfilt'
|
|
305 if options.filter != 'count':
|
|
306 command += ' -' + options.filter
|
|
307 if options.lower > 0:
|
|
308 command += ' --lower ' + str(options.lower)
|
|
309 if options.upper > 0:
|
|
310 command += ' --upper ' + str(options.upper)
|
|
311
|
|
312 else:
|
|
313 command += ' | ffpcol'
|
|
314
|
|
315 if options.type in 'text':
|
|
316 command += ' -t'
|
|
317
|
|
318 else:
|
|
319
|
|
320 if options.type == 'amino':
|
|
321 command += ' -a'
|
|
322
|
|
323 if options.disable:
|
|
324 command += ' -d'
|
|
325
|
|
326 #if options.normalize:
|
|
327 command += ' | ffprwn'
|
|
328
|
|
329 #Now create a taxonomy label file, ensuring a name exists for each profile.
|
|
330 taxonomyNames = getTaxonomyNames(options.type, options.multiple, options.abbreviate, in_files, options.taxonomy)
|
|
331 taxonomyTempFile = getTaxonomyFile(taxonomyNames)
|
1
|
332
|
0
|
333 # -p = Include phylip format 'infile' of the taxon names to use. Very simple, just a list of fasta identifier names.
|
|
334 command += ' | ffpjsd -p ' + taxonomyTempFile
|
|
335
|
|
336 if options.metric and len(options.metric) >0 :
|
|
337 command += ' --' + options.metric
|
|
338 if options.similarity:
|
|
339 command += ' -s'
|
|
340
|
|
341 # Generate Newick (.nhx) formatted tree if we have at least 3 taxonomy items:
|
|
342 if options.tree:
|
|
343 if len(taxonomyNames) > 2:
|
|
344 command += ' | ffptree -q'
|
|
345 else:
|
|
346 stop_err("For a phylogenetic tree display, one must have at least 3 ffp profiles.")
|
|
347
|
1
|
348 #print command
|
|
349
|
0
|
350 result = check_output(command)
|
|
351 with open(options.output,'w') as fw:
|
|
352 fw.writelines(result)
|
|
353 os.remove(taxonomyTempFile)
|
|
354
|
|
355 if __name__ == '__main__':
|
|
356
|
|
357 time_start = time.time()
|
|
358
|
|
359 reportEngine = ReportEngine()
|
|
360 reportEngine.__main__()
|
|
361
|
|
362 print('Execution time (seconds): ' + str(int(time.time()-time_start)))
|
|
363
|