0
|
1 #!/usr/bin/python
|
|
2 import optparse
|
|
3 import time
|
|
4 import os
|
|
5 import tempfile
|
|
6 import sys
|
|
7 import shlex, subprocess
|
|
8 from string import maketrans
|
|
9
|
|
10 VERSION_NUMBER = "0.1.00"
|
|
11
|
|
12 class MyParser(optparse.OptionParser):
|
|
13 """
|
|
14 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output
|
|
15 Provides a better class for displaying formatted help info in epilog() portion of optParse; allows for carriage returns.
|
|
16 """
|
|
17 def format_epilog(self, formatter):
|
|
18 return self.epilog
|
|
19
|
|
20
|
|
21 def stop_err( msg ):
|
|
22 sys.stderr.write("%s\n" % msg)
|
|
23 sys.exit(1)
|
|
24
|
|
25 def getTaxonomyNames(type, multiple, abbreviate, filepaths, filenames):
|
|
26 """
|
|
27 Returns a taxonomic list of names corresponding to each file being analyzed by ffp.
|
|
28 This may also include names for each fasta sequence found within a file if the
|
|
29 "-m" multiple option is provided. Default is to use the file names rather than fasta id's inside the files.
|
|
30 NOTE: THIS DOES NOT (MUST NOT) REORDER NAMES IN NAME ARRAY.
|
|
31 EACH NAME ENTRY IS TRIMMED AND MADE UNIQUE
|
|
32
|
|
33 @param type string ['text','amino','nucleotide']
|
|
34 @param multiple boolean Flag indicates to look within files for labels
|
|
35 @param abbreviate boolean Flag indicates to shorten labels
|
|
36 @filenames array original input file names as user selected them
|
|
37 @filepaths array resulting galaxy dataset file .dat paths
|
|
38
|
|
39 """
|
|
40 # Take off prefix/suffix whitespace/comma :
|
|
41 taxonomy = filenames.strip().strip(',').split(',')
|
|
42 translations = maketrans(' .- ','____')
|
|
43 names=[]
|
|
44 ptr = 0
|
|
45
|
|
46 for file in filepaths:
|
|
47 # First, convert space, period to underscore in file names. ffprwn IS VERY SENSITIVE ABOUT THIS.
|
|
48 # Also trim labels to 50 characters. Turns out ffpjsd is kneecapping a taxonomy label to 10 characters if it is greater than 50 chars.
|
|
49 taxonomyitem = taxonomy[ptr].strip().translate(translations)[:50]
|
|
50 # print taxonomyitem
|
|
51 if not type in 'text' and multiple:
|
|
52 #Must read each fasta file, looking for all lines beginning ">"
|
|
53 with open(file) as fastafile:
|
|
54 lineptr = 0
|
|
55 for line in fastafile:
|
|
56 if line[0] == '>':
|
|
57 name = line[1:].split(None,1)[0].strip()[:50]
|
|
58 # Odd case where no fasta description found
|
|
59 if name == '': name = taxonomyitem + '.' + str(lineptr)
|
|
60 names.append(name)
|
|
61 lineptr += 1
|
|
62 else:
|
|
63 names.append(taxonomyitem)
|
|
64
|
|
65 ptr += 1
|
|
66
|
|
67 if abbreviate:
|
|
68 names = trimCommonPrefixes(names)
|
|
69 names = trimCommonPrefixes(names, True) # reverse = Suffixes.
|
|
70
|
|
71 return names
|
|
72
|
|
73 def trimCommonPrefixes(names, reverse=False):
|
|
74 """
|
|
75 Examines sorted array of names. Trims off prefix of each subsequent pair.
|
|
76
|
|
77 @param names array of textual labels (file names or fasta taxonomy ids)
|
|
78 @param reverse boolean whether to reverse array strings before doing prefix trimming.
|
|
79 """
|
|
80 wordybits = '|.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
|
81
|
|
82 if reverse:
|
|
83 names = map(lambda name: name[::-1], names) #reverses characters in names
|
|
84
|
|
85 sortednames = sorted(names)
|
|
86 ptr = 0
|
|
87 sortedlen = len(sortednames)
|
|
88 oldprefixlen=0
|
|
89 prefixlen=0
|
|
90 for name in sortednames:
|
|
91 ptr += 1
|
|
92
|
|
93 #If we're not at the very last item, reevaluate prefixlen
|
|
94 if ptr < sortedlen:
|
|
95
|
|
96 # Skip first item in an any duplicate pair. Leave duplicate name in full.
|
|
97 if name == sortednames[ptr]:
|
|
98 if reverse:
|
|
99 continue
|
|
100 else:
|
|
101 names[names.index(name)] = 'DupLabel-' + name
|
|
102 continue
|
|
103
|
|
104 # See http://stackoverflow.com/questions/9114402/regexp-finding-longest-common-prefix-of-two-strings
|
|
105 prefixlen = len( name[:([x[0]==x[1] for x in zip(name, sortednames[ptr])]+[0]).index(0)] )
|
|
106
|
|
107 if prefixlen <= oldprefixlen:
|
|
108 newprefix = name[:oldprefixlen]
|
|
109 else:
|
|
110 newprefix = name[:prefixlen]
|
|
111 # Expands label to include any preceeding characters that were probably part of it.
|
|
112 newprefix = newprefix.rstrip(wordybits)
|
|
113 newname = name[len(newprefix):]
|
|
114 # Some tree visualizers don't show numeric labels?!?!
|
|
115 if not reverse and newname.replace('.','',1).isdigit():
|
|
116 newname = 'id_' + newname
|
|
117 names[names.index(name)] = newname #extract name after prefix part; has nl in it
|
|
118 oldprefixlen = prefixlen
|
|
119
|
|
120 if reverse:
|
|
121 names = map(lambda name: name[::-1], names) #now back to original direction
|
|
122
|
|
123 return names
|
|
124
|
|
125 def getTaxonomyFile(names):
|
|
126 """
|
|
127 FFP's ffpjsd -p [taxon file of labels] option creates a phylip tree with
|
|
128 given taxon labels
|
|
129
|
|
130 @param names array of datafile names or fasta sequence ids
|
|
131 """
|
|
132
|
|
133 try:
|
|
134 temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False)
|
|
135 taxonomyTempFile = temp.name
|
|
136 temp.writelines(name + '\n' for name in names)
|
|
137
|
|
138 except:
|
|
139 stop_err("Galaxy configuration error for ffp_phylogeny tool. Unable to write taxonomy file " + taxonomyTempFile)
|
|
140
|
|
141 finally:
|
|
142 temp.close()
|
|
143
|
|
144 return taxonomyTempFile
|
|
145
|
|
146
|
|
147 def check_output(command):
|
|
148 """
|
|
149 Execute a command line containing a series of pipes; and handle error cases by exiting at first error case. This is a substitute for Python 2.7 subprocess.check_output() - allowing piped commands without shell=True call . Based on Python subprocess docs 17.1.4.2
|
|
150
|
|
151 ISSUE: warnings on stderr are given with no exit code 0:
|
|
152 ffpry: Warning: No keys of length 6 found.
|
|
153 ffpcol: (null): Not a key valued FFP.
|
|
154
|
|
155 Can't use communicate() because this closes processes' stdout
|
|
156 file handle even without errors because of read to end of stdout:
|
|
157 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
|
|
158
|
|
159 """
|
|
160 commands = command.split("|")
|
|
161 processes = []
|
|
162 ptr = 0
|
|
163 for command_line in commands:
|
|
164 print command_line.strip()
|
|
165 args = shlex.split(command_line.strip())
|
|
166 if ptr == 0:
|
|
167 proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
168 processes.append(proc)
|
|
169 else:
|
|
170 # It seems the act of reading standard error output is enough to trigger
|
|
171 # error code signal for that process, i.e. so that retcode returns a code.
|
|
172 stderrdata = processes[ptr-1].stderr.read()
|
|
173 retcode = processes[ptr-1].poll()
|
|
174 if retcode or len(stderrdata) > 0:
|
|
175 stop_err(stderrdata)
|
|
176
|
|
177 newProcess = subprocess.Popen(args, stdin=processes[ptr-1].stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
178 processes.append(newProcess)
|
|
179 processes[ptr-1].stdout.close() # Allow prev. process to receive a SIGPIPE if current process exits.
|
|
180
|
|
181 ptr += 1
|
|
182
|
|
183 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
|
|
184 retcode = processes[ptr-1].poll()
|
|
185 if retcode or len(stderrdata) > 0:
|
|
186 stop_err(stderrdata)
|
|
187
|
|
188 return stdoutdata
|
|
189
|
|
190
|
|
191 class ReportEngine(object):
|
|
192
|
|
193 def __init__(self): pass
|
|
194
|
|
195 def __main__(self):
|
|
196
|
|
197
|
|
198 ## *************************** Parse Command Line *****************************
|
|
199 parser = MyParser(
|
|
200 description = 'FFP (Feature frequency profile) is an alignment free comparison tool',
|
|
201 usage = 'python ffp_phylogeny.py [input_files] [output file] [options]',
|
|
202 epilog="""Details:
|
|
203
|
|
204 FFP (Feature frequency profile) is an alignment free comparison tool for phylogenetic analysis and text comparison. It can be applied to nucleotide sequences, complete genomes, proteomes and even used for text comparison.
|
|
205
|
|
206 """)
|
|
207
|
|
208 parser.set_defaults(row_limit=0)
|
|
209 # Don't use "-h" , it is reserved for --help!
|
|
210
|
|
211 parser.add_option('-t', '--type', type='choice', dest='type', default='text',
|
|
212 choices=['amino','nucleotide','text'],
|
|
213 help='Choice of Amino acid, nucleotide or plain text sequences to find features in')
|
|
214
|
|
215 parser.add_option('-l', '--length', type='int', dest='length', default=6,
|
|
216 help='Features (any string of valid characters found in data) of this length will be counted. Synonyms: l-mer, k-mer, n-gram, k-tuple')
|
|
217
|
|
218 #parser.add_option('-n', '--normalize', dest='normalize', default=True, action='store_true',
|
|
219 # help='Normalize counts into relative frequency')
|
|
220
|
|
221 parser.add_option('-m', '--multiple', dest='multiple', default=False, action='store_true',
|
|
222 help='By default all sequences in a fasta file be treated as 1 sequence to profile. This option enables each sequence found in a fasta file to have its own profile.')
|
|
223
|
|
224 parser.add_option('-M', '--metric', type='string', dest='metric',
|
|
225 help='Various metrics to measure count distances by.')
|
|
226
|
|
227 parser.add_option('-x', '--taxonomy', type='string', dest='taxonomy',
|
|
228 help='Taxanomic label for each profile/sequence.')
|
|
229
|
|
230 parser.add_option('-d', '--disable', dest='disable', default=False, action='store_true',
|
|
231 help='By default amino acid and nucleotide characters are grouped by functional category (protein or purine/pyrimidine group) before being counted. Disable this to treat individual characters as distinct.')
|
|
232
|
|
233 parser.add_option('-a', '--abbreviate', dest='abbreviate', default=False, action='store_true',
|
|
234 help='Shorten tree taxonomy labels as much as possible.')
|
|
235
|
|
236 parser.add_option('-s', '--similarity', dest='similarity', default=False, action='store_true',
|
|
237 help='Enables pearson correlation coefficient matrix and any of the binary distance measures to be turned into similarity matrixes.')
|
|
238
|
|
239 parser.add_option('-f', '--filter', type='choice', dest='filter', default='none',
|
|
240 choices=['none','f','n','e','freq','norm','evd'],
|
|
241 help='Choice of [f=raw frequency|n=normal|e=extreme value (Gumbel)] distribution: Features are trimmed from the data based on lower/upper cutoff points according to the given distribution.')
|
|
242
|
|
243 parser.add_option('-L', '--lower', type='float', dest='lower',
|
|
244 help='Filter lower bound is a 0.00 percentages')
|
|
245
|
|
246 parser.add_option('-U', '--upper', type='float', dest='upper',
|
|
247 help='Filter upper bound is a 0.00 percentages')
|
|
248
|
|
249 parser.add_option('-o', '--output', type='string', dest='output',
|
|
250 help='Path of output file to create')
|
|
251
|
|
252 parser.add_option('-T', '--tree', dest='tree', default=False, action='store_true', help='Generate Phylogenetic Tree output file')
|
|
253
|
|
254 parser.add_option('-v', '--version', dest='version', default=False, action='store_true', help='Version number')
|
|
255
|
|
256 # Could also have -D INT decimal precision included for ffprwn .
|
|
257
|
|
258 options, args = parser.parse_args()
|
|
259
|
|
260 if options.version:
|
|
261 print VERSION_NUMBER
|
|
262 return
|
|
263
|
|
264 import time
|
|
265 time_start = time.time()
|
|
266
|
|
267 try:
|
|
268 in_files = args[:]
|
|
269
|
|
270 except:
|
|
271 stop_err("Expecting at least 1 input data file.")
|
|
272
|
|
273
|
|
274 #ffptxt / ffpaa / ffpry
|
|
275 if options.type in 'text':
|
|
276 command = 'ffptxt'
|
|
277
|
|
278 else:
|
|
279 if options.type == 'amino':
|
|
280 command = 'ffpaa'
|
|
281 else:
|
|
282 command = 'ffpry'
|
|
283
|
|
284 if options.disable:
|
|
285 command += ' -d'
|
|
286
|
|
287 if options.multiple:
|
|
288 command += ' -m'
|
|
289
|
|
290 command += ' -l ' + str(options.length)
|
|
291
|
|
292 if len(in_files): #Note: app isn't really suited to stdio
|
|
293 command += ' "' + '" "'.join(in_files) + '"'
|
|
294
|
|
295 #ffpcol / ffpfilt
|
|
296 if options.filter != 'none':
|
|
297 command += ' | ffpfilt'
|
|
298 if options.filter != 'count':
|
|
299 command += ' -' + options.filter
|
|
300 if options.lower > 0:
|
|
301 command += ' --lower ' + str(options.lower)
|
|
302 if options.upper > 0:
|
|
303 command += ' --upper ' + str(options.upper)
|
|
304
|
|
305 else:
|
|
306 command += ' | ffpcol'
|
|
307
|
|
308 if options.type in 'text':
|
|
309 command += ' -t'
|
|
310
|
|
311 else:
|
|
312
|
|
313 if options.type == 'amino':
|
|
314 command += ' -a'
|
|
315
|
|
316 if options.disable:
|
|
317 command += ' -d'
|
|
318
|
|
319 #if options.normalize:
|
|
320 command += ' | ffprwn'
|
|
321
|
|
322 #Now create a taxonomy label file, ensuring a name exists for each profile.
|
|
323 taxonomyNames = getTaxonomyNames(options.type, options.multiple, options.abbreviate, in_files, options.taxonomy)
|
|
324 taxonomyTempFile = getTaxonomyFile(taxonomyNames)
|
|
325 # -p = Include phylip format 'infile' of the taxon names to use. Very simple, just a list of fasta identifier names.
|
|
326 command += ' | ffpjsd -p ' + taxonomyTempFile
|
|
327
|
|
328 if options.metric and len(options.metric) >0 :
|
|
329 command += ' --' + options.metric
|
|
330 if options.similarity:
|
|
331 command += ' -s'
|
|
332
|
|
333 # Generate Newick (.nhx) formatted tree if we have at least 3 taxonomy items:
|
|
334 if options.tree:
|
|
335 if len(taxonomyNames) > 2:
|
|
336 command += ' | ffptree -q'
|
|
337 else:
|
|
338 stop_err("For a phylogenetic tree display, one must have at least 3 ffp profiles.")
|
|
339
|
|
340 result = check_output(command)
|
|
341 with open(options.output,'w') as fw:
|
|
342 fw.writelines(result)
|
|
343 os.remove(taxonomyTempFile)
|
|
344
|
|
345 if __name__ == '__main__':
|
|
346
|
|
347 time_start = time.time()
|
|
348
|
|
349 reportEngine = ReportEngine()
|
|
350 reportEngine.__main__()
|
|
351
|
|
352 print('Execution time (seconds): ' + str(int(time.time()-time_start)))
|
|
353
|