annotate ffp_phylogeny.py @ 1:d1c88b118a3f draft

Uploaded
author damion
date Fri, 13 Mar 2015 20:59:28 -0400
parents eb6e5e78a066
children 671667722d3d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
1 #!/usr/bin/python
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
2 import optparse
1
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
3 import re
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
4 import time
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
5 import os
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
6 import tempfile
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
7 import sys
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
8 import shlex, subprocess
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
9 from string import maketrans
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
10
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
11 VERSION_NUMBER = "0.1.00"
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
12
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
13 class MyParser(optparse.OptionParser):
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
14 """
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
15 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
16 Provides a better class for displaying formatted help info in epilog() portion of optParse; allows for carriage returns.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
17 """
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
18 def format_epilog(self, formatter):
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
19 return self.epilog
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
20
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
21
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
22 def stop_err( msg ):
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
23 sys.stderr.write("%s\n" % msg)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
24 sys.exit(1)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
25
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
26 def getTaxonomyNames(type, multiple, abbreviate, filepaths, filenames):
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
27 """
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
28 Returns a taxonomic list of names corresponding to each file being analyzed by ffp.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
29 This may also include names for each fasta sequence found within a file if the
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
30 "-m" multiple option is provided. Default is to use the file names rather than fasta id's inside the files.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
31 NOTE: THIS DOES NOT (MUST NOT) REORDER NAMES IN NAME ARRAY.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
32 EACH NAME ENTRY IS TRIMMED AND MADE UNIQUE
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
33
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
34 @param type string ['text','amino','nucleotide']
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
35 @param multiple boolean Flag indicates to look within files for labels
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
36 @param abbreviate boolean Flag indicates to shorten labels
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
37 @filenames array original input file names as user selected them
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
38 @filepaths array resulting galaxy dataset file .dat paths
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
39
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
40 """
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
41 # Take off prefix/suffix whitespace/comma :
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
42 taxonomy = filenames.strip().strip(',').split(',')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
43 translations = maketrans(' .- ','____')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
44 names=[]
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
45 ptr = 0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
46
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
47 for file in filepaths:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
48 # First, convert space, period to underscore in file names. ffprwn IS VERY SENSITIVE ABOUT THIS.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
49 # Also trim labels to 50 characters. Turns out ffpjsd is kneecapping a taxonomy label to 10 characters if it is greater than 50 chars.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
50 taxonomyitem = taxonomy[ptr].strip().translate(translations)[:50]
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
51 # print taxonomyitem
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
52 if not type in 'text' and multiple:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
53 #Must read each fasta file, looking for all lines beginning ">"
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
54 with open(file) as fastafile:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
55 lineptr = 0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
56 for line in fastafile:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
57 if line[0] == '>':
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
58 name = line[1:].split(None,1)[0].strip()[:50]
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
59 # Odd case where no fasta description found
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
60 if name == '': name = taxonomyitem + '.' + str(lineptr)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
61 names.append(name)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
62 lineptr += 1
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
63 else:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
64 names.append(taxonomyitem)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
65
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
66 ptr += 1
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
67
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
68 if abbreviate:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
69 names = trimCommonPrefixes(names)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
70 names = trimCommonPrefixes(names, True) # reverse = Suffixes.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
71
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
72 return names
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
73
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
74 def trimCommonPrefixes(names, reverse=False):
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
75 """
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
76 Examines sorted array of names. Trims off prefix of each subsequent pair.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
77
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
78 @param names array of textual labels (file names or fasta taxonomy ids)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
79 @param reverse boolean whether to reverse array strings before doing prefix trimming.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
80 """
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
81 wordybits = '|.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
82
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
83 if reverse:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
84 names = map(lambda name: name[::-1], names) #reverses characters in names
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
85
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
86 sortednames = sorted(names)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
87 ptr = 0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
88 sortedlen = len(sortednames)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
89 oldprefixlen=0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
90 prefixlen=0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
91 for name in sortednames:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
92 ptr += 1
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
93
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
94 #If we're not at the very last item, reevaluate prefixlen
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
95 if ptr < sortedlen:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
96
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
97 # Skip first item in an any duplicate pair. Leave duplicate name in full.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
98 if name == sortednames[ptr]:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
99 if reverse:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
100 continue
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
101 else:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
102 names[names.index(name)] = 'DupLabel-' + name
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
103 continue
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
104
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
105 # See http://stackoverflow.com/questions/9114402/regexp-finding-longest-common-prefix-of-two-strings
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
106 prefixlen = len( name[:([x[0]==x[1] for x in zip(name, sortednames[ptr])]+[0]).index(0)] )
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
107
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
108 if prefixlen <= oldprefixlen:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
109 newprefix = name[:oldprefixlen]
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
110 else:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
111 newprefix = name[:prefixlen]
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
112 # Expands label to include any preceeding characters that were probably part of it.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
113 newprefix = newprefix.rstrip(wordybits)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
114 newname = name[len(newprefix):]
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
115 # Some tree visualizers don't show numeric labels?!?!
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
116 if not reverse and newname.replace('.','',1).isdigit():
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
117 newname = 'id_' + newname
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
118 names[names.index(name)] = newname #extract name after prefix part; has nl in it
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
119 oldprefixlen = prefixlen
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
120
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
121 if reverse:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
122 names = map(lambda name: name[::-1], names) #now back to original direction
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
123
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
124 return names
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
125
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
126 def getTaxonomyFile(names):
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
127 """
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
128 FFP's ffpjsd -p [taxon file of labels] option creates a phylip tree with
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
129 given taxon labels
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
130
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
131 @param names array of datafile names or fasta sequence ids
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
132 """
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
133
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
134 try:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
135 temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
136 taxonomyTempFile = temp.name
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
137 temp.writelines(name + '\n' for name in names)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
138
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
139 except:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
140 stop_err("Galaxy configuration error for ffp_phylogeny tool. Unable to write taxonomy file " + taxonomyTempFile)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
141
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
142 finally:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
143 temp.close()
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
144
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
145 return taxonomyTempFile
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
146
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
147
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
148 def check_output(command):
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
149 """
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
150 Execute a command line containing a series of pipes; and handle error cases by exiting at first error case. This is a substitute for Python 2.7 subprocess.check_output() - allowing piped commands without shell=True call . Based on Python subprocess docs 17.1.4.2
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
151
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
152 ISSUE: warnings on stderr are given with no exit code 0:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
153 ffpry: Warning: No keys of length 6 found.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
154 ffpcol: (null): Not a key valued FFP.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
155
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
156 Can't use communicate() because this closes processes' stdout
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
157 file handle even without errors because of read to end of stdout:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
158 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
159
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
160 """
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
161 commands = command.split("|")
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
162 processes = []
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
163 ptr = 0
1
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
164 substantive = re.compile('[a-zA-Z0-9]+')
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
165
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
166 for command_line in commands:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
167 print command_line.strip()
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
168 args = shlex.split(command_line.strip())
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
169 if ptr == 0:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
170 proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
171 processes.append(proc)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
172 else:
1
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
173
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
174 #this has to come before error processing?
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
175 newProcess = subprocess.Popen(args, stdin=processes[ptr-1].stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
176
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
177 # It seems the act of reading standard error output is enough to trigger
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
178 # error code signal for that process, i.e. so that retcode returns a code.
1
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
179 retcode = processes[ptr-1].poll()
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
180 stderrdata = processes[ptr-1].stderr.read()
1
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
181 #Issue with ffptree is it outputs ----....---- on stderr
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
182 if retcode or (len(stderrdata) > 0 and substantive.search(stderrdata)):
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
183 stop_err(stderrdata)
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
184
1
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
185 processes.append(newProcess)
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
186 processes[ptr-1].stdout.close() # Allow prev. process to receive a SIGPIPE if current process exits.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
187
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
188 ptr += 1
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
189
1
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
190 retcode = processes[ptr-1].poll()
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
191 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
1
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
192 if retcode or (len(stderrdata) > 0 and substantive.search(stderrdata)):
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
193 stop_err(stderrdata)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
194
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
195 return stdoutdata
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
196
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
197
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
198 class ReportEngine(object):
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
199
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
200 def __init__(self): pass
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
201
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
202 def __main__(self):
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
203
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
204
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
205 ## *************************** Parse Command Line *****************************
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
206 parser = MyParser(
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
207 description = 'FFP (Feature frequency profile) is an alignment free comparison tool',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
208 usage = 'python ffp_phylogeny.py [input_files] [output file] [options]',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
209 epilog="""Details:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
210
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
211 FFP (Feature frequency profile) is an alignment free comparison tool for phylogenetic analysis and text comparison. It can be applied to nucleotide sequences, complete genomes, proteomes and even used for text comparison.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
212
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
213 """)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
214
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
215 parser.set_defaults(row_limit=0)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
216 # Don't use "-h" , it is reserved for --help!
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
217
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
218 parser.add_option('-t', '--type', type='choice', dest='type', default='text',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
219 choices=['amino','nucleotide','text'],
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
220 help='Choice of Amino acid, nucleotide or plain text sequences to find features in')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
221
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
222 parser.add_option('-l', '--length', type='int', dest='length', default=6,
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
223 help='Features (any string of valid characters found in data) of this length will be counted. Synonyms: l-mer, k-mer, n-gram, k-tuple')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
224
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
225 #parser.add_option('-n', '--normalize', dest='normalize', default=True, action='store_true',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
226 # help='Normalize counts into relative frequency')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
227
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
228 parser.add_option('-m', '--multiple', dest='multiple', default=False, action='store_true',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
229 help='By default all sequences in a fasta file be treated as 1 sequence to profile. This option enables each sequence found in a fasta file to have its own profile.')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
230
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
231 parser.add_option('-M', '--metric', type='string', dest='metric',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
232 help='Various metrics to measure count distances by.')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
233
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
234 parser.add_option('-x', '--taxonomy', type='string', dest='taxonomy',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
235 help='Taxanomic label for each profile/sequence.')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
236
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
237 parser.add_option('-d', '--disable', dest='disable', default=False, action='store_true',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
238 help='By default amino acid and nucleotide characters are grouped by functional category (protein or purine/pyrimidine group) before being counted. Disable this to treat individual characters as distinct.')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
239
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
240 parser.add_option('-a', '--abbreviate', dest='abbreviate', default=False, action='store_true',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
241 help='Shorten tree taxonomy labels as much as possible.')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
242
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
243 parser.add_option('-s', '--similarity', dest='similarity', default=False, action='store_true',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
244 help='Enables pearson correlation coefficient matrix and any of the binary distance measures to be turned into similarity matrixes.')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
245
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
246 parser.add_option('-f', '--filter', type='choice', dest='filter', default='none',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
247 choices=['none','f','n','e','freq','norm','evd'],
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
248 help='Choice of [f=raw frequency|n=normal|e=extreme value (Gumbel)] distribution: Features are trimmed from the data based on lower/upper cutoff points according to the given distribution.')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
249
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
250 parser.add_option('-L', '--lower', type='float', dest='lower',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
251 help='Filter lower bound is a 0.00 percentages')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
252
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
253 parser.add_option('-U', '--upper', type='float', dest='upper',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
254 help='Filter upper bound is a 0.00 percentages')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
255
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
256 parser.add_option('-o', '--output', type='string', dest='output',
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
257 help='Path of output file to create')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
258
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
259 parser.add_option('-T', '--tree', dest='tree', default=False, action='store_true', help='Generate Phylogenetic Tree output file')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
260
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
261 parser.add_option('-v', '--version', dest='version', default=False, action='store_true', help='Version number')
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
262
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
263 # Could also have -D INT decimal precision included for ffprwn .
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
264
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
265 options, args = parser.parse_args()
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
266
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
267 if options.version:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
268 print VERSION_NUMBER
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
269 return
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
270
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
271 import time
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
272 time_start = time.time()
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
273
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
274 try:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
275 in_files = args[:]
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
276
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
277 except:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
278 stop_err("Expecting at least 1 input data file.")
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
279
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
280
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
281 #ffptxt / ffpaa / ffpry
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
282 if options.type in 'text':
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
283 command = 'ffptxt'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
284
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
285 else:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
286 if options.type == 'amino':
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
287 command = 'ffpaa'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
288 else:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
289 command = 'ffpry'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
290
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
291 if options.disable:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
292 command += ' -d'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
293
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
294 if options.multiple:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
295 command += ' -m'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
296
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
297 command += ' -l ' + str(options.length)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
298
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
299 if len(in_files): #Note: app isn't really suited to stdio
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
300 command += ' "' + '" "'.join(in_files) + '"'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
301
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
302 #ffpcol / ffpfilt
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
303 if options.filter != 'none':
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
304 command += ' | ffpfilt'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
305 if options.filter != 'count':
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
306 command += ' -' + options.filter
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
307 if options.lower > 0:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
308 command += ' --lower ' + str(options.lower)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
309 if options.upper > 0:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
310 command += ' --upper ' + str(options.upper)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
311
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
312 else:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
313 command += ' | ffpcol'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
314
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
315 if options.type in 'text':
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
316 command += ' -t'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
317
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
318 else:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
319
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
320 if options.type == 'amino':
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
321 command += ' -a'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
322
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
323 if options.disable:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
324 command += ' -d'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
325
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
326 #if options.normalize:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
327 command += ' | ffprwn'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
328
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
329 #Now create a taxonomy label file, ensuring a name exists for each profile.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
330 taxonomyNames = getTaxonomyNames(options.type, options.multiple, options.abbreviate, in_files, options.taxonomy)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
331 taxonomyTempFile = getTaxonomyFile(taxonomyNames)
1
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
332
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
333 # -p = Include phylip format 'infile' of the taxon names to use. Very simple, just a list of fasta identifier names.
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
334 command += ' | ffpjsd -p ' + taxonomyTempFile
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
335
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
336 if options.metric and len(options.metric) >0 :
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
337 command += ' --' + options.metric
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
338 if options.similarity:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
339 command += ' -s'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
340
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
341 # Generate Newick (.nhx) formatted tree if we have at least 3 taxonomy items:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
342 if options.tree:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
343 if len(taxonomyNames) > 2:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
344 command += ' | ffptree -q'
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
345 else:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
346 stop_err("For a phylogenetic tree display, one must have at least 3 ffp profiles.")
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
347
1
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
348 #print command
d1c88b118a3f Uploaded
damion
parents: 0
diff changeset
349
0
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
350 result = check_output(command)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
351 with open(options.output,'w') as fw:
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
352 fw.writelines(result)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
353 os.remove(taxonomyTempFile)
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
354
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
355 if __name__ == '__main__':
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
356
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
357 time_start = time.time()
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
358
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
359 reportEngine = ReportEngine()
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
360 reportEngine.__main__()
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
361
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
362 print('Execution time (seconds): ' + str(int(time.time()-time_start)))
eb6e5e78a066 Uploaded
damion
parents:
diff changeset
363