comparison phylogenies/phytab_raxml_using_ptree.parallel.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/env python
2
3 ##
4 ## This tool runs RAxML to optimize branch lengths on a tree. (Multiple trees if multi-gene phytab provided).
5 ## If N = # of nodes requested in job runner, then N RAxML jobs will run simultaneously. Make sure that the
6 ## number of processors ('ppn') in the job runner matches the 'numthreads' commandline arguement.
7 ##
8 ## Usage: ./phytab_raxml_using_ptree.parallel.py -i <phytabinput> -e <model> -f <modelfile> -T 4
9 ## example: ./phytab_raxml_using_ptree.parallel.py -i myphytab.txt -e PROTGAMMAWAG -f None -T 4
10 ## or: ./phytab_raxml_using_ptree.parallel.py -i myphtab.txt -e None -f modelsforeachpartition.txt -T 4
11 ##
12 import optparse
13 import os
14 import subprocess
15 import multiprocessing
16
17 RESULTS_DIR = 'results'
18 RESULTS_FILE = 'results.phy'
19 RAXML_PREFIX = 'RAxML_result.'
20 #NUMTHREADS = '4'
21
22 def unescape(string):
23 mapped_chars = {
24 '>': '__gt__',
25 '<': '__lt__',
26 "'": '__sq__',
27 '"': '__dq__',
28 '[': '__ob__',
29 ']': '__cb__',
30 '{': '__oc__',
31 '}': '__cc__',
32 '@': '__at__',
33 '\n': '__cn__',
34 '\r': '__cr__',
35 '\t': '__tc__',
36 '#': '__pd__'
37 }
38
39 for key, value in mapped_chars.iteritems():
40 string = string.replace(value, key)
41
42 return string
43
44
45 class Species:
46 def __init__(self, string):
47 lis = string.split('\t')
48 # print lis
49 self.species = lis[0]
50 self.gene = lis[1]
51 self.name = lis[2]
52 self.sequence = lis[3]
53
54 def toString(self):
55 return self.species + '\t' + self.sequence
56
57 class Gene:
58 def __init__(self, name):
59 self.name = name
60 self.count = 0
61 self.length = 0
62 self.species = []
63
64 def output(self):
65 file_name = self.name + ".phy"
66 location = RESULTS_DIR + os.sep + file_name
67 with open(location, 'w') as f:
68 f.write(str(self.count) + '\t' + str(self.length) + '\n')
69 for s in self.species:
70 f.write(s.toString())
71 return file_name
72
73 def add(self, species):
74 if species.name == "":
75 return
76 self.species.append(species)
77 self.count += 1
78 if self.length == 0:
79 self.length = len(species.sequence) - 1
80
81
82 def output_species(species):
83 file_name = species.gene + ".phy"
84 location = RESULTS_DIR + os.sep + file_name
85 with open(location, 'a') as f:
86 f.write(species.toString())
87 return file_name
88
89
90 def process_phytab(input):
91 files = set()
92 genes = dict()
93 with open(input) as f:
94 for line in f:
95 if len(line) < 4:
96 continue
97 species = Species(line)
98 if species.gene in genes:
99 genes[species.gene].add(species)
100 else:
101 gene = Gene(species.gene)
102 gene.add(species)
103 genes[gene.name] = gene
104 for k, gene in genes.iteritems():
105 files.add(gene.output())
106 return files
107
108
109 def runRaxml(list_of_files, evo, evoDict, list_of_ptrees,NUMTHREADS):
110 list_of_ptrees = sorted(list_of_ptrees)
111 count = 0
112 for ptre in list_of_ptrees:
113 count+=1
114 matching_gene_file = [file for file in list_of_files if file.startswith(ptre[:-5])]
115 gene_file = ''.join(matching_gene_file)
116
117 if gene_file.split(".")[0] in evoDict:
118 newEvo = evoDict[gene_file.split(".")[0]]
119 else:
120 newEvo = evo
121 #cpu_count = str(multiprocessing.cpu_count())
122 file_name = RESULTS_DIR + os.sep + gene_file
123 # to run parsimony trees:
124 # popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count,'-f', 'd', '-s', file_name,'-y', '-m', newEvo, '-n', gene_file[:-4]+'.tre', '-p', '34'])
125 # to run likelihood trees:
126 # popen = subprocess.Popen(['raxmlHPC-PTHREADS', "-T", cpu_count, "-s", file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34'])
127 # to run likelihood trees using starting tree:
128 # popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count, '-f', 'e','-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-t', ptre])
129 # popen.wait()
130 raxml_cmd = ['raxmlHPC-PTHREADS', '-T', NUMTHREADS, '-f' 'e', '-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-t', ptre]
131 if count == len(list_of_ptrees):
132 run = subprocess.Popen(raxml_cmd)
133 run.wait()
134 else:
135 run = subprocess.Popen(raxml_cmd)
136 run.communicate()[0]
137
138 def readEfile(efile):
139 evoDict = {}
140 with open(efile, "r") as f:
141 for line in f:
142 pair = line.split("\t")
143 evoDict[pair[0].strip()] = pair[1].strip()
144 return evoDict
145
146 def main():
147 usage = """%prog [options]
148 options (listed below) default to 'None' if omitted
149 """
150 parser = optparse.OptionParser(usage=usage)
151
152 parser.add_option(
153 '-i', '--in',
154 dest='input',
155 action='store',
156 type='string',
157 metavar="FILE",
158 help='Name of input data.')
159
160 parser.add_option(
161 '-e', '--evo',
162 dest='evo',
163 action='store',
164 type='string',
165 metavar="EVO",
166 help='Evolution model.')
167
168 parser.add_option(
169 '-f', '--evo-file',
170 dest='efile',
171 action='store',
172 type='string',
173 metavar="EVO_FILE",
174 help='Evolution model file. Format is gene_name [tab] evolution_model.')
175
176 parser.add_option(
177 '-t', '--starting-tree',
178 dest='ptre',
179 action='store',
180 type='string',
181 metavar="PTRE",
182 help='File of starting trees.')
183
184 parser.add_option('-T', '--numthread',dest='numthreads', action='store',type='int', metavar="NUMT", help='Provide number of threads for RAxML')
185 options, args = parser.parse_args()
186
187 os.mkdir(RESULTS_DIR)
188
189 list_of_species_files = process_phytab(unescape(options.input))
190
191 try:
192 evoDict = readEfile(unescape(options.efile))
193 except IOError:
194 print "No sequence model file provide...using", unescape(options.evo), "as the model"
195 evoDict = {}
196
197 #read in starting treelist
198 with open(options.ptre, 'r') as MPtrees:
199 lines = MPtrees.readlines()
200 for each in lines:
201 if len(each)> 1:
202 line = each.split('\t')
203 gene = line[0]
204 parsTree = line[1]
205 tmptreefile = gene+'.ptre'
206 with open(tmptreefile, 'wb') as tmp:
207 tmp.write(parsTree)
208 list_of_ptrees = [file for file in os.listdir('./') if file.endswith('.ptre')]
209
210 runRaxml(list_of_species_files, unescape(options.evo), evoDict, list_of_ptrees, str(options.numthreads))
211
212 result = [file for file in os.listdir('./') if file.startswith(RAXML_PREFIX)]
213 result = sorted(result)
214 with open(RESULTS_DIR + os.sep + RESULTS_FILE, "w") as f:
215 for file in result:
216 with open(file, "r") as r:
217 f.write(file[len(RAXML_PREFIX):] + '\t' + r.read())
218
219 if __name__ == '__main__':
220 main()