comparison phylogenies/phytab_raxml_using_ptree.serial.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/env python
2
3 import optparse
4 import os
5 import subprocess
6 import multiprocessing
7
8 RESULTS_DIR = 'results'
9 RESULTS_FILE = 'results.phy'
10 RAXML_PREFIX = 'RAxML_result.'
11 NUMTHREADS = '4'
12
13 def unescape(string):
14 mapped_chars = {
15 '>': '__gt__',
16 '<': '__lt__',
17 "'": '__sq__',
18 '"': '__dq__',
19 '[': '__ob__',
20 ']': '__cb__',
21 '{': '__oc__',
22 '}': '__cc__',
23 '@': '__at__',
24 '\n': '__cn__',
25 '\r': '__cr__',
26 '\t': '__tc__',
27 '#': '__pd__'
28 }
29
30 for key, value in mapped_chars.iteritems():
31 string = string.replace(value, key)
32
33 return string
34
35
36 class Species:
37 def __init__(self, string):
38 lis = string.split('\t')
39 # print lis
40 self.species = lis[0]
41 self.gene = lis[1]
42 self.name = lis[2]
43 self.sequence = lis[3]
44
45 def toString(self):
46 return self.species + '\t' + self.sequence
47
48
49 class Gene:
50 def __init__(self, name):
51 self.name = name
52 self.count = 0
53 self.length = 0
54 self.species = []
55
56 def output(self):
57 file_name = self.name + ".phy"
58 location = RESULTS_DIR + os.sep + file_name
59 with open(location, 'w') as f:
60 f.write(str(self.count) + '\t' + str(self.length) + '\n')
61 for s in self.species:
62 f.write(s.toString())
63 return file_name
64
65 def add(self, species):
66 if species.name == "":
67 return
68 self.species.append(species)
69 self.count += 1
70 if self.length == 0:
71 self.length = len(species.sequence) - 1
72
73
74 def output_species(species):
75 file_name = species.gene + ".phy"
76 location = RESULTS_DIR + os.sep + file_name
77 with open(location, 'a') as f:
78 f.write(species.toString())
79 return file_name
80
81
82 def process_phytab(input):
83 files = set()
84 genes = dict()
85 with open(input) as f:
86 for line in f:
87 if len(line) < 4:
88 continue
89 species = Species(line)
90 if species.gene in genes:
91 genes[species.gene].add(species)
92 else:
93 gene = Gene(species.gene)
94 gene.add(species)
95 genes[gene.name] = gene
96 for k, gene in genes.iteritems():
97 files.add(gene.output())
98 return files
99
100
101 def runRaxml(list_of_files, evo, evoDict, list_of_ptrees):
102 # ptreelist = [file for file in list_of_ptrees if file.endswith('.ptre')]
103 for ptre in list_of_ptrees:
104 matching_gene_file = [file for file in list_of_files if file.startswith(ptre[:-5])]
105 gene_file = ''.join(matching_gene_file)
106
107 if gene_file.split(".")[0] in evoDict:
108 newEvo = evoDict[gene_file.split(".")[0]]
109 else:
110 newEvo = evo
111 # cpu_count = str(multiprocessing.cpu_count())
112 file_name = RESULTS_DIR + os.sep + gene_file
113 # to run parsimony trees:
114 # popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count,'-f', 'd', '-s', file_name,'-y', '-m', newEvo, '-n', gene_file[:-4]+'.tre', '-p', '34'])
115 # to run likelihood trees:
116 # popen = subprocess.Popen(['raxmlHPC-PTHREADS', "-T", cpu_count, "-s", file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34'])
117 # to run likelihood trees using starting tree:
118 popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', NUMTHREADS, '-f', 'e','-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-t', ptre])
119
120 popen.wait()
121
122 def readEfile(efile):
123 evoDict = {}
124 with open(efile, "r") as f:
125 for line in f:
126 pair = line.split("\t")
127 evoDict[pair[0].strip()] = pair[1].strip()
128 return evoDict
129
130 def main():
131 usage = """%prog [options]
132 options (listed below) default to 'None' if omitted
133 """
134 parser = optparse.OptionParser(usage=usage)
135
136 parser.add_option(
137 '-i', '--in',
138 dest='input',
139 action='store',
140 type='string',
141 metavar="FILE",
142 help='Name of input data.')
143
144 parser.add_option(
145 '-e', '--evo',
146 dest='evo',
147 action='store',
148 type='string',
149 metavar="EVO",
150 help='Evolution model.')
151
152 parser.add_option(
153 '-f', '--evo-file',
154 dest='efile',
155 action='store',
156 type='string',
157 metavar="EVO_FILE",
158 help='Evolution model file. Format is gene_name [tab] evolution_model.')
159
160 parser.add_option(
161 '-t', '--starting-tree',
162 dest='ptre',
163 action='store',
164 type='string',
165 metavar="PTRE",
166 help='File of starting trees.')
167
168 options, args = parser.parse_args()
169
170 os.mkdir(RESULTS_DIR)
171
172 list_of_species_files = process_phytab(unescape(options.input))
173
174 try:
175 evoDict = readEfile(unescape(options.efile))
176 except IOError:
177 print "Could not find evolution model file, using:", unescape(options.evo)
178 evoDict = {}
179
180 #read in starting treelist
181 with open(options.ptre, 'r') as MPtrees:
182 lines = MPtrees.readlines()
183 for each in lines:
184 if len(each)> 1:
185 line = each.split('\t')
186 gene = line[0]
187 parsTree = line[1]
188 tmptreefile = gene+'.ptre'
189 with open(tmptreefile, 'wb') as tmp:
190 tmp.write(parsTree)
191 list_of_ptrees = [file for file in os.listdir('./') if file.endswith('.ptre')]
192
193 runRaxml(list_of_species_files, unescape(options.evo), evoDict, list_of_ptrees)
194
195 result = [file for file in os.listdir('./') if file.startswith(RAXML_PREFIX)]
196 with open(RESULTS_DIR + os.sep + RESULTS_FILE, "w") as f:
197 for file in result:
198 with open(file, "r") as r:
199 f.write(file[len(RAXML_PREFIX):] + '\t' + r.read())
200
201 if __name__ == '__main__':
202 main()
203