comparison phylogenies/phytab_raxml_pars.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/env python
2
3 ## This tool runs RAxML's parsimony inference on a phytab input.
4 ## If N = # of nodes requested in job runner, then N RAxML jobs will run simultaneously. Make sure that the
5 ## number of processors ('ppn') in the job runner matches the 'numthreads' commandline argument -T.
6 ##
7 ## Usage: ./phytab_raxml_using_ptree.parallel.py -i <phytabinput> -e <model> -f <modelfile> -T 4
8 ## example: ./phytab_raxml_using_ptree.parallel.py -i myphytab.txt -e PROTGAMMAWAG -f None -T 4
9 ## or: ./phytab_raxml_using_ptree.parallel.py -i myphtab.txt -e None -f modelsforeachpartition.txt -T 4
10 ##
11 ## outputs a tab-delimited file with gene-partition and newick parsimony tree on each line.
12
13 import optparse
14 import os
15 import subprocess
16 import multiprocessing
17
18 RESULTS_DIR = 'results'
19 RESULTS_FILE = 'parsimony_results.txt'
20 RAXML_PREFIX = 'RAxML_parsimonyTree.'
21
22 def unescape(string):
23 mapped_chars = {
24 '>': '__gt__',
25 '<': '__lt__',
26 "'": '__sq__',
27 '"': '__dq__',
28 '[': '__ob__',
29 ']': '__cb__',
30 '{': '__oc__',
31 '}': '__cc__',
32 '@': '__at__',
33 '\n': '__cn__',
34 '\r': '__cr__',
35 '\t': '__tc__',
36 '#': '__pd__'
37 }
38
39 for key, value in mapped_chars.iteritems():
40 string = string.replace(value, key)
41
42 return string
43
44
45 class Species:
46 def __init__(self, string):
47 lis = string.split('\t')
48 # print lis
49 self.species = lis[0]
50 self.gene = lis[1]
51 self.name = lis[2]
52 self.sequence = lis[3]
53
54 def toString(self):
55 return self.species + '\t' + self.sequence
56
57
58 class Gene:
59 def __init__(self, name):
60 self.name = name
61 self.count = 0
62 self.length = 0
63 self.species = []
64
65 def output(self):
66 file_name = self.name + ".phy"
67 location = RESULTS_DIR + os.sep + file_name
68 with open(location, 'w') as f:
69 f.write(str(self.count) + '\t' + str(self.length) + '\n')
70 for s in self.species:
71 f.write(s.toString())
72 return file_name
73
74 def add(self, species):
75 if species.name == "":
76 return
77 self.species.append(species)
78 self.count += 1
79 if self.length == 0:
80 self.length = len(species.sequence) - 1
81
82
83 def output_species(species):
84 file_name = species.gene + ".phy"
85 location = RESULTS_DIR + os.sep + file_name
86 with open(location, 'a') as f:
87 f.write(species.toString())
88 return file_name
89
90
91 def process_phytab(input):
92 files = set()
93 genes = dict()
94 with open(input) as f:
95 for line in f:
96 if len(line) < 4:
97 continue
98 species = Species(line)
99 if species.gene in genes:
100 genes[species.gene].add(species)
101 else:
102 gene = Gene(species.gene)
103 gene.add(species)
104 genes[gene.name] = gene
105 for k, gene in genes.iteritems():
106 files.add(gene.output())
107 return files
108
109
110 def runRaxml(list_of_files, evo, evoDict,NUMTHREADS):
111 for gene_file in list_of_files:
112 if gene_file.split(".")[0] in evoDict:
113 newEvo = evoDict[gene_file.split(".")[0]]
114 else:
115 newEvo = evo
116 # cpu_count = str(multiprocessing.cpu_count())
117 file_name = RESULTS_DIR + os.sep + gene_file
118 # to run parsimony trees:
119 popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count,'-f', 'd', '-s', file_name,'-y', '-m', newEvo, '-n', gene_file[:-4]+'.tre', '-p', '34'])
120 # to run likelihood trees:
121 # popen = subprocess.Popen(['raxmlHPC-PTHREADS', "-T", NUMTHREADS, "-s", file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34'])
122 popen.wait()
123
124
125 def toData(text, name):
126 text = name + "\t" + text.replace("\n", "\\n")
127 return text
128
129 def readEfile(efile):
130 evoDict = {}
131 with open(efile, "r") as f:
132 for line in f:
133 pair = line.split("\t")
134 evoDict[pair[0].strip()] = pair[1].strip()
135 return evoDict
136
137 def main():
138 usage = """%prog [options]
139 options (listed below) default to 'None' if omitted
140 """
141 parser = optparse.OptionParser(usage=usage)
142
143 parser.add_option(
144 '-i', '--in',
145 dest='input',
146 action='store',
147 type='string',
148 metavar="FILE",
149 help='Name of input data.')
150
151 parser.add_option(
152 '-e', '--evo',
153 dest='evo',
154 action='store',
155 type='string',
156 metavar="EVO",
157 help='Evolution model.')
158
159 parser.add_option(
160 '-f', '--evo-file',
161 dest='efile',
162 action='store',
163 type='string',
164 metavar="EVO_FILE",
165 help='Evolution model file. Format is gene_name [tab] evolution_model.')
166
167 parser.add_option('-T', '--numthread',dest='numthreads', action='store',type='int', metavar="NUMT", help='Provide number of threads for RAxML')
168 options, args = parser.parse_args()
169
170 os.mkdir(RESULTS_DIR)
171
172 list_of_species_files = process_phytab(unescape(options.input))
173
174
175 try:
176 evoDict = readEfile(unescape(options.efile))
177 except IOError:
178 print "Could not find evolution model file, using:", unescape(options.evo)
179 evoDict = {}
180
181 runRaxml(list_of_species_files, unescape(options.evo), evoDict,str(options.numthreads))
182
183 result = [file for file in os.listdir('./') if file.startswith(RAXML_PREFIX)]
184 with open(RESULTS_DIR + os.sep + RESULTS_FILE, "w") as f:
185 for file in result:
186 with open(file, "r") as r:
187 f.write(file[len(RAXML_PREFIX):-4] + '\t' + r.read())
188
189 if __name__ == '__main__':
190 main()