comparison cpo_galaxy_tree.py @ 6:cabceaa239e4 draft

planemo upload
author jjjjia
date Thu, 23 Aug 2018 12:21:15 -0400
parents fea89c4d5227
children 4d2777aa99db
comparison
equal deleted inserted replaced
5:698579246d0d 6:cabceaa239e4
22 import time 22 import time
23 import urllib.request 23 import urllib.request
24 import gzip 24 import gzip
25 import collections 25 import collections
26 import json 26 import json
27 import ete3
28 import numpy 27 import numpy
28 import ete3 as e
29
29 30
30 #parses some parameters 31 #parses some parameters
31 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") 32 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
32 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate") 33 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate")
33 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)") 34 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)")
103 with gzip.open(inputpath, 'rb') as f: 104 with gzip.open(inputpath, 'rb') as f:
104 gzContent = f.read() 105 gzContent = f.read()
105 with open(outputpath, 'wb') as out: 106 with open(outputpath, 'wb') as out:
106 out.write(gzContent) 107 out.write(gzContent)
107 return True 108 return True
109 def addFace(name):
110 #if its the reference branch, populate the faces with column headers
111 face = e.faces.TextFace(name,fsize=10,tight_text=True)
112 face.border.margin = 5
113 face.margin_right = 5
114 face.margin_left = 5
115 return face
108 #endregion 116 #endregion
109 117
110 #region functions to parse result files 118 #region functions to parse result files
111 def ParseWorkflowResults(pathToResult): 119 def ParseWorkflowResults(pathToResult):
112 _worflowResult = {} 120 _worflowResult = {}
113 r = pandas.read_csv(pathToResult, delimiter='\t', header=None) #read the kraken2report.tsv 121 r = pandas.read_csv(pathToResult, delimiter='\t', header=0)
114 r = r.replace(numpy.nan, '', regex=True) 122 r = r.replace(numpy.nan, '', regex=True)
115 for i in range(len(r.index)): 123 for i in range(len(r.index)):
116 _results = workflowResult() 124 _results = workflowResult()
117 if(str(r.iloc[i,0]).lower() == "new"): 125 if(str(r.loc[r.index[i], 'new']).lower() == "new"):
118 _results.new = True 126 _results.new = True
119 else: 127 else:
120 _results.new = False 128 _results.new = False
121 _results.ID = str(r.iloc[i,1]) 129 _results.ID = str(r.loc[r.index[i], 'ID'])
122 _results.ExpectedSpecies = str(r.iloc[i,2]) 130 _results.ExpectedSpecies = str(r.loc[r.index[i], 'Expected Species'])
123 _results.MLSTSpecies = str(r.iloc[i,3]) 131 _results.MLSTSpecies = str(r.loc[r.index[i], 'MLST Species'])
124 _results.SequenceType = str(r.iloc[i,4]) 132 _results.SequenceType = str(r.loc[r.index[i], 'Sequence Type'])
125 _results.MLSTScheme = (str(r.iloc[i,5])) 133 _results.MLSTScheme = (str(r.loc[r.index[i], 'MLST Scheme']))
126 _results.CarbapenemResistanceGenes = (str(r.iloc[i,6])) 134 _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes']))
127 _results.OtherAMRGenes = (str(r.iloc[i,7])) 135 _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes']))
128 _results.TotalPlasmids = int(r.iloc[i,8]) 136 _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids'])
129 for j in range(0,_results.TotalPlasmids): 137 for j in range(0,_results.TotalPlasmids):
130 _plasmid = plasmidObj() 138 _plasmid = plasmidObj()
131 _plasmid.PlasmidsID =(((str(r.iloc[i,9])).split(";"))[j]) 139 _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j])
132 _plasmid.Num_Contigs = (((str(r.iloc[i,10])).split(";"))[j]) 140 _plasmid.Num_Contigs = (((str(r.loc[r.index[i], 'Num_Contigs'])).split(";"))[j])
133 _plasmid.PlasmidLength = (((str(r.iloc[i,11])).split(";"))[j]) 141 _plasmid.PlasmidLength = (((str(r.loc[r.index[i], 'Plasmid Length'])).split(";"))[j])
134 _plasmid.PlasmidRepType = (((str(r.iloc[i,12])).split(";"))[j]) 142 _plasmid.PlasmidRepType = (((str(r.loc[r.index[i], 'Plasmid RepType'])).split(";"))[j])
135 _plasmid.PlasmidMobility = ((str(r.iloc[i,13])).split(";"))[j] 143 _plasmid.PlasmidMobility = ((str(r.loc[r.index[i], 'Plasmid Mobility'])).split(";"))[j]
136 _plasmid.NearestReference = ((str(r.iloc[i,14])).split(";"))[j] 144 _plasmid.NearestReference = ((str(r.loc[r.index[i], 'Nearest Reference'])).split(";"))[j]
137 _results.plasmids.append(_plasmid) 145 _results.plasmids.append(_plasmid)
138 _results.DefinitelyPlasmidContigs = (str(r.iloc[i,15])) 146 _results.DefinitelyPlasmidContigs = (str(r.loc[r.index[i], 'Definitely Plasmid Contigs']))
139 _results.LikelyPlasmidContigs = (str(r.iloc[i,16])) 147 _results.LikelyPlasmidContigs = (str(r.loc[r.index[i], 'Likely Plasmid Contigs']))
140 _results.row = "\t".join(str(x) for x in r.ix[i].tolist()) 148 _results.row = "\t".join(str(x) for x in r.ix[i].tolist())
141 _worflowResult[_results.ID] = _results 149 _worflowResult[_results.ID] = _results
142 return _worflowResult 150 return _worflowResult
143 151
144 #endregion 152 #endregion
146 def Main(): 154 def Main():
147 metadata = ParseWorkflowResults(metadataPath) 155 metadata = ParseWorkflowResults(metadataPath)
148 distance = read(distancePath) 156 distance = read(distancePath)
149 treeFile = "".join(read(treePath)) 157 treeFile = "".join(read(treePath))
150 158
151 distanceDict = {} 159 distanceDict = {} #store the distance matrix as rowname:list<string>
152 for i in range(len(distance)): 160 for i in range(len(distance)):
153 temp = distance[i].split("\t") 161 temp = distance[i].split("\t")
154 distanceDict[temp[0]] = temp[1:] 162 distanceDict[temp[0]] = temp[1:]
155 #region step5: tree construction 163 #region step5: tree construction
156 t = ete3.Tree(treeFile) 164
165 '''
166 #region create detailed tree
167
168 plasmidCount = 0
169 for n in t.traverse():
170 if (n.is_leaf() and not n.name == "Reference"):
171 mData = metadata[n.name.replace(".fa","")]
172 face = faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
173 face.border.margin = 5
174 face.margin_left = 10
175 face.margin_right = 10
176 n.add_face(face, 0, "aligned")
177 face = faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
178 face.border.margin = 5
179 face.margin_right = 10
180 n.add_face(face, 1, "aligned")
181 face = faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
182 face.border.margin = 5
183 face.margin_right = 10
184 n.add_face(face, 2, "aligned")
185 index = 3
186 if (mData.TotalPlasmids > plasmidCount):
187 plasmidCount = mData.TotalPlasmids
188 for i in range(0, mData.TotalPlasmids):
189 face = faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True)
190 face.border.margin = 5
191 face.margin_right = 10
192 n.add_face(face, index, "aligned")
193 index+=1
194 face = faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True)
195 face.border.margin = 5
196 face.margin_right = 10
197 n.add_face(face, index, "aligned")
198 index+=1
199
200 face = faces.TextFace("Species",fsize=10,tight_text=True)
201 face.border.margin = 5
202 face.margin_right = 10
203 face.margin_left = 10
204 (t&"Reference").add_face(face, 0, "aligned")
205 face = faces.TextFace("Sequence Type",fsize=10,tight_text=True)
206 face.border.margin = 5
207 face.margin_right = 10
208 (t&"Reference").add_face(face, 1, "aligned")
209 face = faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
210 face.border.margin = 5
211 face.margin_right = 10
212 (t&"Reference").add_face(face, 2, "aligned")
213 index = 3
214 for i in range(0, plasmidCount):
215 face = faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True)
216 face.border.margin = 5
217 face.margin_right = 10
218 (t&"Reference").add_face(face, index, "aligned")
219 index+=1
220 face = faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True)
221 face.border.margin = 5
222 face.margin_right = 10
223 (t&"Reference").add_face(face, index, "aligned")
224 index+=1
225
226 t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts)
227
228 #endregion
229 '''
230 #region create box tree
231 #region step5: tree construction
232 treeFile = "".join(read(treePath))
233 t = e.Tree(treeFile)
157 t.set_outgroup(t&"Reference") 234 t.set_outgroup(t&"Reference")
158 235
159 ts = ete3.TreeStyle() 236 #set the tree style
160 ts.show_leaf_name = True 237 ts = e.TreeStyle()
238 ts.show_leaf_name = False
161 ts.show_branch_length = True 239 ts.show_branch_length = True
162 ts.scale = 2000 #pixel per branch length unit 240 ts.scale = 2000 #pixel per branch length unit
163 ts.branch_vertical_margin = 15 #pixel between branches 241 ts.branch_vertical_margin = 15 #pixel between branches
164 style2 = ete3.NodeStyle() 242 style2 = e.NodeStyle()
165 style2["fgcolor"] = "#000000" 243 style2["fgcolor"] = "#000000"
166 style2["shape"] = "circle" 244 style2["shape"] = "circle"
167 style2["vt_line_color"] = "#0000aa" 245 style2["vt_line_color"] = "#0000aa"
168 style2["hz_line_color"] = "#0000aa" 246 style2["hz_line_color"] = "#0000aa"
169 style2["vt_line_width"] = 2 247 style2["vt_line_width"] = 2
170 style2["hz_line_width"] = 2 248 style2["hz_line_width"] = 2
171 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted 249 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
172 style2["hz_line_type"] = 0 250 style2["hz_line_type"] = 0
173 for n in t.traverse(): 251 for n in t.traverse():
174 n.set_style(style2) 252 n.set_style(style2)
175 ''' 253
176 #region create detailed tree 254 #find the plasmid origins
177
178 plasmidCount = 0
179 for n in t.traverse():
180 if (n.is_leaf() and not n.name == "Reference"):
181 mData = metadata[n.name.replace(".fa","")]
182 face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
183 face.border.margin = 5
184 face.margin_left = 10
185 face.margin_right = 10
186 n.add_face(face, 0, "aligned")
187 face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
188 face.border.margin = 5
189 face.margin_right = 10
190 n.add_face(face, 1, "aligned")
191 face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
192 face.border.margin = 5
193 face.margin_right = 10
194 n.add_face(face, 2, "aligned")
195 index = 3
196 if (mData.TotalPlasmids > plasmidCount):
197 plasmidCount = mData.TotalPlasmids
198 for i in range(0, mData.TotalPlasmids):
199 face = ete3.faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True)
200 face.border.margin = 5
201 face.margin_right = 10
202 n.add_face(face, index, "aligned")
203 index+=1
204 face = ete3.faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True)
205 face.border.margin = 5
206 face.margin_right = 10
207 n.add_face(face, index, "aligned")
208 index+=1
209
210 face = ete3.faces.TextFace("Species",fsize=10,tight_text=True)
211 face.border.margin = 5
212 face.margin_right = 10
213 face.margin_left = 10
214 (t&"Reference").add_face(face, 0, "aligned")
215 face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True)
216 face.border.margin = 5
217 face.margin_right = 10
218 (t&"Reference").add_face(face, 1, "aligned")
219 face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
220 face.border.margin = 5
221 face.margin_right = 10
222 (t&"Reference").add_face(face, 2, "aligned")
223 index = 3
224 for i in range(0, plasmidCount):
225 face = ete3.faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True)
226 face.border.margin = 5
227 face.margin_right = 10
228 (t&"Reference").add_face(face, index, "aligned")
229 index+=1
230 face = ete3.faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True)
231 face.border.margin = 5
232 face.margin_right = 10
233 (t&"Reference").add_face(face, index, "aligned")
234 index+=1
235
236 t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts)
237
238 #endregion
239 '''
240 #region create box tree
241 #region step5: tree construction
242 treeFile = "".join(read("./pipelineTest/tree.txt"))
243 t = ete3.Tree(treeFile)
244 t.set_outgroup(t&"Reference")
245
246 ts = ete3.TreeStyle()
247 ts.show_leaf_name = True
248 ts.show_branch_length = True
249 ts.scale = 2000 #pixel per branch length unit
250 ts.branch_vertical_margin = 15 #pixel between branches
251 style2 = ete3.NodeStyle()
252 style2["fgcolor"] = "#000000"
253 style2["shape"] = "circle"
254 style2["vt_line_color"] = "#0000aa"
255 style2["hz_line_color"] = "#0000aa"
256 style2["vt_line_width"] = 2
257 style2["hz_line_width"] = 2
258 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
259 style2["hz_line_type"] = 0
260 for n in t.traverse():
261 n.set_style(style2)
262 plasmidIncs = {} 255 plasmidIncs = {}
263 for key in metadata: 256 for key in metadata:
264 for plasmid in metadata[key].plasmids: 257 for plasmid in metadata[key].plasmids:
265 for inc in plasmid.PlasmidRepType.split(","): 258 for inc in plasmid.PlasmidRepType.split(","):
266 if (inc.lower().find("inc") > -1): 259 if (inc.lower().find("inc") > -1):
268 plasmidIncs[inc] = [metadata[key].ID] 261 plasmidIncs[inc] = [metadata[key].ID]
269 else: 262 else:
270 if metadata[key].ID not in plasmidIncs[inc]: 263 if metadata[key].ID not in plasmidIncs[inc]:
271 plasmidIncs[inc].append(metadata[key].ID) 264 plasmidIncs[inc].append(metadata[key].ID)
272 #plasmidIncs = sorted(plasmidIncs) 265 #plasmidIncs = sorted(plasmidIncs)
273 for n in t.traverse(): 266 for n in t.traverse(): #loop through the nodes of a tree
274 if (n.is_leaf() and n.name == "Reference"): 267 if (n.is_leaf() and n.name == "Reference"):
275 face = ete3.faces.TextFace("New?",fsize=10,tight_text=True) 268 #if its the reference branch, populate the faces with column headers
276 face.border.margin = 5 269 index = 0
277 face.margin_right = 5 270 (t&"Reference").add_face(addFace("SampleID"), index, "aligned")
278 face.margin_left = 5 271 index = index + 1
279 (t&"Reference").add_face(face, 0, "aligned") 272 (t&"Reference").add_face(addFace("New?"), index, "aligned")
273 index = index + 1
280 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node 274 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node
281 face = ete3.faces.TextFace(list(plasmidIncs.keys())[i],fsize=10,tight_text=True) 275 (t&"Reference").add_face(addFace(list(plasmidIncs.keys())[i]), i + index, "aligned")
282 face.border.margin = 5 276 index = index + len(plasmidIncs)
283 face.margin_right = 5 277 (t&"Reference").add_face(addFace("MLSTScheme"), index, "aligned")
284 face.margin_left = 5 278 index = index + 1
285 (t&"Reference").add_face(face, i + 1, "aligned") 279 (t&"Reference").add_face(addFace("Sequence Type"), index, "aligned")
286 face = ete3.faces.TextFace("MLSTScheme",fsize=10,tight_text=True) 280 index = index + 1
287 face.border.margin = 5 281 (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned")
288 face.margin_right = 5 282 index = index + 1
289 face.margin_left = 5 283 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix
290 (t&"Reference").add_face(face, len(plasmidIncs) + 0 + 1, "aligned") 284 (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned")
291 face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True) 285 index = index + len(distanceDict[list(distanceDict.keys())[0]])
292 face.border.margin = 5 286 elif (n.is_leaf() and not n.name == "Reference"):
293 face.margin_right = 5 287 #not reference branches, populate with metadata
294 face.margin_left = 5 288 index = 0
295 (t&"Reference").add_face(face, len(plasmidIncs) + 1 + 1, "aligned") 289 mData = metadata[n.name.replace(".fa","")]
296 face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True) 290 n.add_face(addFace(mData.ID), index, "aligned")
297 face.border.margin = 5 291 index = index + 1
298 face.margin_right = 5 292 if (metadata[n.name.replace(".fa","")].new == True): #new column
299 face.margin_left = 5 293 face = e.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
300 (t&"Reference").add_face(face, len(plasmidIncs) + 2 + 1, "aligned")
301 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the columns (aka the incs) to the reference node
302 face = ete3.faces.TextFace(distanceDict[list(distanceDict.keys())[0]][i],fsize=10,tight_text=True)
303 face.border.margin = 5
304 face.margin_right = 5
305 face.margin_left = 5
306 (t&"Reference").add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned")
307 elif (n.is_leaf() and not n.name == "Reference"):
308 if (metadata[n.name.replace(".fa","")].new == True):
309 face = ete3.faces.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
310 face.border.margin = 5 294 face.border.margin = 5
311 face.margin_right = 5 295 face.margin_right = 5
312 face.margin_left = 5 296 face.margin_left = 5
313 face.vt_align = 1 297 face.vt_align = 1
314 face.ht_align = 1 298 face.ht_align = 1
315 n.add_face(face, 0, "aligned") 299 n.add_face(face, index, "aligned")
300 index = index + 1
316 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes 301 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes
317 if (n.name.replace(".fa","") in plasmidIncs[incs]): 302 if (n.name.replace(".fa","") in plasmidIncs[incs]):
318 face = ete3.faces.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True) 303 face = e.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True)
319 face.border.margin = 5 304 face.border.margin = 5
320 face.margin_right = 5 305 face.margin_right = 5
321 face.margin_left = 5 306 face.margin_left = 5
322 face.vt_align = 1 307 face.vt_align = 1
323 face.ht_align = 1 308 face.ht_align = 1
324 n.add_face(face, list(plasmidIncs.keys()).index(incs) + 1, "aligned") 309 n.add_face(face, list(plasmidIncs.keys()).index(incs) + index, "aligned")
325 mData = metadata[n.name.replace(".fa","")] 310 index = index + len(plasmidIncs)
326 face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True) 311 n.add_face(addFace(mData.MLSTSpecies), index, "aligned")
327 face.border.margin = 5 312 index = index + 1
328 face.margin_right = 5 313 n.add_face(addFace(mData.SequenceType), index, "aligned")
329 face.margin_left = 5 314 index = index + 1
330 n.add_face(face, len(plasmidIncs) + 0 + 1, "aligned") 315 n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned")
331 face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True) 316 index = index + 1
332 face.border.margin = 5
333 face.margin_right = 5
334 face.margin_left = 5
335 n.add_face(face, len(plasmidIncs) + 1 + 1, "aligned")
336 face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
337 face.margin_right = 5
338 face.margin_left = 5
339 n.add_face(face, len(plasmidIncs) + 2 + 1, "aligned")
340 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix 317 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix
341 face = ete3.faces.TextFace(list(distanceDict[n.name])[i],fsize=10,tight_text=True) 318 n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned")
342 face.border.margin = 5 319
343 face.margin_right = 5 320 t.render("./tree.png", w=5000,units="mm", tree_style=ts) #save it as a png. or an phyloxml
344 face.margin_left = 5
345 n.add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned")
346
347 t.render("./tree.png", w=5000,units="mm", tree_style=ts)
348 321
349 #endregion 322 #endregion
350 #endregion 323 #endregion
351 324
352 325