Mercurial > repos > jjjjia > cpo_prediction
diff cpo_galaxy_tree.py @ 6:cabceaa239e4 draft
planemo upload
author | jjjjia |
---|---|
date | Thu, 23 Aug 2018 12:21:15 -0400 |
parents | fea89c4d5227 |
children | 4d2777aa99db |
line wrap: on
line diff
--- a/cpo_galaxy_tree.py Tue Aug 21 17:53:08 2018 -0400 +++ b/cpo_galaxy_tree.py Thu Aug 23 12:21:15 2018 -0400 @@ -24,8 +24,9 @@ import gzip import collections import json -import ete3 import numpy +import ete3 as e + #parses some parameters parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") @@ -105,38 +106,45 @@ with open(outputpath, 'wb') as out: out.write(gzContent) return True +def addFace(name): + #if its the reference branch, populate the faces with column headers + face = e.faces.TextFace(name,fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_right = 5 + face.margin_left = 5 + return face #endregion #region functions to parse result files def ParseWorkflowResults(pathToResult): _worflowResult = {} - r = pandas.read_csv(pathToResult, delimiter='\t', header=None) #read the kraken2report.tsv + r = pandas.read_csv(pathToResult, delimiter='\t', header=0) r = r.replace(numpy.nan, '', regex=True) for i in range(len(r.index)): _results = workflowResult() - if(str(r.iloc[i,0]).lower() == "new"): + if(str(r.loc[r.index[i], 'new']).lower() == "new"): _results.new = True else: _results.new = False - _results.ID = str(r.iloc[i,1]) - _results.ExpectedSpecies = str(r.iloc[i,2]) - _results.MLSTSpecies = str(r.iloc[i,3]) - _results.SequenceType = str(r.iloc[i,4]) - _results.MLSTScheme = (str(r.iloc[i,5])) - _results.CarbapenemResistanceGenes = (str(r.iloc[i,6])) - _results.OtherAMRGenes = (str(r.iloc[i,7])) - _results.TotalPlasmids = int(r.iloc[i,8]) + _results.ID = str(r.loc[r.index[i], 'ID']) + _results.ExpectedSpecies = str(r.loc[r.index[i], 'Expected Species']) + _results.MLSTSpecies = str(r.loc[r.index[i], 'MLST Species']) + _results.SequenceType = str(r.loc[r.index[i], 'Sequence Type']) + _results.MLSTScheme = (str(r.loc[r.index[i], 'MLST Scheme'])) + _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes'])) + _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes'])) + _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids']) for j in range(0,_results.TotalPlasmids): _plasmid = plasmidObj() - _plasmid.PlasmidsID =(((str(r.iloc[i,9])).split(";"))[j]) - _plasmid.Num_Contigs = (((str(r.iloc[i,10])).split(";"))[j]) - _plasmid.PlasmidLength = (((str(r.iloc[i,11])).split(";"))[j]) - _plasmid.PlasmidRepType = (((str(r.iloc[i,12])).split(";"))[j]) - _plasmid.PlasmidMobility = ((str(r.iloc[i,13])).split(";"))[j] - _plasmid.NearestReference = ((str(r.iloc[i,14])).split(";"))[j] + _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j]) + _plasmid.Num_Contigs = (((str(r.loc[r.index[i], 'Num_Contigs'])).split(";"))[j]) + _plasmid.PlasmidLength = (((str(r.loc[r.index[i], 'Plasmid Length'])).split(";"))[j]) + _plasmid.PlasmidRepType = (((str(r.loc[r.index[i], 'Plasmid RepType'])).split(";"))[j]) + _plasmid.PlasmidMobility = ((str(r.loc[r.index[i], 'Plasmid Mobility'])).split(";"))[j] + _plasmid.NearestReference = ((str(r.loc[r.index[i], 'Nearest Reference'])).split(";"))[j] _results.plasmids.append(_plasmid) - _results.DefinitelyPlasmidContigs = (str(r.iloc[i,15])) - _results.LikelyPlasmidContigs = (str(r.iloc[i,16])) + _results.DefinitelyPlasmidContigs = (str(r.loc[r.index[i], 'Definitely Plasmid Contigs'])) + _results.LikelyPlasmidContigs = (str(r.loc[r.index[i], 'Likely Plasmid Contigs'])) _results.row = "\t".join(str(x) for x in r.ix[i].tolist()) _worflowResult[_results.ID] = _results return _worflowResult @@ -148,20 +156,90 @@ distance = read(distancePath) treeFile = "".join(read(treePath)) - distanceDict = {} + distanceDict = {} #store the distance matrix as rowname:list<string> for i in range(len(distance)): temp = distance[i].split("\t") distanceDict[temp[0]] = temp[1:] #region step5: tree construction - t = ete3.Tree(treeFile) + + ''' + #region create detailed tree + + plasmidCount = 0 + for n in t.traverse(): + if (n.is_leaf() and not n.name == "Reference"): + mData = metadata[n.name.replace(".fa","")] + face = faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_left = 10 + face.margin_right = 10 + n.add_face(face, 0, "aligned") + face = faces.TextFace(mData.SequenceType,fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_right = 10 + n.add_face(face, 1, "aligned") + face = faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_right = 10 + n.add_face(face, 2, "aligned") + index = 3 + if (mData.TotalPlasmids > plasmidCount): + plasmidCount = mData.TotalPlasmids + for i in range(0, mData.TotalPlasmids): + face = faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_right = 10 + n.add_face(face, index, "aligned") + index+=1 + face = faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_right = 10 + n.add_face(face, index, "aligned") + index+=1 + + face = faces.TextFace("Species",fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_right = 10 + face.margin_left = 10 + (t&"Reference").add_face(face, 0, "aligned") + face = faces.TextFace("Sequence Type",fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_right = 10 + (t&"Reference").add_face(face, 1, "aligned") + face = faces.TextFace("Carbapenamases",fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_right = 10 + (t&"Reference").add_face(face, 2, "aligned") + index = 3 + for i in range(0, plasmidCount): + face = faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_right = 10 + (t&"Reference").add_face(face, index, "aligned") + index+=1 + face = faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True) + face.border.margin = 5 + face.margin_right = 10 + (t&"Reference").add_face(face, index, "aligned") + index+=1 + + t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts) + + #endregion + ''' + #region create box tree + #region step5: tree construction + treeFile = "".join(read(treePath)) + t = e.Tree(treeFile) t.set_outgroup(t&"Reference") - ts = ete3.TreeStyle() - ts.show_leaf_name = True + #set the tree style + ts = e.TreeStyle() + ts.show_leaf_name = False ts.show_branch_length = True ts.scale = 2000 #pixel per branch length unit ts.branch_vertical_margin = 15 #pixel between branches - style2 = ete3.NodeStyle() + style2 = e.NodeStyle() style2["fgcolor"] = "#000000" style2["shape"] = "circle" style2["vt_line_color"] = "#0000aa" @@ -172,93 +250,8 @@ style2["hz_line_type"] = 0 for n in t.traverse(): n.set_style(style2) - ''' - #region create detailed tree - - plasmidCount = 0 - for n in t.traverse(): - if (n.is_leaf() and not n.name == "Reference"): - mData = metadata[n.name.replace(".fa","")] - face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_left = 10 - face.margin_right = 10 - n.add_face(face, 0, "aligned") - face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 10 - n.add_face(face, 1, "aligned") - face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 10 - n.add_face(face, 2, "aligned") - index = 3 - if (mData.TotalPlasmids > plasmidCount): - plasmidCount = mData.TotalPlasmids - for i in range(0, mData.TotalPlasmids): - face = ete3.faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 10 - n.add_face(face, index, "aligned") - index+=1 - face = ete3.faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 10 - n.add_face(face, index, "aligned") - index+=1 - face = ete3.faces.TextFace("Species",fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 10 - face.margin_left = 10 - (t&"Reference").add_face(face, 0, "aligned") - face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 10 - (t&"Reference").add_face(face, 1, "aligned") - face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 10 - (t&"Reference").add_face(face, 2, "aligned") - index = 3 - for i in range(0, plasmidCount): - face = ete3.faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 10 - (t&"Reference").add_face(face, index, "aligned") - index+=1 - face = ete3.faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 10 - (t&"Reference").add_face(face, index, "aligned") - index+=1 - - t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts) - - #endregion - ''' - #region create box tree - #region step5: tree construction - treeFile = "".join(read("./pipelineTest/tree.txt")) - t = ete3.Tree(treeFile) - t.set_outgroup(t&"Reference") - - ts = ete3.TreeStyle() - ts.show_leaf_name = True - ts.show_branch_length = True - ts.scale = 2000 #pixel per branch length unit - ts.branch_vertical_margin = 15 #pixel between branches - style2 = ete3.NodeStyle() - style2["fgcolor"] = "#000000" - style2["shape"] = "circle" - style2["vt_line_color"] = "#0000aa" - style2["hz_line_color"] = "#0000aa" - style2["vt_line_width"] = 2 - style2["hz_line_width"] = 2 - style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted - style2["hz_line_type"] = 0 - for n in t.traverse(): - n.set_style(style2) + #find the plasmid origins plasmidIncs = {} for key in metadata: for plasmid in metadata[key].plasmids: @@ -270,81 +263,61 @@ if metadata[key].ID not in plasmidIncs[inc]: plasmidIncs[inc].append(metadata[key].ID) #plasmidIncs = sorted(plasmidIncs) - for n in t.traverse(): + for n in t.traverse(): #loop through the nodes of a tree if (n.is_leaf() and n.name == "Reference"): - face = ete3.faces.TextFace("New?",fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 5 - face.margin_left = 5 - (t&"Reference").add_face(face, 0, "aligned") + #if its the reference branch, populate the faces with column headers + index = 0 + (t&"Reference").add_face(addFace("SampleID"), index, "aligned") + index = index + 1 + (t&"Reference").add_face(addFace("New?"), index, "aligned") + index = index + 1 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node - face = ete3.faces.TextFace(list(plasmidIncs.keys())[i],fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 5 - face.margin_left = 5 - (t&"Reference").add_face(face, i + 1, "aligned") - face = ete3.faces.TextFace("MLSTScheme",fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 5 - face.margin_left = 5 - (t&"Reference").add_face(face, len(plasmidIncs) + 0 + 1, "aligned") - face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 5 - face.margin_left = 5 - (t&"Reference").add_face(face, len(plasmidIncs) + 1 + 1, "aligned") - face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 5 - face.margin_left = 5 - (t&"Reference").add_face(face, len(plasmidIncs) + 2 + 1, "aligned") - for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the columns (aka the incs) to the reference node - face = ete3.faces.TextFace(distanceDict[list(distanceDict.keys())[0]][i],fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 5 - face.margin_left = 5 - (t&"Reference").add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned") - elif (n.is_leaf() and not n.name == "Reference"): - if (metadata[n.name.replace(".fa","")].new == True): - face = ete3.faces.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True) + (t&"Reference").add_face(addFace(list(plasmidIncs.keys())[i]), i + index, "aligned") + index = index + len(plasmidIncs) + (t&"Reference").add_face(addFace("MLSTScheme"), index, "aligned") + index = index + 1 + (t&"Reference").add_face(addFace("Sequence Type"), index, "aligned") + index = index + 1 + (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned") + index = index + 1 + for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix + (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned") + index = index + len(distanceDict[list(distanceDict.keys())[0]]) + elif (n.is_leaf() and not n.name == "Reference"): + #not reference branches, populate with metadata + index = 0 + mData = metadata[n.name.replace(".fa","")] + n.add_face(addFace(mData.ID), index, "aligned") + index = index + 1 + if (metadata[n.name.replace(".fa","")].new == True): #new column + face = e.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 face.vt_align = 1 face.ht_align = 1 - n.add_face(face, 0, "aligned") + n.add_face(face, index, "aligned") + index = index + 1 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes if (n.name.replace(".fa","") in plasmidIncs[incs]): - face = ete3.faces.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True) + face = e.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 face.vt_align = 1 face.ht_align = 1 - n.add_face(face, list(plasmidIncs.keys()).index(incs) + 1, "aligned") - mData = metadata[n.name.replace(".fa","")] - face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 5 - face.margin_left = 5 - n.add_face(face, len(plasmidIncs) + 0 + 1, "aligned") - face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 5 - face.margin_left = 5 - n.add_face(face, len(plasmidIncs) + 1 + 1, "aligned") - face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True) - face.margin_right = 5 - face.margin_left = 5 - n.add_face(face, len(plasmidIncs) + 2 + 1, "aligned") + n.add_face(face, list(plasmidIncs.keys()).index(incs) + index, "aligned") + index = index + len(plasmidIncs) + n.add_face(addFace(mData.MLSTSpecies), index, "aligned") + index = index + 1 + n.add_face(addFace(mData.SequenceType), index, "aligned") + index = index + 1 + n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned") + index = index + 1 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix - face = ete3.faces.TextFace(list(distanceDict[n.name])[i],fsize=10,tight_text=True) - face.border.margin = 5 - face.margin_right = 5 - face.margin_left = 5 - n.add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned") - - t.render("./tree.png", w=5000,units="mm", tree_style=ts) + n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned") + + t.render("./tree.png", w=5000,units="mm", tree_style=ts) #save it as a png. or an phyloxml #endregion #endregion