Mercurial > repos > jjjjia > cpo_prediction
comparison cpo_galaxy_tree.py @ 18:596bf8a792de draft
planemo upload
author | jjjjia |
---|---|
date | Tue, 28 Aug 2018 15:15:09 -0400 |
parents | a14b12a71a53 |
children | e5a7da2239af |
comparison
equal
deleted
inserted
replaced
17:ed3b291693fc | 18:596bf8a792de |
---|---|
37 import ete3 as e #conda ete3 3.1.1**** >requires pyqt5 | 37 import ete3 as e #conda ete3 3.1.1**** >requires pyqt5 |
38 | 38 |
39 | 39 |
40 #parses some parameters | 40 #parses some parameters |
41 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") | 41 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") |
42 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate") | 42 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="absolute file path to phylip tree") |
43 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)") | 43 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path to distance matrix") |
44 parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)") | 44 parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to metadata file") |
45 parser.add_option("-o", "--output_file", dest="outputFile", type="string", default="tree.png", help="Output graphics file. Use ending 'png', 'pdf' or 'svg' to specify file format.") | |
46 | |
47 # sensitive data adder | |
48 parser.add_option("-p", "--sensitive_data", dest="sensitivePath", type="string", default="", help="Spreadsheet (CSV) with sensitive metadata") | |
49 parser.add_option("-c", "--sensitive_cols", dest="sensitiveCols", type="string", default="", help="CSV list of column names from sensitive metadata spreadsheet to use as labels on dendrogram") | |
50 parser.add_option("-b", "--bcid_column", dest="bcidCol", type="string", default="BCID", help="Column name of BCID in sensitive metadata file") | |
51 parser.add_option("-n", "--missing_value", dest="naValue", type="string", default="NA", help="Value to write for missing data.") | |
52 | |
45 (options,args) = parser.parse_args() | 53 (options,args) = parser.parse_args() |
46 treePath = str(options.treePath).lstrip().rstrip() | 54 treePath = str(options.treePath).lstrip().rstrip() |
47 distancePath = str(options.distancePath).lstrip().rstrip() | 55 distancePath = str(options.distancePath).lstrip().rstrip() |
48 metadataPath = str(options.metadataPath).lstrip().rstrip() | 56 metadataPath = str(options.metadataPath).lstrip().rstrip() |
49 | 57 |
58 sensitivePath = str(options.sensitivePath).lstrip().rstrip() | |
59 sensitiveCols = str(options.sensitiveCols).lstrip().rstrip() | |
60 outputFile = str(options.outputFile).lstrip().rstrip() | |
61 bcidCol = str( str(options.bcidCol).lstrip().rstrip() ) | |
62 naValue = str( str(options.naValue).lstrip().rstrip() ) | |
63 | |
50 | 64 |
51 #region result objects | 65 #region result objects |
52 #define some objects to store values from results | 66 #define some objects to store values from results |
53 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). | 67 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). |
68 | |
69 class SensitiveMetadata(object): | |
70 def __init__(self): | |
71 x = pandas.read_csv( sensitivePath ) | |
72 col_names = [ s for s in sensitiveCols.split(',')] # convert to 0 offset | |
73 if not bcidCol in col_names: | |
74 col_names.append( bcidCol ) | |
75 all_cols = [ str(col) for col in x.columns ] | |
76 col_idxs = [ all_cols.index(col) for col in col_names ] | |
77 self.sensitive_data = x.iloc[:, col_idxs] | |
78 def get_columns(self): | |
79 cols = [ str(x) for x in self.sensitive_data.columns ] | |
80 return cols | |
81 def get_value( self, bcid, column_name ): # might be nice to get them all in single call via an input list of bcids ... for later | |
82 bcids= list( self.sensitive_data.loc[:, bcidCol ] ) # get the list of all BCIDs in sensitive metadata | |
83 if not bcid in bcids: | |
84 return naValue | |
85 else: | |
86 row_idx = bcids.index( bcid ) # lookup the row for this BCID | |
87 return self.sensitive_data.loc[ row_idx, column_name ] # return the one value based on the column (col_idx) and this row | |
88 | |
54 class workflowResult(object): | 89 class workflowResult(object): |
55 def __init__(self): | 90 def __init__(self): |
56 self.new = False | 91 self.new = False |
57 self.ID = "?" | 92 self.ID = "?" |
58 self.ExpectedSpecies = "?" | 93 self.ExpectedSpecies = "?" |
59 self.MLSTSpecies = "?" | 94 self.MLSTSpecies = "?" |
60 self.SequenceType = "?" | 95 self.SequenceType = "?" |
61 self.MLSTScheme = "?" | 96 self.MLSTScheme = "?" |
62 self.CarbapenemResistanceGenes ="?" | 97 self.CarbapenemResistanceGenes ="?" |
98 self.plasmidBestMatch ="?" | |
63 self.OtherAMRGenes="?" | 99 self.OtherAMRGenes="?" |
64 self.TotalPlasmids = -1 | 100 self.TotalPlasmids = -1 |
65 self.plasmids = [] | 101 self.plasmids = [] |
66 self.DefinitelyPlasmidContigs ="?" | 102 self.DefinitelyPlasmidContigs ="?" |
67 self.LikelyPlasmidContigs="?" | 103 self.LikelyPlasmidContigs="?" |
143 _results.SequenceType = str(r.loc[r.index[i], 'Sequence Type']) | 179 _results.SequenceType = str(r.loc[r.index[i], 'Sequence Type']) |
144 _results.MLSTScheme = (str(r.loc[r.index[i], 'MLST Scheme'])) | 180 _results.MLSTScheme = (str(r.loc[r.index[i], 'MLST Scheme'])) |
145 _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes'])) | 181 _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes'])) |
146 _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes'])) | 182 _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes'])) |
147 _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids']) | 183 _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids']) |
184 _results.plasmidBestMatch = str(r.loc[r.index[i], 'Plasmid Best Match']) | |
148 for j in range(0,_results.TotalPlasmids): | 185 for j in range(0,_results.TotalPlasmids): |
149 _plasmid = plasmidObj() | 186 _plasmid = plasmidObj() |
150 _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j]) | 187 _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j]) |
151 _plasmid.Num_Contigs = (((str(r.loc[r.index[i], 'Num_Contigs'])).split(";"))[j]) | 188 _plasmid.Num_Contigs = (((str(r.loc[r.index[i], 'Num_Contigs'])).split(";"))[j]) |
152 _plasmid.PlasmidLength = (((str(r.loc[r.index[i], 'Plasmid Length'])).split(";"))[j]) | 189 _plasmid.PlasmidLength = (((str(r.loc[r.index[i], 'Plasmid Length'])).split(";"))[j]) |
161 return _worflowResult | 198 return _worflowResult |
162 | 199 |
163 #endregion | 200 #endregion |
164 | 201 |
165 def Main(): | 202 def Main(): |
203 if len(sensitivePath)>0: | |
204 sensitive_meta_data = SensitiveMetadata() | |
205 | |
166 metadata = ParseWorkflowResults(metadataPath) | 206 metadata = ParseWorkflowResults(metadataPath) |
167 distance = read(distancePath) | 207 distance = read(distancePath) |
168 treeFile = "".join(read(treePath)) | 208 treeFile = "".join(read(treePath)) |
169 | 209 |
170 distanceDict = {} #store the distance matrix as rowname:list<string> | 210 distanceDict = {} #store the distance matrix as rowname:list<string> |
210 #plasmidIncs = sorted(plasmidIncs) | 250 #plasmidIncs = sorted(plasmidIncs) |
211 for n in t.traverse(): #loop through the nodes of a tree | 251 for n in t.traverse(): #loop through the nodes of a tree |
212 if (n.is_leaf() and n.name == "Reference"): | 252 if (n.is_leaf() and n.name == "Reference"): |
213 #if its the reference branch, populate the faces with column headers | 253 #if its the reference branch, populate the faces with column headers |
214 index = 0 | 254 index = 0 |
255 | |
256 if len(sensitivePath)>0: #sensitive metadat @ chris | |
257 for sensitive_data_column in sensitive_meta_data.get_columns(): | |
258 (t&"Reference").add_face(addFace(sensitive_data_column), index, "aligned") | |
259 index = index + 1 | |
260 | |
215 (t&"Reference").add_face(addFace("SampleID"), index, "aligned") | 261 (t&"Reference").add_face(addFace("SampleID"), index, "aligned") |
216 index = index + 1 | 262 index = index + 1 |
217 (t&"Reference").add_face(addFace("New?"), index, "aligned") | 263 (t&"Reference").add_face(addFace("New?"), index, "aligned") |
218 index = index + 1 | 264 index = index + 1 |
219 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node | 265 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node |
222 (t&"Reference").add_face(addFace("MLSTScheme"), index, "aligned") | 268 (t&"Reference").add_face(addFace("MLSTScheme"), index, "aligned") |
223 index = index + 1 | 269 index = index + 1 |
224 (t&"Reference").add_face(addFace("Sequence Type"), index, "aligned") | 270 (t&"Reference").add_face(addFace("Sequence Type"), index, "aligned") |
225 index = index + 1 | 271 index = index + 1 |
226 (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned") | 272 (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned") |
273 index = index + 1 | |
274 (t&"Reference").add_face(addFace("Plasmid Best Match"), index, "aligned") | |
227 index = index + 1 | 275 index = index + 1 |
228 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix | 276 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix |
229 (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned") | 277 (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned") |
230 index = index + len(distanceDict[list(distanceDict.keys())[0]]) | 278 index = index + len(distanceDict[list(distanceDict.keys())[0]]) |
231 elif (n.is_leaf() and not n.name == "Reference"): | 279 elif (n.is_leaf() and not n.name == "Reference"): |
232 #not reference branches, populate with metadata | 280 #not reference branches, populate with metadata |
233 index = 0 | 281 index = 0 |
282 | |
283 if len(sensitivePath)>0: #sensitive metadata @ chris | |
284 # pushing in sensitive data | |
285 for sensitive_data_column in sensitive_meta_data.get_columns(): | |
286 # tree uses bcids like BC18A021A_S12 | |
287 # while sens meta-data uses BC18A021A | |
288 # trim the "_S.*" if present | |
289 bcid = str(mData.ID) | |
290 if bcid.find( "_S" ) != -1: | |
291 bcid = bcid[ 0:bcid.find( "_S" ) ] | |
292 sens_col_val = sensitive_meta_data.get_value(bcid=bcid, column_name=sensitive_data_column ) | |
293 n.add_face(addFace(sens_col_val), index, "aligned") | |
294 index = index + 1 | |
295 | |
234 if (n.name.replace(".fa","") in metadata.keys()): | 296 if (n.name.replace(".fa","") in metadata.keys()): |
235 mData = metadata[n.name.replace(".fa","")] | 297 mData = metadata[n.name.replace(".fa","")] |
236 else: | 298 else: |
237 mData = metadata["na"] | 299 mData = metadata["na"] |
238 n.add_face(addFace(mData.ID), index, "aligned") | 300 n.add_face(addFace(mData.ID), index, "aligned") |
260 index = index + 1 | 322 index = index + 1 |
261 n.add_face(addFace(mData.SequenceType), index, "aligned") | 323 n.add_face(addFace(mData.SequenceType), index, "aligned") |
262 index = index + 1 | 324 index = index + 1 |
263 n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned") | 325 n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned") |
264 index = index + 1 | 326 index = index + 1 |
327 n.add_face(addFace(mData.plasmidBestMatch), index, "aligned") | |
328 index = index + 1 | |
265 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix | 329 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix |
266 if (n.name in distanceDict): #make sure the column is in the distance matrice | 330 if (n.name in distanceDict): #make sure the column is in the distance matrice |
267 n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned") | 331 n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned") |
268 | 332 |
269 t.render("./tree.pdf", w=5000,units="mm", tree_style=ts) #save it as a png. or an phyloxml | 333 t.render(outputFile, w=5000,units="mm", tree_style=ts) #save it as a png, pdf, svg or an phyloxml |
270 | 334 |
271 #endregion | 335 #endregion |
272 #endregion | 336 #endregion |
273 | 337 |
274 | 338 |