12
+ − 1 #!/home/jjjjia/.conda/envs/py36/bin/python
+ − 2
+ − 3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python
+ − 4 #$ -V # Pass environment variables to the job
+ − 5 #$ -N CPO_pipeline # Replace with a more specific job name
+ − 6 #$ -wd /home/jjjjia/testCases # Use the current working dir
+ − 7 #$ -pe smp 1 # Parallel Environment (how many cores)
+ − 8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core*
+ − 9 #$ -e ./logs/$JOB_ID.err
+ − 10 #$ -o ./logs/$JOB_ID.log
+ − 11 #$ -m ea
+ − 12 #$ -M bja20@sfu.ca
+ − 13
+ − 14 # >python cpo_galaxy_tree.py -t /path/to/tree.ph -d /path/to/distance/matrix -m /path/to/metadata
+ − 15 # python cpo_galaxy_tree.py -t tree.txt -d ./dist.tabular -m ./metadata.tsv
+ − 16
+ − 17 # <requirements>
+ − 18 # <requirement type="package" version="0.23.4">pandas</requirement>
+ − 19 # <requirement type="package" version="3.6">python</requirement>
+ − 20 # <requirement type="package" version="3.1.1">ete3</requirement>
+ − 21 # <requirement type="package" version="5.9.3">pyqt</requirement>
+ − 22 # </requirements>
+ − 23
+ − 24 import subprocess
+ − 25 import pandas #conda pandas
+ − 26 import optparse
+ − 27 import os
+ − 28 import datetime
+ − 29 import sys
+ − 30 import time
+ − 31 import urllib.request
+ − 32 import gzip
+ − 33 import collections
+ − 34 import json
+ − 35 import numpy #conda numpy
+ − 36 import ete3 as e #conda ete3 3.1.1**** >requires pyqt5
+ − 37 import csv
+ − 38
+ − 39 #parses some parameters
+ − 40 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
+ − 41 parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate")
+ − 42 parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)")
+ − 43 parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)")
+ − 44 parser.add_option("-p", "--sensitive_data", dest="sensitivePath", type="string", default="", help="Spreadsheet (CSV) with sensitive metadata")
+ − 45 parser.add_option("-c", "--sensitive_cols", dest="sensitiveCols", type="string", default="", help="CSV list of column names from sensitive metadata spreadsheet to use as labels on dendrogram")
+ − 46 parser.add_option("-o", "--output_file", dest="outputFile", type="string", default="tree.png", help="Output graphics file. Use ending 'png', 'pdf' or 'svg' to specify file format.")
+ − 47 parser.add_option("-b", "--bcid_column", dest="bcidCol", type="string", default="BCID", help="Column name of BCID in sensitive metadata file")
+ − 48 parser.add_option("-n", "--missing_value", dest="naValue", type="string", default="NA", help="Value to write for missing data.")
+ − 49
+ − 50 (options,args) = parser.parse_args()
+ − 51 treePath = str(options.treePath).lstrip().rstrip()
+ − 52 distancePath = str(options.distancePath).lstrip().rstrip()
+ − 53 metadataPath = str(options.metadataPath).lstrip().rstrip()
+ − 54 sensitivePath = str(options.sensitivePath).lstrip().rstrip()
+ − 55 sensitiveCols = str(options.sensitiveCols).lstrip().rstrip()
+ − 56 outputFile = str(options.outputFile).lstrip().rstrip()
+ − 57 bcidCol = str( str(options.bcidCol).lstrip().rstrip() )
+ − 58 naValue = str( str(options.naValue).lstrip().rstrip() )
+ − 59
+ − 60 if len(sensitivePath) == 0:
+ − 61 print("Must give a file with sensitive meta data. Option -p, or --sensitive_data")
+ − 62
+ − 63 ### test values to uncomment
+ − 64 # sensitivePath = "./sensitive_metadata.csv"
+ − 65 # sensitiveCols = "Name,Care facility"
+ − 66 # outputFile = "newtree_test.png"
+ − 67 # bcidCol = "BCID"
+ − 68
+ − 69
+ − 70
+ − 71 import pandas
+ − 72 class SensitiveMetadata(object):
+ − 73 def __init__(self):
+ − 74 x = pandas.read_csv( sensitivePath )
+ − 75 col_names = [ s for s in sensitiveCols.split(',')] # convert to 0 offset
+ − 76 if not bcidCol in col_names:
+ − 77 col_names.append( bcidCol )
+ − 78 all_cols = [ str(col) for col in x.columns ]
+ − 79 col_idxs = [ all_cols.index(col) for col in col_names ]
+ − 80 self.sensitive_data = x.iloc[:, col_idxs]
+ − 81 def get_columns(self):
+ − 82 cols = [ str(x) for x in self.sensitive_data.columns ]
+ − 83 return cols
+ − 84 def get_value( self, bcid, column_name ): # might be nice to get them all in single call via an input list of bcids ... for later
+ − 85 bcids= list( self.sensitive_data.loc[:, bcidCol ] ) # get the list of all BCIDs in sensitive metadata
+ − 86 if not bcid in bcids:
+ − 87 return naValue
+ − 88 else:
+ − 89 row_idx = bcids.index( bcid ) # lookup the row for this BCID
+ − 90 return self.sensitive_data.loc[ row_idx, column_name ] # return the one value based on the column (col_idx) and this row
+ − 91
+ − 92
+ − 93 #region result objects
+ − 94 #define some objects to store values from results
+ − 95 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
+ − 96 class workflowResult(object):
+ − 97 def __init__(self):
+ − 98 self.new = False
+ − 99 self.ID = ""
+ − 100 self.ExpectedSpecies = ""
+ − 101 self.MLSTSpecies = ""
+ − 102 self.SequenceType = ""
+ − 103 self.MLSTScheme = ""
+ − 104 self.CarbapenemResistanceGenes =""
+ − 105 self.OtherAMRGenes=""
+ − 106 self.TotalPlasmids = 0
+ − 107 self.plasmids = []
+ − 108 self.DefinitelyPlasmidContigs =""
+ − 109 self.LikelyPlasmidContigs=""
+ − 110 self.row = ""
+ − 111 class plasmidObj(object):
+ − 112 def __init__(self):
+ − 113 self.PlasmidsID = 0
+ − 114 self.Num_Contigs = 0
+ − 115 self.PlasmidLength = 0
+ − 116 self.PlasmidRepType = ""
+ − 117 self.PlasmidMobility = ""
+ − 118 self.NearestReference = ""
+ − 119
+ − 120 #endregion
+ − 121
+ − 122 #region useful functions
+ − 123 def read(path): #read in a text file to a list
+ − 124 return [line.rstrip('\n') for line in open(path)]
+ − 125 def execute(command): #subprocess.popen call bash command
+ − 126 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+ − 127
+ − 128 # Poll process for new output until finished
+ − 129 while True:
+ − 130 nextline = process.stdout.readline()
+ − 131 if nextline == '' and process.poll() is not None:
+ − 132 break
+ − 133 sys.stdout.write(nextline)
+ − 134 sys.stdout.flush()
+ − 135
+ − 136 output = process.communicate()[0]
+ − 137 exitCode = process.returncode
+ − 138
+ − 139 if (exitCode == 0):
+ − 140 return output
+ − 141 else:
+ − 142 raise subprocess.CalledProcessError(exitCode, command)
+ − 143 def httpGetFile(url, filepath=""): #download a file from the web
+ − 144 if (filepath == ""):
+ − 145 return urllib.request.urlretrieve(url)
+ − 146 else:
+ − 147 urllib.request.urlretrieve(url, filepath)
+ − 148 return True
+ − 149 def gunzip(inputpath="", outputpath=""): #gunzip
+ − 150 if (outputpath == ""):
+ − 151 with gzip.open(inputpath, 'rb') as f:
+ − 152 gzContent = f.read()
+ − 153 return gzContent
+ − 154 else:
+ − 155 with gzip.open(inputpath, 'rb') as f:
+ − 156 gzContent = f.read()
+ − 157 with open(outputpath, 'wb') as out:
+ − 158 out.write(gzContent)
+ − 159 return True
+ − 160 def addFace(name): #function to add a facet to a tree
+ − 161 #if its the reference branch, populate the faces with column headers
+ − 162 face = e.faces.TextFace(name,fsize=10,tight_text=True)
+ − 163 face.border.margin = 5
+ − 164 face.margin_right = 5
+ − 165 face.margin_left = 5
+ − 166 return face
+ − 167 #endregion
+ − 168
+ − 169 #region functions to parse result files
+ − 170 def ParseWorkflowResults(pathToResult):
+ − 171 _worflowResult = {}
+ − 172 r = pandas.read_csv(pathToResult, delimiter='\t', header=0)
+ − 173 r = r.replace(numpy.nan, '', regex=True)
+ − 174 for i in range(len(r.index)):
+ − 175 _results = workflowResult()
+ − 176 if(str(r.loc[r.index[i], 'new']).lower() == "new"):
+ − 177 _results.new = True
+ − 178 else:
+ − 179 _results.new = False
+ − 180 _results.ID = str(r.loc[r.index[i], 'ID'])
+ − 181 _results.ExpectedSpecies = str(r.loc[r.index[i], 'Expected Species'])
+ − 182 _results.MLSTSpecies = str(r.loc[r.index[i], 'MLST Species'])
+ − 183 _results.SequenceType = str(r.loc[r.index[i], 'Sequence Type'])
+ − 184 _results.MLSTScheme = (str(r.loc[r.index[i], 'MLST Scheme']))
+ − 185 _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes']))
+ − 186 _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes']))
+ − 187 _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids'])
+ − 188 for j in range(0,_results.TotalPlasmids):
+ − 189 _plasmid = plasmidObj()
+ − 190 _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j])
+ − 191 _plasmid.Num_Contigs = (((str(r.loc[r.index[i], 'Num_Contigs'])).split(";"))[j])
+ − 192 _plasmid.PlasmidLength = (((str(r.loc[r.index[i], 'Plasmid Length'])).split(";"))[j])
+ − 193 _plasmid.PlasmidRepType = (((str(r.loc[r.index[i], 'Plasmid RepType'])).split(";"))[j])
+ − 194 _plasmid.PlasmidMobility = ((str(r.loc[r.index[i], 'Plasmid Mobility'])).split(";"))[j]
+ − 195 _plasmid.NearestReference = ((str(r.loc[r.index[i], 'Nearest Reference'])).split(";"))[j]
+ − 196 _results.plasmids.append(_plasmid)
+ − 197 _results.DefinitelyPlasmidContigs = (str(r.loc[r.index[i], 'Definitely Plasmid Contigs']))
+ − 198 _results.LikelyPlasmidContigs = (str(r.loc[r.index[i], 'Likely Plasmid Contigs']))
+ − 199 _results.row = "\t".join(str(x) for x in r.ix[i].tolist())
+ − 200 _worflowResult[_results.ID] = _results
+ − 201 return _worflowResult
+ − 202
+ − 203 #endregion
+ − 204
+ − 205 def Main():
+ − 206 sensitive_meta_data = SensitiveMetadata()
+ − 207 # print( sensitive_meta_data.get_columns() )
+ − 208 metadata = ParseWorkflowResults(metadataPath)
+ − 209 distance = read(distancePath)
+ − 210 treeFile = "".join(read(treePath))
+ − 211
+ − 212 distanceDict = {} #store the distance matrix as rowname:list<string>
+ − 213 for i in range(len(distance)):
+ − 214 temp = distance[i].split("\t")
+ − 215 distanceDict[temp[0]] = temp[1:]
+ − 216 #region step5: tree construction
+ − 217
+ − 218 '''
+ − 219 #region create detailed tree
+ − 220
+ − 221 plasmidCount = 0
+ − 222 for n in t.traverse():
+ − 223 if (n.is_leaf() and not n.name == "Reference"):
+ − 224 mData = metadata[n.name.replace(".fa","")]
+ − 225 face = faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True)
+ − 226 face.border.margin = 5
+ − 227 face.margin_left = 10
+ − 228 face.margin_right = 10
+ − 229 n.add_face(face, 0, "aligned")
+ − 230 face = faces.TextFace(mData.SequenceType,fsize=10,tight_text=True)
+ − 231 face.border.margin = 5
+ − 232 face.margin_right = 10
+ − 233 n.add_face(face, 1, "aligned")
+ − 234 face = faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True)
+ − 235 face.border.margin = 5
+ − 236 face.margin_right = 10
+ − 237 n.add_face(face, 2, "aligned")
+ − 238 index = 3
+ − 239 if (mData.TotalPlasmids > plasmidCount):
+ − 240 plasmidCount = mData.TotalPlasmids
+ − 241 for i in range(0, mData.TotalPlasmids):
+ − 242 face = faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True)
+ − 243 face.border.margin = 5
+ − 244 face.margin_right = 10
+ − 245 n.add_face(face, index, "aligned")
+ − 246 index+=1
+ − 247 face = faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True)
+ − 248 face.border.margin = 5
+ − 249 face.margin_right = 10
+ − 250 n.add_face(face, index, "aligned")
+ − 251 index+=1
+ − 252
+ − 253 face = faces.TextFace("Species",fsize=10,tight_text=True)
+ − 254 face.border.margin = 5
+ − 255 face.margin_right = 10
+ − 256 face.margin_left = 10
+ − 257 (t&"Reference").add_face(face, 0, "aligned")
+ − 258 face = faces.TextFace("Sequence Type",fsize=10,tight_text=True)
+ − 259 face.border.margin = 5
+ − 260 face.margin_right = 10
+ − 261 (t&"Reference").add_face(face, 1, "aligned")
+ − 262 face = faces.TextFace("Carbapenamases",fsize=10,tight_text=True)
+ − 263 face.border.margin = 5
+ − 264 face.margin_right = 10
+ − 265 (t&"Reference").add_face(face, 2, "aligned")
+ − 266 index = 3
+ − 267 for i in range(0, plasmidCount):
+ − 268 face = faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True)
+ − 269 face.border.margin = 5
+ − 270 face.margin_right = 10
+ − 271 (t&"Reference").add_face(face, index, "aligned")
+ − 272 index+=1
+ − 273 face = faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True)
+ − 274 face.border.margin = 5
+ − 275 face.margin_right = 10
+ − 276 (t&"Reference").add_face(face, index, "aligned")
+ − 277 index+=1
+ − 278
+ − 279 t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts)
+ − 280
+ − 281 #endregion
+ − 282 '''
+ − 283 #region create box tree
+ − 284 #region step5: tree construction
+ − 285 treeFile = "".join(read(treePath))
+ − 286 t = e.Tree(treeFile)
+ − 287 t.set_outgroup(t&"Reference")
+ − 288
+ − 289 #set the tree style
+ − 290 ts = e.TreeStyle()
+ − 291 ts.show_leaf_name = False
+ − 292 ts.show_branch_length = True
+ − 293 ts.scale = 2000 #pixel per branch length unit
+ − 294 ts.branch_vertical_margin = 15 #pixel between branches
+ − 295 style2 = e.NodeStyle()
+ − 296 style2["fgcolor"] = "#000000"
+ − 297 style2["shape"] = "circle"
+ − 298 style2["vt_line_color"] = "#0000aa"
+ − 299 style2["hz_line_color"] = "#0000aa"
+ − 300 style2["vt_line_width"] = 2
+ − 301 style2["hz_line_width"] = 2
+ − 302 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
+ − 303 style2["hz_line_type"] = 0
+ − 304 for n in t.traverse():
+ − 305 n.set_style(style2)
+ − 306
+ − 307 #find the plasmid origins
+ − 308 plasmidIncs = {}
+ − 309 for key in metadata:
+ − 310 for plasmid in metadata[key].plasmids:
+ − 311 for inc in plasmid.PlasmidRepType.split(","):
+ − 312 if (inc.lower().find("inc") > -1):
+ − 313 if not (inc in plasmidIncs):
+ − 314 plasmidIncs[inc] = [metadata[key].ID]
+ − 315 else:
+ − 316 if metadata[key].ID not in plasmidIncs[inc]:
+ − 317 plasmidIncs[inc].append(metadata[key].ID)
+ − 318 #plasmidIncs = sorted(plasmidIncs)
+ − 319 for n in t.traverse(): #loop through the nodes of a tree
+ − 320 if (n.is_leaf() and n.name == "Reference"):
+ − 321 #if its the reference branch, populate the faces with column headers
+ − 322 index = 0
+ − 323
+ − 324 for sensitive_data_column in sensitive_meta_data.get_columns():
+ − 325 (t&"Reference").add_face(addFace(sensitive_data_column), index, "aligned")
+ − 326 index = index + 1
+ − 327
+ − 328 (t&"Reference").add_face(addFace("SampleID"), index, "aligned")
+ − 329 index = index + 1
+ − 330 (t&"Reference").add_face(addFace("New?"), index, "aligned")
+ − 331 index = index + 1
+ − 332 for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node
+ − 333 (t&"Reference").add_face(addFace(list(plasmidIncs.keys())[i]), i + index, "aligned")
+ − 334 index = index + len(plasmidIncs)
+ − 335 (t&"Reference").add_face(addFace("MLSTScheme"), index, "aligned")
+ − 336 index = index + 1
+ − 337 (t&"Reference").add_face(addFace("Sequence Type"), index, "aligned")
+ − 338 index = index + 1
+ − 339 (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned")
+ − 340 index = index + 1
+ − 341 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix
+ − 342 (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned")
+ − 343 index = index + len(distanceDict[list(distanceDict.keys())[0]])
+ − 344 elif (n.is_leaf() and not n.name == "Reference"):
+ − 345 #not reference branches, populate with metadata
+ − 346 index = 0
+ − 347 mData = metadata[n.name.replace(".fa","")]
+ − 348
+ − 349 # pushing in sensitive data
+ − 350 for sensitive_data_column in sensitive_meta_data.get_columns():
+ − 351 sens_col_val = sensitive_meta_data.get_value(bcid=mData.ID, column_name=sensitive_data_column )
+ − 352 n.add_face(addFace(sens_col_val), index, "aligned")
+ − 353 index = index + 1
+ − 354
+ − 355 n.add_face(addFace(mData.ID), index, "aligned")
+ − 356 index = index + 1
+ − 357 if (metadata[n.name.replace(".fa","")].new == True): #new column
+ − 358 face = e.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True)
+ − 359 face.border.margin = 5
+ − 360 face.margin_right = 5
+ − 361 face.margin_left = 5
+ − 362 face.vt_align = 1
+ − 363 face.ht_align = 1
+ − 364 n.add_face(face, index, "aligned")
+ − 365 index = index + 1
+ − 366 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes
+ − 367 if (n.name.replace(".fa","") in plasmidIncs[incs]):
+ − 368 face = e.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True)
+ − 369 face.border.margin = 5
+ − 370 face.margin_right = 5
+ − 371 face.margin_left = 5
+ − 372 face.vt_align = 1
+ − 373 face.ht_align = 1
+ − 374 n.add_face(face, list(plasmidIncs.keys()).index(incs) + index, "aligned")
+ − 375 index = index + len(plasmidIncs)
+ − 376 n.add_face(addFace(mData.MLSTSpecies), index, "aligned")
+ − 377 index = index + 1
+ − 378 n.add_face(addFace(mData.SequenceType), index, "aligned")
+ − 379 index = index + 1
+ − 380 n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned")
+ − 381 index = index + 1
+ − 382 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix
+ − 383 n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned")
+ − 384
+ − 385 t.render( outputFile, w=5000,units="mm", tree_style=ts) #save it as a png. or an phyloxml
+ − 386
+ − 387 #endregion
+ − 388 #endregion
+ − 389
+ − 390
+ − 391 start = time.time()#time the analysis
+ − 392
+ − 393 #analysis time
+ − 394 Main()
+ − 395
+ − 396 end = time.time()
+ − 397 print("Finished!\nThe analysis used: " + str(end-start) + " seconds")