Mercurial > repos > jjjjia > cpo_prediction
view cpo_galaxy_tree.py @ 3:e6027598a35c draft
planemo upload
author | jjjjia |
---|---|
date | Mon, 20 Aug 2018 17:53:59 -0400 |
parents | fea89c4d5227 |
children | cabceaa239e4 |
line wrap: on
line source
#!/home/jjjjia/.conda/envs/py36/bin/python #$ -S /home/jjjjia/.conda/envs/py36/bin/python #$ -V # Pass environment variables to the job #$ -N CPO_pipeline # Replace with a more specific job name #$ -wd /home/jjjjia/testCases # Use the current working dir #$ -pe smp 8 # Parallel Environment (how many cores) #$ -l h_vmem=11G # Memory (RAM) allocation *per core* #$ -e ./logs/$JOB_ID.err #$ -o ./logs/$JOB_ID.log #$ -m ea #$ -M bja20@sfu.ca #~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" import subprocess import pandas import optparse import os import datetime import sys import time import urllib.request import gzip import collections import json import ete3 import numpy #parses some parameters parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate") parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)") parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)") (options,args) = parser.parse_args() treePath = str(options.treePath).lstrip().rstrip() distancePath = str(options.distancePath).lstrip().rstrip() metadataPath = str(options.metadataPath).lstrip().rstrip() #region result objects #define some objects to store values from results #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). class workflowResult(object): def __init__(self): self.new = False self.ID = "" self.ExpectedSpecies = "" self.MLSTSpecies = "" self.SequenceType = "" self.MLSTScheme = "" self.CarbapenemResistanceGenes ="" self.OtherAMRGenes="" self.TotalPlasmids = 0 self.plasmids = [] self.DefinitelyPlasmidContigs ="" self.LikelyPlasmidContigs="" self.row = "" class plasmidObj(object): def __init__(self): self.PlasmidsID = 0 self.Num_Contigs = 0 self.PlasmidLength = 0 self.PlasmidRepType = "" self.PlasmidMobility = "" self.NearestReference = "" #endregion #region useful functions def read(path): return [line.rstrip('\n') for line in open(path)] def execute(command): process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # Poll process for new output until finished while True: nextline = process.stdout.readline() if nextline == '' and process.poll() is not None: break sys.stdout.write(nextline) sys.stdout.flush() output = process.communicate()[0] exitCode = process.returncode if (exitCode == 0): return output else: raise subprocess.CalledProcessError(exitCode, command) def httpGetFile(url, filepath=""): if (filepath == ""): return urllib.request.urlretrieve(url) else: urllib.request.urlretrieve(url, filepath) return True def gunzip(inputpath="", outputpath=""): if (outputpath == ""): with gzip.open(inputpath, 'rb') as f: gzContent = f.read() return gzContent else: with gzip.open(inputpath, 'rb') as f: gzContent = f.read() with open(outputpath, 'wb') as out: out.write(gzContent) return True #endregion #region functions to parse result files def ParseWorkflowResults(pathToResult): _worflowResult = {} r = pandas.read_csv(pathToResult, delimiter='\t', header=None) #read the kraken2report.tsv r = r.replace(numpy.nan, '', regex=True) for i in range(len(r.index)): _results = workflowResult() if(str(r.iloc[i,0]).lower() == "new"): _results.new = True else: _results.new = False _results.ID = str(r.iloc[i,1]) _results.ExpectedSpecies = str(r.iloc[i,2]) _results.MLSTSpecies = str(r.iloc[i,3]) _results.SequenceType = str(r.iloc[i,4]) _results.MLSTScheme = (str(r.iloc[i,5])) _results.CarbapenemResistanceGenes = (str(r.iloc[i,6])) _results.OtherAMRGenes = (str(r.iloc[i,7])) _results.TotalPlasmids = int(r.iloc[i,8]) for j in range(0,_results.TotalPlasmids): _plasmid = plasmidObj() _plasmid.PlasmidsID =(((str(r.iloc[i,9])).split(";"))[j]) _plasmid.Num_Contigs = (((str(r.iloc[i,10])).split(";"))[j]) _plasmid.PlasmidLength = (((str(r.iloc[i,11])).split(";"))[j]) _plasmid.PlasmidRepType = (((str(r.iloc[i,12])).split(";"))[j]) _plasmid.PlasmidMobility = ((str(r.iloc[i,13])).split(";"))[j] _plasmid.NearestReference = ((str(r.iloc[i,14])).split(";"))[j] _results.plasmids.append(_plasmid) _results.DefinitelyPlasmidContigs = (str(r.iloc[i,15])) _results.LikelyPlasmidContigs = (str(r.iloc[i,16])) _results.row = "\t".join(str(x) for x in r.ix[i].tolist()) _worflowResult[_results.ID] = _results return _worflowResult #endregion def Main(): metadata = ParseWorkflowResults(metadataPath) distance = read(distancePath) treeFile = "".join(read(treePath)) distanceDict = {} for i in range(len(distance)): temp = distance[i].split("\t") distanceDict[temp[0]] = temp[1:] #region step5: tree construction t = ete3.Tree(treeFile) t.set_outgroup(t&"Reference") ts = ete3.TreeStyle() ts.show_leaf_name = True ts.show_branch_length = True ts.scale = 2000 #pixel per branch length unit ts.branch_vertical_margin = 15 #pixel between branches style2 = ete3.NodeStyle() style2["fgcolor"] = "#000000" style2["shape"] = "circle" style2["vt_line_color"] = "#0000aa" style2["hz_line_color"] = "#0000aa" style2["vt_line_width"] = 2 style2["hz_line_width"] = 2 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted style2["hz_line_type"] = 0 for n in t.traverse(): n.set_style(style2) ''' #region create detailed tree plasmidCount = 0 for n in t.traverse(): if (n.is_leaf() and not n.name == "Reference"): mData = metadata[n.name.replace(".fa","")] face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True) face.border.margin = 5 face.margin_left = 10 face.margin_right = 10 n.add_face(face, 0, "aligned") face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 10 n.add_face(face, 1, "aligned") face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 10 n.add_face(face, 2, "aligned") index = 3 if (mData.TotalPlasmids > plasmidCount): plasmidCount = mData.TotalPlasmids for i in range(0, mData.TotalPlasmids): face = ete3.faces.TextFace(mData.plasmids[i].PlasmidRepType,fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 10 n.add_face(face, index, "aligned") index+=1 face = ete3.faces.TextFace(mData.plasmids[i].PlasmidMobility,fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 10 n.add_face(face, index, "aligned") index+=1 face = ete3.faces.TextFace("Species",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 10 face.margin_left = 10 (t&"Reference").add_face(face, 0, "aligned") face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 10 (t&"Reference").add_face(face, 1, "aligned") face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 10 (t&"Reference").add_face(face, 2, "aligned") index = 3 for i in range(0, plasmidCount): face = ete3.faces.TextFace("plasmid " + str(i) + " replicons",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 10 (t&"Reference").add_face(face, index, "aligned") index+=1 face = ete3.faces.TextFace("plasmid " + str(i) + " mobility",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 10 (t&"Reference").add_face(face, index, "aligned") index+=1 t.render("./pipelineTest/tree.png", w=5000,units="mm", tree_style=ts) #endregion ''' #region create box tree #region step5: tree construction treeFile = "".join(read("./pipelineTest/tree.txt")) t = ete3.Tree(treeFile) t.set_outgroup(t&"Reference") ts = ete3.TreeStyle() ts.show_leaf_name = True ts.show_branch_length = True ts.scale = 2000 #pixel per branch length unit ts.branch_vertical_margin = 15 #pixel between branches style2 = ete3.NodeStyle() style2["fgcolor"] = "#000000" style2["shape"] = "circle" style2["vt_line_color"] = "#0000aa" style2["hz_line_color"] = "#0000aa" style2["vt_line_width"] = 2 style2["hz_line_width"] = 2 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted style2["hz_line_type"] = 0 for n in t.traverse(): n.set_style(style2) plasmidIncs = {} for key in metadata: for plasmid in metadata[key].plasmids: for inc in plasmid.PlasmidRepType.split(","): if (inc.lower().find("inc") > -1): if not (inc in plasmidIncs): plasmidIncs[inc] = [metadata[key].ID] else: if metadata[key].ID not in plasmidIncs[inc]: plasmidIncs[inc].append(metadata[key].ID) #plasmidIncs = sorted(plasmidIncs) for n in t.traverse(): if (n.is_leaf() and n.name == "Reference"): face = ete3.faces.TextFace("New?",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 (t&"Reference").add_face(face, 0, "aligned") for i in range(len(plasmidIncs)): #this loop adds the columns (aka the incs) to the reference node face = ete3.faces.TextFace(list(plasmidIncs.keys())[i],fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 (t&"Reference").add_face(face, i + 1, "aligned") face = ete3.faces.TextFace("MLSTScheme",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 (t&"Reference").add_face(face, len(plasmidIncs) + 0 + 1, "aligned") face = ete3.faces.TextFace("Sequence Type",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 (t&"Reference").add_face(face, len(plasmidIncs) + 1 + 1, "aligned") face = ete3.faces.TextFace("Carbapenamases",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 (t&"Reference").add_face(face, len(plasmidIncs) + 2 + 1, "aligned") for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the columns (aka the incs) to the reference node face = ete3.faces.TextFace(distanceDict[list(distanceDict.keys())[0]][i],fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 (t&"Reference").add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned") elif (n.is_leaf() and not n.name == "Reference"): if (metadata[n.name.replace(".fa","")].new == True): face = ete3.faces.RectFace(30,30,"green","green") # TextFace("Y",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 face.vt_align = 1 face.ht_align = 1 n.add_face(face, 0, "aligned") for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes if (n.name.replace(".fa","") in plasmidIncs[incs]): face = ete3.faces.RectFace(30,30,"black","black") # TextFace("Y",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 face.vt_align = 1 face.ht_align = 1 n.add_face(face, list(plasmidIncs.keys()).index(incs) + 1, "aligned") mData = metadata[n.name.replace(".fa","")] face = ete3.faces.TextFace(mData.MLSTSpecies,fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 n.add_face(face, len(plasmidIncs) + 0 + 1, "aligned") face = ete3.faces.TextFace(mData.SequenceType,fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 n.add_face(face, len(plasmidIncs) + 1 + 1, "aligned") face = ete3.faces.TextFace(mData.CarbapenemResistanceGenes,fsize=10,tight_text=True) face.margin_right = 5 face.margin_left = 5 n.add_face(face, len(plasmidIncs) + 2 + 1, "aligned") for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix face = ete3.faces.TextFace(list(distanceDict[n.name])[i],fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 n.add_face(face, len(plasmidIncs) + 2 + i + 1 + 1, "aligned") t.render("./tree.png", w=5000,units="mm", tree_style=ts) #endregion #endregion start = time.time()#time the analysis #analysis time Main() end = time.time() print("Finished!\nThe analysis used: " + str(end-start) + " seconds")