# HG changeset patch
# User jjjjia
# Date 1535483709 14400
# Node ID 596bf8a792dec0144c6fc234e566de4835a12d46
# Parent ed3b291693fc26ff11428473175c83f31184daf9
planemo upload
diff -r ed3b291693fc -r 596bf8a792de cpo_galaxy_prediction.py
--- a/cpo_galaxy_prediction.py Mon Aug 27 19:58:24 2018 -0400
+++ b/cpo_galaxy_prediction.py Tue Aug 28 15:15:09 2018 -0400
@@ -42,6 +42,7 @@
parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ")
+ parser.add_options("-d", "--mash", dest="mash", type="string", help="absolute file path to mash plasmiddb result")
#parallelization, useless, these are hard coded to 8cores/64G RAM
#parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
@@ -60,6 +61,7 @@
expectedSpecies = str(options.expectedSpecies).lstrip().rstrip()
mlstScheme = str(options.mlstScheme).lstrip().rstrip()
plasmidfinder = str(options.plasmidfinder).lstrip().rstrip()
+ mash = str(options.mash).lstrip().rstrip()
outputDir = "./"
print(mlst)
print(mobfindercontig)
@@ -68,6 +70,8 @@
print(rgi)
print(expectedSpecies)
print(mlstScheme)
+ print(mash)
+
else:
curDir = os.getcwd()
ID = "BC11"
@@ -79,6 +83,7 @@
expectedSpecies = "Escherichia coli"
mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab"
plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins"
+ mash = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\mash.tsv"
outputDir = "./"
#region result objects
@@ -190,6 +195,26 @@
self.source = ""
self.row = ""
+class MashResult(object):
+ def __init__(self):
+ self.size = 0.0
+ self.depth = 0.0
+ self.identity = 0.0
+ self.sharedHashes = ""
+ self.medianMultiplicity = 0
+ self.pvalue = 0.0
+ self.queryID= ""
+ self.queryComment = ""
+ self.species = ""
+ self.row = ""
+ self.accession = ""
+ self.gcf=""
+ self.assembly=""
+
+ def toDict(self): #doesnt actually work
+ return dict((name, getattr(self, name)) for name in dir(self) if not name.startswith('__'))
+
+
#endregion
#region useful functions
@@ -476,6 +501,22 @@
#plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
#origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
return _pFinder
+
+def ParseMashResult(pathToMashScreen):
+ mashScreen = pandas.read_csv(pathToMashScreen, delimiter='\t', header=None)
+
+ _mashPlasmidHits = {} #***********************
+ #parse what the species are.
+ for i in (range(len(mashScreen.index))):
+ mr = MashResult()
+ mr.identity = float(mashScreen.ix[i, 0])
+ mr.sharedHashes = mashScreen.ix[i, 1]
+ mr.medianMultiplicity = int(mashScreen.ix[i, 2])
+ mr.pvalue = float(mashScreen.ix[i, 3])
+ mr.name = mashScreen.ix[i, 4] #accession
+ mr.row = "\t".join(str(x) for x in mashScreen.ix[i].tolist())
+ _mashPlasmidHits[mr.name] = mr
+ return _mashPlasmidHits
#endregion
def Main():
@@ -531,6 +572,9 @@
rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
ToJson(rgiAMR, "rgi.json") #*************
+ plasmidFamily = ParseMashResult(mash)
+ ToJson(plasmidFamily, "mash.json")
+
carbapenamases = []
resfinderCarbas = [] #list of rfinder objects for lindaout list
amrGenes = []
@@ -613,20 +657,27 @@
lindaTemp += "\t\t" #sero and kcap
#resfinderCarbas
+ index = 0
for carbs in resfinderCarbas:
if (carbs.source == "plasmid"): #
- lindaTemp += "\t\t\t\t\t" #plasmid 1 rflp plasmid 1 family information. PLASMID_1_FAMILY\tPLASMID_1_BEST_MATCH\tPLASMID_1_COVERAGE\tPLASMID_1_SNVS_TO_BEST_MATCH
+ lindaTemp += "\t"
+ plasmid = plasmidFamily[list(plasmidFamily.keys())[index]]
+ lindaTemp += plasmid.name + "\t"
+ lindaTemp += str(plasmid.identity) + "\t"
+ lindaTemp += plasmid.sharedHashes + "\t"
lindaTemp += carbs.shortGene + "\t" #found an carbapenase
contig = carbs.sequence[6:] #this is the contig number
for i in mSuite.keys():
if (str(mSuite[i].contig_num) == str(contig)): #found the right plasmid
- lindaTemp += mSuite[i].rep_type
+ clusterid = mSuite[i].cluster_id
+ rep_types = mSuitePlasmids["plasmid_" + str(clusterid) + ".fasta"].rep_types
+ lindaTemp += rep_types
lindaOut.append(lindaTemp)
out = open("summary.linda.tsv", 'w')
for item in lindaOut:
out.write("%s\n" % item)
- tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
+ tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tPlasmid Best Match\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
#start with ID
temp = "\t"
temp += (ID + "\t")
@@ -642,6 +693,7 @@
temp += ";".join(amrGenes) + "\t"
#lastly plasmids
+ temp += str(plasmidFamily[list(plasmidFamily.keys())[0]].name)
temp+= str(len(mSuitePlasmids)) + "\t"
plasmidID = ""
contigs = ""
diff -r ed3b291693fc -r 596bf8a792de cpo_galaxy_predictions.xml
--- a/cpo_galaxy_predictions.xml Mon Aug 27 19:58:24 2018 -0400
+++ b/cpo_galaxy_predictions.xml Tue Aug 28 15:15:09 2018 -0400
@@ -11,11 +11,12 @@
'-m $mlst'
'-c $mobsuitecontig'
'-f $mobsuiteaggregate'
- '-a $abricate'
+ '-a $resfinder'
'-r $rgi'
'-e $expected'
'-s $__tool_directory__/scheme_species_map.tab'
'-p $plasmidfinder'
+ '-d $mash'
]]>
@@ -23,15 +24,16 @@
-
+
+
-
-
+
+
diff -r ed3b291693fc -r 596bf8a792de cpo_galaxy_tree.py
--- a/cpo_galaxy_tree.py Mon Aug 27 19:58:24 2018 -0400
+++ b/cpo_galaxy_tree.py Tue Aug 28 15:15:09 2018 -0400
@@ -39,18 +39,53 @@
#parses some parameters
parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
-parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate")
-parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)")
-parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)")
+parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="absolute file path to phylip tree")
+parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path to distance matrix")
+parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to metadata file")
+parser.add_option("-o", "--output_file", dest="outputFile", type="string", default="tree.png", help="Output graphics file. Use ending 'png', 'pdf' or 'svg' to specify file format.")
+
+# sensitive data adder
+parser.add_option("-p", "--sensitive_data", dest="sensitivePath", type="string", default="", help="Spreadsheet (CSV) with sensitive metadata")
+parser.add_option("-c", "--sensitive_cols", dest="sensitiveCols", type="string", default="", help="CSV list of column names from sensitive metadata spreadsheet to use as labels on dendrogram")
+parser.add_option("-b", "--bcid_column", dest="bcidCol", type="string", default="BCID", help="Column name of BCID in sensitive metadata file")
+parser.add_option("-n", "--missing_value", dest="naValue", type="string", default="NA", help="Value to write for missing data.")
+
(options,args) = parser.parse_args()
treePath = str(options.treePath).lstrip().rstrip()
distancePath = str(options.distancePath).lstrip().rstrip()
metadataPath = str(options.metadataPath).lstrip().rstrip()
+sensitivePath = str(options.sensitivePath).lstrip().rstrip()
+sensitiveCols = str(options.sensitiveCols).lstrip().rstrip()
+outputFile = str(options.outputFile).lstrip().rstrip()
+bcidCol = str( str(options.bcidCol).lstrip().rstrip() )
+naValue = str( str(options.naValue).lstrip().rstrip() )
+
#region result objects
#define some objects to store values from results
#//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
+
+class SensitiveMetadata(object):
+ def __init__(self):
+ x = pandas.read_csv( sensitivePath )
+ col_names = [ s for s in sensitiveCols.split(',')] # convert to 0 offset
+ if not bcidCol in col_names:
+ col_names.append( bcidCol )
+ all_cols = [ str(col) for col in x.columns ]
+ col_idxs = [ all_cols.index(col) for col in col_names ]
+ self.sensitive_data = x.iloc[:, col_idxs]
+ def get_columns(self):
+ cols = [ str(x) for x in self.sensitive_data.columns ]
+ return cols
+ def get_value( self, bcid, column_name ): # might be nice to get them all in single call via an input list of bcids ... for later
+ bcids= list( self.sensitive_data.loc[:, bcidCol ] ) # get the list of all BCIDs in sensitive metadata
+ if not bcid in bcids:
+ return naValue
+ else:
+ row_idx = bcids.index( bcid ) # lookup the row for this BCID
+ return self.sensitive_data.loc[ row_idx, column_name ] # return the one value based on the column (col_idx) and this row
+
class workflowResult(object):
def __init__(self):
self.new = False
@@ -60,6 +95,7 @@
self.SequenceType = "?"
self.MLSTScheme = "?"
self.CarbapenemResistanceGenes ="?"
+ self.plasmidBestMatch ="?"
self.OtherAMRGenes="?"
self.TotalPlasmids = -1
self.plasmids = []
@@ -145,6 +181,7 @@
_results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes']))
_results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes']))
_results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids'])
+ _results.plasmidBestMatch = str(r.loc[r.index[i], 'Plasmid Best Match'])
for j in range(0,_results.TotalPlasmids):
_plasmid = plasmidObj()
_plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j])
@@ -163,6 +200,9 @@
#endregion
def Main():
+ if len(sensitivePath)>0:
+ sensitive_meta_data = SensitiveMetadata()
+
metadata = ParseWorkflowResults(metadataPath)
distance = read(distancePath)
treeFile = "".join(read(treePath))
@@ -212,6 +252,12 @@
if (n.is_leaf() and n.name == "Reference"):
#if its the reference branch, populate the faces with column headers
index = 0
+
+ if len(sensitivePath)>0: #sensitive metadat @ chris
+ for sensitive_data_column in sensitive_meta_data.get_columns():
+ (t&"Reference").add_face(addFace(sensitive_data_column), index, "aligned")
+ index = index + 1
+
(t&"Reference").add_face(addFace("SampleID"), index, "aligned")
index = index + 1
(t&"Reference").add_face(addFace("New?"), index, "aligned")
@@ -225,12 +271,28 @@
index = index + 1
(t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned")
index = index + 1
+ (t&"Reference").add_face(addFace("Plasmid Best Match"), index, "aligned")
+ index = index + 1
for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix
(t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned")
index = index + len(distanceDict[list(distanceDict.keys())[0]])
elif (n.is_leaf() and not n.name == "Reference"):
#not reference branches, populate with metadata
index = 0
+
+ if len(sensitivePath)>0: #sensitive metadata @ chris
+ # pushing in sensitive data
+ for sensitive_data_column in sensitive_meta_data.get_columns():
+ # tree uses bcids like BC18A021A_S12
+ # while sens meta-data uses BC18A021A
+ # trim the "_S.*" if present
+ bcid = str(mData.ID)
+ if bcid.find( "_S" ) != -1:
+ bcid = bcid[ 0:bcid.find( "_S" ) ]
+ sens_col_val = sensitive_meta_data.get_value(bcid=bcid, column_name=sensitive_data_column )
+ n.add_face(addFace(sens_col_val), index, "aligned")
+ index = index + 1
+
if (n.name.replace(".fa","") in metadata.keys()):
mData = metadata[n.name.replace(".fa","")]
else:
@@ -262,11 +324,13 @@
index = index + 1
n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned")
index = index + 1
+ n.add_face(addFace(mData.plasmidBestMatch), index, "aligned")
+ index = index + 1
for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix
if (n.name in distanceDict): #make sure the column is in the distance matrice
n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned")
- t.render("./tree.pdf", w=5000,units="mm", tree_style=ts) #save it as a png. or an phyloxml
+ t.render(outputFile, w=5000,units="mm", tree_style=ts) #save it as a png, pdf, svg or an phyloxml
#endregion
#endregion
diff -r ed3b291693fc -r 596bf8a792de cpo_mash.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpo_mash.xml Tue Aug 28 15:15:09 2018 -0400
@@ -0,0 +1,31 @@
+
+ Modified version of mash 2.0 with custom database
+
+ mash
+
+
+ mashresult.tsv
+ ]]>
+
+
+
+
+
+
+
+
+ mash screen bcPlasmidDB.msh $input | sort -gr | head -10 > mashresult.tsv
+
+
+
+ @misc{githubmob-suite,
+ author = {Robertson J, Nash J},
+ title = {MOB-Suite: Software tools for clustering, reconstruction and typing of plasmids from draft assemblies.},
+ publisher = {GitHub},
+ journal = {GitHub repository},
+ doi = {10.1099/mgen.0.000206},
+ url = {https://github.com/phac-nml/mob-suite}
+ }
+
+
\ No newline at end of file