1
+ − 1 #!/home/jjjjia/.conda/envs/py36/bin/python
+ − 2
+ − 3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python
+ − 4 #$ -V # Pass environment variables to the job
+ − 5 #$ -N CPO_pipeline # Replace with a more specific job name
+ − 6 #$ -wd /home/jjjjia/testCases # Use the current working dir
+ − 7 #$ -pe smp 8 # Parallel Environment (how many cores)
+ − 8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core*
+ − 9 #$ -e ./logs/$JOB_ID.err
+ − 10 #$ -o ./logs/$JOB_ID.log
+ − 11 #$ -m ea
+ − 12 #$ -M bja20@sfu.ca
+ − 13
+ − 14 #~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae"
+ − 15
+ − 16 import subprocess
+ − 17 import pandas
+ − 18 import optparse
+ − 19 import os
+ − 20 import datetime
+ − 21 import sys
+ − 22 import time
+ − 23 import urllib.request
+ − 24 import gzip
+ − 25 import collections
+ − 26 import json
+ − 27 import numpy
+ − 28
+ − 29
+ − 30 debug = True #debug skips the shell scripts and also dump out a ton of debugging messages
+ − 31
+ − 32 if not debug:
+ − 33 #parses some parameters
+ − 34 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...")
+ − 35 #required
+ − 36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate")
+ − 37 parser.add_option("-a", "--assembly", dest="assemblyPath", type="string", help="absolute file path to contigs fasta")
+ − 38 parser.add_option("-c", "--card-db", dest="cardDB", default = "/home/jjjjia/databases/card202.json", type="string", help="absolute path to card reference database")
+ − 39 parser.add_option("-o", "--output", dest="output", default='./', type="string", help="absolute path to output folder")
+ − 40 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate")
+ − 41
+ − 42 #optionals
+ − 43 parser.add_option("-k", "--script-path", dest="scriptDir", default="/home/jjjjia/scripts", type="string", help="absolute file path to this script folder")
+ − 44 parser.add_option("-b", "--update-abricate-path", dest="updateAbPath", default = "", type="string", help="absolute file path to fasta sequence used for abricate database")
+ − 45 parser.add_option("-m", "--update-abricate-dbname", dest="updateAbName", default = "default", type="string", help="name of abricate database to update")
+ − 46 parser.add_option("-u", "--update-mlst", dest="updateMLST", default = "False", type="string", help="True = update MLST")
+ − 47 #used for parsing
+ − 48 parser.add_option("-s", "--mlst-scheme", dest="mlst", default= "/home/jjjjia/databases/scheme_species_map.tab", type="string", help="absolute file path to mlst scheme")
+ − 49
+ − 50
+ − 51 #parallelization, useless, these are hard coded to 8cores/64G RAM
+ − 52 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use")
+ − 53 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB")
+ − 54
+ − 55 (options,args) = parser.parse_args()
+ − 56 #if len(args) != 8:
+ − 57 #parser.error("incorrect number of arguments, all 7 is required")
+ − 58 curDir = os.getcwd()
+ − 59 outputDir = options.output
+ − 60 expectedSpecies = options.expectedSpecies
+ − 61 mlstScheme = options.mlst
+ − 62 tempDir = outputDir + "/shovillTemp"
+ − 63 scriptDir = options.scriptDir
+ − 64 updateAbName = options.updateAbName
+ − 65 updateAbPath = options.updateAbPath
+ − 66 updateMLST = options.updateMLST
+ − 67 cardDB=options.cardDB
+ − 68 assemblyPath=options.assemblyPath
+ − 69 ID = options.id
+ − 70 else:
+ − 71 manifestDir = ""
+ − 72 curDir = os.getcwd()
+ − 73 outputDir = "pipelineTest"
+ − 74 expectedSpecies = "Escherichia coli"
+ − 75 #threads = 8
+ − 76 #memory = 64
+ − 77 mlstScheme = outputDir + "/scheme_species_map.tab"
+ − 78 tempDir= outputDir + "/shovillTemp"
+ − 79 scriptDir = "~/scripts"
+ − 80 updateAbName = "cpo"
+ − 81 updateAbPath = "~/database/bccdcCPO.seq"
+ − 82 updateMLST = True
+ − 83 assemblyPath = "./"
+ − 84 cardDB = "./"
+ − 85 ID = "BC11-Kpn005_S2"
+ − 86
+ − 87 #region result objects
+ − 88 #define some objects to store values from results
+ − 89 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar).
+ − 90 class starFinders(object):
+ − 91 def __init__(self):
+ − 92 self.file = ""
+ − 93 self.sequence = ""
+ − 94 self.start = 0
+ − 95 self.end = 0
+ − 96 self.gene = ""
+ − 97 self.shortGene = ""
+ − 98 self.coverage = ""
+ − 99 self.coverage_map = ""
+ − 100 self.gaps = ""
+ − 101 self.pCoverage = 100.00
+ − 102 self.pIdentity = 100.00
+ − 103 self.database = ""
+ − 104 self.accession = ""
+ − 105 self.product = ""
+ − 106 self.source = "chromosome"
+ − 107 self.row = ""
+ − 108
+ − 109 class PlasFlowResult(object):
+ − 110 def __init__(self):
+ − 111 self.sequence = ""
+ − 112 self.length = 0
+ − 113 self.label = ""
+ − 114 self.confidence = 0
+ − 115 self.usefulRow = ""
+ − 116 self.row = ""
+ − 117
+ − 118 class MlstResult(object):
+ − 119 def __init__(self):
+ − 120 self.file = ""
+ − 121 self.speciesID = ""
+ − 122 self.seqType = 0
+ − 123 self.scheme = ""
+ − 124 self.species = ""
+ − 125 self.row=""
+ − 126
+ − 127 class mobsuiteResult(object):
+ − 128 def __init__(self):
+ − 129 self.file_id = ""
+ − 130 self.cluster_id = ""
+ − 131 self.contig_id = ""
+ − 132 self.contig_num = 0
+ − 133 self.contig_length = 0
+ − 134 self.circularity_status = ""
+ − 135 self.rep_type = ""
+ − 136 self.rep_type_accession = ""
+ − 137 self.relaxase_type = ""
+ − 138 self.relaxase_type_accession = ""
+ − 139 self.mash_nearest_neighbor = ""
+ − 140 self.mash_neighbor_distance = 0.00
+ − 141 self.repetitive_dna_id = ""
+ − 142 self.match_type = ""
+ − 143 self.score = 0
+ − 144 self.contig_match_start = 0
+ − 145 self.contig_match_end = 0
+ − 146 self.row = ""
+ − 147
+ − 148 class mobsuitePlasmids(object):
+ − 149 def __init__(self):
+ − 150 self.file_id = ""
+ − 151 self.num_contigs = 0
+ − 152 self.total_length = 0
+ − 153 self.gc = ""
+ − 154 self.rep_types = ""
+ − 155 self.rep_typeAccession = ""
+ − 156 self.relaxase_type= ""
+ − 157 self.relaxase_type_accession = ""
+ − 158 self.mpf_type = ""
+ − 159 self.mpf_type_accession= ""
+ − 160 self.orit_type = ""
+ − 161 self.orit_accession = ""
+ − 162 self.PredictedMobility = ""
+ − 163 self.mash_nearest_neighbor = ""
+ − 164 self.mash_neighbor_distance = 0.00
+ − 165 self.mash_neighbor_cluster= 0
+ − 166 self.row = ""
+ − 167 class RGIResult(object):
+ − 168 def __init__(self):
+ − 169 self.ORF_ID = ""
+ − 170 self.Contig = ""
+ − 171 self.Start = -1
+ − 172 self.Stop = -1
+ − 173 self.Orientation = ""
+ − 174 self.Cut_Off = ""
+ − 175 self.Pass_Bitscore = 100000
+ − 176 self.Best_Hit_Bitscore = 0.00
+ − 177 self.Best_Hit_ARO = ""
+ − 178 self.Best_Identities = 0.00
+ − 179 self.ARO = 0
+ − 180 self.Model_type = ""
+ − 181 self.SNPs_in_Best_Hit_ARO = ""
+ − 182 self.Other_SNPs = ""
+ − 183 self.Drug_Class = ""
+ − 184 self.Resistance_Mechanism = ""
+ − 185 self.AMR_Gene_Family = ""
+ − 186 self.Predicted_DNA = ""
+ − 187 self.Predicted_Protein = ""
+ − 188 self.CARD_Protein_Sequence = ""
+ − 189 self.Percentage_Length_of_Reference_Sequence = 0.00
+ − 190 self.ID = ""
+ − 191 self.Model_ID = 0
+ − 192 self.source = ""
+ − 193 self.row = ""
+ − 194
+ − 195 #endregion
+ − 196
+ − 197 #region useful functions
+ − 198 def read(path):
+ − 199 return [line.rstrip('\n') for line in open(path)]
+ − 200 def execute(command):
+ − 201 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+ − 202
+ − 203 # Poll process for new output until finished
+ − 204 while True:
+ − 205 nextline = process.stdout.readline()
+ − 206 if nextline == '' and process.poll() is not None:
+ − 207 break
+ − 208 sys.stdout.write(nextline)
+ − 209 sys.stdout.flush()
+ − 210
+ − 211 output = process.communicate()[0]
+ − 212 exitCode = process.returncode
+ − 213
+ − 214 if (exitCode == 0):
+ − 215 return output
+ − 216 else:
+ − 217 raise subprocess.CalledProcessError(exitCode, command)
+ − 218 def httpGetFile(url, filepath=""):
+ − 219 if (filepath == ""):
+ − 220 return urllib.request.urlretrieve(url)
+ − 221 else:
+ − 222 urllib.request.urlretrieve(url, filepath)
+ − 223 return True
+ − 224 def gunzip(inputpath="", outputpath=""):
+ − 225 if (outputpath == ""):
+ − 226 with gzip.open(inputpath, 'rb') as f:
+ − 227 gzContent = f.read()
+ − 228 return gzContent
+ − 229 else:
+ − 230 with gzip.open(inputpath, 'rb') as f:
+ − 231 gzContent = f.read()
+ − 232 with open(outputpath, 'wb') as out:
+ − 233 out.write(gzContent)
+ − 234 return True
+ − 235 def ToJson(dictObject, outputPath):
+ − 236 outDir = outputDir + '/summary/' + ID + ".json/"
+ − 237 if not (os.path.exists(outDir)):
+ − 238 os.makedirs(outDir)
+ − 239 with open(outDir + outputPath, 'w') as f:
+ − 240 json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False)
+ − 241 #endregion
+ − 242
+ − 243 #region functions to parse result files
+ − 244 def ParseMLSTResult(pathToMLSTResult):
+ − 245 _mlstResult = {}
+ − 246 scheme = pandas.read_csv(mlstScheme, delimiter='\t', header=0)
+ − 247 scheme = scheme.replace(numpy.nan, '', regex=True)
+ − 248
+ − 249 taxon = {}
+ − 250 #record the scheme as a dictionary
+ − 251 taxon["-"] = "No MLST Match"
+ − 252 for i in range(len(scheme.index)):
+ − 253 key = scheme.iloc[i,0]
+ − 254 if (str(scheme.iloc[i,2]) == "nan"):
+ − 255 value = str(scheme.iloc[i,1])
+ − 256 else:
+ − 257 value = str(scheme.iloc[i,1]) + " " + str(scheme.iloc[i,2])
+ − 258
+ − 259 if (key in taxon.keys()):
+ − 260 taxon[key] = taxon.get(key) + ";" + value
+ − 261 else:
+ − 262 taxon[key] = value
+ − 263 #read in the mlst result
+ − 264 mlst = pandas.read_csv(pathToMLSTResult, delimiter='\t', header=None)
+ − 265 _mlstHit = MlstResult()
+ − 266
+ − 267 _mlstHit.file = mlst.iloc[0,0]
+ − 268 _mlstHit.speciesID = (mlst.iloc[0,1])
+ − 269 _mlstHit.seqType = str(mlst.iloc[0,2])
+ − 270 for i in range(3, len(mlst.columns)):
+ − 271 _mlstHit.scheme += mlst.iloc[0,i] + ";"
+ − 272 _mlstHit.species = taxon[_mlstHit.speciesID]
+ − 273 _mlstHit.row = "\t".join(str(x) for x in mlst.ix[0].tolist())
+ − 274 _mlstResult[_mlstHit.speciesID]=_mlstHit
+ − 275
+ − 276 return _mlstResult
+ − 277
+ − 278 def ParsePlasmidFinderResult(pathToPlasmidFinderResult):
+ − 279 #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918
+ − 280 #example resfinder:
+ − 281 #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181
+ − 282
+ − 283 _pFinder = {} #***********************
+ − 284 plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0)
+ − 285 plasmidFinder = plasmidFinder.replace(numpy.nan, '', regex=True)
+ − 286
+ − 287
+ − 288 for i in range(len(plasmidFinder.index)):
+ − 289 pf = starFinders()
+ − 290 pf.file = str(plasmidFinder.iloc[i,0])
+ − 291 pf.sequence = str(plasmidFinder.iloc[i,1])
+ − 292 pf.start = int(plasmidFinder.iloc[i,2])
+ − 293 pf.end = int(plasmidFinder.iloc[i,3])
+ − 294 pf.gene = str(plasmidFinder.iloc[i,4])
+ − 295 pf.shortGene = pf.gene[:pf.gene.index("_")]
+ − 296 pf.coverage = str(plasmidFinder.iloc[i,5])
+ − 297 pf.coverage_map = str(plasmidFinder.iloc[i,6])
+ − 298 pf.gaps = str(plasmidFinder.iloc[i,7])
+ − 299 pf.pCoverage = float(plasmidFinder.iloc[i,8])
+ − 300 pf.pIdentity = float(plasmidFinder.iloc[i,9])
+ − 301 pf.database = str(plasmidFinder.iloc[i,10])
+ − 302 pf.accession = str(plasmidFinder.iloc[i,11])
+ − 303 pf.product = str(plasmidFinder.iloc[i,12])
+ − 304 pf.source = "plasmid"
+ − 305 pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
+ − 306 _pFinder[pf.gene]=pf
+ − 307 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist())
+ − 308 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1]))
+ − 309 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")]))
+ − 310 return _pFinder
+ − 311
+ − 312 def ParseMobsuiteResult(pathToMobsuiteResult):
+ − 313 _mobsuite = {}
+ − 314 mResult = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0)
+ − 315 mResult = mResult.replace(numpy.nan, '', regex=True)
+ − 316
+ − 317 for i in range(len(mResult.index)):
+ − 318 mr = mobsuiteResult()
+ − 319 mr.file_id = str(mResult.iloc[i,0])
+ − 320 mr.cluster_id = str(mResult.iloc[i,1])
+ − 321 if (mr.cluster_id == "chromosome"):
+ − 322 break
+ − 323 mr.contig_id = str(mResult.iloc[i,2])
+ − 324 mr.contig_num = mr.contig_id[(mr.contig_id.find("contig")+6):mr.contig_id.find("_len=")]
+ − 325 mr.contig_length = int(mResult.iloc[i,3])
+ − 326 mr.circularity_status = str(mResult.iloc[i,4])
+ − 327 mr.rep_type = str(mResult.iloc[i,5])
+ − 328 mr.rep_type_accession = str(mResult.iloc[i,6])
+ − 329 mr.relaxase_type = str(mResult.iloc[i,7])
+ − 330 mr.relaxase_type_accession = str(mResult.iloc[i,8])
+ − 331 mr.mash_nearest_neighbor = str(mResult.iloc[i,9])
+ − 332 mr.mash_neighbor_distance = float(mResult.iloc[i,10])
+ − 333 mr.repetitive_dna_id = str(mResult.iloc[i,11])
+ − 334 mr.match_type = str(mResult.iloc[i,12])
+ − 335 if (mr.match_type == ""):
+ − 336 mr.score = -1
+ − 337 mr.contig_match_start = -1
+ − 338 mr.contig_match_end = -1
+ − 339 else:
+ − 340 mr.score = int(mResult.iloc[i,13])
+ − 341 mr.contig_match_start = int(mResult.iloc[i,14])
+ − 342 mr.contig_match_end = int(mResult.iloc[i,15])
+ − 343 mr.row = "\t".join(str(x) for x in mResult.ix[i].tolist())
+ − 344 _mobsuite[mr.contig_id]=(mr)
+ − 345 return _mobsuite
+ − 346
+ − 347 def ParseMobsuitePlasmids(pathToMobsuiteResult):
+ − 348 _mobsuite = {}
+ − 349 mResults = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0)
+ − 350 mResults = mResults.replace(numpy.nan, '', regex=True)
+ − 351
+ − 352 for i in range(len(mResults.index)):
+ − 353 mr = mobsuitePlasmids()
+ − 354 mr.file_id = str(mResults.iloc[i,0])
+ − 355 mr.num_contigs = int(mResults.iloc[i,1])
+ − 356 mr.total_length = int(mResults.iloc[i,2])
+ − 357 mr.gc = int(mResults.iloc[i,3])
+ − 358 mr.rep_types = str(mResults.iloc[i,4])
+ − 359 mr.rep_typeAccession = str(mResults.iloc[i,5])
+ − 360 mr.relaxase_type = str(mResults.iloc[i,6])
+ − 361 mr.relaxase_type_accession = str(mResults.iloc[i,7])
+ − 362 mr.mpf_type = str(mResults.iloc[i,8])
+ − 363 mr.mpf_type_accession = str(mResults.iloc[i,9])
+ − 364 mr.orit_type = str(mResults.iloc[i,10])
+ − 365 mr.orit_accession = str(mResults.iloc[i,11])
+ − 366 mr.PredictedMobility = str(mResults.iloc[i,12])
+ − 367 mr.mash_nearest_neighbor = str(mResults.iloc[i,13])
+ − 368 mr.mash_neighbor_distance = float(mResults.iloc[i,14])
+ − 369 mr.mash_neighbor_cluster = int(mResults.iloc[i,15])
+ − 370 mr.row = "\t".join(str(x) for x in mResults.ix[i].tolist())
+ − 371 _mobsuite[mr.file_id] = mr
+ − 372 return _mobsuite
+ − 373
+ − 374 def ParseResFinderResult(pathToResFinderResults, plasmidContigs, likelyPlasmidContigs):
+ − 375 _rFinder = {}
+ − 376 resFinder = pandas.read_csv(pathToResFinderResults, delimiter='\t', header=0)
+ − 377 resFinder = resFinder.replace(numpy.nan, '', regex=True)
+ − 378
+ − 379 for i in range(len(resFinder.index)):
+ − 380 rf = starFinders()
+ − 381 rf.file = str(resFinder.iloc[i,0])
+ − 382 rf.sequence = str(resFinder.iloc[i,1])
+ − 383 rf.start = int(resFinder.iloc[i,2])
+ − 384 rf.end = int(resFinder.iloc[i,3])
+ − 385 rf.gene = str(resFinder.iloc[i,4])
+ − 386 rf.shortGene = rf.gene
+ − 387 rf.coverage = str(resFinder.iloc[i,5])
+ − 388 rf.coverage_map = str(resFinder.iloc[i,6])
+ − 389 rf.gaps = str(resFinder.iloc[i,7])
+ − 390 rf.pCoverage = float(resFinder.iloc[i,8])
+ − 391 rf.pIdentity = float(resFinder.iloc[i,9])
+ − 392 rf.database = str(resFinder.iloc[i,10])
+ − 393 rf.accession = str(resFinder.iloc[i,11])
+ − 394 rf.product = str(resFinder.iloc[i,12])
+ − 395 rf.row = "\t".join(str(x) for x in resFinder.ix[i].tolist())
+ − 396 if (rf.sequence[6:] in plasmidContigs):
+ − 397 rf.source = "plasmid"
+ − 398 elif (rf.sequence[6:] in likelyPlasmidContigs):
+ − 399 rf.source = "likely plasmid"
+ − 400 else:
+ − 401 rf.source = "likely chromosome"
+ − 402 _rFinder[rf.gene]=rf
+ − 403 return _rFinder
+ − 404
+ − 405 def ParseRGIResult(pathToRGIResults, plasmidContigs, likelyPlasmidContigs):
+ − 406 _rgiR = {}
+ − 407 RGI = pandas.read_csv(pathToRGIResults, delimiter='\t', header=0)
+ − 408 RGI = RGI.replace(numpy.nan, '', regex=True)
+ − 409
+ − 410 for i in range(len(RGI.index)):
+ − 411 r = RGIResult()
+ − 412 r.ORF_ID = str(RGI.iloc[i,0])
+ − 413 r.Contig = str(RGI.iloc[i,1])
+ − 414 r.Contig_Num = r.Contig[6:r.Contig.find("_")]
+ − 415 r.Start = int(RGI.iloc[i,2])
+ − 416 r.Stop = int(RGI.iloc[i,3])
+ − 417 r.Orientation = str(RGI.iloc[i,4])
+ − 418 r.Cut_Off = str(RGI.iloc[i,5])
+ − 419 r.Pass_Bitscore = int(RGI.iloc[i,6])
+ − 420 r.Best_Hit_Bitscore = float(RGI.iloc[i,7])
+ − 421 r.Best_Hit_ARO = str(RGI.iloc[i,8])
+ − 422 r.Best_Identities = float(RGI.iloc[i,9])
+ − 423 r.ARO = int(RGI.iloc[i,10])
+ − 424 r.Model_type = str(RGI.iloc[i,11])
+ − 425 r.SNPs_in_Best_Hit_ARO = str(RGI.iloc[i,12])
+ − 426 r.Other_SNPs = str(RGI.iloc[i,13])
+ − 427 r.Drug_Class = str(RGI.iloc[i,14])
+ − 428 r.Resistance_Mechanism = str(RGI.iloc[i,15])
+ − 429 r.AMR_Gene_Family = str(RGI.iloc[i,16])
+ − 430 r.Predicted_DNA = str(RGI.iloc[i,17])
+ − 431 r.Predicted_Protein = str(RGI.iloc[i,18])
+ − 432 r.CARD_Protein_Sequence = str(RGI.iloc[i,19])
+ − 433 r.Percentage_Length_of_Reference_Sequence = float(RGI.iloc[i,20])
+ − 434 r.ID = str(RGI.iloc[i,21])
+ − 435 r.Model_ID = int(RGI.iloc[i,22])
+ − 436 r.row = "\t".join(str(x) for x in RGI.ix[i].tolist())
+ − 437 if (r.Contig_Num in plasmidContigs):
+ − 438 r.source = "plasmid"
+ − 439 elif (r.Contig_Num in likelyPlasmidContigs):
+ − 440 r.source = "likely plasmid"
+ − 441 else:
+ − 442 r.source = "likely chromosome"
+ − 443 _rgiR[r.Model_ID]=r
+ − 444 return _rgiR
+ − 445 #endregion
+ − 446
+ − 447 def Main():
+ − 448 notes = []
+ − 449 #init the output list
+ − 450 output = []
+ − 451 jsonOutput = []
+ − 452
+ − 453 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath)
+ − 454 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath)
+ − 455
+ − 456 #region update databases if update=true
+ − 457 if not debug:
+ − 458 #update databases if necessary
+ − 459 if not (updateAbPath == "" and updateAbName == "default"):
+ − 460 print("updating abricate database: " + updateAbName + " @fasta path: " + updateAbPath)
+ − 461 cmd = [scriptDir + "/pipeline_updateAbricateDB.sh", updateAbPath, updateAbName]
+ − 462 update = execute(cmd)
+ − 463 if (updateMLST.lower() == "true"):
+ − 464 print("updating mlst database... ")
+ − 465 cmd = [scriptDir + "/pipeline_updateMLST.sh"]
+ − 466 update = execute(cmd)
+ − 467 #endregion
+ − 468
+ − 469 print("step 3: parsing the mlst results")
+ − 470
+ − 471 print("performing MLST")
+ − 472 #region call the mlst script
+ − 473 if not debug:
+ − 474 print("running pipeline_prediction.sh")
+ − 475 #input parameters: 1=ID 2 = assemblyPath, 3= outputdir, 4=card.json
+ − 476 cmd = [scriptDir + "/pipeline_prediction.sh", ID, assemblyPath, outputDir, cardDB]
+ − 477 result = execute(cmd)
+ − 478 #endregion
+ − 479
+ − 480 #region parse the mlst results
+ − 481 print("step 3: parsing mlst, plasmid, and amr results")
+ − 482
+ − 483 print("identifying MLST")
+ − 484 pathToMLSTScheme = outputDir + "/predictions/" + ID + ".mlst"
+ − 485 mlstHit = ParseMLSTResult(pathToMLSTScheme)#***********************
+ − 486 ToJson(mlstHit, "mlst.json") #write it to a json output
+ − 487 mlstHit = list(mlstHit.values())[0]
+ − 488
+ − 489 #endregion
+ − 490
+ − 491 #region parse mobsuite, resfinder and rgi results
+ − 492 print("identifying plasmid contigs and amr genes")
+ − 493
+ − 494 plasmidContigs = []
+ − 495 likelyPlasmidContigs = []
+ − 496 origins = []
+ − 497
+ − 498 #parse mobsuite results
+ − 499 mSuite = ParseMobsuiteResult(outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#*************
+ − 500 ToJson(mSuite, "mobsuite.json") #*************
+ − 501 mSuitePlasmids = ParseMobsuitePlasmids(outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#*************
+ − 502 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #*************
+ − 503
+ − 504 for key in mSuite:
+ − 505 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs:
+ − 506 if not (mSuite[key].rep_type == ''):
+ − 507 plasmidContigs.append(mSuite[key].contig_num)
+ − 508 else:
+ − 509 likelyPlasmidContigs.append(mSuite[key].contig_num)
+ − 510 for key in mSuite:
+ − 511 if mSuite[key].rep_type not in origins:
+ − 512 origins.append(mSuite[key].rep_type)
+ − 513
+ − 514 #parse resfinder AMR results
+ − 515 rFinder = ParseResFinderResult(outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #**********************
+ − 516 ToJson(rFinder, "resfinder.json") #*************
+ − 517
+ − 518 rgiAMR = ParseRGIResult(outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#***********************
+ − 519 ToJson(rgiAMR, "rgi.json") #*************
+ − 520
+ − 521 carbapenamases = []
+ − 522 amrGenes = []
+ − 523 for keys in rFinder:
+ − 524 carbapenamases.append(rFinder[keys].shortGene + "(" + rFinder[keys].source + ")")
+ − 525 for keys in rgiAMR:
+ − 526 if (rgiAMR[keys].Drug_Class.find("carbapenem") > -1):
+ − 527 if (rgiAMR[keys].Best_Hit_ARO not in carbapenamases):
+ − 528 carbapenamases.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")")
+ − 529 else:
+ − 530 if (rgiAMR[keys].Best_Hit_ARO not in amrGenes):
+ − 531 amrGenes.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")")
+ − 532 #endregion
+ − 533
+ − 534 #region output parsed mlst information
+ − 535 print("formatting mlst outputs")
+ − 536 output.append("\n\n\n~~~~~~~MLST summary~~~~~~~")
+ − 537 output.append("MLST determined species: " + mlstHit.species)
+ − 538 output.append("\nMLST Details: ")
+ − 539 output.append(mlstHit.row)
+ − 540
+ − 541 output.append("\nMLST information: ")
+ − 542 if (mlstHit.species == expectedSpecies):
+ − 543 output.append("MLST determined species is the same as expected species")
+ − 544 #notes.append("MLST determined species is the same as expected species")
+ − 545 else:
+ − 546 output.append("!!!MLST determined species is NOT the same as expected species, contamination? mislabeling?")
+ − 547 notes.append("MLST: Not expected species. Possible contamination or mislabeling")
+ − 548
+ − 549 #endregion
+ − 550
+ − 551 #region output the parsed plasmid/amr results
+ − 552 output.append("\n\n\n~~~~~~~~Plasmids~~~~~~~~\n")
+ − 553
+ − 554 output.append("predicted plasmid origins: ")
+ − 555 output.append(";".join(origins))
+ − 556
+ − 557 output.append("\ndefinitely plasmid contigs")
+ − 558 output.append(";".join(plasmidContigs))
+ − 559
+ − 560 output.append("\nlikely plasmid contigs")
+ − 561 output.append(";".join(likelyPlasmidContigs))
+ − 562
+ − 563 output.append("\nmob-suite prediction details: ")
+ − 564 for key in mSuite:
+ − 565 output.append(mSuite[key].row)
+ − 566
+ − 567 output.append("\n\n\n~~~~~~~~AMR Genes~~~~~~~~\n")
+ − 568 output.append("predicted carbapenamase Genes: ")
+ − 569 output.append(",".join(carbapenamases))
+ − 570 output.append("other RGI AMR Genes: ")
+ − 571 for key in rgiAMR:
+ − 572 output.append(rgiAMR[key].Best_Hit_ARO + "(" + rgiAMR[key].source + ")")
+ − 573
+ − 574 output.append("\nDetails about the carbapenamase Genes: ")
+ − 575 for key in rFinder:
+ − 576 output.append(rFinder[key].row)
+ − 577 output.append("\nDetails about the RGI AMR Genes: ")
+ − 578 for key in rgiAMR:
+ − 579 output.append(rgiAMR[key].row)
+ − 580
+ − 581 #write summary to a file
+ − 582 summaryDir = outputDir + "/summary/" + ID
+ − 583 out = open(summaryDir + ".txt", 'w')
+ − 584 for item in output:
+ − 585 out.write("%s\n" % item)
+ − 586
+ − 587
+ − 588 #TSV output
+ − 589 tsvOut = []
+ − 590 tsvOut.append("ID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs")
+ − 591 #start with ID
+ − 592 temp = ""
+ − 593 temp += (ID + "\t")
+ − 594 temp += expectedSpecies + "\t"
+ − 595
+ − 596 #move into MLST
+ − 597 temp += mlstHit.species + "\t"
+ − 598 temp += str(mlstHit.seqType) + "\t"
+ − 599 temp += mlstHit.scheme + "\t"
+ − 600
+ − 601 #now onto AMR genes
+ − 602 temp += ";".join(carbapenamases) + "\t"
+ − 603 temp += ";".join(amrGenes) + "\t"
+ − 604
+ − 605 #lastly plasmids
+ − 606 temp+= str(len(mSuitePlasmids)) + "\t"
+ − 607 plasmidID = ""
+ − 608 contigs = ""
+ − 609 lengths = ""
+ − 610 rep_type = ""
+ − 611 mobility = ""
+ − 612 neighbour = ""
+ − 613 for keys in mSuitePlasmids:
+ − 614 plasmidID += str(mSuitePlasmids[keys].mash_neighbor_cluster) + ";"
+ − 615 contigs += str(mSuitePlasmids[keys].num_contigs) + ";"
+ − 616 lengths += str(mSuitePlasmids[keys].total_length) + ";"
+ − 617 rep_type += str(mSuitePlasmids[keys].rep_types) + ";"
+ − 618 mobility += str(mSuitePlasmids[keys].PredictedMobility) + ";"
+ − 619 neighbour += str(mSuitePlasmids[keys].mash_nearest_neighbor) + ";"
+ − 620 temp += plasmidID + "\t" + contigs + "\t" + lengths + "\t" + rep_type + "\t" + mobility + "\t" + neighbour + "\t"
+ − 621 temp += ";".join(plasmidContigs) + "\t"
+ − 622 temp += ";".join(likelyPlasmidContigs)
+ − 623 tsvOut.append(temp)
+ − 624
+ − 625 summaryDir = outputDir + "/summary/" + ID
+ − 626 out = open(summaryDir + ".tsv", 'w')
+ − 627 for item in tsvOut:
+ − 628 out.write("%s\n" % item)
+ − 629 #endregion
+ − 630
+ − 631
+ − 632 start = time.time()#time the analysis
+ − 633 print("Starting workflow...")
+ − 634 #analysis time
+ − 635 Main()
+ − 636
+ − 637 end = time.time()
+ − 638 print("Finished!\nThe analysis used: " + str(end-start) + " seconds")