Mercurial > repos > jjjjia > cpo_prediction
comparison cpo_galaxy_prediction.py @ 1:fea89c4d5227 draft
Uploaded
| author | jjjjia |
|---|---|
| date | Thu, 16 Aug 2018 19:27:05 -0400 |
| parents | |
| children | 29302ffdf137 |
comparison
equal
deleted
inserted
replaced
| 0:917a05a03ac9 | 1:fea89c4d5227 |
|---|---|
| 1 #!/home/jjjjia/.conda/envs/py36/bin/python | |
| 2 | |
| 3 #$ -S /home/jjjjia/.conda/envs/py36/bin/python | |
| 4 #$ -V # Pass environment variables to the job | |
| 5 #$ -N CPO_pipeline # Replace with a more specific job name | |
| 6 #$ -wd /home/jjjjia/testCases # Use the current working dir | |
| 7 #$ -pe smp 8 # Parallel Environment (how many cores) | |
| 8 #$ -l h_vmem=11G # Memory (RAM) allocation *per core* | |
| 9 #$ -e ./logs/$JOB_ID.err | |
| 10 #$ -o ./logs/$JOB_ID.log | |
| 11 #$ -m ea | |
| 12 #$ -M bja20@sfu.ca | |
| 13 | |
| 14 #~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" | |
| 15 | |
| 16 import subprocess | |
| 17 import pandas | |
| 18 import optparse | |
| 19 import os | |
| 20 import datetime | |
| 21 import sys | |
| 22 import time | |
| 23 import urllib.request | |
| 24 import gzip | |
| 25 import collections | |
| 26 import json | |
| 27 import numpy | |
| 28 | |
| 29 | |
| 30 debug = True #debug skips the shell scripts and also dump out a ton of debugging messages | |
| 31 | |
| 32 if not debug: | |
| 33 #parses some parameters | |
| 34 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") | |
| 35 #required | |
| 36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate") | |
| 37 parser.add_option("-a", "--assembly", dest="assemblyPath", type="string", help="absolute file path to contigs fasta") | |
| 38 parser.add_option("-c", "--card-db", dest="cardDB", default = "/home/jjjjia/databases/card202.json", type="string", help="absolute path to card reference database") | |
| 39 parser.add_option("-o", "--output", dest="output", default='./', type="string", help="absolute path to output folder") | |
| 40 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") | |
| 41 | |
| 42 #optionals | |
| 43 parser.add_option("-k", "--script-path", dest="scriptDir", default="/home/jjjjia/scripts", type="string", help="absolute file path to this script folder") | |
| 44 parser.add_option("-b", "--update-abricate-path", dest="updateAbPath", default = "", type="string", help="absolute file path to fasta sequence used for abricate database") | |
| 45 parser.add_option("-m", "--update-abricate-dbname", dest="updateAbName", default = "default", type="string", help="name of abricate database to update") | |
| 46 parser.add_option("-u", "--update-mlst", dest="updateMLST", default = "False", type="string", help="True = update MLST") | |
| 47 #used for parsing | |
| 48 parser.add_option("-s", "--mlst-scheme", dest="mlst", default= "/home/jjjjia/databases/scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") | |
| 49 | |
| 50 | |
| 51 #parallelization, useless, these are hard coded to 8cores/64G RAM | |
| 52 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") | |
| 53 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB") | |
| 54 | |
| 55 (options,args) = parser.parse_args() | |
| 56 #if len(args) != 8: | |
| 57 #parser.error("incorrect number of arguments, all 7 is required") | |
| 58 curDir = os.getcwd() | |
| 59 outputDir = options.output | |
| 60 expectedSpecies = options.expectedSpecies | |
| 61 mlstScheme = options.mlst | |
| 62 tempDir = outputDir + "/shovillTemp" | |
| 63 scriptDir = options.scriptDir | |
| 64 updateAbName = options.updateAbName | |
| 65 updateAbPath = options.updateAbPath | |
| 66 updateMLST = options.updateMLST | |
| 67 cardDB=options.cardDB | |
| 68 assemblyPath=options.assemblyPath | |
| 69 ID = options.id | |
| 70 else: | |
| 71 manifestDir = "" | |
| 72 curDir = os.getcwd() | |
| 73 outputDir = "pipelineTest" | |
| 74 expectedSpecies = "Escherichia coli" | |
| 75 #threads = 8 | |
| 76 #memory = 64 | |
| 77 mlstScheme = outputDir + "/scheme_species_map.tab" | |
| 78 tempDir= outputDir + "/shovillTemp" | |
| 79 scriptDir = "~/scripts" | |
| 80 updateAbName = "cpo" | |
| 81 updateAbPath = "~/database/bccdcCPO.seq" | |
| 82 updateMLST = True | |
| 83 assemblyPath = "./" | |
| 84 cardDB = "./" | |
| 85 ID = "BC11-Kpn005_S2" | |
| 86 | |
| 87 #region result objects | |
| 88 #define some objects to store values from results | |
| 89 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). | |
| 90 class starFinders(object): | |
| 91 def __init__(self): | |
| 92 self.file = "" | |
| 93 self.sequence = "" | |
| 94 self.start = 0 | |
| 95 self.end = 0 | |
| 96 self.gene = "" | |
| 97 self.shortGene = "" | |
| 98 self.coverage = "" | |
| 99 self.coverage_map = "" | |
| 100 self.gaps = "" | |
| 101 self.pCoverage = 100.00 | |
| 102 self.pIdentity = 100.00 | |
| 103 self.database = "" | |
| 104 self.accession = "" | |
| 105 self.product = "" | |
| 106 self.source = "chromosome" | |
| 107 self.row = "" | |
| 108 | |
| 109 class PlasFlowResult(object): | |
| 110 def __init__(self): | |
| 111 self.sequence = "" | |
| 112 self.length = 0 | |
| 113 self.label = "" | |
| 114 self.confidence = 0 | |
| 115 self.usefulRow = "" | |
| 116 self.row = "" | |
| 117 | |
| 118 class MlstResult(object): | |
| 119 def __init__(self): | |
| 120 self.file = "" | |
| 121 self.speciesID = "" | |
| 122 self.seqType = 0 | |
| 123 self.scheme = "" | |
| 124 self.species = "" | |
| 125 self.row="" | |
| 126 | |
| 127 class mobsuiteResult(object): | |
| 128 def __init__(self): | |
| 129 self.file_id = "" | |
| 130 self.cluster_id = "" | |
| 131 self.contig_id = "" | |
| 132 self.contig_num = 0 | |
| 133 self.contig_length = 0 | |
| 134 self.circularity_status = "" | |
| 135 self.rep_type = "" | |
| 136 self.rep_type_accession = "" | |
| 137 self.relaxase_type = "" | |
| 138 self.relaxase_type_accession = "" | |
| 139 self.mash_nearest_neighbor = "" | |
| 140 self.mash_neighbor_distance = 0.00 | |
| 141 self.repetitive_dna_id = "" | |
| 142 self.match_type = "" | |
| 143 self.score = 0 | |
| 144 self.contig_match_start = 0 | |
| 145 self.contig_match_end = 0 | |
| 146 self.row = "" | |
| 147 | |
| 148 class mobsuitePlasmids(object): | |
| 149 def __init__(self): | |
| 150 self.file_id = "" | |
| 151 self.num_contigs = 0 | |
| 152 self.total_length = 0 | |
| 153 self.gc = "" | |
| 154 self.rep_types = "" | |
| 155 self.rep_typeAccession = "" | |
| 156 self.relaxase_type= "" | |
| 157 self.relaxase_type_accession = "" | |
| 158 self.mpf_type = "" | |
| 159 self.mpf_type_accession= "" | |
| 160 self.orit_type = "" | |
| 161 self.orit_accession = "" | |
| 162 self.PredictedMobility = "" | |
| 163 self.mash_nearest_neighbor = "" | |
| 164 self.mash_neighbor_distance = 0.00 | |
| 165 self.mash_neighbor_cluster= 0 | |
| 166 self.row = "" | |
| 167 class RGIResult(object): | |
| 168 def __init__(self): | |
| 169 self.ORF_ID = "" | |
| 170 self.Contig = "" | |
| 171 self.Start = -1 | |
| 172 self.Stop = -1 | |
| 173 self.Orientation = "" | |
| 174 self.Cut_Off = "" | |
| 175 self.Pass_Bitscore = 100000 | |
| 176 self.Best_Hit_Bitscore = 0.00 | |
| 177 self.Best_Hit_ARO = "" | |
| 178 self.Best_Identities = 0.00 | |
| 179 self.ARO = 0 | |
| 180 self.Model_type = "" | |
| 181 self.SNPs_in_Best_Hit_ARO = "" | |
| 182 self.Other_SNPs = "" | |
| 183 self.Drug_Class = "" | |
| 184 self.Resistance_Mechanism = "" | |
| 185 self.AMR_Gene_Family = "" | |
| 186 self.Predicted_DNA = "" | |
| 187 self.Predicted_Protein = "" | |
| 188 self.CARD_Protein_Sequence = "" | |
| 189 self.Percentage_Length_of_Reference_Sequence = 0.00 | |
| 190 self.ID = "" | |
| 191 self.Model_ID = 0 | |
| 192 self.source = "" | |
| 193 self.row = "" | |
| 194 | |
| 195 #endregion | |
| 196 | |
| 197 #region useful functions | |
| 198 def read(path): | |
| 199 return [line.rstrip('\n') for line in open(path)] | |
| 200 def execute(command): | |
| 201 process = subprocess.Popen(command, shell=False, cwd=curDir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
| 202 | |
| 203 # Poll process for new output until finished | |
| 204 while True: | |
| 205 nextline = process.stdout.readline() | |
| 206 if nextline == '' and process.poll() is not None: | |
| 207 break | |
| 208 sys.stdout.write(nextline) | |
| 209 sys.stdout.flush() | |
| 210 | |
| 211 output = process.communicate()[0] | |
| 212 exitCode = process.returncode | |
| 213 | |
| 214 if (exitCode == 0): | |
| 215 return output | |
| 216 else: | |
| 217 raise subprocess.CalledProcessError(exitCode, command) | |
| 218 def httpGetFile(url, filepath=""): | |
| 219 if (filepath == ""): | |
| 220 return urllib.request.urlretrieve(url) | |
| 221 else: | |
| 222 urllib.request.urlretrieve(url, filepath) | |
| 223 return True | |
| 224 def gunzip(inputpath="", outputpath=""): | |
| 225 if (outputpath == ""): | |
| 226 with gzip.open(inputpath, 'rb') as f: | |
| 227 gzContent = f.read() | |
| 228 return gzContent | |
| 229 else: | |
| 230 with gzip.open(inputpath, 'rb') as f: | |
| 231 gzContent = f.read() | |
| 232 with open(outputpath, 'wb') as out: | |
| 233 out.write(gzContent) | |
| 234 return True | |
| 235 def ToJson(dictObject, outputPath): | |
| 236 outDir = outputDir + '/summary/' + ID + ".json/" | |
| 237 if not (os.path.exists(outDir)): | |
| 238 os.makedirs(outDir) | |
| 239 with open(outDir + outputPath, 'w') as f: | |
| 240 json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False) | |
| 241 #endregion | |
| 242 | |
| 243 #region functions to parse result files | |
| 244 def ParseMLSTResult(pathToMLSTResult): | |
| 245 _mlstResult = {} | |
| 246 scheme = pandas.read_csv(mlstScheme, delimiter='\t', header=0) | |
| 247 scheme = scheme.replace(numpy.nan, '', regex=True) | |
| 248 | |
| 249 taxon = {} | |
| 250 #record the scheme as a dictionary | |
| 251 taxon["-"] = "No MLST Match" | |
| 252 for i in range(len(scheme.index)): | |
| 253 key = scheme.iloc[i,0] | |
| 254 if (str(scheme.iloc[i,2]) == "nan"): | |
| 255 value = str(scheme.iloc[i,1]) | |
| 256 else: | |
| 257 value = str(scheme.iloc[i,1]) + " " + str(scheme.iloc[i,2]) | |
| 258 | |
| 259 if (key in taxon.keys()): | |
| 260 taxon[key] = taxon.get(key) + ";" + value | |
| 261 else: | |
| 262 taxon[key] = value | |
| 263 #read in the mlst result | |
| 264 mlst = pandas.read_csv(pathToMLSTResult, delimiter='\t', header=None) | |
| 265 _mlstHit = MlstResult() | |
| 266 | |
| 267 _mlstHit.file = mlst.iloc[0,0] | |
| 268 _mlstHit.speciesID = (mlst.iloc[0,1]) | |
| 269 _mlstHit.seqType = str(mlst.iloc[0,2]) | |
| 270 for i in range(3, len(mlst.columns)): | |
| 271 _mlstHit.scheme += mlst.iloc[0,i] + ";" | |
| 272 _mlstHit.species = taxon[_mlstHit.speciesID] | |
| 273 _mlstHit.row = "\t".join(str(x) for x in mlst.ix[0].tolist()) | |
| 274 _mlstResult[_mlstHit.speciesID]=_mlstHit | |
| 275 | |
| 276 return _mlstResult | |
| 277 | |
| 278 def ParsePlasmidFinderResult(pathToPlasmidFinderResult): | |
| 279 #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918 | |
| 280 #example resfinder: | |
| 281 #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181 | |
| 282 | |
| 283 _pFinder = {} #*********************** | |
| 284 plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0) | |
| 285 plasmidFinder = plasmidFinder.replace(numpy.nan, '', regex=True) | |
| 286 | |
| 287 | |
| 288 for i in range(len(plasmidFinder.index)): | |
| 289 pf = starFinders() | |
| 290 pf.file = str(plasmidFinder.iloc[i,0]) | |
| 291 pf.sequence = str(plasmidFinder.iloc[i,1]) | |
| 292 pf.start = int(plasmidFinder.iloc[i,2]) | |
| 293 pf.end = int(plasmidFinder.iloc[i,3]) | |
| 294 pf.gene = str(plasmidFinder.iloc[i,4]) | |
| 295 pf.shortGene = pf.gene[:pf.gene.index("_")] | |
| 296 pf.coverage = str(plasmidFinder.iloc[i,5]) | |
| 297 pf.coverage_map = str(plasmidFinder.iloc[i,6]) | |
| 298 pf.gaps = str(plasmidFinder.iloc[i,7]) | |
| 299 pf.pCoverage = float(plasmidFinder.iloc[i,8]) | |
| 300 pf.pIdentity = float(plasmidFinder.iloc[i,9]) | |
| 301 pf.database = str(plasmidFinder.iloc[i,10]) | |
| 302 pf.accession = str(plasmidFinder.iloc[i,11]) | |
| 303 pf.product = str(plasmidFinder.iloc[i,12]) | |
| 304 pf.source = "plasmid" | |
| 305 pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) | |
| 306 _pFinder[pf.gene]=pf | |
| 307 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) | |
| 308 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1])) | |
| 309 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")])) | |
| 310 return _pFinder | |
| 311 | |
| 312 def ParseMobsuiteResult(pathToMobsuiteResult): | |
| 313 _mobsuite = {} | |
| 314 mResult = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0) | |
| 315 mResult = mResult.replace(numpy.nan, '', regex=True) | |
| 316 | |
| 317 for i in range(len(mResult.index)): | |
| 318 mr = mobsuiteResult() | |
| 319 mr.file_id = str(mResult.iloc[i,0]) | |
| 320 mr.cluster_id = str(mResult.iloc[i,1]) | |
| 321 if (mr.cluster_id == "chromosome"): | |
| 322 break | |
| 323 mr.contig_id = str(mResult.iloc[i,2]) | |
| 324 mr.contig_num = mr.contig_id[(mr.contig_id.find("contig")+6):mr.contig_id.find("_len=")] | |
| 325 mr.contig_length = int(mResult.iloc[i,3]) | |
| 326 mr.circularity_status = str(mResult.iloc[i,4]) | |
| 327 mr.rep_type = str(mResult.iloc[i,5]) | |
| 328 mr.rep_type_accession = str(mResult.iloc[i,6]) | |
| 329 mr.relaxase_type = str(mResult.iloc[i,7]) | |
| 330 mr.relaxase_type_accession = str(mResult.iloc[i,8]) | |
| 331 mr.mash_nearest_neighbor = str(mResult.iloc[i,9]) | |
| 332 mr.mash_neighbor_distance = float(mResult.iloc[i,10]) | |
| 333 mr.repetitive_dna_id = str(mResult.iloc[i,11]) | |
| 334 mr.match_type = str(mResult.iloc[i,12]) | |
| 335 if (mr.match_type == ""): | |
| 336 mr.score = -1 | |
| 337 mr.contig_match_start = -1 | |
| 338 mr.contig_match_end = -1 | |
| 339 else: | |
| 340 mr.score = int(mResult.iloc[i,13]) | |
| 341 mr.contig_match_start = int(mResult.iloc[i,14]) | |
| 342 mr.contig_match_end = int(mResult.iloc[i,15]) | |
| 343 mr.row = "\t".join(str(x) for x in mResult.ix[i].tolist()) | |
| 344 _mobsuite[mr.contig_id]=(mr) | |
| 345 return _mobsuite | |
| 346 | |
| 347 def ParseMobsuitePlasmids(pathToMobsuiteResult): | |
| 348 _mobsuite = {} | |
| 349 mResults = pandas.read_csv(pathToMobsuiteResult, delimiter='\t', header=0) | |
| 350 mResults = mResults.replace(numpy.nan, '', regex=True) | |
| 351 | |
| 352 for i in range(len(mResults.index)): | |
| 353 mr = mobsuitePlasmids() | |
| 354 mr.file_id = str(mResults.iloc[i,0]) | |
| 355 mr.num_contigs = int(mResults.iloc[i,1]) | |
| 356 mr.total_length = int(mResults.iloc[i,2]) | |
| 357 mr.gc = int(mResults.iloc[i,3]) | |
| 358 mr.rep_types = str(mResults.iloc[i,4]) | |
| 359 mr.rep_typeAccession = str(mResults.iloc[i,5]) | |
| 360 mr.relaxase_type = str(mResults.iloc[i,6]) | |
| 361 mr.relaxase_type_accession = str(mResults.iloc[i,7]) | |
| 362 mr.mpf_type = str(mResults.iloc[i,8]) | |
| 363 mr.mpf_type_accession = str(mResults.iloc[i,9]) | |
| 364 mr.orit_type = str(mResults.iloc[i,10]) | |
| 365 mr.orit_accession = str(mResults.iloc[i,11]) | |
| 366 mr.PredictedMobility = str(mResults.iloc[i,12]) | |
| 367 mr.mash_nearest_neighbor = str(mResults.iloc[i,13]) | |
| 368 mr.mash_neighbor_distance = float(mResults.iloc[i,14]) | |
| 369 mr.mash_neighbor_cluster = int(mResults.iloc[i,15]) | |
| 370 mr.row = "\t".join(str(x) for x in mResults.ix[i].tolist()) | |
| 371 _mobsuite[mr.file_id] = mr | |
| 372 return _mobsuite | |
| 373 | |
| 374 def ParseResFinderResult(pathToResFinderResults, plasmidContigs, likelyPlasmidContigs): | |
| 375 _rFinder = {} | |
| 376 resFinder = pandas.read_csv(pathToResFinderResults, delimiter='\t', header=0) | |
| 377 resFinder = resFinder.replace(numpy.nan, '', regex=True) | |
| 378 | |
| 379 for i in range(len(resFinder.index)): | |
| 380 rf = starFinders() | |
| 381 rf.file = str(resFinder.iloc[i,0]) | |
| 382 rf.sequence = str(resFinder.iloc[i,1]) | |
| 383 rf.start = int(resFinder.iloc[i,2]) | |
| 384 rf.end = int(resFinder.iloc[i,3]) | |
| 385 rf.gene = str(resFinder.iloc[i,4]) | |
| 386 rf.shortGene = rf.gene | |
| 387 rf.coverage = str(resFinder.iloc[i,5]) | |
| 388 rf.coverage_map = str(resFinder.iloc[i,6]) | |
| 389 rf.gaps = str(resFinder.iloc[i,7]) | |
| 390 rf.pCoverage = float(resFinder.iloc[i,8]) | |
| 391 rf.pIdentity = float(resFinder.iloc[i,9]) | |
| 392 rf.database = str(resFinder.iloc[i,10]) | |
| 393 rf.accession = str(resFinder.iloc[i,11]) | |
| 394 rf.product = str(resFinder.iloc[i,12]) | |
| 395 rf.row = "\t".join(str(x) for x in resFinder.ix[i].tolist()) | |
| 396 if (rf.sequence[6:] in plasmidContigs): | |
| 397 rf.source = "plasmid" | |
| 398 elif (rf.sequence[6:] in likelyPlasmidContigs): | |
| 399 rf.source = "likely plasmid" | |
| 400 else: | |
| 401 rf.source = "likely chromosome" | |
| 402 _rFinder[rf.gene]=rf | |
| 403 return _rFinder | |
| 404 | |
| 405 def ParseRGIResult(pathToRGIResults, plasmidContigs, likelyPlasmidContigs): | |
| 406 _rgiR = {} | |
| 407 RGI = pandas.read_csv(pathToRGIResults, delimiter='\t', header=0) | |
| 408 RGI = RGI.replace(numpy.nan, '', regex=True) | |
| 409 | |
| 410 for i in range(len(RGI.index)): | |
| 411 r = RGIResult() | |
| 412 r.ORF_ID = str(RGI.iloc[i,0]) | |
| 413 r.Contig = str(RGI.iloc[i,1]) | |
| 414 r.Contig_Num = r.Contig[6:r.Contig.find("_")] | |
| 415 r.Start = int(RGI.iloc[i,2]) | |
| 416 r.Stop = int(RGI.iloc[i,3]) | |
| 417 r.Orientation = str(RGI.iloc[i,4]) | |
| 418 r.Cut_Off = str(RGI.iloc[i,5]) | |
| 419 r.Pass_Bitscore = int(RGI.iloc[i,6]) | |
| 420 r.Best_Hit_Bitscore = float(RGI.iloc[i,7]) | |
| 421 r.Best_Hit_ARO = str(RGI.iloc[i,8]) | |
| 422 r.Best_Identities = float(RGI.iloc[i,9]) | |
| 423 r.ARO = int(RGI.iloc[i,10]) | |
| 424 r.Model_type = str(RGI.iloc[i,11]) | |
| 425 r.SNPs_in_Best_Hit_ARO = str(RGI.iloc[i,12]) | |
| 426 r.Other_SNPs = str(RGI.iloc[i,13]) | |
| 427 r.Drug_Class = str(RGI.iloc[i,14]) | |
| 428 r.Resistance_Mechanism = str(RGI.iloc[i,15]) | |
| 429 r.AMR_Gene_Family = str(RGI.iloc[i,16]) | |
| 430 r.Predicted_DNA = str(RGI.iloc[i,17]) | |
| 431 r.Predicted_Protein = str(RGI.iloc[i,18]) | |
| 432 r.CARD_Protein_Sequence = str(RGI.iloc[i,19]) | |
| 433 r.Percentage_Length_of_Reference_Sequence = float(RGI.iloc[i,20]) | |
| 434 r.ID = str(RGI.iloc[i,21]) | |
| 435 r.Model_ID = int(RGI.iloc[i,22]) | |
| 436 r.row = "\t".join(str(x) for x in RGI.ix[i].tolist()) | |
| 437 if (r.Contig_Num in plasmidContigs): | |
| 438 r.source = "plasmid" | |
| 439 elif (r.Contig_Num in likelyPlasmidContigs): | |
| 440 r.source = "likely plasmid" | |
| 441 else: | |
| 442 r.source = "likely chromosome" | |
| 443 _rgiR[r.Model_ID]=r | |
| 444 return _rgiR | |
| 445 #endregion | |
| 446 | |
| 447 def Main(): | |
| 448 notes = [] | |
| 449 #init the output list | |
| 450 output = [] | |
| 451 jsonOutput = [] | |
| 452 | |
| 453 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath) | |
| 454 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath) | |
| 455 | |
| 456 #region update databases if update=true | |
| 457 if not debug: | |
| 458 #update databases if necessary | |
| 459 if not (updateAbPath == "" and updateAbName == "default"): | |
| 460 print("updating abricate database: " + updateAbName + " @fasta path: " + updateAbPath) | |
| 461 cmd = [scriptDir + "/pipeline_updateAbricateDB.sh", updateAbPath, updateAbName] | |
| 462 update = execute(cmd) | |
| 463 if (updateMLST.lower() == "true"): | |
| 464 print("updating mlst database... ") | |
| 465 cmd = [scriptDir + "/pipeline_updateMLST.sh"] | |
| 466 update = execute(cmd) | |
| 467 #endregion | |
| 468 | |
| 469 print("step 3: parsing the mlst results") | |
| 470 | |
| 471 print("performing MLST") | |
| 472 #region call the mlst script | |
| 473 if not debug: | |
| 474 print("running pipeline_prediction.sh") | |
| 475 #input parameters: 1=ID 2 = assemblyPath, 3= outputdir, 4=card.json | |
| 476 cmd = [scriptDir + "/pipeline_prediction.sh", ID, assemblyPath, outputDir, cardDB] | |
| 477 result = execute(cmd) | |
| 478 #endregion | |
| 479 | |
| 480 #region parse the mlst results | |
| 481 print("step 3: parsing mlst, plasmid, and amr results") | |
| 482 | |
| 483 print("identifying MLST") | |
| 484 pathToMLSTScheme = outputDir + "/predictions/" + ID + ".mlst" | |
| 485 mlstHit = ParseMLSTResult(pathToMLSTScheme)#*********************** | |
| 486 ToJson(mlstHit, "mlst.json") #write it to a json output | |
| 487 mlstHit = list(mlstHit.values())[0] | |
| 488 | |
| 489 #endregion | |
| 490 | |
| 491 #region parse mobsuite, resfinder and rgi results | |
| 492 print("identifying plasmid contigs and amr genes") | |
| 493 | |
| 494 plasmidContigs = [] | |
| 495 likelyPlasmidContigs = [] | |
| 496 origins = [] | |
| 497 | |
| 498 #parse mobsuite results | |
| 499 mSuite = ParseMobsuiteResult(outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#************* | |
| 500 ToJson(mSuite, "mobsuite.json") #************* | |
| 501 mSuitePlasmids = ParseMobsuitePlasmids(outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#************* | |
| 502 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #************* | |
| 503 | |
| 504 for key in mSuite: | |
| 505 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs: | |
| 506 if not (mSuite[key].rep_type == ''): | |
| 507 plasmidContigs.append(mSuite[key].contig_num) | |
| 508 else: | |
| 509 likelyPlasmidContigs.append(mSuite[key].contig_num) | |
| 510 for key in mSuite: | |
| 511 if mSuite[key].rep_type not in origins: | |
| 512 origins.append(mSuite[key].rep_type) | |
| 513 | |
| 514 #parse resfinder AMR results | |
| 515 rFinder = ParseResFinderResult(outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** | |
| 516 ToJson(rFinder, "resfinder.json") #************* | |
| 517 | |
| 518 rgiAMR = ParseRGIResult(outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** | |
| 519 ToJson(rgiAMR, "rgi.json") #************* | |
| 520 | |
| 521 carbapenamases = [] | |
| 522 amrGenes = [] | |
| 523 for keys in rFinder: | |
| 524 carbapenamases.append(rFinder[keys].shortGene + "(" + rFinder[keys].source + ")") | |
| 525 for keys in rgiAMR: | |
| 526 if (rgiAMR[keys].Drug_Class.find("carbapenem") > -1): | |
| 527 if (rgiAMR[keys].Best_Hit_ARO not in carbapenamases): | |
| 528 carbapenamases.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")") | |
| 529 else: | |
| 530 if (rgiAMR[keys].Best_Hit_ARO not in amrGenes): | |
| 531 amrGenes.append(rgiAMR[keys].Best_Hit_ARO+ "(" + rgiAMR[keys].source + ")") | |
| 532 #endregion | |
| 533 | |
| 534 #region output parsed mlst information | |
| 535 print("formatting mlst outputs") | |
| 536 output.append("\n\n\n~~~~~~~MLST summary~~~~~~~") | |
| 537 output.append("MLST determined species: " + mlstHit.species) | |
| 538 output.append("\nMLST Details: ") | |
| 539 output.append(mlstHit.row) | |
| 540 | |
| 541 output.append("\nMLST information: ") | |
| 542 if (mlstHit.species == expectedSpecies): | |
| 543 output.append("MLST determined species is the same as expected species") | |
| 544 #notes.append("MLST determined species is the same as expected species") | |
| 545 else: | |
| 546 output.append("!!!MLST determined species is NOT the same as expected species, contamination? mislabeling?") | |
| 547 notes.append("MLST: Not expected species. Possible contamination or mislabeling") | |
| 548 | |
| 549 #endregion | |
| 550 | |
| 551 #region output the parsed plasmid/amr results | |
| 552 output.append("\n\n\n~~~~~~~~Plasmids~~~~~~~~\n") | |
| 553 | |
| 554 output.append("predicted plasmid origins: ") | |
| 555 output.append(";".join(origins)) | |
| 556 | |
| 557 output.append("\ndefinitely plasmid contigs") | |
| 558 output.append(";".join(plasmidContigs)) | |
| 559 | |
| 560 output.append("\nlikely plasmid contigs") | |
| 561 output.append(";".join(likelyPlasmidContigs)) | |
| 562 | |
| 563 output.append("\nmob-suite prediction details: ") | |
| 564 for key in mSuite: | |
| 565 output.append(mSuite[key].row) | |
| 566 | |
| 567 output.append("\n\n\n~~~~~~~~AMR Genes~~~~~~~~\n") | |
| 568 output.append("predicted carbapenamase Genes: ") | |
| 569 output.append(",".join(carbapenamases)) | |
| 570 output.append("other RGI AMR Genes: ") | |
| 571 for key in rgiAMR: | |
| 572 output.append(rgiAMR[key].Best_Hit_ARO + "(" + rgiAMR[key].source + ")") | |
| 573 | |
| 574 output.append("\nDetails about the carbapenamase Genes: ") | |
| 575 for key in rFinder: | |
| 576 output.append(rFinder[key].row) | |
| 577 output.append("\nDetails about the RGI AMR Genes: ") | |
| 578 for key in rgiAMR: | |
| 579 output.append(rgiAMR[key].row) | |
| 580 | |
| 581 #write summary to a file | |
| 582 summaryDir = outputDir + "/summary/" + ID | |
| 583 out = open(summaryDir + ".txt", 'w') | |
| 584 for item in output: | |
| 585 out.write("%s\n" % item) | |
| 586 | |
| 587 | |
| 588 #TSV output | |
| 589 tsvOut = [] | |
| 590 tsvOut.append("ID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs") | |
| 591 #start with ID | |
| 592 temp = "" | |
| 593 temp += (ID + "\t") | |
| 594 temp += expectedSpecies + "\t" | |
| 595 | |
| 596 #move into MLST | |
| 597 temp += mlstHit.species + "\t" | |
| 598 temp += str(mlstHit.seqType) + "\t" | |
| 599 temp += mlstHit.scheme + "\t" | |
| 600 | |
| 601 #now onto AMR genes | |
| 602 temp += ";".join(carbapenamases) + "\t" | |
| 603 temp += ";".join(amrGenes) + "\t" | |
| 604 | |
| 605 #lastly plasmids | |
| 606 temp+= str(len(mSuitePlasmids)) + "\t" | |
| 607 plasmidID = "" | |
| 608 contigs = "" | |
| 609 lengths = "" | |
| 610 rep_type = "" | |
| 611 mobility = "" | |
| 612 neighbour = "" | |
| 613 for keys in mSuitePlasmids: | |
| 614 plasmidID += str(mSuitePlasmids[keys].mash_neighbor_cluster) + ";" | |
| 615 contigs += str(mSuitePlasmids[keys].num_contigs) + ";" | |
| 616 lengths += str(mSuitePlasmids[keys].total_length) + ";" | |
| 617 rep_type += str(mSuitePlasmids[keys].rep_types) + ";" | |
| 618 mobility += str(mSuitePlasmids[keys].PredictedMobility) + ";" | |
| 619 neighbour += str(mSuitePlasmids[keys].mash_nearest_neighbor) + ";" | |
| 620 temp += plasmidID + "\t" + contigs + "\t" + lengths + "\t" + rep_type + "\t" + mobility + "\t" + neighbour + "\t" | |
| 621 temp += ";".join(plasmidContigs) + "\t" | |
| 622 temp += ";".join(likelyPlasmidContigs) | |
| 623 tsvOut.append(temp) | |
| 624 | |
| 625 summaryDir = outputDir + "/summary/" + ID | |
| 626 out = open(summaryDir + ".tsv", 'w') | |
| 627 for item in tsvOut: | |
| 628 out.write("%s\n" % item) | |
| 629 #endregion | |
| 630 | |
| 631 | |
| 632 start = time.time()#time the analysis | |
| 633 print("Starting workflow...") | |
| 634 #analysis time | |
| 635 Main() | |
| 636 | |
| 637 end = time.time() | |
| 638 print("Finished!\nThe analysis used: " + str(end-start) + " seconds") |
