Mercurial > repos > jjjjia > cpo_prediction
comparison cpo_galaxy_prediction.py @ 3:e6027598a35c draft
planemo upload
author | jjjjia |
---|---|
date | Mon, 20 Aug 2018 17:53:59 -0400 |
parents | 29302ffdf137 |
children | bd6f5844d60e |
comparison
equal
deleted
inserted
replaced
2:29302ffdf137 | 3:e6027598a35c |
---|---|
9 #$ -e ./logs/$JOB_ID.err | 9 #$ -e ./logs/$JOB_ID.err |
10 #$ -o ./logs/$JOB_ID.log | 10 #$ -o ./logs/$JOB_ID.log |
11 #$ -m ea | 11 #$ -m ea |
12 #$ -M bja20@sfu.ca | 12 #$ -M bja20@sfu.ca |
13 | 13 |
14 #~/scripts/pipeline.py -i BC11-Kpn005_S2 -f /data/jjjjia/R1/BC11-Kpn005_S2_L001_R1_001.fastq.gz -r /data/jjjjia/R2/BC11-Kpn005_S2_L001_R2_001.fastq.gz -o pipelineResultsQsub -e "Klebsiella pneumoniae" | 14 #./prediction.py -i ~/testCases/cpoResults/contigs/BC11-Kpn005_S2.fa -m ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.mlst -c ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/contig_report.txt -f ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.recon/mobtyper_aggregate_report.txt -a ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.cp -r ~/testCases/predictionResultsQsubTest/predictions/BC11-Kpn005_S2.rgi.txt -e "Klebsiella" |
15 | |
16 import subprocess | 15 import subprocess |
17 import pandas | 16 import pandas |
18 import optparse | 17 import optparse |
19 import os | 18 import os |
20 import datetime | 19 import datetime |
25 import collections | 24 import collections |
26 import json | 25 import json |
27 import numpy | 26 import numpy |
28 | 27 |
29 | 28 |
30 debug = False #True #debug skips the shell scripts and also dump out a ton of debugging messages | 29 debug = True #debug skips the shell scripts and also dump out a ton of debugging messages |
31 | 30 |
32 if not debug: | 31 if not debug: |
33 #parses some parameters | 32 #parses some parameters |
34 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") | 33 parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") |
35 #required | 34 #required |
35 #MLSTHIT, mobsuite, resfinder, rgi, mlstscheme | |
36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate") | 36 parser.add_option("-i", "--id", dest="id", type="string", help="identifier of the isolate") |
37 parser.add_option("-a", "--assembly", dest="assemblyPath", type="string", help="absolute file path to contigs fasta") | 37 parser.add_option("-m", "--mlst", dest="mlst", type="string", help="absolute file path to mlst result") |
38 parser.add_option("-c", "--card-db", dest="cardDB", default = "/home/jjjjia/databases/card202.json", type="string", help="absolute path to card reference database") | 38 parser.add_option("-c", "--mobfinderContig", dest="mobfinderContig", type="string", help="absolute path to mobfinder aggregate result") |
39 parser.add_option("-o", "--output", dest="output", default='./', type="string", help="absolute path to output folder") | 39 parser.add_option("-f", "--mobfinderAggregate", dest="mobfinderAggregate", type="string", help="absolute path to mobfinder plasmid results") |
40 parser.add_option("-a", "--abricate", dest="abricate", type="string", help="absolute path to abricate results") | |
41 parser.add_option("-r", "--rgi", dest="rgi", type="string", help="absolute path to rgi results") | |
40 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") | 42 parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") |
41 | 43 parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") |
42 #optionals | 44 parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ") |
43 parser.add_option("-k", "--script-path", dest="scriptDir", default="/home/jjjjia/scripts", type="string", help="absolute file path to this script folder") | 45 |
44 parser.add_option("-b", "--update-abricate-path", dest="updateAbPath", default = "", type="string", help="absolute file path to fasta sequence used for abricate database") | |
45 parser.add_option("-m", "--update-abricate-dbname", dest="updateAbName", default = "default", type="string", help="name of abricate database to update") | |
46 parser.add_option("-u", "--update-mlst", dest="updateMLST", default = "False", type="string", help="True = update MLST") | |
47 #used for parsing | |
48 parser.add_option("-s", "--mlst-scheme", dest="mlst", default= "/home/jjjjia/databases/scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") | |
49 | |
50 | |
51 #parallelization, useless, these are hard coded to 8cores/64G RAM | 46 #parallelization, useless, these are hard coded to 8cores/64G RAM |
52 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") | 47 #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") |
53 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB") | 48 #parser.add_option("-p", "--memory", dest="memory", default=64, type="int", help="memory to use in GB") |
54 | 49 |
55 (options,args) = parser.parse_args() | 50 (options,args) = parser.parse_args() |
56 #if len(args) != 8: | 51 #if len(args) != 8: |
57 #parser.error("incorrect number of arguments, all 7 is required") | 52 #parser.error("incorrect number of arguments, all 7 is required") |
58 curDir = os.getcwd() | 53 curDir = os.getcwd() |
59 outputDir = options.output | 54 ID = str(options.id).lstrip().rstrip() |
60 expectedSpecies = options.expectedSpecies | 55 mlst = str(options.mlst).lstrip().rstrip() |
61 mlstScheme = options.mlst | 56 mobfindercontig = str(options.mobfinderContig).lstrip().rstrip() |
62 tempDir = outputDir + "/shovillTemp" | 57 mobfinderaggregate = str(options.mobfinderAggregate).lstrip().rstrip() |
63 scriptDir = options.scriptDir | 58 abricate = str(options.abricate).lstrip().rstrip() |
64 updateAbName = options.updateAbName | 59 rgi = str(options.rgi).lstrip().rstrip() |
65 updateAbPath = options.updateAbPath | 60 expectedSpecies = str(options.expectedSpecies).lstrip().rstrip() |
66 updateMLST = options.updateMLST | 61 mlstScheme = str(options.mlstScheme).lstrip().rstrip() |
67 cardDB=options.cardDB | 62 plasmidfinder = str(options.plasmidfinder).lstrip().rstrip() |
68 assemblyPath=options.assemblyPath | 63 outputDir = "./" |
69 ID = options.id | 64 print(mlst) |
65 print(mobfindercontig) | |
66 print(mobfinderaggregate) | |
67 print(abricate) | |
68 print(rgi) | |
69 print(expectedSpecies) | |
70 print(mlstScheme) | |
70 else: | 71 else: |
71 manifestDir = "" | |
72 curDir = os.getcwd() | 72 curDir = os.getcwd() |
73 outputDir = "pipelineTest" | 73 ID = "BC11" |
74 mlst = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.mlst" | |
75 mobfindercontig = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\contig_report.txt" | |
76 mobfinderaggregate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.recon\mobtyper_aggregate_report.txt" | |
77 abricate = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.cp" | |
78 rgi = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.rgi.txt" | |
74 expectedSpecies = "Escherichia coli" | 79 expectedSpecies = "Escherichia coli" |
75 #threads = 8 | 80 mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab" |
76 #memory = 64 | 81 plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins" |
77 mlstScheme = outputDir + "/scheme_species_map.tab" | 82 outputDir = "./" |
78 tempDir= outputDir + "/shovillTemp" | |
79 scriptDir = "~/scripts" | |
80 updateAbName = "cpo" | |
81 updateAbPath = "~/database/bccdcCPO.seq" | |
82 updateMLST = True | |
83 assemblyPath = "./" | |
84 cardDB = "./" | |
85 ID = "BC11-Kpn005_S2" | |
86 | 83 |
87 #region result objects | 84 #region result objects |
88 #define some objects to store values from results | 85 #define some objects to store values from results |
89 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). | 86 #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). |
90 class starFinders(object): | 87 class starFinders(object): |
162 self.PredictedMobility = "" | 159 self.PredictedMobility = "" |
163 self.mash_nearest_neighbor = "" | 160 self.mash_nearest_neighbor = "" |
164 self.mash_neighbor_distance = 0.00 | 161 self.mash_neighbor_distance = 0.00 |
165 self.mash_neighbor_cluster= 0 | 162 self.mash_neighbor_cluster= 0 |
166 self.row = "" | 163 self.row = "" |
164 | |
167 class RGIResult(object): | 165 class RGIResult(object): |
168 def __init__(self): | 166 def __init__(self): |
169 self.ORF_ID = "" | 167 self.ORF_ID = "" |
170 self.Contig = "" | 168 self.Contig = "" |
171 self.Start = -1 | 169 self.Start = -1 |
231 gzContent = f.read() | 229 gzContent = f.read() |
232 with open(outputpath, 'wb') as out: | 230 with open(outputpath, 'wb') as out: |
233 out.write(gzContent) | 231 out.write(gzContent) |
234 return True | 232 return True |
235 def ToJson(dictObject, outputPath): | 233 def ToJson(dictObject, outputPath): |
236 outDir = outputDir + '/summary/' + ID + ".json/" | 234 #outDir = outputDir + '/summary/' + ID + ".json/" |
237 if not (os.path.exists(outDir)): | 235 #if not (os.path.exists(outDir)): |
238 os.makedirs(outDir) | 236 #os.makedirs(outDir) |
239 with open(outDir + outputPath, 'w') as f: | 237 #with open(outputPath, 'w') as f: |
240 json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False) | 238 #json.dump([ob.__dict__ for ob in dictObject.values()], f, ensure_ascii=False) |
239 return "" | |
241 #endregion | 240 #endregion |
242 | 241 |
243 #region functions to parse result files | 242 #region functions to parse result files |
244 def ParseMLSTResult(pathToMLSTResult): | 243 def ParseMLSTResult(pathToMLSTResult, scheme): |
245 _mlstResult = {} | 244 _mlstResult = {} |
246 scheme = pandas.read_csv(mlstScheme, delimiter='\t', header=0) | 245 scheme = pandas.read_csv(scheme, delimiter='\t', header=0) |
247 scheme = scheme.replace(numpy.nan, '', regex=True) | 246 scheme = scheme.replace(numpy.nan, '', regex=True) |
248 | 247 |
249 taxon = {} | 248 taxon = {} |
250 #record the scheme as a dictionary | 249 #record the scheme as a dictionary |
251 taxon["-"] = "No MLST Match" | 250 taxon["-"] = "No MLST Match" |
440 r.source = "likely plasmid" | 439 r.source = "likely plasmid" |
441 else: | 440 else: |
442 r.source = "likely chromosome" | 441 r.source = "likely chromosome" |
443 _rgiR[r.Model_ID]=r | 442 _rgiR[r.Model_ID]=r |
444 return _rgiR | 443 return _rgiR |
444 | |
445 def ParsePlasmidFinderResult(pathToPlasmidFinderResult): | |
446 #pipelineTest/contigs/BC110-Kpn005.fa contig00019 45455 45758 IncFIC(FII)_1 8-308/499 ========/=..... 8/11 59.52 75.65 plasmidfinder AP001918 IncFIC(FII)_1__AP001918 | |
447 #example resfinder: | |
448 #pipelineTest/contigs/BC110-Kpn005.fa contig00038 256 1053 OXA-181 1-798/798 =============== 0/0 100.00 100.00 bccdc AEP16366.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-181 | |
449 | |
450 _pFinder = {} #*********************** | |
451 plasmidFinder = pandas.read_csv(pathToPlasmidFinderResult, delimiter='\t', header=0) | |
452 | |
453 for i in range(len(plasmidFinder.index)): | |
454 pf = starFinders() | |
455 pf.file = str(plasmidFinder.iloc[i,0]) | |
456 pf.sequence = str(plasmidFinder.iloc[i,1]) | |
457 pf.start = int(plasmidFinder.iloc[i,2]) | |
458 pf.end = int(plasmidFinder.iloc[i,3]) | |
459 pf.gene = str(plasmidFinder.iloc[i,4]) | |
460 pf.shortGene = pf.gene[:pf.gene.index("_")] | |
461 pf.coverage = str(plasmidFinder.iloc[i,5]) | |
462 pf.coverage_map = str(plasmidFinder.iloc[i,6]) | |
463 pf.gaps = str(plasmidFinder.iloc[i,7]) | |
464 pf.pCoverage = float(plasmidFinder.iloc[i,8]) | |
465 pf.pIdentity = float(plasmidFinder.iloc[i,9]) | |
466 pf.database = str(plasmidFinder.iloc[i,10]) | |
467 pf.accession = str(plasmidFinder.iloc[i,11]) | |
468 pf.product = str(plasmidFinder.iloc[i,12]) | |
469 pf.source = "plasmid" | |
470 pf.row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) | |
471 _pFinder[pf.gene]=pf | |
472 #row = "\t".join(str(x) for x in plasmidFinder.ix[i].tolist()) | |
473 #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1])) | |
474 #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")])) | |
475 return _pFinder | |
445 #endregion | 476 #endregion |
446 | 477 |
447 def Main(): | 478 def Main(): |
479 outputDir = "./" | |
448 notes = [] | 480 notes = [] |
449 #init the output list | 481 #init the output list |
450 output = [] | 482 output = [] |
451 jsonOutput = [] | 483 jsonOutput = [] |
452 | 484 |
453 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath) | 485 print(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + ID) |
454 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + assemblyPath) | 486 output.append(str(datetime.datetime.now()) + "\n\nID: " + ID + "\nAssembly: " + ID) |
455 | |
456 #region update databases if update=true | |
457 if not debug: | |
458 #update databases if necessary | |
459 if not (updateAbPath == "" and updateAbName == "default"): | |
460 print("updating abricate database: " + updateAbName + " @fasta path: " + updateAbPath) | |
461 cmd = [scriptDir + "/pipeline_updateAbricateDB.sh", updateAbPath, updateAbName] | |
462 update = execute(cmd) | |
463 if (updateMLST.lower() == "true"): | |
464 print("updating mlst database... ") | |
465 cmd = [scriptDir + "/pipeline_updateMLST.sh"] | |
466 update = execute(cmd) | |
467 #endregion | |
468 | |
469 print("step 3: parsing the mlst results") | |
470 | |
471 print("performing MLST") | |
472 #region call the mlst script | |
473 if not debug: | |
474 print("running pipeline_prediction.sh") | |
475 #input parameters: 1=ID 2 = assemblyPath, 3= outputdir, 4=card.json | |
476 cmd = [scriptDir + "/pipeline_prediction.sh", ID, assemblyPath, outputDir, cardDB] | |
477 result = execute(cmd) | |
478 #endregion | |
479 | 487 |
480 #region parse the mlst results | 488 #region parse the mlst results |
481 print("step 3: parsing mlst, plasmid, and amr results") | 489 print("step 3: parsing mlst, plasmid, and amr results") |
482 | 490 |
483 print("identifying MLST") | 491 print("identifying MLST") |
484 pathToMLSTScheme = outputDir + "/predictions/" + ID + ".mlst" | 492 mlstHit = ParseMLSTResult(mlst, str(mlstScheme))#*********************** |
485 mlstHit = ParseMLSTResult(pathToMLSTScheme)#*********************** | |
486 ToJson(mlstHit, "mlst.json") #write it to a json output | 493 ToJson(mlstHit, "mlst.json") #write it to a json output |
487 mlstHit = list(mlstHit.values())[0] | 494 mlstHit = list(mlstHit.values())[0] |
488 | 495 |
489 #endregion | 496 #endregion |
490 | 497 |
494 plasmidContigs = [] | 501 plasmidContigs = [] |
495 likelyPlasmidContigs = [] | 502 likelyPlasmidContigs = [] |
496 origins = [] | 503 origins = [] |
497 | 504 |
498 #parse mobsuite results | 505 #parse mobsuite results |
499 mSuite = ParseMobsuiteResult(outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#************* | 506 mSuite = ParseMobsuiteResult(mobfindercontig) #outputDir + "/predictions/" + ID + ".recon/contig_report.txt")#************* |
500 ToJson(mSuite, "mobsuite.json") #************* | 507 ToJson(mSuite, "mobsuite.json") #************* |
501 mSuitePlasmids = ParseMobsuitePlasmids(outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#************* | 508 mSuitePlasmids = ParseMobsuitePlasmids(mobfinderaggregate)#outputDir + "/predictions/" + ID + ".recon/mobtyper_aggregate_report.txt")#************* |
502 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #************* | 509 ToJson(mSuitePlasmids, "mobsuitePlasmids.json") #************* |
503 | 510 |
504 for key in mSuite: | 511 for key in mSuite: |
505 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs: | 512 if mSuite[key].contig_num not in plasmidContigs and mSuite[key].contig_num not in likelyPlasmidContigs: |
506 if not (mSuite[key].rep_type == ''): | 513 if not (mSuite[key].rep_type == ''): |
510 for key in mSuite: | 517 for key in mSuite: |
511 if mSuite[key].rep_type not in origins: | 518 if mSuite[key].rep_type not in origins: |
512 origins.append(mSuite[key].rep_type) | 519 origins.append(mSuite[key].rep_type) |
513 | 520 |
514 #parse resfinder AMR results | 521 #parse resfinder AMR results |
515 rFinder = ParseResFinderResult(outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** | 522 pFinder = ParsePlasmidFinderResult(plasmidfinder) |
523 ToJson(pFinder, "origins.json") | |
524 | |
525 rFinder = ParseResFinderResult(abricate, plasmidContigs, likelyPlasmidContigs)#outputDir + "/predictions/" + ID + ".cp", plasmidContigs, likelyPlasmidContigs) #********************** | |
516 ToJson(rFinder, "resfinder.json") #************* | 526 ToJson(rFinder, "resfinder.json") #************* |
517 | 527 |
518 rgiAMR = ParseRGIResult(outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** | 528 rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** |
519 ToJson(rgiAMR, "rgi.json") #************* | 529 ToJson(rgiAMR, "rgi.json") #************* |
520 | 530 |
521 carbapenamases = [] | 531 carbapenamases = [] |
522 amrGenes = [] | 532 amrGenes = [] |
523 for keys in rFinder: | 533 for keys in rFinder: |
578 for key in rgiAMR: | 588 for key in rgiAMR: |
579 output.append(rgiAMR[key].row) | 589 output.append(rgiAMR[key].row) |
580 | 590 |
581 #write summary to a file | 591 #write summary to a file |
582 summaryDir = outputDir + "/summary/" + ID | 592 summaryDir = outputDir + "/summary/" + ID |
583 out = open(summaryDir + ".txt", 'w') | 593 out = open("summary.txt", 'w') |
584 for item in output: | 594 for item in output: |
585 out.write("%s\n" % item) | 595 out.write("%s\n" % item) |
586 | 596 |
587 | 597 |
588 #TSV output | 598 #TSV output |
621 temp += ";".join(plasmidContigs) + "\t" | 631 temp += ";".join(plasmidContigs) + "\t" |
622 temp += ";".join(likelyPlasmidContigs) | 632 temp += ";".join(likelyPlasmidContigs) |
623 tsvOut.append(temp) | 633 tsvOut.append(temp) |
624 | 634 |
625 summaryDir = outputDir + "/summary/" + ID | 635 summaryDir = outputDir + "/summary/" + ID |
626 out = open(summaryDir + ".tsv", 'w') | 636 out = open("summary.tsv", 'w') |
627 for item in tsvOut: | 637 for item in tsvOut: |
628 out.write("%s\n" % item) | 638 out.write("%s\n" % item) |
629 #endregion | 639 #endregion |
630 | 640 |
631 | 641 |