Mercurial > repos > greg > lrn_risk
comparison lrn_risk.py @ 0:99e04eba4033 draft
Uploaded
| author | greg |
|---|---|
| date | Thu, 27 Apr 2023 19:22:36 +0000 |
| parents | |
| children | f98c92618a6c |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:99e04eba4033 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse | |
| 4 | |
| 5 | |
| 6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category'] | |
| 7 VFDB_HEADER = ['Gene', 'Contig', '% Identity', '% Coverage', 'E-Value', 'Annotation', 'Comparison to Publicly Available Genomes'] | |
| 8 | |
| 9 | |
| 10 def get_species_from_gtdb(f): | |
| 11 # get GTDB species | |
| 12 # assumes there is one genome in the GTDB-Tk output file | |
| 13 with open(f, 'r') as fh: | |
| 14 for line in fh: | |
| 15 if line.find('user_genome') < 0: | |
| 16 items = line.split('\t') | |
| 17 tax = items[1].strip() | |
| 18 tax = tax.split(';')[-1].strip() | |
| 19 # split on GTDB species tag | |
| 20 tax = tax.split('s__')[1].strip() | |
| 21 if len(tax) == 0: | |
| 22 tax = '(Unknown Species)' | |
| 23 return tax | |
| 24 | |
| 25 | |
| 26 def get_blast_genes(f): | |
| 27 # reads genes detected via BLAST | |
| 28 # BLAST header is as follows: | |
| 29 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen | |
| 30 d = {} | |
| 31 with open(f, 'r') as fh: | |
| 32 for line in fh: | |
| 33 items = line.split('\t') | |
| 34 gene = items[0] | |
| 35 # contig = items[1] | |
| 36 # pid = items[2] | |
| 37 alen = items[3] | |
| 38 # e = items[-4] | |
| 39 qlen = items[-1] | |
| 40 # calculate query coverage by dividing alignment length by query length | |
| 41 qcov = round(float(alen) / float(qlen) * 100.0, 2) | |
| 42 if gene not in d.keys(): | |
| 43 d[gene] = [] | |
| 44 d[gene].append('%s\t%s' % (line, str(qcov))) | |
| 45 return d | |
| 46 | |
| 47 | |
| 48 def get_blacklist(v, b): | |
| 49 # identify high-risk isolates based on blacklisted genes | |
| 50 # blacklisted genes file contains two columns: | |
| 51 # column 0=the gene name as it appears in the gene database | |
| 52 # column 1=the reason why the gene was blacklisted, which will be reported | |
| 53 # e.g., 'ANTHRAX TOXIN' | |
| 54 bdict = {} | |
| 55 with open(b, 'r') as fh: | |
| 56 for line in fh: | |
| 57 items = line.split('\t') | |
| 58 gene = items[0].strip() | |
| 59 val = items[1].strip() | |
| 60 bdict[gene] = val | |
| 61 blacklist_present = {} | |
| 62 for key in v.keys(): | |
| 63 if key in bdict.keys(): | |
| 64 val = bdict[key] | |
| 65 blacklist_present[key] = val | |
| 66 return blacklist_present | |
| 67 | |
| 68 | |
| 69 def gene_dist(f, blast, gtdb): | |
| 70 # get within-species prevalence of genes | |
| 71 # for virulence factors (VFs): uses VFDB VFs detected via ABRicate's VFDB db | |
| 72 # for AMR genes: uses AMR genes detected via ABRicate's ResFinder db | |
| 73 # for VFs and AMR genes: genes were detected via ABRicate XXX | |
| 74 # minimum nucleotide identity and coverage values >=80% | |
| 75 # total of 61,161 genomes queried | |
| 76 # takes VFDB or AMR gene distribution file as input (f) | |
| 77 # BLAST file of VFDB or AMR genes (blast) | |
| 78 # GTDB species (gtdb) | |
| 79 # create dictionaries based on gene distribution | |
| 80 d = {} | |
| 81 annd = {} | |
| 82 gtdbd = {} | |
| 83 with open(f, 'r') as fh: | |
| 84 for line in fh: | |
| 85 items = line.split('\t') | |
| 86 tax = items[0].strip() | |
| 87 tax = tax.split('s__')[1].strip() | |
| 88 if len(tax) == 0: | |
| 89 tax = '(Unknown Species)' | |
| 90 gene = items[1].strip() | |
| 91 ann = items[-1].strip() | |
| 92 denom = items[3].strip() | |
| 93 d['%s___%s' % (tax, gene)] = line | |
| 94 annd[gene] = ann | |
| 95 gtdbd[tax] = denom | |
| 96 # parse BLAST results | |
| 97 finallines = [] | |
| 98 for key in blast.keys(): | |
| 99 blastval = blast[key] | |
| 100 for bv in blastval: | |
| 101 testkey = '%s___%s' % (gtdb, key) | |
| 102 if testkey in d.keys() and gtdb != '(Unknown Species)': | |
| 103 taxval = d[testkey] | |
| 104 items = taxval.split('\t') | |
| 105 tax = items[0].strip() | |
| 106 tax = tax.split('s__')[1].strip() | |
| 107 if len(tax) == 0: | |
| 108 tax = '(Unknown Species)' | |
| 109 gene = items[1].strip() | |
| 110 pres = items[2].strip() | |
| 111 denom = items[3].strip() | |
| 112 perc = items[4].strip() | |
| 113 perc = str(round(float(perc), 2)) | |
| 114 ann = items[-1].strip() | |
| 115 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) | |
| 116 elif gtdb != '(Unknown Species)': | |
| 117 ann = annd[key] | |
| 118 denom = gtdbd[gtdb] | |
| 119 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) | |
| 120 else: | |
| 121 ann = annd[key] | |
| 122 freetext = 'WARNING: Genome belongs to an undescribed species. Interpret with caution!' | |
| 123 finalline = '%s\t%s\t%s' % (bv, ann, freetext) | |
| 124 finallines.append(finalline) | |
| 125 return finallines | |
| 126 | |
| 127 | |
| 128 def output_blacklist(blacklist, blacklist_output_file): | |
| 129 # takes detected blacklisted genes as input (blacklist) | |
| 130 # blacklist results | |
| 131 with open(blacklist_output_file, 'w') as fh: | |
| 132 fh.write('%s\n' % '\t'.join(BLACKLIST_HEADER)) | |
| 133 if len(blacklist.keys()) == 0: | |
| 134 # print this if no blacklisted genes are detected | |
| 135 fh.write('(No blacklisted genes detected)\tNA\tNot high risk\n') | |
| 136 else: | |
| 137 # print this if blacklisted genes are detected | |
| 138 # print a table with one row per detected blacklisted gene | |
| 139 for key in blacklist.keys(): | |
| 140 val = blacklist[key] | |
| 141 fh.write('%s\t%s\tHIGH RISK\n' % (key, val)) | |
| 142 | |
| 143 | |
| 144 def output_vfdb(vfdist, vfdb_output_file): | |
| 145 # takes distribution of virulence factors as input (vfdist) | |
| 146 # VFDB results | |
| 147 with open(vfdb_output_file, 'w') as fh: | |
| 148 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) | |
| 149 if len(vfdist) == 0: | |
| 150 # print this if no VFs detected | |
| 151 fh.write('%s\n' % '\t'.join(['(No VFs Detected)'] * 7)) | |
| 152 else: | |
| 153 # print table of VFs if VFs detected | |
| 154 for vline in vfdist: | |
| 155 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] | |
| 156 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] | |
| 157 items = vline.split('\t') | |
| 158 vgene = items[0].strip() | |
| 159 vcontig = items[1].strip() | |
| 160 vid = items[2].strip() | |
| 161 vcov = items[-3].strip() | |
| 162 veval = items[-7].strip() | |
| 163 vann = items[-2].strip() | |
| 164 vnotes = items[-1].strip() | |
| 165 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] | |
| 166 vfinal = '\t'.join(vfinal).strip() | |
| 167 fh.write('%s\n' % vfinal) | |
| 168 | |
| 169 | |
| 170 def output_amr(amrdist, amr_output_file): | |
| 171 # takes distribution of AMR genes as input (amrdist) | |
| 172 # AMR results | |
| 173 with open(amr_output_file, 'w') as fh: | |
| 174 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) | |
| 175 if len(amrdist) == 0: | |
| 176 # print this if no AMR genes detected | |
| 177 fh.write('%s\n' % '\t'.join(['(No AMR Genes Detected)'] * 7)) | |
| 178 else: | |
| 179 # print this if AMR genes detected | |
| 180 for aline in amrdist: | |
| 181 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] | |
| 182 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] | |
| 183 items = aline.split('\t') | |
| 184 agene = items[0].strip() | |
| 185 acontig = items[1].strip() | |
| 186 aid = items[2].strip() | |
| 187 acov = items[-3].strip() | |
| 188 aeval = items[-7].strip() | |
| 189 aann = items[-2].strip() | |
| 190 anotes = items[-1].strip() | |
| 191 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] | |
| 192 afinal = '\t'.join(afinal).strip() | |
| 193 fh.write('%s\n' % afinal) | |
| 194 | |
| 195 | |
| 196 # lrnrisk_prototype arguments | |
| 197 parser = argparse.ArgumentParser() | |
| 198 | |
| 199 parser.add_argument('--gtdb_file', action='store', dest='gtdb_file', help='Path to gtdbtk tsv file') | |
| 200 parser.add_argument('--virulence_factors_file', action='store', dest='virulence_factors_file', help='Path to tsv virulence factors file') | |
| 201 parser.add_argument('--amr_determinants_file', action='store', dest='amr_determinants_file', help='Path to AMR determinants tsv file') | |
| 202 parser.add_argument('--blacklist_file', action='store', dest='blacklist_file', help='Path to blacklisted high-risk virulence factors tsv file') | |
| 203 parser.add_argument('--vf_distribution_file', action='store', dest='vf_distribution_file', help='Path to virulence factor distribution tsv file') | |
| 204 parser.add_argument('--amr_distribution_file', action='store', dest='amr_distribution_file', help='Path to AMR determinant distribution tsv file') | |
| 205 parser.add_argument('--blacklist_output_file', action='store', dest='blacklist_output_file', help='Path to blacklist output file') | |
| 206 parser.add_argument('--vfdb_output_file', action='store', dest='vfdb_output_file', help='Path to vfdb output file') | |
| 207 parser.add_argument('--amr_output_file', action='store', dest='amr_output_file', help='Path to amr output file') | |
| 208 | |
| 209 # parse arguments and run pipeline | |
| 210 args = parser.parse_args() | |
| 211 | |
| 212 # print_output(blacklist, vf_distribution, amr_distribution, args.output, species) | |
| 213 virulence_genes = get_blast_genes(args.virulence_factors_file) | |
| 214 species = get_species_from_gtdb(args.gtdb_file) | |
| 215 | |
| 216 blacklist = get_blacklist(virulence_genes, args.blacklist_file) | |
| 217 output_blacklist(blacklist, args.blacklist_output_file) | |
| 218 | |
| 219 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species) | |
| 220 output_vfdb(vf_distribution, args.vfdb_output_file) | |
| 221 | |
| 222 amr_genes = get_blast_genes(args.amr_determinants_file) | |
| 223 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species) | |
| 224 output_amr(amr_distribution, args.amr_output_file) |
