comparison lrn_risk.py @ 0:99e04eba4033 draft

Uploaded
author greg
date Thu, 27 Apr 2023 19:22:36 +0000
parents
children f98c92618a6c
comparison
equal deleted inserted replaced
-1:000000000000 0:99e04eba4033
1 #!/usr/bin/env python
2
3 import argparse
4
5
6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category']
7 VFDB_HEADER = ['Gene', 'Contig', '% Identity', '% Coverage', 'E-Value', 'Annotation', 'Comparison to Publicly Available Genomes']
8
9
10 def get_species_from_gtdb(f):
11 # get GTDB species
12 # assumes there is one genome in the GTDB-Tk output file
13 with open(f, 'r') as fh:
14 for line in fh:
15 if line.find('user_genome') < 0:
16 items = line.split('\t')
17 tax = items[1].strip()
18 tax = tax.split(';')[-1].strip()
19 # split on GTDB species tag
20 tax = tax.split('s__')[1].strip()
21 if len(tax) == 0:
22 tax = '(Unknown Species)'
23 return tax
24
25
26 def get_blast_genes(f):
27 # reads genes detected via BLAST
28 # BLAST header is as follows:
29 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen
30 d = {}
31 with open(f, 'r') as fh:
32 for line in fh:
33 items = line.split('\t')
34 gene = items[0]
35 # contig = items[1]
36 # pid = items[2]
37 alen = items[3]
38 # e = items[-4]
39 qlen = items[-1]
40 # calculate query coverage by dividing alignment length by query length
41 qcov = round(float(alen) / float(qlen) * 100.0, 2)
42 if gene not in d.keys():
43 d[gene] = []
44 d[gene].append('%s\t%s' % (line, str(qcov)))
45 return d
46
47
48 def get_blacklist(v, b):
49 # identify high-risk isolates based on blacklisted genes
50 # blacklisted genes file contains two columns:
51 # column 0=the gene name as it appears in the gene database
52 # column 1=the reason why the gene was blacklisted, which will be reported
53 # e.g., 'ANTHRAX TOXIN'
54 bdict = {}
55 with open(b, 'r') as fh:
56 for line in fh:
57 items = line.split('\t')
58 gene = items[0].strip()
59 val = items[1].strip()
60 bdict[gene] = val
61 blacklist_present = {}
62 for key in v.keys():
63 if key in bdict.keys():
64 val = bdict[key]
65 blacklist_present[key] = val
66 return blacklist_present
67
68
69 def gene_dist(f, blast, gtdb):
70 # get within-species prevalence of genes
71 # for virulence factors (VFs): uses VFDB VFs detected via ABRicate's VFDB db
72 # for AMR genes: uses AMR genes detected via ABRicate's ResFinder db
73 # for VFs and AMR genes: genes were detected via ABRicate XXX
74 # minimum nucleotide identity and coverage values >=80%
75 # total of 61,161 genomes queried
76 # takes VFDB or AMR gene distribution file as input (f)
77 # BLAST file of VFDB or AMR genes (blast)
78 # GTDB species (gtdb)
79 # create dictionaries based on gene distribution
80 d = {}
81 annd = {}
82 gtdbd = {}
83 with open(f, 'r') as fh:
84 for line in fh:
85 items = line.split('\t')
86 tax = items[0].strip()
87 tax = tax.split('s__')[1].strip()
88 if len(tax) == 0:
89 tax = '(Unknown Species)'
90 gene = items[1].strip()
91 ann = items[-1].strip()
92 denom = items[3].strip()
93 d['%s___%s' % (tax, gene)] = line
94 annd[gene] = ann
95 gtdbd[tax] = denom
96 # parse BLAST results
97 finallines = []
98 for key in blast.keys():
99 blastval = blast[key]
100 for bv in blastval:
101 testkey = '%s___%s' % (gtdb, key)
102 if testkey in d.keys() and gtdb != '(Unknown Species)':
103 taxval = d[testkey]
104 items = taxval.split('\t')
105 tax = items[0].strip()
106 tax = tax.split('s__')[1].strip()
107 if len(tax) == 0:
108 tax = '(Unknown Species)'
109 gene = items[1].strip()
110 pres = items[2].strip()
111 denom = items[3].strip()
112 perc = items[4].strip()
113 perc = str(round(float(perc), 2))
114 ann = items[-1].strip()
115 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom)
116 elif gtdb != '(Unknown Species)':
117 ann = annd[key]
118 denom = gtdbd[gtdb]
119 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom)
120 else:
121 ann = annd[key]
122 freetext = 'WARNING: Genome belongs to an undescribed species. Interpret with caution!'
123 finalline = '%s\t%s\t%s' % (bv, ann, freetext)
124 finallines.append(finalline)
125 return finallines
126
127
128 def output_blacklist(blacklist, blacklist_output_file):
129 # takes detected blacklisted genes as input (blacklist)
130 # blacklist results
131 with open(blacklist_output_file, 'w') as fh:
132 fh.write('%s\n' % '\t'.join(BLACKLIST_HEADER))
133 if len(blacklist.keys()) == 0:
134 # print this if no blacklisted genes are detected
135 fh.write('(No blacklisted genes detected)\tNA\tNot high risk\n')
136 else:
137 # print this if blacklisted genes are detected
138 # print a table with one row per detected blacklisted gene
139 for key in blacklist.keys():
140 val = blacklist[key]
141 fh.write('%s\t%s\tHIGH RISK\n' % (key, val))
142
143
144 def output_vfdb(vfdist, vfdb_output_file):
145 # takes distribution of virulence factors as input (vfdist)
146 # VFDB results
147 with open(vfdb_output_file, 'w') as fh:
148 fh.write('%s\n' % '\t'.join(VFDB_HEADER))
149 if len(vfdist) == 0:
150 # print this if no VFs detected
151 fh.write('%s\n' % '\t'.join(['(No VFs Detected)'] * 7))
152 else:
153 # print table of VFs if VFs detected
154 for vline in vfdist:
155 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length']
156 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes']
157 items = vline.split('\t')
158 vgene = items[0].strip()
159 vcontig = items[1].strip()
160 vid = items[2].strip()
161 vcov = items[-3].strip()
162 veval = items[-7].strip()
163 vann = items[-2].strip()
164 vnotes = items[-1].strip()
165 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes]
166 vfinal = '\t'.join(vfinal).strip()
167 fh.write('%s\n' % vfinal)
168
169
170 def output_amr(amrdist, amr_output_file):
171 # takes distribution of AMR genes as input (amrdist)
172 # AMR results
173 with open(amr_output_file, 'w') as fh:
174 fh.write('%s\n' % '\t'.join(VFDB_HEADER))
175 if len(amrdist) == 0:
176 # print this if no AMR genes detected
177 fh.write('%s\n' % '\t'.join(['(No AMR Genes Detected)'] * 7))
178 else:
179 # print this if AMR genes detected
180 for aline in amrdist:
181 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length']
182 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes']
183 items = aline.split('\t')
184 agene = items[0].strip()
185 acontig = items[1].strip()
186 aid = items[2].strip()
187 acov = items[-3].strip()
188 aeval = items[-7].strip()
189 aann = items[-2].strip()
190 anotes = items[-1].strip()
191 afinal = [agene, acontig, aid, acov, aeval, aann, anotes]
192 afinal = '\t'.join(afinal).strip()
193 fh.write('%s\n' % afinal)
194
195
196 # lrnrisk_prototype arguments
197 parser = argparse.ArgumentParser()
198
199 parser.add_argument('--gtdb_file', action='store', dest='gtdb_file', help='Path to gtdbtk tsv file')
200 parser.add_argument('--virulence_factors_file', action='store', dest='virulence_factors_file', help='Path to tsv virulence factors file')
201 parser.add_argument('--amr_determinants_file', action='store', dest='amr_determinants_file', help='Path to AMR determinants tsv file')
202 parser.add_argument('--blacklist_file', action='store', dest='blacklist_file', help='Path to blacklisted high-risk virulence factors tsv file')
203 parser.add_argument('--vf_distribution_file', action='store', dest='vf_distribution_file', help='Path to virulence factor distribution tsv file')
204 parser.add_argument('--amr_distribution_file', action='store', dest='amr_distribution_file', help='Path to AMR determinant distribution tsv file')
205 parser.add_argument('--blacklist_output_file', action='store', dest='blacklist_output_file', help='Path to blacklist output file')
206 parser.add_argument('--vfdb_output_file', action='store', dest='vfdb_output_file', help='Path to vfdb output file')
207 parser.add_argument('--amr_output_file', action='store', dest='amr_output_file', help='Path to amr output file')
208
209 # parse arguments and run pipeline
210 args = parser.parse_args()
211
212 # print_output(blacklist, vf_distribution, amr_distribution, args.output, species)
213 virulence_genes = get_blast_genes(args.virulence_factors_file)
214 species = get_species_from_gtdb(args.gtdb_file)
215
216 blacklist = get_blacklist(virulence_genes, args.blacklist_file)
217 output_blacklist(blacklist, args.blacklist_output_file)
218
219 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species)
220 output_vfdb(vf_distribution, args.vfdb_output_file)
221
222 amr_genes = get_blast_genes(args.amr_determinants_file)
223 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species)
224 output_amr(amr_distribution, args.amr_output_file)