0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import argparse
|
|
4
|
|
5
|
|
6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category']
|
3
|
7 VFDB_HEADER = ['Gene', 'Contig', '%ID', '%COV', 'E', 'Annotation', 'Distribution']
|
0
|
8
|
|
9
|
|
10 def get_species_from_gtdb(f):
|
|
11 # get GTDB species
|
|
12 # assumes there is one genome in the GTDB-Tk output file
|
|
13 with open(f, 'r') as fh:
|
1
|
14 for i, line in enumerate(fh):
|
|
15 if i == 0:
|
|
16 # Skip header.
|
|
17 continue
|
2
|
18 try:
|
|
19 items = line.split('\t')
|
|
20 tax = items[1]
|
|
21 tax = tax.split(';')[-1]
|
|
22 # split on GTDB species tag
|
|
23 tax = tax.split('s__')[1]
|
|
24 except Exception:
|
|
25 return '(Unknown Species)'
|
1
|
26 if len(tax) == 0:
|
2
|
27 return '(Unknown Species)'
|
0
|
28 return tax
|
|
29
|
|
30
|
|
31 def get_blast_genes(f):
|
|
32 # reads genes detected via BLAST
|
|
33 # BLAST header is as follows:
|
|
34 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen
|
|
35 d = {}
|
|
36 with open(f, 'r') as fh:
|
|
37 for line in fh:
|
2
|
38 try:
|
3
|
39 line = line.strip()
|
2
|
40 items = line.split('\t')
|
|
41 gene = items[0]
|
|
42 # contig = items[1]
|
|
43 # pid = items[2]
|
|
44 alen = items[3]
|
|
45 # e = items[-4]
|
|
46 qlen = items[-1]
|
|
47 # calculate query coverage by dividing alignment length by query length
|
|
48 qcov = round(float(alen) / float(qlen) * 100.0, 2)
|
|
49 if gene not in d.keys():
|
|
50 d[gene] = []
|
|
51 d[gene].append('%s\t%s' % (line, str(qcov)))
|
|
52 except Exception:
|
|
53 return d
|
0
|
54 return d
|
|
55
|
|
56
|
|
57 def get_blacklist(v, b):
|
|
58 # identify high-risk isolates based on blacklisted genes
|
|
59 # blacklisted genes file contains two columns:
|
|
60 # column 0=the gene name as it appears in the gene database
|
|
61 # column 1=the reason why the gene was blacklisted, which will be reported
|
|
62 # e.g., 'ANTHRAX TOXIN'
|
|
63 bdict = {}
|
2
|
64 blacklist_present = {}
|
0
|
65 with open(b, 'r') as fh:
|
|
66 for line in fh:
|
2
|
67 try:
|
3
|
68 line = line.strip()
|
2
|
69 items = line.split('\t')
|
|
70 gene = items[0]
|
|
71 val = items[1]
|
|
72 bdict[gene] = val
|
|
73 except Exception:
|
|
74 return blacklist_present
|
0
|
75 for key in v.keys():
|
|
76 if key in bdict.keys():
|
|
77 val = bdict[key]
|
|
78 blacklist_present[key] = val
|
|
79 return blacklist_present
|
|
80
|
|
81
|
|
82 def gene_dist(f, blast, gtdb):
|
|
83 # get within-species prevalence of genes
|
|
84 # for virulence factors (VFs): uses VFDB VFs detected via ABRicate's VFDB db
|
4
|
85 # for AMR genes: uses AMR genes detected via ABRicate + PIMA db
|
0
|
86 # for VFs and AMR genes: genes were detected via ABRicate XXX
|
|
87 # minimum nucleotide identity and coverage values >=80%
|
|
88 # total of 61,161 genomes queried
|
|
89 # takes VFDB or AMR gene distribution file as input (f)
|
|
90 # BLAST file of VFDB or AMR genes (blast)
|
|
91 # GTDB species (gtdb)
|
|
92 # create dictionaries based on gene distribution
|
|
93 d = {}
|
|
94 annd = {}
|
|
95 gtdbd = {}
|
2
|
96 finallines = []
|
3
|
97 warnings = []
|
0
|
98 with open(f, 'r') as fh:
|
|
99 for line in fh:
|
2
|
100 try:
|
3
|
101 line = line.strip()
|
2
|
102 items = line.split('\t')
|
|
103 tax = items[0]
|
|
104 tax = tax.split('s__')[1]
|
|
105 if len(tax) == 0:
|
|
106 tax = '(Unknown Species)'
|
|
107 gene = items[1]
|
|
108 ann = items[-1]
|
|
109 denom = items[3]
|
|
110 d['%s___%s' % (tax, gene)] = line
|
|
111 annd[gene] = ann
|
|
112 gtdbd[tax] = denom
|
|
113 except Exception:
|
|
114 return finallines
|
0
|
115 # parse BLAST results
|
|
116 for key in blast.keys():
|
|
117 blastval = blast[key]
|
|
118 for bv in blastval:
|
|
119 testkey = '%s___%s' % (gtdb, key)
|
|
120 if testkey in d.keys() and gtdb != '(Unknown Species)':
|
|
121 taxval = d[testkey]
|
|
122 items = taxval.split('\t')
|
2
|
123 tax = items[0]
|
|
124 tax = tax.split('s__')[1]
|
0
|
125 if len(tax) == 0:
|
|
126 tax = '(Unknown Species)'
|
2
|
127 gene = items[1]
|
|
128 pres = items[2]
|
|
129 denom = items[3]
|
|
130 perc = items[4]
|
0
|
131 perc = str(round(float(perc), 2))
|
2
|
132 ann = items[-1]
|
3
|
133 freetext = '{0}/{1} ({2}%)'.format(pres, denom, perc)
|
0
|
134 elif gtdb != '(Unknown Species)':
|
4
|
135 ann = 'NA'
|
5
|
136 if gtdb in gtdbd.keys():
|
|
137 denom = gtdbd[gtdb]
|
|
138 else:
|
|
139 denom = 'NA'
|
3
|
140 freetext = "*WARNING"
|
5
|
141 warnings.append("*WARNING: This gene has never been detected in this species and/or this species has not been included in the LRNRisk database! Interpret with caution!")
|
0
|
142 else:
|
4
|
143 ann = 'NA'
|
5
|
144 denom = 'NA'
|
3
|
145 freetext = "**WARNING"
|
|
146 warnings.append("**WARNING: This genome belongs to an undescribed species. Interpret with caution!")
|
|
147 finallines.append('%s\t%s\t%s' % (bv, ann, freetext))
|
|
148 return [finallines, warnings]
|
0
|
149
|
|
150
|
|
151 def output_blacklist(blacklist, blacklist_output_file):
|
|
152 # takes detected blacklisted genes as input (blacklist)
|
|
153 # blacklist results
|
|
154 with open(blacklist_output_file, 'w') as fh:
|
|
155 fh.write('%s\n' % '\t'.join(BLACKLIST_HEADER))
|
|
156 if len(blacklist.keys()) == 0:
|
|
157 # print this if no blacklisted genes are detected
|
|
158 fh.write('(No blacklisted genes detected)\tNA\tNot high risk\n')
|
|
159 else:
|
|
160 # print this if blacklisted genes are detected
|
|
161 # print a table with one row per detected blacklisted gene
|
|
162 for key in blacklist.keys():
|
|
163 val = blacklist[key]
|
|
164 fh.write('%s\t%s\tHIGH RISK\n' % (key, val))
|
|
165
|
|
166
|
3
|
167 def output_vfdb(vfdist, vfdb_output_file, vf_warnings):
|
0
|
168 # takes distribution of virulence factors as input (vfdist)
|
|
169 # VFDB results
|
|
170 with open(vfdb_output_file, 'w') as fh:
|
|
171 fh.write('%s\n' % '\t'.join(VFDB_HEADER))
|
|
172 if len(vfdist) == 0:
|
|
173 # print this if no VFs detected
|
|
174 fh.write('%s\n' % '\t'.join(['(No VFs Detected)'] * 7))
|
|
175 else:
|
|
176 # print table of VFs if VFs detected
|
|
177 for vline in vfdist:
|
|
178 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length']
|
|
179 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes']
|
|
180 items = vline.split('\t')
|
2
|
181 vgene = items[0]
|
|
182 vcontig = items[1]
|
|
183 vid = items[2]
|
|
184 vcov = items[-3]
|
|
185 veval = items[-7]
|
|
186 vann = items[-2]
|
|
187 vnotes = items[-1]
|
0
|
188 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes]
|
3
|
189 fh.write('%s\n' % '\t'.join(vfinal))
|
|
190 for vfw in sorted(vf_warnings, key=lambda x: x.count('*')):
|
|
191 fh.write('%s\n' % vfw)
|
0
|
192
|
|
193
|
3
|
194 def output_amr(amrdist, amr_output_file, amr_warnings):
|
0
|
195 # takes distribution of AMR genes as input (amrdist)
|
|
196 # AMR results
|
|
197 with open(amr_output_file, 'w') as fh:
|
|
198 fh.write('%s\n' % '\t'.join(VFDB_HEADER))
|
|
199 if len(amrdist) == 0:
|
|
200 # print this if no AMR genes detected
|
|
201 fh.write('%s\n' % '\t'.join(['(No AMR Genes Detected)'] * 7))
|
|
202 else:
|
|
203 # print this if AMR genes detected
|
|
204 for aline in amrdist:
|
|
205 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length']
|
|
206 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes']
|
|
207 items = aline.split('\t')
|
2
|
208 agene = items[0]
|
|
209 acontig = items[1]
|
|
210 aid = items[2]
|
|
211 acov = items[-3]
|
|
212 aeval = items[-7]
|
|
213 aann = items[-2]
|
|
214 anotes = items[-1]
|
0
|
215 afinal = [agene, acontig, aid, acov, aeval, aann, anotes]
|
3
|
216 fh.write('%s\n' % '\t'.join(afinal))
|
|
217 for amrw in sorted(amr_warnings, key=lambda x: x.count('*')):
|
|
218 fh.write('%s\n' % amrw)
|
0
|
219
|
|
220
|
|
221 # lrnrisk_prototype arguments
|
|
222 parser = argparse.ArgumentParser()
|
|
223
|
|
224 parser.add_argument('--gtdb_file', action='store', dest='gtdb_file', help='Path to gtdbtk tsv file')
|
|
225 parser.add_argument('--virulence_factors_file', action='store', dest='virulence_factors_file', help='Path to tsv virulence factors file')
|
|
226 parser.add_argument('--amr_determinants_file', action='store', dest='amr_determinants_file', help='Path to AMR determinants tsv file')
|
|
227 parser.add_argument('--blacklist_file', action='store', dest='blacklist_file', help='Path to blacklisted high-risk virulence factors tsv file')
|
|
228 parser.add_argument('--vf_distribution_file', action='store', dest='vf_distribution_file', help='Path to virulence factor distribution tsv file')
|
|
229 parser.add_argument('--amr_distribution_file', action='store', dest='amr_distribution_file', help='Path to AMR determinant distribution tsv file')
|
|
230 parser.add_argument('--blacklist_output_file', action='store', dest='blacklist_output_file', help='Path to blacklist output file')
|
|
231 parser.add_argument('--vfdb_output_file', action='store', dest='vfdb_output_file', help='Path to vfdb output file')
|
|
232 parser.add_argument('--amr_output_file', action='store', dest='amr_output_file', help='Path to amr output file')
|
|
233
|
|
234 # parse arguments and run pipeline
|
|
235 args = parser.parse_args()
|
|
236
|
|
237 # print_output(blacklist, vf_distribution, amr_distribution, args.output, species)
|
|
238 virulence_genes = get_blast_genes(args.virulence_factors_file)
|
|
239 species = get_species_from_gtdb(args.gtdb_file)
|
|
240
|
|
241 blacklist = get_blacklist(virulence_genes, args.blacklist_file)
|
|
242 output_blacklist(blacklist, args.blacklist_output_file)
|
|
243
|
|
244 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species)
|
3
|
245 vf_warnings = vf_distribution[1]
|
|
246 vf_distribution = vf_distribution[0]
|
|
247 output_vfdb(vf_distribution, args.vfdb_output_file, vf_warnings)
|
0
|
248
|
|
249 amr_genes = get_blast_genes(args.amr_determinants_file)
|
|
250 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species)
|
3
|
251 amr_warnings = amr_distribution[1]
|
|
252 amr_distribution = amr_distribution[0]
|
|
253 output_amr(amr_distribution, args.amr_output_file, amr_warnings)
|