Mercurial > repos > greg > lrn_risk
comparison lrn_risk.py @ 0:99e04eba4033 draft
Uploaded
author | greg |
---|---|
date | Thu, 27 Apr 2023 19:22:36 +0000 |
parents | |
children | f98c92618a6c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:99e04eba4033 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 | |
5 | |
6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category'] | |
7 VFDB_HEADER = ['Gene', 'Contig', '% Identity', '% Coverage', 'E-Value', 'Annotation', 'Comparison to Publicly Available Genomes'] | |
8 | |
9 | |
10 def get_species_from_gtdb(f): | |
11 # get GTDB species | |
12 # assumes there is one genome in the GTDB-Tk output file | |
13 with open(f, 'r') as fh: | |
14 for line in fh: | |
15 if line.find('user_genome') < 0: | |
16 items = line.split('\t') | |
17 tax = items[1].strip() | |
18 tax = tax.split(';')[-1].strip() | |
19 # split on GTDB species tag | |
20 tax = tax.split('s__')[1].strip() | |
21 if len(tax) == 0: | |
22 tax = '(Unknown Species)' | |
23 return tax | |
24 | |
25 | |
26 def get_blast_genes(f): | |
27 # reads genes detected via BLAST | |
28 # BLAST header is as follows: | |
29 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen | |
30 d = {} | |
31 with open(f, 'r') as fh: | |
32 for line in fh: | |
33 items = line.split('\t') | |
34 gene = items[0] | |
35 # contig = items[1] | |
36 # pid = items[2] | |
37 alen = items[3] | |
38 # e = items[-4] | |
39 qlen = items[-1] | |
40 # calculate query coverage by dividing alignment length by query length | |
41 qcov = round(float(alen) / float(qlen) * 100.0, 2) | |
42 if gene not in d.keys(): | |
43 d[gene] = [] | |
44 d[gene].append('%s\t%s' % (line, str(qcov))) | |
45 return d | |
46 | |
47 | |
48 def get_blacklist(v, b): | |
49 # identify high-risk isolates based on blacklisted genes | |
50 # blacklisted genes file contains two columns: | |
51 # column 0=the gene name as it appears in the gene database | |
52 # column 1=the reason why the gene was blacklisted, which will be reported | |
53 # e.g., 'ANTHRAX TOXIN' | |
54 bdict = {} | |
55 with open(b, 'r') as fh: | |
56 for line in fh: | |
57 items = line.split('\t') | |
58 gene = items[0].strip() | |
59 val = items[1].strip() | |
60 bdict[gene] = val | |
61 blacklist_present = {} | |
62 for key in v.keys(): | |
63 if key in bdict.keys(): | |
64 val = bdict[key] | |
65 blacklist_present[key] = val | |
66 return blacklist_present | |
67 | |
68 | |
69 def gene_dist(f, blast, gtdb): | |
70 # get within-species prevalence of genes | |
71 # for virulence factors (VFs): uses VFDB VFs detected via ABRicate's VFDB db | |
72 # for AMR genes: uses AMR genes detected via ABRicate's ResFinder db | |
73 # for VFs and AMR genes: genes were detected via ABRicate XXX | |
74 # minimum nucleotide identity and coverage values >=80% | |
75 # total of 61,161 genomes queried | |
76 # takes VFDB or AMR gene distribution file as input (f) | |
77 # BLAST file of VFDB or AMR genes (blast) | |
78 # GTDB species (gtdb) | |
79 # create dictionaries based on gene distribution | |
80 d = {} | |
81 annd = {} | |
82 gtdbd = {} | |
83 with open(f, 'r') as fh: | |
84 for line in fh: | |
85 items = line.split('\t') | |
86 tax = items[0].strip() | |
87 tax = tax.split('s__')[1].strip() | |
88 if len(tax) == 0: | |
89 tax = '(Unknown Species)' | |
90 gene = items[1].strip() | |
91 ann = items[-1].strip() | |
92 denom = items[3].strip() | |
93 d['%s___%s' % (tax, gene)] = line | |
94 annd[gene] = ann | |
95 gtdbd[tax] = denom | |
96 # parse BLAST results | |
97 finallines = [] | |
98 for key in blast.keys(): | |
99 blastval = blast[key] | |
100 for bv in blastval: | |
101 testkey = '%s___%s' % (gtdb, key) | |
102 if testkey in d.keys() and gtdb != '(Unknown Species)': | |
103 taxval = d[testkey] | |
104 items = taxval.split('\t') | |
105 tax = items[0].strip() | |
106 tax = tax.split('s__')[1].strip() | |
107 if len(tax) == 0: | |
108 tax = '(Unknown Species)' | |
109 gene = items[1].strip() | |
110 pres = items[2].strip() | |
111 denom = items[3].strip() | |
112 perc = items[4].strip() | |
113 perc = str(round(float(perc), 2)) | |
114 ann = items[-1].strip() | |
115 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) | |
116 elif gtdb != '(Unknown Species)': | |
117 ann = annd[key] | |
118 denom = gtdbd[gtdb] | |
119 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) | |
120 else: | |
121 ann = annd[key] | |
122 freetext = 'WARNING: Genome belongs to an undescribed species. Interpret with caution!' | |
123 finalline = '%s\t%s\t%s' % (bv, ann, freetext) | |
124 finallines.append(finalline) | |
125 return finallines | |
126 | |
127 | |
128 def output_blacklist(blacklist, blacklist_output_file): | |
129 # takes detected blacklisted genes as input (blacklist) | |
130 # blacklist results | |
131 with open(blacklist_output_file, 'w') as fh: | |
132 fh.write('%s\n' % '\t'.join(BLACKLIST_HEADER)) | |
133 if len(blacklist.keys()) == 0: | |
134 # print this if no blacklisted genes are detected | |
135 fh.write('(No blacklisted genes detected)\tNA\tNot high risk\n') | |
136 else: | |
137 # print this if blacklisted genes are detected | |
138 # print a table with one row per detected blacklisted gene | |
139 for key in blacklist.keys(): | |
140 val = blacklist[key] | |
141 fh.write('%s\t%s\tHIGH RISK\n' % (key, val)) | |
142 | |
143 | |
144 def output_vfdb(vfdist, vfdb_output_file): | |
145 # takes distribution of virulence factors as input (vfdist) | |
146 # VFDB results | |
147 with open(vfdb_output_file, 'w') as fh: | |
148 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) | |
149 if len(vfdist) == 0: | |
150 # print this if no VFs detected | |
151 fh.write('%s\n' % '\t'.join(['(No VFs Detected)'] * 7)) | |
152 else: | |
153 # print table of VFs if VFs detected | |
154 for vline in vfdist: | |
155 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] | |
156 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] | |
157 items = vline.split('\t') | |
158 vgene = items[0].strip() | |
159 vcontig = items[1].strip() | |
160 vid = items[2].strip() | |
161 vcov = items[-3].strip() | |
162 veval = items[-7].strip() | |
163 vann = items[-2].strip() | |
164 vnotes = items[-1].strip() | |
165 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] | |
166 vfinal = '\t'.join(vfinal).strip() | |
167 fh.write('%s\n' % vfinal) | |
168 | |
169 | |
170 def output_amr(amrdist, amr_output_file): | |
171 # takes distribution of AMR genes as input (amrdist) | |
172 # AMR results | |
173 with open(amr_output_file, 'w') as fh: | |
174 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) | |
175 if len(amrdist) == 0: | |
176 # print this if no AMR genes detected | |
177 fh.write('%s\n' % '\t'.join(['(No AMR Genes Detected)'] * 7)) | |
178 else: | |
179 # print this if AMR genes detected | |
180 for aline in amrdist: | |
181 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] | |
182 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] | |
183 items = aline.split('\t') | |
184 agene = items[0].strip() | |
185 acontig = items[1].strip() | |
186 aid = items[2].strip() | |
187 acov = items[-3].strip() | |
188 aeval = items[-7].strip() | |
189 aann = items[-2].strip() | |
190 anotes = items[-1].strip() | |
191 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] | |
192 afinal = '\t'.join(afinal).strip() | |
193 fh.write('%s\n' % afinal) | |
194 | |
195 | |
196 # lrnrisk_prototype arguments | |
197 parser = argparse.ArgumentParser() | |
198 | |
199 parser.add_argument('--gtdb_file', action='store', dest='gtdb_file', help='Path to gtdbtk tsv file') | |
200 parser.add_argument('--virulence_factors_file', action='store', dest='virulence_factors_file', help='Path to tsv virulence factors file') | |
201 parser.add_argument('--amr_determinants_file', action='store', dest='amr_determinants_file', help='Path to AMR determinants tsv file') | |
202 parser.add_argument('--blacklist_file', action='store', dest='blacklist_file', help='Path to blacklisted high-risk virulence factors tsv file') | |
203 parser.add_argument('--vf_distribution_file', action='store', dest='vf_distribution_file', help='Path to virulence factor distribution tsv file') | |
204 parser.add_argument('--amr_distribution_file', action='store', dest='amr_distribution_file', help='Path to AMR determinant distribution tsv file') | |
205 parser.add_argument('--blacklist_output_file', action='store', dest='blacklist_output_file', help='Path to blacklist output file') | |
206 parser.add_argument('--vfdb_output_file', action='store', dest='vfdb_output_file', help='Path to vfdb output file') | |
207 parser.add_argument('--amr_output_file', action='store', dest='amr_output_file', help='Path to amr output file') | |
208 | |
209 # parse arguments and run pipeline | |
210 args = parser.parse_args() | |
211 | |
212 # print_output(blacklist, vf_distribution, amr_distribution, args.output, species) | |
213 virulence_genes = get_blast_genes(args.virulence_factors_file) | |
214 species = get_species_from_gtdb(args.gtdb_file) | |
215 | |
216 blacklist = get_blacklist(virulence_genes, args.blacklist_file) | |
217 output_blacklist(blacklist, args.blacklist_output_file) | |
218 | |
219 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species) | |
220 output_vfdb(vf_distribution, args.vfdb_output_file) | |
221 | |
222 amr_genes = get_blast_genes(args.amr_determinants_file) | |
223 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species) | |
224 output_amr(amr_distribution, args.amr_output_file) |