comparison lrn_risk.py @ 3:5a27ac020c9e draft

Uploaded
author greg
date Tue, 27 Jun 2023 14:02:05 +0000
parents 8dc6d4aa17ec
children 8fe04662585c
comparison
equal deleted inserted replaced
2:8dc6d4aa17ec 3:5a27ac020c9e
2 2
3 import argparse 3 import argparse
4 4
5 5
6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category'] 6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category']
7 VFDB_HEADER = ['Gene', 'Contig', '% Identity', '% Coverage', 'E-Value', 'Annotation', 'Comparison to Publicly Available Genomes'] 7 VFDB_HEADER = ['Gene', 'Contig', '%ID', '%COV', 'E', 'Annotation', 'Distribution']
8 8
9 9
10 def get_species_from_gtdb(f): 10 def get_species_from_gtdb(f):
11 # get GTDB species 11 # get GTDB species
12 # assumes there is one genome in the GTDB-Tk output file 12 # assumes there is one genome in the GTDB-Tk output file
34 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen 34 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen
35 d = {} 35 d = {}
36 with open(f, 'r') as fh: 36 with open(f, 'r') as fh:
37 for line in fh: 37 for line in fh:
38 try: 38 try:
39 line = line.strip()
39 items = line.split('\t') 40 items = line.split('\t')
40 gene = items[0] 41 gene = items[0]
41 # contig = items[1] 42 # contig = items[1]
42 # pid = items[2] 43 # pid = items[2]
43 alen = items[3] 44 alen = items[3]
62 bdict = {} 63 bdict = {}
63 blacklist_present = {} 64 blacklist_present = {}
64 with open(b, 'r') as fh: 65 with open(b, 'r') as fh:
65 for line in fh: 66 for line in fh:
66 try: 67 try:
68 line = line.strip()
67 items = line.split('\t') 69 items = line.split('\t')
68 gene = items[0] 70 gene = items[0]
69 val = items[1] 71 val = items[1]
70 bdict[gene] = val 72 bdict[gene] = val
71 except Exception: 73 except Exception:
90 # create dictionaries based on gene distribution 92 # create dictionaries based on gene distribution
91 d = {} 93 d = {}
92 annd = {} 94 annd = {}
93 gtdbd = {} 95 gtdbd = {}
94 finallines = [] 96 finallines = []
97 warnings = []
95 with open(f, 'r') as fh: 98 with open(f, 'r') as fh:
96 for line in fh: 99 for line in fh:
97 try: 100 try:
101 line = line.strip()
98 items = line.split('\t') 102 items = line.split('\t')
99 tax = items[0] 103 tax = items[0]
100 tax = tax.split('s__')[1] 104 tax = tax.split('s__')[1]
101 if len(tax) == 0: 105 if len(tax) == 0:
102 tax = '(Unknown Species)' 106 tax = '(Unknown Species)'
124 pres = items[2] 128 pres = items[2]
125 denom = items[3] 129 denom = items[3]
126 perc = items[4] 130 perc = items[4]
127 perc = str(round(float(perc), 2)) 131 perc = str(round(float(perc), 2))
128 ann = items[-1] 132 ann = items[-1]
129 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) 133 freetext = '{0}/{1} ({2}%)'.format(pres, denom, perc)
130 elif gtdb != '(Unknown Species)': 134 elif gtdb != '(Unknown Species)':
131 ann = annd[key] 135 ann = annd[key]
132 denom = gtdbd[gtdb] 136 denom = gtdbd[gtdb]
133 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) 137 freetext = "*WARNING"
138 warnings.append("*WARNING: This gene has never been detected in this species! Interpret with caution!")
134 else: 139 else:
135 ann = annd[key] 140 ann = annd[key]
136 freetext = 'WARNING: Genome belongs to an undescribed species. Interpret with caution!' 141 freetext = "**WARNING"
137 finalline = '%s\t%s\t%s' % (bv, ann, freetext) 142 warnings.append("**WARNING: This genome belongs to an undescribed species. Interpret with caution!")
138 finallines.append(finalline) 143 finallines.append('%s\t%s\t%s' % (bv, ann, freetext))
139 return finallines 144 return [finallines, warnings]
140 145
141 146
142 def output_blacklist(blacklist, blacklist_output_file): 147 def output_blacklist(blacklist, blacklist_output_file):
143 # takes detected blacklisted genes as input (blacklist) 148 # takes detected blacklisted genes as input (blacklist)
144 # blacklist results 149 # blacklist results
153 for key in blacklist.keys(): 158 for key in blacklist.keys():
154 val = blacklist[key] 159 val = blacklist[key]
155 fh.write('%s\t%s\tHIGH RISK\n' % (key, val)) 160 fh.write('%s\t%s\tHIGH RISK\n' % (key, val))
156 161
157 162
158 def output_vfdb(vfdist, vfdb_output_file): 163 def output_vfdb(vfdist, vfdb_output_file, vf_warnings):
159 # takes distribution of virulence factors as input (vfdist) 164 # takes distribution of virulence factors as input (vfdist)
160 # VFDB results 165 # VFDB results
161 with open(vfdb_output_file, 'w') as fh: 166 with open(vfdb_output_file, 'w') as fh:
162 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) 167 fh.write('%s\n' % '\t'.join(VFDB_HEADER))
163 if len(vfdist) == 0: 168 if len(vfdist) == 0:
175 vcov = items[-3] 180 vcov = items[-3]
176 veval = items[-7] 181 veval = items[-7]
177 vann = items[-2] 182 vann = items[-2]
178 vnotes = items[-1] 183 vnotes = items[-1]
179 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] 184 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes]
180 vfinal = '\t'.join(vfinal) 185 fh.write('%s\n' % '\t'.join(vfinal))
181 fh.write('%s\n' % vfinal) 186 for vfw in sorted(vf_warnings, key=lambda x: x.count('*')):
182 187 fh.write('%s\n' % vfw)
183 188
184 def output_amr(amrdist, amr_output_file): 189
190 def output_amr(amrdist, amr_output_file, amr_warnings):
185 # takes distribution of AMR genes as input (amrdist) 191 # takes distribution of AMR genes as input (amrdist)
186 # AMR results 192 # AMR results
187 with open(amr_output_file, 'w') as fh: 193 with open(amr_output_file, 'w') as fh:
188 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) 194 fh.write('%s\n' % '\t'.join(VFDB_HEADER))
189 if len(amrdist) == 0: 195 if len(amrdist) == 0:
201 acov = items[-3] 207 acov = items[-3]
202 aeval = items[-7] 208 aeval = items[-7]
203 aann = items[-2] 209 aann = items[-2]
204 anotes = items[-1] 210 anotes = items[-1]
205 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] 211 afinal = [agene, acontig, aid, acov, aeval, aann, anotes]
206 afinal = '\t'.join(afinal) 212 fh.write('%s\n' % '\t'.join(afinal))
207 fh.write('%s\n' % afinal) 213 for amrw in sorted(amr_warnings, key=lambda x: x.count('*')):
214 fh.write('%s\n' % amrw)
208 215
209 216
210 # lrnrisk_prototype arguments 217 # lrnrisk_prototype arguments
211 parser = argparse.ArgumentParser() 218 parser = argparse.ArgumentParser()
212 219
229 236
230 blacklist = get_blacklist(virulence_genes, args.blacklist_file) 237 blacklist = get_blacklist(virulence_genes, args.blacklist_file)
231 output_blacklist(blacklist, args.blacklist_output_file) 238 output_blacklist(blacklist, args.blacklist_output_file)
232 239
233 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species) 240 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species)
234 output_vfdb(vf_distribution, args.vfdb_output_file) 241 vf_warnings = vf_distribution[1]
242 vf_distribution = vf_distribution[0]
243 output_vfdb(vf_distribution, args.vfdb_output_file, vf_warnings)
235 244
236 amr_genes = get_blast_genes(args.amr_determinants_file) 245 amr_genes = get_blast_genes(args.amr_determinants_file)
237 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species) 246 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species)
238 output_amr(amr_distribution, args.amr_output_file) 247 amr_warnings = amr_distribution[1]
248 amr_distribution = amr_distribution[0]
249 output_amr(amr_distribution, args.amr_output_file, amr_warnings)