Mercurial > repos > greg > lrn_risk
comparison lrn_risk.py @ 3:5a27ac020c9e draft
Uploaded
author | greg |
---|---|
date | Tue, 27 Jun 2023 14:02:05 +0000 |
parents | 8dc6d4aa17ec |
children | 8fe04662585c |
comparison
equal
deleted
inserted
replaced
2:8dc6d4aa17ec | 3:5a27ac020c9e |
---|---|
2 | 2 |
3 import argparse | 3 import argparse |
4 | 4 |
5 | 5 |
6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category'] | 6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category'] |
7 VFDB_HEADER = ['Gene', 'Contig', '% Identity', '% Coverage', 'E-Value', 'Annotation', 'Comparison to Publicly Available Genomes'] | 7 VFDB_HEADER = ['Gene', 'Contig', '%ID', '%COV', 'E', 'Annotation', 'Distribution'] |
8 | 8 |
9 | 9 |
10 def get_species_from_gtdb(f): | 10 def get_species_from_gtdb(f): |
11 # get GTDB species | 11 # get GTDB species |
12 # assumes there is one genome in the GTDB-Tk output file | 12 # assumes there is one genome in the GTDB-Tk output file |
34 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen | 34 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen |
35 d = {} | 35 d = {} |
36 with open(f, 'r') as fh: | 36 with open(f, 'r') as fh: |
37 for line in fh: | 37 for line in fh: |
38 try: | 38 try: |
39 line = line.strip() | |
39 items = line.split('\t') | 40 items = line.split('\t') |
40 gene = items[0] | 41 gene = items[0] |
41 # contig = items[1] | 42 # contig = items[1] |
42 # pid = items[2] | 43 # pid = items[2] |
43 alen = items[3] | 44 alen = items[3] |
62 bdict = {} | 63 bdict = {} |
63 blacklist_present = {} | 64 blacklist_present = {} |
64 with open(b, 'r') as fh: | 65 with open(b, 'r') as fh: |
65 for line in fh: | 66 for line in fh: |
66 try: | 67 try: |
68 line = line.strip() | |
67 items = line.split('\t') | 69 items = line.split('\t') |
68 gene = items[0] | 70 gene = items[0] |
69 val = items[1] | 71 val = items[1] |
70 bdict[gene] = val | 72 bdict[gene] = val |
71 except Exception: | 73 except Exception: |
90 # create dictionaries based on gene distribution | 92 # create dictionaries based on gene distribution |
91 d = {} | 93 d = {} |
92 annd = {} | 94 annd = {} |
93 gtdbd = {} | 95 gtdbd = {} |
94 finallines = [] | 96 finallines = [] |
97 warnings = [] | |
95 with open(f, 'r') as fh: | 98 with open(f, 'r') as fh: |
96 for line in fh: | 99 for line in fh: |
97 try: | 100 try: |
101 line = line.strip() | |
98 items = line.split('\t') | 102 items = line.split('\t') |
99 tax = items[0] | 103 tax = items[0] |
100 tax = tax.split('s__')[1] | 104 tax = tax.split('s__')[1] |
101 if len(tax) == 0: | 105 if len(tax) == 0: |
102 tax = '(Unknown Species)' | 106 tax = '(Unknown Species)' |
124 pres = items[2] | 128 pres = items[2] |
125 denom = items[3] | 129 denom = items[3] |
126 perc = items[4] | 130 perc = items[4] |
127 perc = str(round(float(perc), 2)) | 131 perc = str(round(float(perc), 2)) |
128 ann = items[-1] | 132 ann = items[-1] |
129 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) | 133 freetext = '{0}/{1} ({2}%)'.format(pres, denom, perc) |
130 elif gtdb != '(Unknown Species)': | 134 elif gtdb != '(Unknown Species)': |
131 ann = annd[key] | 135 ann = annd[key] |
132 denom = gtdbd[gtdb] | 136 denom = gtdbd[gtdb] |
133 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) | 137 freetext = "*WARNING" |
138 warnings.append("*WARNING: This gene has never been detected in this species! Interpret with caution!") | |
134 else: | 139 else: |
135 ann = annd[key] | 140 ann = annd[key] |
136 freetext = 'WARNING: Genome belongs to an undescribed species. Interpret with caution!' | 141 freetext = "**WARNING" |
137 finalline = '%s\t%s\t%s' % (bv, ann, freetext) | 142 warnings.append("**WARNING: This genome belongs to an undescribed species. Interpret with caution!") |
138 finallines.append(finalline) | 143 finallines.append('%s\t%s\t%s' % (bv, ann, freetext)) |
139 return finallines | 144 return [finallines, warnings] |
140 | 145 |
141 | 146 |
142 def output_blacklist(blacklist, blacklist_output_file): | 147 def output_blacklist(blacklist, blacklist_output_file): |
143 # takes detected blacklisted genes as input (blacklist) | 148 # takes detected blacklisted genes as input (blacklist) |
144 # blacklist results | 149 # blacklist results |
153 for key in blacklist.keys(): | 158 for key in blacklist.keys(): |
154 val = blacklist[key] | 159 val = blacklist[key] |
155 fh.write('%s\t%s\tHIGH RISK\n' % (key, val)) | 160 fh.write('%s\t%s\tHIGH RISK\n' % (key, val)) |
156 | 161 |
157 | 162 |
158 def output_vfdb(vfdist, vfdb_output_file): | 163 def output_vfdb(vfdist, vfdb_output_file, vf_warnings): |
159 # takes distribution of virulence factors as input (vfdist) | 164 # takes distribution of virulence factors as input (vfdist) |
160 # VFDB results | 165 # VFDB results |
161 with open(vfdb_output_file, 'w') as fh: | 166 with open(vfdb_output_file, 'w') as fh: |
162 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) | 167 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) |
163 if len(vfdist) == 0: | 168 if len(vfdist) == 0: |
175 vcov = items[-3] | 180 vcov = items[-3] |
176 veval = items[-7] | 181 veval = items[-7] |
177 vann = items[-2] | 182 vann = items[-2] |
178 vnotes = items[-1] | 183 vnotes = items[-1] |
179 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] | 184 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] |
180 vfinal = '\t'.join(vfinal) | 185 fh.write('%s\n' % '\t'.join(vfinal)) |
181 fh.write('%s\n' % vfinal) | 186 for vfw in sorted(vf_warnings, key=lambda x: x.count('*')): |
182 | 187 fh.write('%s\n' % vfw) |
183 | 188 |
184 def output_amr(amrdist, amr_output_file): | 189 |
190 def output_amr(amrdist, amr_output_file, amr_warnings): | |
185 # takes distribution of AMR genes as input (amrdist) | 191 # takes distribution of AMR genes as input (amrdist) |
186 # AMR results | 192 # AMR results |
187 with open(amr_output_file, 'w') as fh: | 193 with open(amr_output_file, 'w') as fh: |
188 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) | 194 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) |
189 if len(amrdist) == 0: | 195 if len(amrdist) == 0: |
201 acov = items[-3] | 207 acov = items[-3] |
202 aeval = items[-7] | 208 aeval = items[-7] |
203 aann = items[-2] | 209 aann = items[-2] |
204 anotes = items[-1] | 210 anotes = items[-1] |
205 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] | 211 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] |
206 afinal = '\t'.join(afinal) | 212 fh.write('%s\n' % '\t'.join(afinal)) |
207 fh.write('%s\n' % afinal) | 213 for amrw in sorted(amr_warnings, key=lambda x: x.count('*')): |
214 fh.write('%s\n' % amrw) | |
208 | 215 |
209 | 216 |
210 # lrnrisk_prototype arguments | 217 # lrnrisk_prototype arguments |
211 parser = argparse.ArgumentParser() | 218 parser = argparse.ArgumentParser() |
212 | 219 |
229 | 236 |
230 blacklist = get_blacklist(virulence_genes, args.blacklist_file) | 237 blacklist = get_blacklist(virulence_genes, args.blacklist_file) |
231 output_blacklist(blacklist, args.blacklist_output_file) | 238 output_blacklist(blacklist, args.blacklist_output_file) |
232 | 239 |
233 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species) | 240 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species) |
234 output_vfdb(vf_distribution, args.vfdb_output_file) | 241 vf_warnings = vf_distribution[1] |
242 vf_distribution = vf_distribution[0] | |
243 output_vfdb(vf_distribution, args.vfdb_output_file, vf_warnings) | |
235 | 244 |
236 amr_genes = get_blast_genes(args.amr_determinants_file) | 245 amr_genes = get_blast_genes(args.amr_determinants_file) |
237 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species) | 246 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species) |
238 output_amr(amr_distribution, args.amr_output_file) | 247 amr_warnings = amr_distribution[1] |
248 amr_distribution = amr_distribution[0] | |
249 output_amr(amr_distribution, args.amr_output_file, amr_warnings) |