# HG changeset patch # User greg # Date 1687874525 0 # Node ID 5a27ac020c9ed0f00a325e8da4394aded4845171 # Parent 8dc6d4aa17ec1c5f339d692d7da5cd5ca0328f36 Uploaded diff -r 8dc6d4aa17ec -r 5a27ac020c9e lrn_risk.py --- a/lrn_risk.py Fri Apr 28 17:03:46 2023 +0000 +++ b/lrn_risk.py Tue Jun 27 14:02:05 2023 +0000 @@ -4,7 +4,7 @@ BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category'] -VFDB_HEADER = ['Gene', 'Contig', '% Identity', '% Coverage', 'E-Value', 'Annotation', 'Comparison to Publicly Available Genomes'] +VFDB_HEADER = ['Gene', 'Contig', '%ID', '%COV', 'E', 'Annotation', 'Distribution'] def get_species_from_gtdb(f): @@ -36,6 +36,7 @@ with open(f, 'r') as fh: for line in fh: try: + line = line.strip() items = line.split('\t') gene = items[0] # contig = items[1] @@ -64,6 +65,7 @@ with open(b, 'r') as fh: for line in fh: try: + line = line.strip() items = line.split('\t') gene = items[0] val = items[1] @@ -92,9 +94,11 @@ annd = {} gtdbd = {} finallines = [] + warnings = [] with open(f, 'r') as fh: for line in fh: try: + line = line.strip() items = line.split('\t') tax = items[0] tax = tax.split('s__')[1] @@ -126,17 +130,18 @@ perc = items[4] perc = str(round(float(perc), 2)) ann = items[-1] - freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) + freetext = '{0}/{1} ({2}%)'.format(pres, denom, perc) elif gtdb != '(Unknown Species)': ann = annd[key] denom = gtdbd[gtdb] - freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) + freetext = "*WARNING" + warnings.append("*WARNING: This gene has never been detected in this species! Interpret with caution!") else: ann = annd[key] - freetext = 'WARNING: Genome belongs to an undescribed species. Interpret with caution!' - finalline = '%s\t%s\t%s' % (bv, ann, freetext) - finallines.append(finalline) - return finallines + freetext = "**WARNING" + warnings.append("**WARNING: This genome belongs to an undescribed species. Interpret with caution!") + finallines.append('%s\t%s\t%s' % (bv, ann, freetext)) + return [finallines, warnings] def output_blacklist(blacklist, blacklist_output_file): @@ -155,7 +160,7 @@ fh.write('%s\t%s\tHIGH RISK\n' % (key, val)) -def output_vfdb(vfdist, vfdb_output_file): +def output_vfdb(vfdist, vfdb_output_file, vf_warnings): # takes distribution of virulence factors as input (vfdist) # VFDB results with open(vfdb_output_file, 'w') as fh: @@ -177,11 +182,12 @@ vann = items[-2] vnotes = items[-1] vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] - vfinal = '\t'.join(vfinal) - fh.write('%s\n' % vfinal) + fh.write('%s\n' % '\t'.join(vfinal)) + for vfw in sorted(vf_warnings, key=lambda x: x.count('*')): + fh.write('%s\n' % vfw) -def output_amr(amrdist, amr_output_file): +def output_amr(amrdist, amr_output_file, amr_warnings): # takes distribution of AMR genes as input (amrdist) # AMR results with open(amr_output_file, 'w') as fh: @@ -203,8 +209,9 @@ aann = items[-2] anotes = items[-1] afinal = [agene, acontig, aid, acov, aeval, aann, anotes] - afinal = '\t'.join(afinal) - fh.write('%s\n' % afinal) + fh.write('%s\n' % '\t'.join(afinal)) + for amrw in sorted(amr_warnings, key=lambda x: x.count('*')): + fh.write('%s\n' % amrw) # lrnrisk_prototype arguments @@ -231,8 +238,12 @@ output_blacklist(blacklist, args.blacklist_output_file) vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species) -output_vfdb(vf_distribution, args.vfdb_output_file) +vf_warnings = vf_distribution[1] +vf_distribution = vf_distribution[0] +output_vfdb(vf_distribution, args.vfdb_output_file, vf_warnings) amr_genes = get_blast_genes(args.amr_determinants_file) amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species) -output_amr(amr_distribution, args.amr_output_file) +amr_warnings = amr_distribution[1] +amr_distribution = amr_distribution[0] +output_amr(amr_distribution, args.amr_output_file, amr_warnings)