lrn_risk: lrn_risk.py comparison

comparison lrn_risk.py @ 3:5a27ac020c9e draft

Uploaded

author	greg
date	Tue, 27 Jun 2023 14:02:05 +0000
parents	8dc6d4aa17ec
children	8fe04662585c

comparison

equal deleted inserted replaced

-:8dc6d4aa17ec
+:5a27ac020c9e
 import argparse
 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category']
-VFDB_HEADER = ['Gene', 'Contig', '% Identity', '% Coverage', 'E-Value', 'Annotation', 'Comparison to Publicly Available Genomes']
+VFDB_HEADER = ['Gene', 'Contig', '%ID', '%COV', 'E', 'Annotation', 'Distribution']
 def get_species_from_gtdb(f):
 # get GTDB species
 # assumes there is one genome in the GTDB-Tk output file
 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen
 d = {}
 with open(f, 'r') as fh:
 for line in fh:
 try:
+line = line.strip()
 items = line.split('\t')
 gene = items[0]
 # contig = items[1]
 # pid = items[2]
 alen = items[3]
 bdict = {}
 blacklist_present = {}
 with open(b, 'r') as fh:
 for line in fh:
 try:
+line = line.strip()
 items = line.split('\t')
 gene = items[0]
 val = items[1]
 bdict[gene] = val
 except Exception:
 # create dictionaries based on gene distribution
 d = {}
 annd = {}
 gtdbd = {}
 finallines = []
+warnings = []
 with open(f, 'r') as fh:
 for line in fh:
 try:
+line = line.strip()
 items = line.split('\t')
 tax = items[0]
 tax = tax.split('s__')[1]
 if len(tax) == 0:
 tax = '(Unknown Species)'
 pres = items[2]
 denom = items[3]
 perc = items[4]
 perc = str(round(float(perc), 2))
 ann = items[-1]
-freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom)
+freetext = '{0}/{1} ({2}%)'.format(pres, denom, perc)
 elif gtdb != '(Unknown Species)':
 ann = annd[key]
 denom = gtdbd[gtdb]
-freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom)
+freetext = "*WARNING"
+warnings.append("*WARNING: This gene has never been detected in this species! Interpret with caution!")
 else:
 ann = annd[key]
-freetext = 'WARNING: Genome belongs to an undescribed species. Interpret with caution!'
+freetext = "**WARNING"
-finalline = '%s\t%s\t%s' % (bv, ann, freetext)
+warnings.append("**WARNING: This genome belongs to an undescribed species. Interpret with caution!")
-finallines.append(finalline)
+finallines.append('%s\t%s\t%s' % (bv, ann, freetext))
-return finallines
+return [finallines, warnings]
 def output_blacklist(blacklist, blacklist_output_file):
 # takes detected blacklisted genes as input (blacklist)
 # blacklist results
 for key in blacklist.keys():
 val = blacklist[key]
 fh.write('%s\t%s\tHIGH RISK\n' % (key, val))
-def output_vfdb(vfdist, vfdb_output_file):
+def output_vfdb(vfdist, vfdb_output_file, vf_warnings):
 # takes distribution of virulence factors as input (vfdist)
 # VFDB results
 with open(vfdb_output_file, 'w') as fh:
 fh.write('%s\n' % '\t'.join(VFDB_HEADER))
 if len(vfdist) == 0:
 vcov = items[-3]
 veval = items[-7]
 vann = items[-2]
 vnotes = items[-1]
 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes]
-vfinal = '\t'.join(vfinal)
+fh.write('%s\n' % '\t'.join(vfinal))
-fh.write('%s\n' % vfinal)
+for vfw in sorted(vf_warnings, key=lambda x: x.count('*')):
+fh.write('%s\n' % vfw)
-def output_amr(amrdist, amr_output_file):
+def output_amr(amrdist, amr_output_file, amr_warnings):
 # takes distribution of AMR genes as input (amrdist)
 # AMR results
 with open(amr_output_file, 'w') as fh:
 fh.write('%s\n' % '\t'.join(VFDB_HEADER))
 if len(amrdist) == 0:
 acov = items[-3]
 aeval = items[-7]
 aann = items[-2]
 anotes = items[-1]
 afinal = [agene, acontig, aid, acov, aeval, aann, anotes]
-afinal = '\t'.join(afinal)
+fh.write('%s\n' % '\t'.join(afinal))
-fh.write('%s\n' % afinal)
+for amrw in sorted(amr_warnings, key=lambda x: x.count('*')):
+fh.write('%s\n' % amrw)
 # lrnrisk_prototype arguments
 parser = argparse.ArgumentParser()
 blacklist = get_blacklist(virulence_genes, args.blacklist_file)
 output_blacklist(blacklist, args.blacklist_output_file)
 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species)
-output_vfdb(vf_distribution, args.vfdb_output_file)
+vf_warnings = vf_distribution[1]
+vf_distribution = vf_distribution[0]
+output_vfdb(vf_distribution, args.vfdb_output_file, vf_warnings)
 amr_genes = get_blast_genes(args.amr_determinants_file)
 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species)
-output_amr(amr_distribution, args.amr_output_file)
+amr_warnings = amr_distribution[1]
+amr_distribution = amr_distribution[0]
+output_amr(amr_distribution, args.amr_output_file, amr_warnings)

Mercurial > repos > greg > lrn_risk

comparison lrn_risk.py @ 3:5a27ac020c9e draft