| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 import argparse
 | 
| 
 | 
     4 
 | 
| 
 | 
     5 
 | 
| 
 | 
     6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category']
 | 
| 
3
 | 
     7 VFDB_HEADER = ['Gene', 'Contig', '%ID', '%COV', 'E', 'Annotation', 'Distribution']
 | 
| 
0
 | 
     8 
 | 
| 
 | 
     9 
 | 
| 
 | 
    10 def get_species_from_gtdb(f):
 | 
| 
 | 
    11     # get GTDB species
 | 
| 
 | 
    12     # assumes there is one genome in the GTDB-Tk output file
 | 
| 
 | 
    13     with open(f, 'r') as fh:
 | 
| 
1
 | 
    14         for i, line in enumerate(fh):
 | 
| 
 | 
    15             if i == 0:
 | 
| 
 | 
    16                 # Skip header.
 | 
| 
 | 
    17                 continue
 | 
| 
2
 | 
    18             try:
 | 
| 
 | 
    19                 items = line.split('\t')
 | 
| 
 | 
    20                 tax = items[1]
 | 
| 
 | 
    21                 tax = tax.split(';')[-1]
 | 
| 
 | 
    22                 # split on GTDB species tag
 | 
| 
 | 
    23                 tax = tax.split('s__')[1]
 | 
| 
 | 
    24             except Exception:
 | 
| 
 | 
    25                 return '(Unknown Species)'
 | 
| 
1
 | 
    26             if len(tax) == 0:
 | 
| 
2
 | 
    27                 return '(Unknown Species)'
 | 
| 
0
 | 
    28     return tax
 | 
| 
 | 
    29 
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 def get_blast_genes(f):
 | 
| 
 | 
    32     # reads genes detected via BLAST
 | 
| 
 | 
    33     # BLAST header is as follows:
 | 
| 
 | 
    34     # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen
 | 
| 
 | 
    35     d = {}
 | 
| 
 | 
    36     with open(f, 'r') as fh:
 | 
| 
 | 
    37         for line in fh:
 | 
| 
2
 | 
    38             try:
 | 
| 
3
 | 
    39                 line = line.strip()
 | 
| 
2
 | 
    40                 items = line.split('\t')
 | 
| 
 | 
    41                 gene = items[0]
 | 
| 
 | 
    42                 # contig = items[1]
 | 
| 
 | 
    43                 # pid = items[2]
 | 
| 
 | 
    44                 alen = items[3]
 | 
| 
 | 
    45                 # e = items[-4]
 | 
| 
 | 
    46                 qlen = items[-1]
 | 
| 
 | 
    47                 # calculate query coverage by dividing alignment length by query length
 | 
| 
 | 
    48                 qcov = round(float(alen) / float(qlen) * 100.0, 2)
 | 
| 
 | 
    49                 if gene not in d.keys():
 | 
| 
 | 
    50                     d[gene] = []
 | 
| 
 | 
    51                 d[gene].append('%s\t%s' % (line, str(qcov)))
 | 
| 
 | 
    52             except Exception:
 | 
| 
 | 
    53                 return d
 | 
| 
0
 | 
    54     return d
 | 
| 
 | 
    55 
 | 
| 
 | 
    56 
 | 
| 
 | 
    57 def get_blacklist(v, b):
 | 
| 
 | 
    58     # identify high-risk isolates based on blacklisted genes
 | 
| 
 | 
    59     # blacklisted genes file contains two columns:
 | 
| 
 | 
    60     # column 0=the gene name as it appears in the gene database
 | 
| 
 | 
    61     # column 1=the reason why the gene was blacklisted, which will be reported
 | 
| 
 | 
    62     # e.g., 'ANTHRAX TOXIN'
 | 
| 
 | 
    63     bdict = {}
 | 
| 
2
 | 
    64     blacklist_present = {}
 | 
| 
0
 | 
    65     with open(b, 'r') as fh:
 | 
| 
 | 
    66         for line in fh:
 | 
| 
2
 | 
    67             try:
 | 
| 
3
 | 
    68                 line = line.strip()
 | 
| 
2
 | 
    69                 items = line.split('\t')
 | 
| 
 | 
    70                 gene = items[0]
 | 
| 
 | 
    71                 val = items[1]
 | 
| 
 | 
    72                 bdict[gene] = val
 | 
| 
 | 
    73             except Exception:
 | 
| 
 | 
    74                 return blacklist_present
 | 
| 
0
 | 
    75     for key in v.keys():
 | 
| 
 | 
    76         if key in bdict.keys():
 | 
| 
 | 
    77             val = bdict[key]
 | 
| 
 | 
    78             blacklist_present[key] = val
 | 
| 
 | 
    79     return blacklist_present
 | 
| 
 | 
    80 
 | 
| 
 | 
    81 
 | 
| 
 | 
    82 def gene_dist(f, blast, gtdb):
 | 
| 
 | 
    83     # get within-species prevalence of genes
 | 
| 
 | 
    84     # for virulence factors (VFs): uses VFDB VFs detected via ABRicate's VFDB db
 | 
| 
4
 | 
    85     # for AMR genes: uses AMR genes detected via ABRicate + PIMA db
 | 
| 
0
 | 
    86     # for VFs and AMR genes: genes were detected via ABRicate XXX
 | 
| 
 | 
    87     # minimum nucleotide identity and coverage values >=80%
 | 
| 
 | 
    88     # total of 61,161 genomes queried
 | 
| 
 | 
    89     # takes VFDB or AMR gene distribution file as input (f)
 | 
| 
 | 
    90     # BLAST file of VFDB or AMR genes (blast)
 | 
| 
 | 
    91     # GTDB species (gtdb)
 | 
| 
 | 
    92     # create dictionaries based on gene distribution
 | 
| 
 | 
    93     d = {}
 | 
| 
 | 
    94     annd = {}
 | 
| 
 | 
    95     gtdbd = {}
 | 
| 
6
 | 
    96     finallines = [] 
 | 
| 
0
 | 
    97     with open(f, 'r') as fh:
 | 
| 
 | 
    98         for line in fh:
 | 
| 
2
 | 
    99             try:
 | 
| 
3
 | 
   100                 line = line.strip()
 | 
| 
2
 | 
   101                 items = line.split('\t')
 | 
| 
 | 
   102                 tax = items[0]
 | 
| 
 | 
   103                 tax = tax.split('s__')[1]
 | 
| 
 | 
   104                 if len(tax) == 0:
 | 
| 
 | 
   105                     tax = '(Unknown Species)'
 | 
| 
 | 
   106                 gene = items[1]
 | 
| 
 | 
   107                 ann = items[-1]
 | 
| 
 | 
   108                 denom = items[3]
 | 
| 
 | 
   109                 d['%s___%s' % (tax, gene)] = line
 | 
| 
 | 
   110                 annd[gene] = ann
 | 
| 
 | 
   111                 gtdbd[tax] = denom
 | 
| 
 | 
   112             except Exception:
 | 
| 
 | 
   113                 return finallines
 | 
| 
0
 | 
   114     # parse BLAST results
 | 
| 
 | 
   115     for key in blast.keys():
 | 
| 
 | 
   116         blastval = blast[key]
 | 
| 
 | 
   117         for bv in blastval:
 | 
| 
 | 
   118             testkey = '%s___%s' % (gtdb, key)
 | 
| 
 | 
   119             if testkey in d.keys() and gtdb != '(Unknown Species)':
 | 
| 
 | 
   120                 taxval = d[testkey]
 | 
| 
 | 
   121                 items = taxval.split('\t')
 | 
| 
2
 | 
   122                 tax = items[0]
 | 
| 
 | 
   123                 tax = tax.split('s__')[1]
 | 
| 
0
 | 
   124                 if len(tax) == 0:
 | 
| 
 | 
   125                     tax = '(Unknown Species)'
 | 
| 
2
 | 
   126                 gene = items[1]
 | 
| 
 | 
   127                 pres = items[2]
 | 
| 
 | 
   128                 denom = items[3]
 | 
| 
 | 
   129                 perc = items[4]
 | 
| 
0
 | 
   130                 perc = str(round(float(perc), 2))
 | 
| 
2
 | 
   131                 ann = items[-1]
 | 
| 
3
 | 
   132                 freetext = '{0}/{1} ({2}%)'.format(pres, denom, perc)
 | 
| 
0
 | 
   133             elif gtdb != '(Unknown Species)':
 | 
| 
4
 | 
   134                 ann = 'NA'
 | 
| 
5
 | 
   135                 if gtdb in gtdbd.keys():
 | 
| 
 | 
   136                     denom = gtdbd[gtdb]
 | 
| 
 | 
   137                 else:
 | 
| 
 | 
   138                     denom = 'NA' 
 | 
| 
6
 | 
   139                 freetext = "NA" 
 | 
| 
0
 | 
   140             else:
 | 
| 
4
 | 
   141                 ann = 'NA'
 | 
| 
5
 | 
   142                 denom = 'NA'
 | 
| 
6
 | 
   143                 freetext = "NA" 
 | 
| 
3
 | 
   144             finallines.append('%s\t%s\t%s' % (bv, ann, freetext))
 | 
| 
6
 | 
   145     return [finallines]
 | 
| 
0
 | 
   146 
 | 
| 
 | 
   147 
 | 
| 
 | 
   148 def output_blacklist(blacklist, blacklist_output_file):
 | 
| 
 | 
   149     # takes detected blacklisted genes as input (blacklist)
 | 
| 
 | 
   150     # blacklist results
 | 
| 
 | 
   151     with open(blacklist_output_file, 'w') as fh:
 | 
| 
 | 
   152         fh.write('%s\n' % '\t'.join(BLACKLIST_HEADER))
 | 
| 
 | 
   153         if len(blacklist.keys()) == 0:
 | 
| 
 | 
   154             # print this if no blacklisted genes are detected
 | 
| 
 | 
   155             fh.write('(No blacklisted genes detected)\tNA\tNot high risk\n')
 | 
| 
 | 
   156         else:
 | 
| 
 | 
   157             # print this if blacklisted genes are detected
 | 
| 
 | 
   158             # print a table with one row per detected blacklisted gene
 | 
| 
 | 
   159             for key in blacklist.keys():
 | 
| 
 | 
   160                 val = blacklist[key]
 | 
| 
 | 
   161                 fh.write('%s\t%s\tHIGH RISK\n' % (key, val))
 | 
| 
 | 
   162 
 | 
| 
 | 
   163 
 | 
| 
6
 | 
   164 def output_vfdb(vfdist, vfdb_output_file):
 | 
| 
0
 | 
   165     # takes distribution of virulence factors as input (vfdist)
 | 
| 
 | 
   166     # VFDB results
 | 
| 
 | 
   167     with open(vfdb_output_file, 'w') as fh:
 | 
| 
 | 
   168         fh.write('%s\n' % '\t'.join(VFDB_HEADER))
 | 
| 
 | 
   169         if len(vfdist) == 0:
 | 
| 
 | 
   170             # print this if no VFs detected
 | 
| 
 | 
   171             fh.write('%s\n' % '\t'.join(['(No VFs Detected)'] * 7))
 | 
| 
 | 
   172         else:
 | 
| 
 | 
   173             # print table of VFs if VFs detected
 | 
| 
 | 
   174             for vline in vfdist:
 | 
| 
 | 
   175                 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score',  'Identical Matches', 'Query Length']
 | 
| 
 | 
   176                 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes']
 | 
| 
 | 
   177                 items = vline.split('\t')
 | 
| 
2
 | 
   178                 vgene = items[0]
 | 
| 
 | 
   179                 vcontig = items[1]
 | 
| 
 | 
   180                 vid = items[2]
 | 
| 
 | 
   181                 vcov = items[-3]
 | 
| 
 | 
   182                 veval = items[-7]
 | 
| 
 | 
   183                 vann = items[-2]
 | 
| 
 | 
   184                 vnotes = items[-1]
 | 
| 
0
 | 
   185                 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes]
 | 
| 
3
 | 
   186                 fh.write('%s\n' % '\t'.join(vfinal))
 | 
| 
0
 | 
   187 
 | 
| 
6
 | 
   188 def output_amr(amrdist, amr_output_file):
 | 
| 
0
 | 
   189     # takes distribution of AMR genes as input (amrdist)
 | 
| 
 | 
   190     # AMR results
 | 
| 
 | 
   191     with open(amr_output_file, 'w') as fh:
 | 
| 
 | 
   192         fh.write('%s\n' % '\t'.join(VFDB_HEADER))
 | 
| 
 | 
   193         if len(amrdist) == 0:
 | 
| 
 | 
   194             # print this if no AMR genes detected
 | 
| 
 | 
   195             fh.write('%s\n' % '\t'.join(['(No AMR Genes Detected)'] * 7))
 | 
| 
 | 
   196         else:
 | 
| 
 | 
   197             # print this if AMR genes detected
 | 
| 
 | 
   198             for aline in amrdist:
 | 
| 
 | 
   199                 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score',  'Identical Matches', 'Query Length']
 | 
| 
 | 
   200                 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes']
 | 
| 
 | 
   201                 items = aline.split('\t')
 | 
| 
2
 | 
   202                 agene = items[0]
 | 
| 
 | 
   203                 acontig = items[1]
 | 
| 
 | 
   204                 aid = items[2]
 | 
| 
 | 
   205                 acov = items[-3]
 | 
| 
 | 
   206                 aeval = items[-7]
 | 
| 
 | 
   207                 aann = items[-2]
 | 
| 
 | 
   208                 anotes = items[-1]
 | 
| 
0
 | 
   209                 afinal = [agene, acontig, aid, acov, aeval, aann, anotes]
 | 
| 
3
 | 
   210                 fh.write('%s\n' % '\t'.join(afinal))
 | 
| 
0
 | 
   211 
 | 
| 
 | 
   212 # lrnrisk_prototype arguments
 | 
| 
 | 
   213 parser = argparse.ArgumentParser()
 | 
| 
 | 
   214 
 | 
| 
 | 
   215 parser.add_argument('--gtdb_file', action='store', dest='gtdb_file', help='Path to gtdbtk tsv file')
 | 
| 
 | 
   216 parser.add_argument('--virulence_factors_file', action='store', dest='virulence_factors_file', help='Path to tsv virulence factors file')
 | 
| 
 | 
   217 parser.add_argument('--amr_determinants_file', action='store', dest='amr_determinants_file', help='Path to AMR determinants tsv file')
 | 
| 
 | 
   218 parser.add_argument('--blacklist_file', action='store', dest='blacklist_file', help='Path to blacklisted high-risk virulence factors tsv file')
 | 
| 
 | 
   219 parser.add_argument('--vf_distribution_file', action='store', dest='vf_distribution_file', help='Path to virulence factor distribution tsv file')
 | 
| 
 | 
   220 parser.add_argument('--amr_distribution_file', action='store', dest='amr_distribution_file', help='Path to AMR determinant distribution tsv file')
 | 
| 
 | 
   221 parser.add_argument('--blacklist_output_file', action='store', dest='blacklist_output_file', help='Path to blacklist output file')
 | 
| 
 | 
   222 parser.add_argument('--vfdb_output_file', action='store', dest='vfdb_output_file', help='Path to vfdb output file')
 | 
| 
 | 
   223 parser.add_argument('--amr_output_file', action='store', dest='amr_output_file', help='Path to amr output file')
 | 
| 
 | 
   224 
 | 
| 
 | 
   225 # parse arguments and run pipeline
 | 
| 
 | 
   226 args = parser.parse_args()
 | 
| 
 | 
   227 
 | 
| 
 | 
   228 # print_output(blacklist, vf_distribution, amr_distribution, args.output, species)
 | 
| 
 | 
   229 virulence_genes = get_blast_genes(args.virulence_factors_file)
 | 
| 
 | 
   230 species = get_species_from_gtdb(args.gtdb_file)
 | 
| 
 | 
   231 
 | 
| 
 | 
   232 blacklist = get_blacklist(virulence_genes, args.blacklist_file)
 | 
| 
 | 
   233 output_blacklist(blacklist, args.blacklist_output_file)
 | 
| 
 | 
   234 
 | 
| 
 | 
   235 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species)
 | 
| 
3
 | 
   236 vf_distribution = vf_distribution[0]
 | 
| 
6
 | 
   237 output_vfdb(vf_distribution, args.vfdb_output_file)
 | 
| 
0
 | 
   238 
 | 
| 
 | 
   239 amr_genes = get_blast_genes(args.amr_determinants_file)
 | 
| 
 | 
   240 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species)
 | 
| 
3
 | 
   241 amr_distribution = amr_distribution[0]
 | 
| 
6
 | 
   242 output_amr(amr_distribution, args.amr_output_file)
 |