comparison draw_amr_matrix.py @ 4:33a0ea992043 draft

Uploaded
author greg
date Tue, 21 Feb 2023 19:55:42 +0000
parents 7d7884f2d921
children caf554e039b2
comparison
equal deleted inserted replaced
3:7d7884f2d921 4:33a0ea992043
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import csv 4 import csv
5 import os 5 import os
6 6 import subprocess
7 import sys
8 import tempfile
9
10 import Bio.SeqIO
7 import numpy 11 import numpy
8 import pandas 12 import pandas
9 import matplotlib.pyplot as pyplot 13 import matplotlib.pyplot as pyplot
10 14
11 15
14 if k.lower().find('amr') >= 0: 18 if k.lower().find('amr') >= 0:
15 return amr_feature_hits[k] 19 return amr_feature_hits[k]
16 return None 20 return None
17 21
18 22
19 def draw_amr_matrix(amr_feature_hits_files, amr_deletions_file, amr_mutations_file, amr_gene_drug_file, output_dir): 23 def load_fasta(fasta_file):
24 sequence = pandas.Series(dtype=object)
25 for contig in Bio.SeqIO.parse(fasta_file, 'fasta'):
26 sequence[contig.id] = contig
27 return sequence
28
29
30 def run_command(cmd):
31 try:
32 tmp_name = tempfile.NamedTemporaryFile(dir=".").name
33 tmp_stderr = open(tmp_name, 'wb')
34 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
35 returncode = proc.wait()
36 tmp_stderr.close()
37 if returncode != 0:
38 # Get stderr, allowing for case where it's very large.
39 tmp_stderr = open(tmp_name, 'rb')
40 stderr = ''
41 buffsize = 1048576
42 try:
43 while True:
44 stderr += tmp_stderr.read(buffsize)
45 if not stderr or len(stderr) % buffsize != 0:
46 break
47 except OverflowError:
48 pass
49 tmp_stderr.close()
50 os.remove(tmp_name)
51 stop_err(stderr)
52 except Exception as e:
53 stop_err('Command:\n%s\n\nended with error:\n%s\n\n' % (cmd, str(e)))
54
55
56 def stop_err(msg):
57 sys.stderr.write(msg)
58 sys.exit(1)
59
60
61 def draw_amr_matrix(amr_feature_hits_files, amr_deletions_file, amr_mutations_file, amr_mutation_regions_file, amr_gene_drug_file, reference, reference_size, region_mutations_output_file, mutations_dir, output_dir):
20 ofh = open('process_log', 'w') 62 ofh = open('process_log', 'w')
21 63
22 # Read amr_feature_hits_files. 64 # Read amr_feature_hits_files.
23 amr_feature_hits = pandas.Series(dtype=object) 65 amr_feature_hits = pandas.Series(dtype=object)
24 for amr_feature_hits_file in amr_feature_hits_files: 66 for amr_feature_hits_file in amr_feature_hits_files:
40 # Read amr_drug_gene_file. 82 # Read amr_drug_gene_file.
41 amr_gene_drug = pandas.read_csv(amr_gene_drug_file, index_col=None, sep='\t', quoting=csv.QUOTE_NONE, header=None) 83 amr_gene_drug = pandas.read_csv(amr_gene_drug_file, index_col=None, sep='\t', quoting=csv.QUOTE_NONE, header=None)
42 ofh.write("\namr_gene_drug:\n%s\n" % str(amr_gene_drug)) 84 ofh.write("\namr_gene_drug:\n%s\n" % str(amr_gene_drug))
43 85
44 # Roll up AMR gene hits. 86 # Roll up AMR gene hits.
45 ofh.write("\namr_hits.shape[0]:%s\n" % str(amr_hits.shape[0])) 87 ofh.write("\namr_hits.shape[0]: %s\n" % str(amr_hits.shape[0]))
46 if amr_hits.shape[0] > 0: 88 if amr_hits.shape[0] > 0:
47 for gene_idx, gene in amr_hits.iterrows(): 89 for gene_idx, gene in amr_hits.iterrows():
48 ofh.write("gene_idx:%s\n" % str(gene_idx)) 90 ofh.write("gene_idx: %s\n" % str(gene_idx))
49 ofh.write("gene:%s\n" % str(gene)) 91 ofh.write("gene: %s\n" % str(gene))
50 gene_name = gene[3] 92 gene_name = gene[3]
51 ofh.write("gene_name: %s\n" % str(gene_name)) 93 ofh.write("gene_name: %s\n" % str(gene_name))
52 ofh.write("amr_gene_drug[0]: %s\n" % str(amr_gene_drug[0])) 94 ofh.write("amr_gene_drug[0]: %s\n" % str(amr_gene_drug[0]))
53 drugs = amr_gene_drug.loc[amr_gene_drug[0] == gene_name, :][1] 95 drugs = amr_gene_drug.loc[amr_gene_drug[0] == gene_name, :][1]
54 ofh.write("drugs:%s\n" % str(drugs)) 96 ofh.write("drugs: %s\n" % str(drugs))
55 for drug in drugs: 97 for drug in drugs:
56 amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) 98 amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
57 ofh.write("\amr_to_draw:%s\n" % str(amr_to_draw)) 99 ofh.write("\amr_to_draw: %s\n" % str(amr_to_draw))
58 100
59 if amr_mutations_file is not None: 101 ofh.write("\namr_mutations_file si None: %s\n" % str(amr_mutations_file == 'None'))
60 # TODO: So far, no samples have produced mutations, so we haven't been able 102 if amr_mutations_file not in [None, 'None'] and os.path.getsize(amr_mutations_file) > 0:
61 # to produce a populated VarScan VCF file of mutations - https://github.com/appliedbinf/pima_md/blob/main/pima.py#L2923. 103 amr_mutations = pandas.Series(dtype=object)
62 # The call_amr_mutations Galaxy tool will currently produce this VarScan VCF file, but we need a sample that 104 if amr_mutation_regions_file is not None:
63 # will produce a populated file. After we find one, we'll need to figure out how to implement this loop 105 mutation_regions = pandas.read_csv(amr_mutation_regions_file, header=0, sep='\t', index_col=False)
64 # https://github.com/appliedbinf/pima_md/blob/main/pima.py#L2925 in a Galaxy tool so that the VarScan VCF 106 if mutation_regions.shape[1] != 7:
65 # file will be converted to the TSV amr_mutations_file that thsi tool expects. 107 ofh.write("\nMutation regions should be a six column file.\n")
66 amr_mutations = pandas.DataFrame() 108 elif mutation_regions.shape[0] == 0:
109 ofh.write("\nNo rows in mutation regions file.\n")
110 else:
111 # Make sure the positions in the BED file fall within the chromosomes provided in the reference sequence.
112 for mutation_region in range(mutation_regions.shape[0]):
113 mutation_region = mutation_regions.iloc[mutation_region, :]
114 if not (mutation_region[0] in reference):
115 ofh.write("\nMutation region :%s not found in reference genome.\n" % ' '.join(mutation_region.astype(str)))
116 continue
117 if not isinstance(mutation_region[1], numpy.int64):
118 ofh.write("\nNon-integer found in mutation region start (column 2): %s.\n" % str(mutation_region[1]))
119 break
120 elif not isinstance(mutation_region[2], numpy.int64):
121 ofh.write("\nNon-integer found in mutation region start (column 3): %s.\n" % str(mutation_region[2]))
122 break
123 if mutation_region[1] <= 0 or mutation_region[2] <= 0:
124 ofh.write("\nMutation region %s starts before the reference sequence.\n" % ' '.join(mutation_region.astype(str)))
125 if mutation_region[1] > len(reference[mutation_region[0]].seq) or mutation_region[2] > len(reference[mutation_region[0]].seq):
126 ofh.write("\nMutation region %s ends after the reference sequence.\n" % ' '.join(mutation_region.astype(str)))
127 for region_i in range(mutation_regions.shape[0]):
128 region = mutation_regions.iloc[region_i, :]
129 if not region.get('type', default='No Type') in ['snp', 'small-indel', 'any']:
130 continue
131 ofh.write("\nFinding AMR mutations for %s.\n" % str(region['name']))
132 region_dir = os.path.join(mutations_dir, 'region_' + str(region_i))
133 os.mkdir(region_dir)
134 region_bed = os.path.join(region_dir, 'region.bed')
135 mutation_regions.loc[[region_i], ].to_csv(path_or_buf=region_bed, sep='\t', header=False, index=False)
136 cmd = "bedtools intersect -nonamecheck -wb -a '%s' -b '%s' | awk '{BEGIN{getline < \"%s\" ;printf $0\"\t\";getline < \"%s\"; getline < \"%s\";print $0}{print}' > %s" % (region_bed, amr_mutations_file, amr_mutation_regions_file, amr_mutations_file, amr_mutations_file, region_mutations_output_file)
137 run_command(cmd)
138 try:
139 region_mutations = pandas.read_csv(region_mutations_output_file, sep='\t', header=0, index_col=False)
140 except Exception:
141 region_mutations = pandas.DataFrame()
142 if region_mutations.shape[0] == 0:
143 continue
144 # Figure out what kind of mutations are in this region.
145 region_mutation_types = pandas.Series(['snp'] * region_mutations.shape[0], name='TYPE', index=region_mutations.index)
146 region_mutation_types[region_mutations['REF'].str.len() != region_mutations['ALT'].str.len()] = 'small-indel'
147 region_mutation_drugs = pandas.Series(region['drug'] * region_mutations.shape[0], name='DRUG', index=region_mutations.index)
148 region_notes = pandas.Series(region['note'] * region_mutations.shape[0], name='NOTE', index=region_mutations.index)
149 region_mutations = pandas.concat([region_mutations, region_mutation_types, region_mutation_drugs, region_notes], axis=1)
150 region_mutations = region_mutations[['#CHROM', 'POS', 'TYPE', 'REF', 'ALT', 'DRUG', 'NOTE']]
151 amr_mutations[region['name']] = region_mutations
152 else:
153 ofh.write("\nMutation region BED not received.\n")
67 # Roll up potentially resistance conferring mutations. 154 # Roll up potentially resistance conferring mutations.
68 for mutation_region, mutation_hits in amr_mutations.iteritems(): 155 for mutation_region, mutation_hits in amr_mutations.iteritems():
69 for mutation_idx, mutation_hit in mutation_hits.iterrows(): 156 for mutation_idx, mutation_hit in mutation_hits.iterrows():
70 mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT'] 157 mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT']
71 amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) 158 amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
72 159
73 if amr_deletions_file is not None: 160 if amr_deletions_file not in [None, 'None'] and os.path.getsize(amr_deletions_file) > 0:
74 # TODO: So far, no samples have produced deletions, but we do have all the pices in place 161 # TODO: So far, no samples have produced deletions, but we do have all the pices in place
75 # within the workflow to receive the amr_deletions_file here, although it is currently 162 # within the workflow to receive the amr_deletions_file here, although it is currently
76 # always empty... 163 # always empty...
77 # Roll up deletions that might confer resistance. 164 # Roll up deletions that might confer resistance.
78 try: 165 try:
113 parser = argparse.ArgumentParser() 200 parser = argparse.ArgumentParser()
114 201
115 parser.add_argument('--amr_feature_hits_dir', action='store', dest='amr_feature_hits_dir', help='Directory of tabular files containing feature hits') 202 parser.add_argument('--amr_feature_hits_dir', action='store', dest='amr_feature_hits_dir', help='Directory of tabular files containing feature hits')
116 parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', default=None, help='AMR deletions BED file') 203 parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', default=None, help='AMR deletions BED file')
117 parser.add_argument('--amr_mutations_file', action='store', dest='amr_mutations_file', default=None, help='AMR mutations TSV file') 204 parser.add_argument('--amr_mutations_file', action='store', dest='amr_mutations_file', default=None, help='AMR mutations TSV file')
205 parser.add_argument('--amr_mutation_regions_file', action='store', dest='amr_mutation_regions_file', default=None, help='AMR mutation regions BED file')
118 parser.add_argument('--amr_gene_drug_file', action='store', dest='amr_gene_drug_file', help='AMR_gene_drugs tsv file') 206 parser.add_argument('--amr_gene_drug_file', action='store', dest='amr_gene_drug_file', help='AMR_gene_drugs tsv file')
207 parser.add_argument('--reference_genome', action='store', dest='reference_genome', help='Reference genome fasta file')
208 parser.add_argument('--region_mutations_output_file', action='store', dest='region_mutations_output_file', default=None, help='Region mutations TSV output file')
209 parser.add_argument('--mutations_dir', action='store', dest='mutations_dir', help='Mutations directory')
119 parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory') 210 parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory')
120 211
121 args = parser.parse_args() 212 args = parser.parse_args()
122 213
123 # Get thge collection of feature hits files. The collection 214 # Get the collection of feature hits files. The collection
124 # will be sorted alphabetically and will contain 2 files 215 # will be sorted alphabetically and will contain 2 files
125 # named something like AMR_CDS_311_2022_12_20.fasta and 216 # named something like AMR_CDS_311_2022_12_20.fasta and
126 # Incompatibility_Groups_2023_01_01.fasta. 217 # Incompatibility_Groups_2023_01_01.fasta.
127 amr_feature_hits_files = [] 218 amr_feature_hits_files = []
128 for file_name in sorted(os.listdir(args.amr_feature_hits_dir)): 219 for file_name in sorted(os.listdir(args.amr_feature_hits_dir)):
129 file_path = os.path.abspath(os.path.join(args.amr_feature_hits_dir, file_name)) 220 file_path = os.path.abspath(os.path.join(args.amr_feature_hits_dir, file_name))
130 amr_feature_hits_files.append(file_path) 221 amr_feature_hits_files.append(file_path)
131 222
132 draw_amr_matrix(amr_feature_hits_files, args.amr_deletions_file, args.amr_mutations_file, args.amr_gene_drug_file, args.output_dir) 223 # Load the reference genome into memory.
224 reference = load_fasta(args.reference_genome)
225 reference_size = 0
226 for i in reference:
227 reference_size += len(i.seq)
228
229 draw_amr_matrix(amr_feature_hits_files, args.amr_deletions_file, args.amr_mutations_file, args.amr_mutation_regions_file, args.amr_gene_drug_file, reference, reference_size, args.region_mutations_output_file, args.mutations_dir, args.output_dir)