comparison draw_amr_matrix.py @ 17:eb86b4bf140e draft default tip

Uploaded
author greg
date Wed, 03 May 2023 15:13:12 +0000
parents 7bd48449ee79
children
comparison
equal deleted inserted replaced
16:7bd48449ee79 17:eb86b4bf140e
9 9
10 import Bio.SeqIO 10 import Bio.SeqIO
11 import numpy 11 import numpy
12 import pandas 12 import pandas
13 import matplotlib.pyplot as pyplot 13 import matplotlib.pyplot as pyplot
14
15
16 def get_amr_in_feature_hits(amr_feature_hits):
17 for k in amr_feature_hits.keys():
18 if k.lower().find('amr') >= 0:
19 return amr_feature_hits[k]
20 return None
21 14
22 15
23 def load_fasta(fasta_file): 16 def load_fasta(fasta_file):
24 sequence = pandas.Series(dtype=object) 17 sequence = pandas.Series(dtype=object)
25 for contig in Bio.SeqIO.parse(fasta_file, 'fasta'): 18 for contig in Bio.SeqIO.parse(fasta_file, 'fasta'):
75 else: 68 else:
76 ofh.write("\nEmpty feature file %s will NOT be processed\n" % os.path.basename(amr_feature_hits_file)) 69 ofh.write("\nEmpty feature file %s will NOT be processed\n" % os.path.basename(amr_feature_hits_file))
77 best_hits = None 70 best_hits = None
78 amr_feature_hits[feature_name] = best_hits 71 amr_feature_hits[feature_name] = best_hits
79 72
80 amr_hits = get_amr_in_feature_hits(amr_feature_hits) 73 # Process populated feature hits.
81 ofh.write("\namr_hits:\n%s\n" % str(amr_hits)) 74 for k in amr_feature_hits.keys():
82 if amr_hits is not None: 75 if amr_feature_hits[k] is not None:
83 amr_to_draw = pandas.DataFrame(columns=['gene', 'drug']) 76 amr_hits = amr_feature_hits[k]
84 ofh.write("\namr_to_draw:\n%s\n" % str(amr_to_draw)) 77 ofh.write("\namr_hits:\n%s\n" % str(amr_hits))
85 # Read amr_drug_gene_file. 78 amr_to_draw = pandas.DataFrame(columns=['gene', 'drug'])
86 amr_gene_drug = pandas.read_csv(amr_gene_drug_file, index_col=None, sep='\t', quoting=csv.QUOTE_NONE, header=None) 79 ofh.write("\namr_to_draw:\n%s\n" % str(amr_to_draw))
87 ofh.write("\namr_gene_drug:\n%s\n" % str(amr_gene_drug)) 80 # Read amr_drug_gene_file.
88 81 amr_gene_drug = pandas.read_csv(amr_gene_drug_file, index_col=None, sep='\t', quoting=csv.QUOTE_NONE, header=None)
89 # Roll up AMR gene hits. 82 ofh.write("\namr_gene_drug:\n%s\n" % str(amr_gene_drug))
90 ofh.write("\namr_hits.shape[0]: %s\n" % str(amr_hits.shape[0])) 83
91 if amr_hits.shape[0] > 0: 84 # Roll up AMR gene hits.
92 for gene_idx, gene in amr_hits.iterrows(): 85 ofh.write("\namr_hits.shape[0]: %s\n" % str(amr_hits.shape[0]))
93 ofh.write("gene_idx: %s\n" % str(gene_idx)) 86 if amr_hits.shape[0] > 0:
94 ofh.write("gene: %s\n" % str(gene)) 87 for gene_idx, gene in amr_hits.iterrows():
95 gene_name = gene[3] 88 ofh.write("gene_idx: %s\n" % str(gene_idx))
96 ofh.write("gene_name: %s\n" % str(gene_name)) 89 ofh.write("gene: %s\n" % str(gene))
97 ofh.write("amr_gene_drug[0]: %s\n" % str(amr_gene_drug[0])) 90 gene_name = gene[3]
98 drugs = amr_gene_drug.loc[amr_gene_drug[0] == gene_name, :][1] 91 ofh.write("gene_name: %s\n" % str(gene_name))
99 ofh.write("drugs: %s\n" % str(drugs)) 92 ofh.write("amr_gene_drug[0]: %s\n" % str(amr_gene_drug[0]))
100 for drug in drugs: 93 drugs = amr_gene_drug.loc[amr_gene_drug[0] == gene_name, :][1]
101 amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) 94 ofh.write("drugs: %s\n" % str(drugs))
102 ofh.write("\amr_to_draw: %s\n" % str(amr_to_draw)) 95 for drug in drugs:
103 96 amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
104 ofh.write("\nvarscan_vcf_file is None: %s\n" % str(varscan_vcf_file == 'None')) 97 ofh.write("\amr_to_draw: %s\n" % str(amr_to_draw))
105 if varscan_vcf_file not in [None, 'None'] and os.path.getsize(varscan_vcf_file) > 0: 98
106 amr_mutations = pandas.Series(dtype=object) 99 ofh.write("\nvarscan_vcf_file is None: %s\n" % str(varscan_vcf_file == 'None'))
107 if amr_mutation_regions_bed_file is not None: 100 if varscan_vcf_file not in [None, 'None'] and os.path.getsize(varscan_vcf_file) > 0:
108 mutation_regions = pandas.read_csv(amr_mutation_regions_bed_file, header=0, sep='\t', index_col=False) 101 amr_mutations = pandas.Series(dtype=object)
109 # Validate mutation regions. 102 if amr_mutation_regions_bed_file is not None:
110 if mutation_regions.shape[1] != 7: 103 mutation_regions = pandas.read_csv(amr_mutation_regions_bed_file, header=0, sep='\t', index_col=False)
111 efh.write("The selected mutations regions BED file is invalid, it should be a six column file.\n") 104 # Validate mutation regions.
112 elif mutation_regions.shape[0] == 0: 105 if mutation_regions.shape[1] != 7:
113 efh.write("There are no rows in the selected mutation regions file.\n") 106 efh.write("The selected mutations regions BED file is invalid, it should be a six column file.\n")
107 elif mutation_regions.shape[0] == 0:
108 efh.write("There are no rows in the selected mutation regions file.\n")
109 else:
110 for region_i in range(mutation_regions.shape[0]):
111 region = mutation_regions.iloc[region_i, :]
112 if region[0] not in reference:
113 efh.write("Mutation region '%s' not found in reference genome.\n" % str(region))
114 break
115 if not isinstance(region[1], numpy.int64):
116 efh.write("Non-integer found in mutation region start (column 2): %s.\n" % str(region[1]))
117 break
118 if not isinstance(region[2], numpy.int64):
119 efh.write("Non-integer found in mutation region start (column 3): %s.\n" % str(region[2]))
120 break
121 if region[1] <= 0 or region[2] <= 0:
122 efh.write("Mutation region '%s' starts before the reference sequence.\n" % str(region))
123 if region[1] > len(reference[region[0]].seq) or region[2] > len(reference[region[0]].seq):
124 efh.write("Mutation region '%s' ends after the reference sequence.\n" % str(region))
125 if not region.get('type', default='No Type') in ['snp', 'small-indel', 'any']:
126 ofh.write("\n\nSkipping mutation region '%s' with invalid type '%s', valid types are 'snp', 'small-indel', 'any'.\n\n" % (str(region), str(region.get('type', default='No Type'))))
127 continue
128 ofh.write("\nFinding AMR mutations for %s.\n" % str(region['name']))
129 region_bed = 'region_%s.bed' % region_i
130 ofh.write("region_bed: %s\n" % str(region_bed))
131 mutation_regions.loc[[region_i], ].to_csv(path_or_buf=region_bed, sep='\t', header=False, index=False)
132 ofh.write("mutation_regions.loc[[region_i], ]:\n%s\n" % str(mutation_regions.loc[[region_i], ]))
133 region_mutations_tsv = os.path.join(mutation_regions_dir, 'region_%s_mutations.tsv' % region_i)
134 ofh.write("region_mutations_tsv: %s\n" % str(region_mutations_tsv))
135 cmd = ' '.join(['bedtools intersect',
136 '-nonamecheck',
137 '-wb',
138 '-a', region_bed,
139 '-b', varscan_vcf_file,
140 ' | awk \'BEGIN{getline < "' + amr_mutation_regions_bed_file + '";printf $0"\\t";',
141 'getline < "' + varscan_vcf_file + '"; getline < "' + varscan_vcf_file + '";print $0}{print}\'',
142 '1>' + region_mutations_tsv])
143 ofh.write("\ncmd:\n%s\n" % cmd)
144 run_command(cmd)
145 try:
146 ofh.write("After running command, os.path.getsize((region_mutations_tsv): %s\n" % str(os.path.getsize(region_mutations_tsv)))
147 region_mutations = pandas.read_csv(region_mutations_tsv, sep='\t', header=0, index_col=False)
148 ofh.write("\nregion_mutations: %s\n" % region_mutations)
149 except Exception:
150 continue
151 # Figure out what kind of mutations are in this region.
152 region_mutation_types = pandas.Series(['snp'] * region_mutations.shape[0], name='TYPE', index=region_mutations.index)
153 region_mutation_types[region_mutations['REF'].str.len() != region_mutations['ALT'].str.len()] = 'small-indel'
154 region_mutation_drugs = pandas.Series(region['drug'] * region_mutations.shape[0], name='DRUG', index=region_mutations.index)
155 region_notes = pandas.Series(region['note'] * region_mutations.shape[0], name='NOTE', index=region_mutations.index)
156 region_mutations = pandas.concat([region_mutations, region_mutation_types, region_mutation_drugs, region_notes], axis=1)
157 region_mutations = region_mutations[['#CHROM', 'POS', 'TYPE', 'REF', 'ALT', 'DRUG', 'NOTE']]
158 amr_mutations[region['name']] = region_mutations
114 else: 159 else:
115 for region_i in range(mutation_regions.shape[0]): 160 ofh.write("\nMutation region BED file not received.\n")
116 region = mutation_regions.iloc[region_i, :] 161 ofh.write("\nAfter processing mutations, amr_mutations: %s\n" % str(amr_mutations))
117 if region[0] not in reference: 162 # Roll up potentially resistance conferring mutations.
118 efh.write("Mutation region '%s' not found in reference genome.\n" % str(region)) 163 ofh.write("\n##### Rolling up potentially resistance conferring mutations..\n")
119 break 164 for mutation_region, mutation_hits in amr_mutations.iteritems():
120 if not isinstance(region[1], numpy.int64): 165 ofh.write("mutation_region: %s\n" % str(mutation_region))
121 efh.write("Non-integer found in mutation region start (column 2): %s.\n" % str(region[1])) 166 ofh.write("mutation_hits: %s\n" % str(mutation_hits))
122 break 167 for mutation_idx, mutation_hit in mutation_hits.iterrows():
123 if not isinstance(region[2], numpy.int64): 168 ofh.write("mutation_idx: %s\n" % str(mutation_idx))
124 efh.write("Non-integer found in mutation region start (column 3): %s.\n" % str(region[2])) 169 ofh.write("mutation_hit: %s\n" % str(mutation_hit))
125 break 170 mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT']
126 if region[1] <= 0 or region[2] <= 0: 171 ofh.write("mutation_name: %s\n" % str(mutation_name))
127 efh.write("Mutation region '%s' starts before the reference sequence.\n" % str(region)) 172 amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
128 if region[1] > len(reference[region[0]].seq) or region[2] > len(reference[region[0]].seq): 173 ofh.write("\nAfter processing mutations, amr_to_draw: %s\n" % str(amr_to_draw))
129 efh.write("Mutation region '%s' ends after the reference sequence.\n" % str(region)) 174 ofh.write("\nAfter processing mutations, amr_to_draw.shape[0]: %s\n" % str(amr_to_draw.shape[0]))
130 if not region.get('type', default='No Type') in ['snp', 'small-indel', 'any']: 175
131 ofh.write("\n\nSkipping mutation region '%s' with invalid type '%s', valid types are 'snp', 'small-indel', 'any'.\n\n" % (str(region), str(region.get('type', default='No Type')))) 176 if amr_deletions_file not in [None, 'None'] and os.path.getsize(amr_deletions_file) > 0:
132 continue 177 # Roll up deletions that might confer resistance.
133 ofh.write("\nFinding AMR mutations for %s.\n" % str(region['name'])) 178 try:
134 region_bed = 'region_%s.bed' % region_i 179 amr_deletions = pandas.read_csv(filepath_or_buffer=amr_deletions_file, sep='\t', header=None)
135 ofh.write("region_bed: %s\n" % str(region_bed)) 180 except Exception:
136 mutation_regions.loc[[region_i], ].to_csv(path_or_buf=region_bed, sep='\t', header=False, index=False) 181 amr_deletions = pandas.DataFrame()
137 ofh.write("mutation_regions.loc[[region_i], ]:\n%s\n" % str(mutation_regions.loc[[region_i], ])) 182 if amr_deletions.shape[0] > 0:
138 region_mutations_tsv = os.path.join(mutation_regions_dir, 'region_%s_mutations.tsv' % region_i) 183 amr_deletions.columns = ['contig', 'start', 'stop', 'name', 'type', 'drug', 'note']
139 ofh.write("region_mutations_tsv: %s\n" % str(region_mutations_tsv)) 184 amr_deletions = amr_deletions.loc[amr_deletions['type'].isin(['large-deletion', 'any']), :]
140 cmd = ' '.join(['bedtools intersect', 185 for deletion_idx, deleted_gene in amr_deletions.iterrows():
141 '-nonamecheck', 186 amr_to_draw = amr_to_draw.append(pandas.Series(['\u0394' + deleted_gene[3], deleted_gene[5]], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
142 '-wb', 187 ofh.write("\nAfter processing deletions, amr_to_draw: %s\n" % str(amr_to_draw))
143 '-a', region_bed, 188
144 '-b', varscan_vcf_file, 189 ofh.write("\namr_to_draw.shape[0]: %s\n" % str(amr_to_draw.shape[0]))
145 ' | awk \'BEGIN{getline < "' + amr_mutation_regions_bed_file + '";printf $0"\\t";', 190 # I have no idea why, but when running functional tests with planemo
146 'getline < "' + varscan_vcf_file + '"; getline < "' + varscan_vcf_file + '";print $0}{print}\'', 191 # the value of amr_to_draw.shape[0] is 1 even though the tests use the
147 '1>' + region_mutations_tsv]) 192 # exact inputs when running outside of planeo that result in the value
148 ofh.write("\ncmd:\n%s\n" % cmd) 193 # being 2. So we cannot test with planemo unless we incorporate a hack
149 run_command(cmd) 194 # like a hidden in_test_mode parameter.
150 try: 195 if amr_to_draw.shape[0] > 1:
151 ofh.write("After running command, os.path.getsize((region_mutations_tsv): %s\n" % str(os.path.getsize(region_mutations_tsv))) 196 ofh.write("\nDrawing AMR matrix...\n")
152 region_mutations = pandas.read_csv(region_mutations_tsv, sep='\t', header=0, index_col=False) 197 present_genes = amr_to_draw['gene'].unique()
153 ofh.write("\nregion_mutations: %s\n" % region_mutations) 198 present_drugs = amr_to_draw['drug'].unique()
154 except Exception: 199 amr_matrix = pandas.DataFrame(0, index=present_genes, columns=present_drugs)
155 continue 200 for hit_idx, hit in amr_to_draw.iterrows():
156 # Figure out what kind of mutations are in this region. 201 amr_matrix.loc[hit[0], hit[1]] = 1
157 region_mutation_types = pandas.Series(['snp'] * region_mutations.shape[0], name='TYPE', index=region_mutations.index) 202 amr_matrix_png = os.path.join(amr_matrix_png_dir, 'amr_matrix.png')
158 region_mutation_types[region_mutations['REF'].str.len() != region_mutations['ALT'].str.len()] = 'small-indel' 203 int_matrix = amr_matrix[amr_matrix.columns].astype(int)
159 region_mutation_drugs = pandas.Series(region['drug'] * region_mutations.shape[0], name='DRUG', index=region_mutations.index) 204 figure, axis = pyplot.subplots()
160 region_notes = pandas.Series(region['note'] * region_mutations.shape[0], name='NOTE', index=region_mutations.index) 205 heatmap = axis.pcolor(int_matrix, cmap=pyplot.cm.Blues, linewidth=0)
161 region_mutations = pandas.concat([region_mutations, region_mutation_types, region_mutation_drugs, region_notes], axis=1) 206 axis.invert_yaxis()
162 region_mutations = region_mutations[['#CHROM', 'POS', 'TYPE', 'REF', 'ALT', 'DRUG', 'NOTE']] 207 axis.set_yticks(numpy.arange(0.5, len(amr_matrix.index)), minor=False)
163 amr_mutations[region['name']] = region_mutations 208 axis.set_yticklabels(int_matrix.index.values)
209 axis.set_xticks(numpy.arange(0.5, len(amr_matrix.columns)), minor=False)
210 axis.set_xticklabels(amr_matrix.columns.values, rotation=90)
211 axis.xaxis.tick_top()
212 axis.xaxis.set_label_position('top')
213 pyplot.tight_layout()
214 pyplot.savefig(amr_matrix_png, dpi=300)
164 else: 215 else:
165 ofh.write("\nMutation region BED file not received.\n") 216 ofh.write("\nEmpty AMR matrix, nothing to draw...\n")
166 ofh.write("\nAfter processing mutations, amr_mutations: %s\n" % str(amr_mutations))
167 # Roll up potentially resistance conferring mutations.
168 ofh.write("\n##### Rolling up potentially resistance conferring mutations..\n")
169 for mutation_region, mutation_hits in amr_mutations.iteritems():
170 ofh.write("mutation_region: %s\n" % str(mutation_region))
171 ofh.write("mutation_hits: %s\n" % str(mutation_hits))
172 for mutation_idx, mutation_hit in mutation_hits.iterrows():
173 ofh.write("mutation_idx: %s\n" % str(mutation_idx))
174 ofh.write("mutation_hit: %s\n" % str(mutation_hit))
175 mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT']
176 ofh.write("mutation_name: %s\n" % str(mutation_name))
177 amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
178 ofh.write("\nAfter processing mutations, amr_to_draw: %s\n" % str(amr_to_draw))
179 ofh.write("\nAfter processing mutations, amr_to_draw.shape[0]: %s\n" % str(amr_to_draw.shape[0]))
180
181 if amr_deletions_file not in [None, 'None'] and os.path.getsize(amr_deletions_file) > 0:
182 # Roll up deletions that might confer resistance.
183 try:
184 amr_deletions = pandas.read_csv(filepath_or_buffer=amr_deletions_file, sep='\t', header=None)
185 except Exception:
186 amr_deletions = pandas.DataFrame()
187 if amr_deletions.shape[0] > 0:
188 amr_deletions.columns = ['contig', 'start', 'stop', 'name', 'type', 'drug', 'note']
189 amr_deletions = amr_deletions.loc[amr_deletions['type'].isin(['large-deletion', 'any']), :]
190 for deletion_idx, deleted_gene in amr_deletions.iterrows():
191 amr_to_draw = amr_to_draw.append(pandas.Series(['\u0394' + deleted_gene[3], deleted_gene[5]], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
192 ofh.write("\nAfter processing deletions, amr_to_draw: %s\n" % str(amr_to_draw))
193
194 ofh.write("\namr_to_draw.shape[0]: %s\n" % str(amr_to_draw.shape[0]))
195 # I have no idea why, but when running functional tests with planemo
196 # the value of amr_to_draw.shape[0] is 1 even though the tests use the
197 # exact inputs when running outside of planeo that result in the value
198 # being 2. So we cannot test with planemo unless we incorporate a hack
199 # like a hidden in_test_mode parameter.
200 if amr_to_draw.shape[0] > 1:
201 ofh.write("\nDrawing AMR matrix...\n")
202 present_genes = amr_to_draw['gene'].unique()
203 present_drugs = amr_to_draw['drug'].unique()
204 amr_matrix = pandas.DataFrame(0, index=present_genes, columns=present_drugs)
205 for hit_idx, hit in amr_to_draw.iterrows():
206 amr_matrix.loc[hit[0], hit[1]] = 1
207 amr_matrix_png = os.path.join(amr_matrix_png_dir, 'amr_matrix.png')
208 int_matrix = amr_matrix[amr_matrix.columns].astype(int)
209 figure, axis = pyplot.subplots()
210 heatmap = axis.pcolor(int_matrix, cmap=pyplot.cm.Blues, linewidth=0)
211 axis.invert_yaxis()
212 axis.set_yticks(numpy.arange(0.5, len(amr_matrix.index)), minor=False)
213 axis.set_yticklabels(int_matrix.index.values)
214 axis.set_xticks(numpy.arange(0.5, len(amr_matrix.columns)), minor=False)
215 axis.set_xticklabels(amr_matrix.columns.values, rotation=90)
216 axis.xaxis.tick_top()
217 axis.xaxis.set_label_position('top')
218 pyplot.tight_layout()
219 pyplot.savefig(amr_matrix_png, dpi=300)
220 else:
221 ofh.write("\nEmpty AMR matrix, nothing to draw...\n")
222 efh.close() 217 efh.close()
223 ofh.close() 218 ofh.close()
224 219
225 220
226 if __name__ == '__main__': 221 if __name__ == '__main__':