diff draw_amr_matrix.py @ 17:eb86b4bf140e draft default tip

Uploaded
author greg
date Wed, 03 May 2023 15:13:12 +0000
parents 7bd48449ee79
children
line wrap: on
line diff
--- a/draw_amr_matrix.py	Mon May 01 18:52:58 2023 +0000
+++ b/draw_amr_matrix.py	Wed May 03 15:13:12 2023 +0000
@@ -13,13 +13,6 @@
 import matplotlib.pyplot as pyplot
 
 
-def get_amr_in_feature_hits(amr_feature_hits):
-    for k in amr_feature_hits.keys():
-        if k.lower().find('amr') >= 0:
-            return amr_feature_hits[k]
-    return None
-
-
 def load_fasta(fasta_file):
     sequence = pandas.Series(dtype=object)
     for contig in Bio.SeqIO.parse(fasta_file, 'fasta'):
@@ -77,148 +70,150 @@
             best_hits = None
         amr_feature_hits[feature_name] = best_hits
 
-    amr_hits = get_amr_in_feature_hits(amr_feature_hits)
-    ofh.write("\namr_hits:\n%s\n" % str(amr_hits))
-    if amr_hits is not None:
-        amr_to_draw = pandas.DataFrame(columns=['gene', 'drug'])
-        ofh.write("\namr_to_draw:\n%s\n" % str(amr_to_draw))
-        # Read amr_drug_gene_file.
-        amr_gene_drug = pandas.read_csv(amr_gene_drug_file, index_col=None, sep='\t', quoting=csv.QUOTE_NONE, header=None)
-        ofh.write("\namr_gene_drug:\n%s\n" % str(amr_gene_drug))
+    # Process populated feature hits.
+    for k in amr_feature_hits.keys():
+        if amr_feature_hits[k] is not None:
+            amr_hits = amr_feature_hits[k]
+            ofh.write("\namr_hits:\n%s\n" % str(amr_hits))
+            amr_to_draw = pandas.DataFrame(columns=['gene', 'drug'])
+            ofh.write("\namr_to_draw:\n%s\n" % str(amr_to_draw))
+            # Read amr_drug_gene_file.
+            amr_gene_drug = pandas.read_csv(amr_gene_drug_file, index_col=None, sep='\t', quoting=csv.QUOTE_NONE, header=None)
+            ofh.write("\namr_gene_drug:\n%s\n" % str(amr_gene_drug))
+
+            # Roll up AMR gene hits.
+            ofh.write("\namr_hits.shape[0]: %s\n" % str(amr_hits.shape[0]))
+            if amr_hits.shape[0] > 0:
+                for gene_idx, gene in amr_hits.iterrows():
+                    ofh.write("gene_idx: %s\n" % str(gene_idx))
+                    ofh.write("gene: %s\n" % str(gene))
+                    gene_name = gene[3]
+                    ofh.write("gene_name: %s\n" % str(gene_name))
+                    ofh.write("amr_gene_drug[0]: %s\n" % str(amr_gene_drug[0]))
+                    drugs = amr_gene_drug.loc[amr_gene_drug[0] == gene_name, :][1]
+                    ofh.write("drugs: %s\n" % str(drugs))
+                    for drug in drugs:
+                        amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
+                        ofh.write("\amr_to_draw: %s\n" % str(amr_to_draw))
 
-        # Roll up AMR gene hits.
-        ofh.write("\namr_hits.shape[0]: %s\n" % str(amr_hits.shape[0]))
-        if amr_hits.shape[0] > 0:
-            for gene_idx, gene in amr_hits.iterrows():
-                ofh.write("gene_idx: %s\n" % str(gene_idx))
-                ofh.write("gene: %s\n" % str(gene))
-                gene_name = gene[3]
-                ofh.write("gene_name: %s\n" % str(gene_name))
-                ofh.write("amr_gene_drug[0]: %s\n" % str(amr_gene_drug[0]))
-                drugs = amr_gene_drug.loc[amr_gene_drug[0] == gene_name, :][1]
-                ofh.write("drugs: %s\n" % str(drugs))
-                for drug in drugs:
-                    amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
-                    ofh.write("\amr_to_draw: %s\n" % str(amr_to_draw))
-
-        ofh.write("\nvarscan_vcf_file is None: %s\n" % str(varscan_vcf_file == 'None'))
-        if varscan_vcf_file not in [None, 'None'] and os.path.getsize(varscan_vcf_file) > 0:
-            amr_mutations = pandas.Series(dtype=object)
-            if amr_mutation_regions_bed_file is not None:
-                mutation_regions = pandas.read_csv(amr_mutation_regions_bed_file, header=0, sep='\t', index_col=False)
-                # Validate mutation regions.
-                if mutation_regions.shape[1] != 7:
-                    efh.write("The selected mutations regions BED file is invalid, it should be a six column file.\n")
-                elif mutation_regions.shape[0] == 0:
-                    efh.write("There are no rows in the selected mutation regions file.\n")
+            ofh.write("\nvarscan_vcf_file is None: %s\n" % str(varscan_vcf_file == 'None'))
+            if varscan_vcf_file not in [None, 'None'] and os.path.getsize(varscan_vcf_file) > 0:
+                amr_mutations = pandas.Series(dtype=object)
+                if amr_mutation_regions_bed_file is not None:
+                    mutation_regions = pandas.read_csv(amr_mutation_regions_bed_file, header=0, sep='\t', index_col=False)
+                    # Validate mutation regions.
+                    if mutation_regions.shape[1] != 7:
+                        efh.write("The selected mutations regions BED file is invalid, it should be a six column file.\n")
+                    elif mutation_regions.shape[0] == 0:
+                        efh.write("There are no rows in the selected mutation regions file.\n")
+                    else:
+                        for region_i in range(mutation_regions.shape[0]):
+                            region = mutation_regions.iloc[region_i, :]
+                            if region[0] not in reference:
+                                efh.write("Mutation region '%s' not found in reference genome.\n" % str(region))
+                                break
+                            if not isinstance(region[1], numpy.int64):
+                                efh.write("Non-integer found in mutation region start (column 2): %s.\n" % str(region[1]))
+                                break
+                            if not isinstance(region[2], numpy.int64):
+                                efh.write("Non-integer found in mutation region start (column 3): %s.\n" % str(region[2]))
+                                break
+                            if region[1] <= 0 or region[2] <= 0:
+                                efh.write("Mutation region '%s' starts before the reference sequence.\n" % str(region))
+                            if region[1] > len(reference[region[0]].seq) or region[2] > len(reference[region[0]].seq):
+                                efh.write("Mutation region '%s' ends after the reference sequence.\n" % str(region))
+                            if not region.get('type', default='No Type') in ['snp', 'small-indel', 'any']:
+                                ofh.write("\n\nSkipping mutation region '%s' with invalid type '%s', valid types are 'snp', 'small-indel', 'any'.\n\n" % (str(region), str(region.get('type', default='No Type'))))
+                                continue
+                            ofh.write("\nFinding AMR mutations for %s.\n" % str(region['name']))
+                            region_bed = 'region_%s.bed' % region_i
+                            ofh.write("region_bed: %s\n" % str(region_bed))
+                            mutation_regions.loc[[region_i], ].to_csv(path_or_buf=region_bed, sep='\t', header=False, index=False)
+                            ofh.write("mutation_regions.loc[[region_i], ]:\n%s\n" % str(mutation_regions.loc[[region_i], ]))
+                            region_mutations_tsv = os.path.join(mutation_regions_dir, 'region_%s_mutations.tsv' % region_i)
+                            ofh.write("region_mutations_tsv: %s\n" % str(region_mutations_tsv))
+                            cmd = ' '.join(['bedtools intersect',
+                                            '-nonamecheck',
+                                            '-wb',
+                                            '-a', region_bed,
+                                            '-b', varscan_vcf_file,
+                                            ' | awk \'BEGIN{getline < "' + amr_mutation_regions_bed_file + '";printf $0"\\t";',
+                                            'getline < "' + varscan_vcf_file + '"; getline < "' + varscan_vcf_file + '";print $0}{print}\'',
+                                            '1>' + region_mutations_tsv])
+                            ofh.write("\ncmd:\n%s\n" % cmd)
+                            run_command(cmd)
+                            try:
+                                ofh.write("After running command, os.path.getsize((region_mutations_tsv): %s\n" % str(os.path.getsize(region_mutations_tsv)))
+                                region_mutations = pandas.read_csv(region_mutations_tsv, sep='\t', header=0, index_col=False)
+                                ofh.write("\nregion_mutations: %s\n" % region_mutations)
+                            except Exception:
+                                continue
+                            # Figure out what kind of mutations are in this region.
+                            region_mutation_types = pandas.Series(['snp'] * region_mutations.shape[0], name='TYPE', index=region_mutations.index)
+                            region_mutation_types[region_mutations['REF'].str.len() != region_mutations['ALT'].str.len()] = 'small-indel'
+                            region_mutation_drugs = pandas.Series(region['drug'] * region_mutations.shape[0], name='DRUG', index=region_mutations.index)
+                            region_notes = pandas.Series(region['note'] * region_mutations.shape[0], name='NOTE', index=region_mutations.index)
+                            region_mutations = pandas.concat([region_mutations, region_mutation_types, region_mutation_drugs, region_notes], axis=1)
+                            region_mutations = region_mutations[['#CHROM', 'POS', 'TYPE', 'REF', 'ALT', 'DRUG', 'NOTE']]
+                            amr_mutations[region['name']] = region_mutations
                 else:
-                    for region_i in range(mutation_regions.shape[0]):
-                        region = mutation_regions.iloc[region_i, :]
-                        if region[0] not in reference:
-                            efh.write("Mutation region '%s' not found in reference genome.\n" % str(region))
-                            break
-                        if not isinstance(region[1], numpy.int64):
-                            efh.write("Non-integer found in mutation region start (column 2): %s.\n" % str(region[1]))
-                            break
-                        if not isinstance(region[2], numpy.int64):
-                            efh.write("Non-integer found in mutation region start (column 3): %s.\n" % str(region[2]))
-                            break
-                        if region[1] <= 0 or region[2] <= 0:
-                            efh.write("Mutation region '%s' starts before the reference sequence.\n" % str(region))
-                        if region[1] > len(reference[region[0]].seq) or region[2] > len(reference[region[0]].seq):
-                            efh.write("Mutation region '%s' ends after the reference sequence.\n" % str(region))
-                        if not region.get('type', default='No Type') in ['snp', 'small-indel', 'any']:
-                            ofh.write("\n\nSkipping mutation region '%s' with invalid type '%s', valid types are 'snp', 'small-indel', 'any'.\n\n" % (str(region), str(region.get('type', default='No Type'))))
-                            continue
-                        ofh.write("\nFinding AMR mutations for %s.\n" % str(region['name']))
-                        region_bed = 'region_%s.bed' % region_i
-                        ofh.write("region_bed: %s\n" % str(region_bed))
-                        mutation_regions.loc[[region_i], ].to_csv(path_or_buf=region_bed, sep='\t', header=False, index=False)
-                        ofh.write("mutation_regions.loc[[region_i], ]:\n%s\n" % str(mutation_regions.loc[[region_i], ]))
-                        region_mutations_tsv = os.path.join(mutation_regions_dir, 'region_%s_mutations.tsv' % region_i)
-                        ofh.write("region_mutations_tsv: %s\n" % str(region_mutations_tsv))
-                        cmd = ' '.join(['bedtools intersect',
-                                        '-nonamecheck',
-                                        '-wb',
-                                        '-a', region_bed,
-                                        '-b', varscan_vcf_file,
-                                        ' | awk \'BEGIN{getline < "' + amr_mutation_regions_bed_file + '";printf $0"\\t";',
-                                        'getline < "' + varscan_vcf_file + '"; getline < "' + varscan_vcf_file + '";print $0}{print}\'',
-                                        '1>' + region_mutations_tsv])
-                        ofh.write("\ncmd:\n%s\n" % cmd)
-                        run_command(cmd)
-                        try:
-                            ofh.write("After running command, os.path.getsize((region_mutations_tsv): %s\n" % str(os.path.getsize(region_mutations_tsv)))
-                            region_mutations = pandas.read_csv(region_mutations_tsv, sep='\t', header=0, index_col=False)
-                            ofh.write("\nregion_mutations: %s\n" % region_mutations)
-                        except Exception:
-                            continue
-                        # Figure out what kind of mutations are in this region.
-                        region_mutation_types = pandas.Series(['snp'] * region_mutations.shape[0], name='TYPE', index=region_mutations.index)
-                        region_mutation_types[region_mutations['REF'].str.len() != region_mutations['ALT'].str.len()] = 'small-indel'
-                        region_mutation_drugs = pandas.Series(region['drug'] * region_mutations.shape[0], name='DRUG', index=region_mutations.index)
-                        region_notes = pandas.Series(region['note'] * region_mutations.shape[0], name='NOTE', index=region_mutations.index)
-                        region_mutations = pandas.concat([region_mutations, region_mutation_types, region_mutation_drugs, region_notes], axis=1)
-                        region_mutations = region_mutations[['#CHROM', 'POS', 'TYPE', 'REF', 'ALT', 'DRUG', 'NOTE']]
-                        amr_mutations[region['name']] = region_mutations
+                    ofh.write("\nMutation region BED file not received.\n")
+                ofh.write("\nAfter processing mutations, amr_mutations: %s\n" % str(amr_mutations))
+                # Roll up potentially resistance conferring mutations.
+                ofh.write("\n##### Rolling up potentially resistance conferring mutations..\n")
+                for mutation_region, mutation_hits in amr_mutations.iteritems():
+                    ofh.write("mutation_region: %s\n" % str(mutation_region))
+                    ofh.write("mutation_hits: %s\n" % str(mutation_hits))
+                    for mutation_idx, mutation_hit in mutation_hits.iterrows():
+                        ofh.write("mutation_idx: %s\n" % str(mutation_idx))
+                        ofh.write("mutation_hit: %s\n" % str(mutation_hit))
+                        mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT']
+                        ofh.write("mutation_name: %s\n" % str(mutation_name))
+                        amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
+                ofh.write("\nAfter processing mutations, amr_to_draw: %s\n" % str(amr_to_draw))
+                ofh.write("\nAfter processing mutations, amr_to_draw.shape[0]: %s\n" % str(amr_to_draw.shape[0]))
+
+            if amr_deletions_file not in [None, 'None'] and os.path.getsize(amr_deletions_file) > 0:
+                # Roll up deletions that might confer resistance.
+                try:
+                    amr_deletions = pandas.read_csv(filepath_or_buffer=amr_deletions_file, sep='\t', header=None)
+                except Exception:
+                    amr_deletions = pandas.DataFrame()
+                if amr_deletions.shape[0] > 0:
+                    amr_deletions.columns = ['contig', 'start', 'stop', 'name', 'type', 'drug', 'note']
+                    amr_deletions = amr_deletions.loc[amr_deletions['type'].isin(['large-deletion', 'any']), :]
+                    for deletion_idx, deleted_gene in amr_deletions.iterrows():
+                        amr_to_draw = amr_to_draw.append(pandas.Series(['\u0394' + deleted_gene[3], deleted_gene[5]], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
+                ofh.write("\nAfter processing deletions, amr_to_draw: %s\n" % str(amr_to_draw))
+
+            ofh.write("\namr_to_draw.shape[0]: %s\n" % str(amr_to_draw.shape[0]))
+            # I have no idea why, but when running functional tests with planemo
+            # the value of amr_to_draw.shape[0] is 1 even though the tests use the
+            # exact inputs when running outside of planeo that result in the value
+            # being 2.  So we cannot test with planemo unless we incorporate a hack
+            # like a hidden in_test_mode parameter.
+            if amr_to_draw.shape[0] > 1:
+                ofh.write("\nDrawing AMR matrix...\n")
+                present_genes = amr_to_draw['gene'].unique()
+                present_drugs = amr_to_draw['drug'].unique()
+                amr_matrix = pandas.DataFrame(0, index=present_genes, columns=present_drugs)
+                for hit_idx, hit in amr_to_draw.iterrows():
+                    amr_matrix.loc[hit[0], hit[1]] = 1
+                amr_matrix_png = os.path.join(amr_matrix_png_dir, 'amr_matrix.png')
+                int_matrix = amr_matrix[amr_matrix.columns].astype(int)
+                figure, axis = pyplot.subplots()
+                heatmap = axis.pcolor(int_matrix, cmap=pyplot.cm.Blues, linewidth=0)
+                axis.invert_yaxis()
+                axis.set_yticks(numpy.arange(0.5, len(amr_matrix.index)), minor=False)
+                axis.set_yticklabels(int_matrix.index.values)
+                axis.set_xticks(numpy.arange(0.5, len(amr_matrix.columns)), minor=False)
+                axis.set_xticklabels(amr_matrix.columns.values, rotation=90)
+                axis.xaxis.tick_top()
+                axis.xaxis.set_label_position('top')
+                pyplot.tight_layout()
+                pyplot.savefig(amr_matrix_png, dpi=300)
             else:
-                ofh.write("\nMutation region BED file not received.\n")
-            ofh.write("\nAfter processing mutations, amr_mutations: %s\n" % str(amr_mutations))
-            # Roll up potentially resistance conferring mutations.
-            ofh.write("\n##### Rolling up potentially resistance conferring mutations..\n")
-            for mutation_region, mutation_hits in amr_mutations.iteritems():
-                ofh.write("mutation_region: %s\n" % str(mutation_region))
-                ofh.write("mutation_hits: %s\n" % str(mutation_hits))
-                for mutation_idx, mutation_hit in mutation_hits.iterrows():
-                    ofh.write("mutation_idx: %s\n" % str(mutation_idx))
-                    ofh.write("mutation_hit: %s\n" % str(mutation_hit))
-                    mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT']
-                    ofh.write("mutation_name: %s\n" % str(mutation_name))
-                    amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
-            ofh.write("\nAfter processing mutations, amr_to_draw: %s\n" % str(amr_to_draw))
-            ofh.write("\nAfter processing mutations, amr_to_draw.shape[0]: %s\n" % str(amr_to_draw.shape[0]))
-
-        if amr_deletions_file not in [None, 'None'] and os.path.getsize(amr_deletions_file) > 0:
-            # Roll up deletions that might confer resistance.
-            try:
-                amr_deletions = pandas.read_csv(filepath_or_buffer=amr_deletions_file, sep='\t', header=None)
-            except Exception:
-                amr_deletions = pandas.DataFrame()
-            if amr_deletions.shape[0] > 0:
-                amr_deletions.columns = ['contig', 'start', 'stop', 'name', 'type', 'drug', 'note']
-                amr_deletions = amr_deletions.loc[amr_deletions['type'].isin(['large-deletion', 'any']), :]
-                for deletion_idx, deleted_gene in amr_deletions.iterrows():
-                    amr_to_draw = amr_to_draw.append(pandas.Series(['\u0394' + deleted_gene[3], deleted_gene[5]], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
-            ofh.write("\nAfter processing deletions, amr_to_draw: %s\n" % str(amr_to_draw))
-
-        ofh.write("\namr_to_draw.shape[0]: %s\n" % str(amr_to_draw.shape[0]))
-        # I have no idea why, but when running functional tests with planemo
-        # the value of amr_to_draw.shape[0] is 1 even though the tests use the
-        # exact inputs when running outside of planeo that result in the value
-        # being 2.  So we cannot test with planemo unless we incorporate a hack
-        # like a hidden in_test_mode parameter.
-        if amr_to_draw.shape[0] > 1:
-            ofh.write("\nDrawing AMR matrix...\n")
-            present_genes = amr_to_draw['gene'].unique()
-            present_drugs = amr_to_draw['drug'].unique()
-            amr_matrix = pandas.DataFrame(0, index=present_genes, columns=present_drugs)
-            for hit_idx, hit in amr_to_draw.iterrows():
-                amr_matrix.loc[hit[0], hit[1]] = 1
-            amr_matrix_png = os.path.join(amr_matrix_png_dir, 'amr_matrix.png')
-            int_matrix = amr_matrix[amr_matrix.columns].astype(int)
-            figure, axis = pyplot.subplots()
-            heatmap = axis.pcolor(int_matrix, cmap=pyplot.cm.Blues, linewidth=0)
-            axis.invert_yaxis()
-            axis.set_yticks(numpy.arange(0.5, len(amr_matrix.index)), minor=False)
-            axis.set_yticklabels(int_matrix.index.values)
-            axis.set_xticks(numpy.arange(0.5, len(amr_matrix.columns)), minor=False)
-            axis.set_xticklabels(amr_matrix.columns.values, rotation=90)
-            axis.xaxis.tick_top()
-            axis.xaxis.set_label_position('top')
-            pyplot.tight_layout()
-            pyplot.savefig(amr_matrix_png, dpi=300)
-        else:
-            ofh.write("\nEmpty AMR matrix, nothing to draw...\n")
+                ofh.write("\nEmpty AMR matrix, nothing to draw...\n")
     efh.close()
     ofh.close()