# HG changeset patch # User greg # Date 1676050033 0 # Node ID 5c923c77cf5fe8d9c8d9b484cc03381b6875a2f6 # Parent 6044ca44e9f6318225febec690fa2be6b15384ee Uploaded diff -r 6044ca44e9f6 -r 5c923c77cf5f draw_amr_matrix.py --- a/draw_amr_matrix.py Tue Feb 07 22:13:57 2023 +0000 +++ b/draw_amr_matrix.py Fri Feb 10 17:27:13 2023 +0000 @@ -9,30 +9,30 @@ import matplotlib.pyplot as pyplot -def get_amr_in_feature_hits(feature_hits): - for k in feature_hits.keys(): +def get_amr_in_feature_hits(amr_feature_hits): + for k in amr_feature_hits.keys(): if k.lower().find('amr') >= 0: - return feature_hits[k] + return amr_feature_hits[k] return None -def draw_amr_matrix(feature_hits_files, amr_gene_drug_file, output_dir): +def draw_amr_matrix(amr_feature_hits_files, amr_deletions_file, amr_mutations_file, amr_gene_drug_file, output_dir): ofh = open('process_log', 'w') - # Read feature_hits_files. - feature_hits = pandas.Series(dtype=object) - for feature_hits_file in feature_hits_files: - feature_name = os.path.basename(feature_hits_file) + # Read amr_feature_hits_files. + amr_feature_hits = pandas.Series(dtype=object) + for amr_feature_hits_file in amr_feature_hits_files: + feature_name = os.path.basename(amr_feature_hits_file) # Make sure the file is not empty. - if os.path.isfile(feature_hits_file) and os.path.getsize(feature_hits_file) > 0: - best_hits = pandas.read_csv(filepath_or_buffer=feature_hits_file, sep='\t', header=None) - ofh.write("\nFeature file %s will be processed\n" % os.path.basename(feature_hits_file)) + if os.path.isfile(amr_feature_hits_file) and os.path.getsize(amr_feature_hits_file) > 0: + best_hits = pandas.read_csv(filepath_or_buffer=amr_feature_hits_file, sep='\t', header=None) + ofh.write("\nFeature file %s will be processed\n" % os.path.basename(amr_feature_hits_file)) else: - ofh.write("\nEmpty feature file %s will NOT be processed\n" % os.path.basename(feature_hits_file)) + ofh.write("\nEmpty feature file %s will NOT be processed\n" % os.path.basename(amr_feature_hits_file)) best_hits = None - feature_hits[feature_name] = best_hits + amr_feature_hits[feature_name] = best_hits - amr_hits = get_amr_in_feature_hits(feature_hits) + amr_hits = get_amr_in_feature_hits(amr_feature_hits) ofh.write("\namr_hits:\n%s\n" % str(amr_hits)) if amr_hits is not None: amr_to_draw = pandas.DataFrame(columns=['gene', 'drug']) @@ -56,21 +56,30 @@ amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) ofh.write("\amr_to_draw:%s\n" % str(amr_to_draw)) - amr_mutations = pandas.Series(dtype=object) - if amr_mutations.shape[0] > 0: + if amr_mutations_file is not None: + # TODO: So far, no samples have produced mutations, so we haven't been able + # to produce a populated VarScan VCF file of mutations - https://github.com/appliedbinf/pima_md/blob/main/pima.py#L2923. + # The call_amr_mutations Galaxy tool will currently produce this VarScan VCF file, but we need a sample that + # will produce a populated file. After we find one, we'll need to figure out how to implement this loop + # https://github.com/appliedbinf/pima_md/blob/main/pima.py#L2925 in a Galaxy tool so that the VarScan VCF + # file will be converted to the TSV amr_mutations_file that thsi tool expects. # Roll up potentially resistance conferring mutations. - # TODO: may need to populate this if we need to use call_amr_mutations. for mutation_region, mutation_hits in amr_mutations.iteritems(): for mutation_idx, mutation_hit in mutation_hits.iterrows(): mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT'] amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) - amr_deletions = pandas.DataFrame() - if amr_deletions.shape[0] > 0: + if amr_deletions_file is not None: + # TODO: So far, no samples have produced deletions, but we do have all the pices in place + # within the workflow to receive the amr_deletions_file here, although it is currently + # always empty... # Roll up deletions that might confer resistance. - # TODO: may need to populate this if we need to use call_insertions. - for deletion_idx, deleted_gene in amr_deletions.iterrows(): - amr_to_draw = amr_to_draw.append(pandas.Series(['\u0394' + deleted_gene[3], deleted_gene[5]], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) + amr_deletions = pandas.read_csv(filepath_or_buffer=amr_deletions_file, sep='\t', header=None) + if amr_deletions.shape[0] > 0: + amr_deletions.columns = ['contig', 'start', 'stop', 'name', 'type', 'drug', 'note'] + amr_deletions = amr_deletions.loc[amr_deletions['type'].isin(['large-deletion', 'any']), :] + for deletion_idx, deleted_gene in amr_deletions.iterrows(): + amr_to_draw = amr_to_draw.append(pandas.Series(['\u0394' + deleted_gene[3], deleted_gene[5]], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) if amr_to_draw.shape[0] > 1: ofh.write("\nDrawing AMR matrix...\n") @@ -99,8 +108,10 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() + parser.add_argument('--amr_feature_hits_dir', action='store', dest='amr_feature_hits_dir', help='Directory of tabular files containing feature hits') + parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', default=None, help='AMR deletions BED file') + parser.add_argument('--amr_mutations_file', action='store', dest='amr_mutations_file', default=None, help='AMR mutations TSV file') parser.add_argument('--amr_gene_drug_file', action='store', dest='amr_gene_drug_file', help='AMR_gene_drugs tsv file') - parser.add_argument('--feature_hits_dir', action='store', dest='feature_hits_dir', help='Directory of tabular files containing feature hits') parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory') args = parser.parse_args() @@ -109,9 +120,9 @@ # will be sorted alphabetically and will contain 2 files # named something like AMR_CDS_311_2022_12_20.fasta and # Incompatibility_Groups_2023_01_01.fasta. - feature_hits_files = [] - for file_name in sorted(os.listdir(args.feature_hits_dir)): - file_path = os.path.abspath(os.path.join(args.feature_hits_dir, file_name)) - feature_hits_files.append(file_path) + amr_feature_hits_files = [] + for file_name in sorted(os.listdir(args.amr_feature_hits_dir)): + file_path = os.path.abspath(os.path.join(args.amr_feature_hits_dir, file_name)) + amr_feature_hits_files.append(file_path) - draw_amr_matrix(feature_hits_files, args.amr_gene_drug_file, args.output_dir) + draw_amr_matrix(amr_feature_hits_files, args.amr_deletions_file, args.amr_mutations_file, args.amr_gene_drug_file, args.output_dir)