changeset 1:5c923c77cf5f draft

Uploaded
author greg
date Fri, 10 Feb 2023 17:27:13 +0000
parents 6044ca44e9f6
children 9fcc1ffd7526
files draw_amr_matrix.py
diffstat 1 files changed, 39 insertions(+), 28 deletions(-) [+]
line wrap: on
line diff
--- a/draw_amr_matrix.py	Tue Feb 07 22:13:57 2023 +0000
+++ b/draw_amr_matrix.py	Fri Feb 10 17:27:13 2023 +0000
@@ -9,30 +9,30 @@
 import matplotlib.pyplot as pyplot
 
 
-def get_amr_in_feature_hits(feature_hits):
-    for k in feature_hits.keys():
+def get_amr_in_feature_hits(amr_feature_hits):
+    for k in amr_feature_hits.keys():
         if k.lower().find('amr') >= 0:
-            return feature_hits[k]
+            return amr_feature_hits[k]
     return None
 
 
-def draw_amr_matrix(feature_hits_files, amr_gene_drug_file, output_dir):
+def draw_amr_matrix(amr_feature_hits_files, amr_deletions_file, amr_mutations_file, amr_gene_drug_file, output_dir):
     ofh = open('process_log', 'w')
 
-    # Read feature_hits_files.
-    feature_hits = pandas.Series(dtype=object)
-    for feature_hits_file in feature_hits_files:
-        feature_name = os.path.basename(feature_hits_file)
+    # Read amr_feature_hits_files.
+    amr_feature_hits = pandas.Series(dtype=object)
+    for amr_feature_hits_file in amr_feature_hits_files:
+        feature_name = os.path.basename(amr_feature_hits_file)
         # Make sure the file is not empty.
-        if os.path.isfile(feature_hits_file) and os.path.getsize(feature_hits_file) > 0:
-            best_hits = pandas.read_csv(filepath_or_buffer=feature_hits_file, sep='\t', header=None)
-            ofh.write("\nFeature file %s will be processed\n" % os.path.basename(feature_hits_file))
+        if os.path.isfile(amr_feature_hits_file) and os.path.getsize(amr_feature_hits_file) > 0:
+            best_hits = pandas.read_csv(filepath_or_buffer=amr_feature_hits_file, sep='\t', header=None)
+            ofh.write("\nFeature file %s will be processed\n" % os.path.basename(amr_feature_hits_file))
         else:
-            ofh.write("\nEmpty feature file %s will NOT be processed\n" % os.path.basename(feature_hits_file))
+            ofh.write("\nEmpty feature file %s will NOT be processed\n" % os.path.basename(amr_feature_hits_file))
             best_hits = None
-        feature_hits[feature_name] = best_hits
+        amr_feature_hits[feature_name] = best_hits
 
-    amr_hits = get_amr_in_feature_hits(feature_hits)
+    amr_hits = get_amr_in_feature_hits(amr_feature_hits)
     ofh.write("\namr_hits:\n%s\n" % str(amr_hits))
     if amr_hits is not None:
         amr_to_draw = pandas.DataFrame(columns=['gene', 'drug'])
@@ -56,21 +56,30 @@
                     amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
                     ofh.write("\amr_to_draw:%s\n" % str(amr_to_draw))
 
-        amr_mutations = pandas.Series(dtype=object)
-        if amr_mutations.shape[0] > 0:
+        if amr_mutations_file is not None:
+            # TODO: So far, no samples have produced mutations, so we haven't been able
+            # to produce a populated VarScan VCF file of mutations - https://github.com/appliedbinf/pima_md/blob/main/pima.py#L2923.
+            # The call_amr_mutations Galaxy tool will currently produce this VarScan VCF file, but we need a sample that
+            # will produce a populated file.  After we find one, we'll need to figure out how to implement this loop
+            # https://github.com/appliedbinf/pima_md/blob/main/pima.py#L2925 in a Galaxy tool so that the VarScan VCF
+            # file will be converted to the TSV amr_mutations_file that thsi tool expects. 
             # Roll up potentially resistance conferring mutations.
-            # TODO: may need to populate this if we need to use call_amr_mutations.
             for mutation_region, mutation_hits in amr_mutations.iteritems():
                 for mutation_idx, mutation_hit in mutation_hits.iterrows():
                     mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT']
                     amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
 
-        amr_deletions = pandas.DataFrame()
-        if amr_deletions.shape[0] > 0:
+        if amr_deletions_file is not None:
+            # TODO: So far, no samples have produced deletions, but we do have all the pices in place
+            # within the workflow to receive the amr_deletions_file here, although it is currently
+            # always empty...
             # Roll up deletions that might confer resistance.
-            # TODO: may need to populate this if we need to use call_insertions.
-            for deletion_idx, deleted_gene in amr_deletions.iterrows():
-                amr_to_draw = amr_to_draw.append(pandas.Series(['\u0394' + deleted_gene[3], deleted_gene[5]], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
+            amr_deletions = pandas.read_csv(filepath_or_buffer=amr_deletions_file, sep='\t', header=None)
+            if amr_deletions.shape[0] > 0:
+                amr_deletions.columns = ['contig', 'start', 'stop', 'name', 'type', 'drug', 'note']
+                amr_deletions = amr_deletions.loc[amr_deletions['type'].isin(['large-deletion', 'any']), :]
+                for deletion_idx, deleted_gene in amr_deletions.iterrows():
+                    amr_to_draw = amr_to_draw.append(pandas.Series(['\u0394' + deleted_gene[3], deleted_gene[5]], name=amr_to_draw.shape[0], index=amr_to_draw.columns))
 
         if amr_to_draw.shape[0] > 1:
             ofh.write("\nDrawing AMR matrix...\n")
@@ -99,8 +108,10 @@
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
 
+    parser.add_argument('--amr_feature_hits_dir', action='store', dest='amr_feature_hits_dir', help='Directory of tabular files containing feature hits')
+    parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', default=None, help='AMR deletions BED file')
+    parser.add_argument('--amr_mutations_file', action='store', dest='amr_mutations_file', default=None, help='AMR mutations TSV file')
     parser.add_argument('--amr_gene_drug_file', action='store', dest='amr_gene_drug_file', help='AMR_gene_drugs tsv file')
-    parser.add_argument('--feature_hits_dir', action='store', dest='feature_hits_dir', help='Directory of tabular files containing feature hits')
     parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory')
 
     args = parser.parse_args()
@@ -109,9 +120,9 @@
     # will be sorted alphabetically and will contain 2 files
     # named something like AMR_CDS_311_2022_12_20.fasta and
     # Incompatibility_Groups_2023_01_01.fasta.
-    feature_hits_files = []
-    for file_name in sorted(os.listdir(args.feature_hits_dir)):
-        file_path = os.path.abspath(os.path.join(args.feature_hits_dir, file_name))
-        feature_hits_files.append(file_path)
+    amr_feature_hits_files = []
+    for file_name in sorted(os.listdir(args.amr_feature_hits_dir)):
+        file_path = os.path.abspath(os.path.join(args.amr_feature_hits_dir, file_name))
+        amr_feature_hits_files.append(file_path)
 
-    draw_amr_matrix(feature_hits_files, args.amr_gene_drug_file, args.output_dir)
+    draw_amr_matrix(amr_feature_hits_files, args.amr_deletions_file, args.amr_mutations_file, args.amr_gene_drug_file, args.output_dir)