Mercurial > repos > greg > draw_amr_matrix
comparison draw_amr_matrix.py @ 4:33a0ea992043 draft
Uploaded
author | greg |
---|---|
date | Tue, 21 Feb 2023 19:55:42 +0000 |
parents | 7d7884f2d921 |
children | caf554e039b2 |
comparison
equal
deleted
inserted
replaced
3:7d7884f2d921 | 4:33a0ea992043 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import argparse | 3 import argparse |
4 import csv | 4 import csv |
5 import os | 5 import os |
6 | 6 import subprocess |
7 import sys | |
8 import tempfile | |
9 | |
10 import Bio.SeqIO | |
7 import numpy | 11 import numpy |
8 import pandas | 12 import pandas |
9 import matplotlib.pyplot as pyplot | 13 import matplotlib.pyplot as pyplot |
10 | 14 |
11 | 15 |
14 if k.lower().find('amr') >= 0: | 18 if k.lower().find('amr') >= 0: |
15 return amr_feature_hits[k] | 19 return amr_feature_hits[k] |
16 return None | 20 return None |
17 | 21 |
18 | 22 |
19 def draw_amr_matrix(amr_feature_hits_files, amr_deletions_file, amr_mutations_file, amr_gene_drug_file, output_dir): | 23 def load_fasta(fasta_file): |
24 sequence = pandas.Series(dtype=object) | |
25 for contig in Bio.SeqIO.parse(fasta_file, 'fasta'): | |
26 sequence[contig.id] = contig | |
27 return sequence | |
28 | |
29 | |
30 def run_command(cmd): | |
31 try: | |
32 tmp_name = tempfile.NamedTemporaryFile(dir=".").name | |
33 tmp_stderr = open(tmp_name, 'wb') | |
34 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno()) | |
35 returncode = proc.wait() | |
36 tmp_stderr.close() | |
37 if returncode != 0: | |
38 # Get stderr, allowing for case where it's very large. | |
39 tmp_stderr = open(tmp_name, 'rb') | |
40 stderr = '' | |
41 buffsize = 1048576 | |
42 try: | |
43 while True: | |
44 stderr += tmp_stderr.read(buffsize) | |
45 if not stderr or len(stderr) % buffsize != 0: | |
46 break | |
47 except OverflowError: | |
48 pass | |
49 tmp_stderr.close() | |
50 os.remove(tmp_name) | |
51 stop_err(stderr) | |
52 except Exception as e: | |
53 stop_err('Command:\n%s\n\nended with error:\n%s\n\n' % (cmd, str(e))) | |
54 | |
55 | |
56 def stop_err(msg): | |
57 sys.stderr.write(msg) | |
58 sys.exit(1) | |
59 | |
60 | |
61 def draw_amr_matrix(amr_feature_hits_files, amr_deletions_file, amr_mutations_file, amr_mutation_regions_file, amr_gene_drug_file, reference, reference_size, region_mutations_output_file, mutations_dir, output_dir): | |
20 ofh = open('process_log', 'w') | 62 ofh = open('process_log', 'w') |
21 | 63 |
22 # Read amr_feature_hits_files. | 64 # Read amr_feature_hits_files. |
23 amr_feature_hits = pandas.Series(dtype=object) | 65 amr_feature_hits = pandas.Series(dtype=object) |
24 for amr_feature_hits_file in amr_feature_hits_files: | 66 for amr_feature_hits_file in amr_feature_hits_files: |
40 # Read amr_drug_gene_file. | 82 # Read amr_drug_gene_file. |
41 amr_gene_drug = pandas.read_csv(amr_gene_drug_file, index_col=None, sep='\t', quoting=csv.QUOTE_NONE, header=None) | 83 amr_gene_drug = pandas.read_csv(amr_gene_drug_file, index_col=None, sep='\t', quoting=csv.QUOTE_NONE, header=None) |
42 ofh.write("\namr_gene_drug:\n%s\n" % str(amr_gene_drug)) | 84 ofh.write("\namr_gene_drug:\n%s\n" % str(amr_gene_drug)) |
43 | 85 |
44 # Roll up AMR gene hits. | 86 # Roll up AMR gene hits. |
45 ofh.write("\namr_hits.shape[0]:%s\n" % str(amr_hits.shape[0])) | 87 ofh.write("\namr_hits.shape[0]: %s\n" % str(amr_hits.shape[0])) |
46 if amr_hits.shape[0] > 0: | 88 if amr_hits.shape[0] > 0: |
47 for gene_idx, gene in amr_hits.iterrows(): | 89 for gene_idx, gene in amr_hits.iterrows(): |
48 ofh.write("gene_idx:%s\n" % str(gene_idx)) | 90 ofh.write("gene_idx: %s\n" % str(gene_idx)) |
49 ofh.write("gene:%s\n" % str(gene)) | 91 ofh.write("gene: %s\n" % str(gene)) |
50 gene_name = gene[3] | 92 gene_name = gene[3] |
51 ofh.write("gene_name: %s\n" % str(gene_name)) | 93 ofh.write("gene_name: %s\n" % str(gene_name)) |
52 ofh.write("amr_gene_drug[0]: %s\n" % str(amr_gene_drug[0])) | 94 ofh.write("amr_gene_drug[0]: %s\n" % str(amr_gene_drug[0])) |
53 drugs = amr_gene_drug.loc[amr_gene_drug[0] == gene_name, :][1] | 95 drugs = amr_gene_drug.loc[amr_gene_drug[0] == gene_name, :][1] |
54 ofh.write("drugs:%s\n" % str(drugs)) | 96 ofh.write("drugs: %s\n" % str(drugs)) |
55 for drug in drugs: | 97 for drug in drugs: |
56 amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) | 98 amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) |
57 ofh.write("\amr_to_draw:%s\n" % str(amr_to_draw)) | 99 ofh.write("\amr_to_draw: %s\n" % str(amr_to_draw)) |
58 | 100 |
59 if amr_mutations_file is not None: | 101 ofh.write("\namr_mutations_file si None: %s\n" % str(amr_mutations_file == 'None')) |
60 # TODO: So far, no samples have produced mutations, so we haven't been able | 102 if amr_mutations_file not in [None, 'None'] and os.path.getsize(amr_mutations_file) > 0: |
61 # to produce a populated VarScan VCF file of mutations - https://github.com/appliedbinf/pima_md/blob/main/pima.py#L2923. | 103 amr_mutations = pandas.Series(dtype=object) |
62 # The call_amr_mutations Galaxy tool will currently produce this VarScan VCF file, but we need a sample that | 104 if amr_mutation_regions_file is not None: |
63 # will produce a populated file. After we find one, we'll need to figure out how to implement this loop | 105 mutation_regions = pandas.read_csv(amr_mutation_regions_file, header=0, sep='\t', index_col=False) |
64 # https://github.com/appliedbinf/pima_md/blob/main/pima.py#L2925 in a Galaxy tool so that the VarScan VCF | 106 if mutation_regions.shape[1] != 7: |
65 # file will be converted to the TSV amr_mutations_file that thsi tool expects. | 107 ofh.write("\nMutation regions should be a six column file.\n") |
66 amr_mutations = pandas.DataFrame() | 108 elif mutation_regions.shape[0] == 0: |
109 ofh.write("\nNo rows in mutation regions file.\n") | |
110 else: | |
111 # Make sure the positions in the BED file fall within the chromosomes provided in the reference sequence. | |
112 for mutation_region in range(mutation_regions.shape[0]): | |
113 mutation_region = mutation_regions.iloc[mutation_region, :] | |
114 if not (mutation_region[0] in reference): | |
115 ofh.write("\nMutation region :%s not found in reference genome.\n" % ' '.join(mutation_region.astype(str))) | |
116 continue | |
117 if not isinstance(mutation_region[1], numpy.int64): | |
118 ofh.write("\nNon-integer found in mutation region start (column 2): %s.\n" % str(mutation_region[1])) | |
119 break | |
120 elif not isinstance(mutation_region[2], numpy.int64): | |
121 ofh.write("\nNon-integer found in mutation region start (column 3): %s.\n" % str(mutation_region[2])) | |
122 break | |
123 if mutation_region[1] <= 0 or mutation_region[2] <= 0: | |
124 ofh.write("\nMutation region %s starts before the reference sequence.\n" % ' '.join(mutation_region.astype(str))) | |
125 if mutation_region[1] > len(reference[mutation_region[0]].seq) or mutation_region[2] > len(reference[mutation_region[0]].seq): | |
126 ofh.write("\nMutation region %s ends after the reference sequence.\n" % ' '.join(mutation_region.astype(str))) | |
127 for region_i in range(mutation_regions.shape[0]): | |
128 region = mutation_regions.iloc[region_i, :] | |
129 if not region.get('type', default='No Type') in ['snp', 'small-indel', 'any']: | |
130 continue | |
131 ofh.write("\nFinding AMR mutations for %s.\n" % str(region['name'])) | |
132 region_dir = os.path.join(mutations_dir, 'region_' + str(region_i)) | |
133 os.mkdir(region_dir) | |
134 region_bed = os.path.join(region_dir, 'region.bed') | |
135 mutation_regions.loc[[region_i], ].to_csv(path_or_buf=region_bed, sep='\t', header=False, index=False) | |
136 cmd = "bedtools intersect -nonamecheck -wb -a '%s' -b '%s' | awk '{BEGIN{getline < \"%s\" ;printf $0\"\t\";getline < \"%s\"; getline < \"%s\";print $0}{print}' > %s" % (region_bed, amr_mutations_file, amr_mutation_regions_file, amr_mutations_file, amr_mutations_file, region_mutations_output_file) | |
137 run_command(cmd) | |
138 try: | |
139 region_mutations = pandas.read_csv(region_mutations_output_file, sep='\t', header=0, index_col=False) | |
140 except Exception: | |
141 region_mutations = pandas.DataFrame() | |
142 if region_mutations.shape[0] == 0: | |
143 continue | |
144 # Figure out what kind of mutations are in this region. | |
145 region_mutation_types = pandas.Series(['snp'] * region_mutations.shape[0], name='TYPE', index=region_mutations.index) | |
146 region_mutation_types[region_mutations['REF'].str.len() != region_mutations['ALT'].str.len()] = 'small-indel' | |
147 region_mutation_drugs = pandas.Series(region['drug'] * region_mutations.shape[0], name='DRUG', index=region_mutations.index) | |
148 region_notes = pandas.Series(region['note'] * region_mutations.shape[0], name='NOTE', index=region_mutations.index) | |
149 region_mutations = pandas.concat([region_mutations, region_mutation_types, region_mutation_drugs, region_notes], axis=1) | |
150 region_mutations = region_mutations[['#CHROM', 'POS', 'TYPE', 'REF', 'ALT', 'DRUG', 'NOTE']] | |
151 amr_mutations[region['name']] = region_mutations | |
152 else: | |
153 ofh.write("\nMutation region BED not received.\n") | |
67 # Roll up potentially resistance conferring mutations. | 154 # Roll up potentially resistance conferring mutations. |
68 for mutation_region, mutation_hits in amr_mutations.iteritems(): | 155 for mutation_region, mutation_hits in amr_mutations.iteritems(): |
69 for mutation_idx, mutation_hit in mutation_hits.iterrows(): | 156 for mutation_idx, mutation_hit in mutation_hits.iterrows(): |
70 mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT'] | 157 mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT'] |
71 amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) | 158 amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) |
72 | 159 |
73 if amr_deletions_file is not None: | 160 if amr_deletions_file not in [None, 'None'] and os.path.getsize(amr_deletions_file) > 0: |
74 # TODO: So far, no samples have produced deletions, but we do have all the pices in place | 161 # TODO: So far, no samples have produced deletions, but we do have all the pices in place |
75 # within the workflow to receive the amr_deletions_file here, although it is currently | 162 # within the workflow to receive the amr_deletions_file here, although it is currently |
76 # always empty... | 163 # always empty... |
77 # Roll up deletions that might confer resistance. | 164 # Roll up deletions that might confer resistance. |
78 try: | 165 try: |
113 parser = argparse.ArgumentParser() | 200 parser = argparse.ArgumentParser() |
114 | 201 |
115 parser.add_argument('--amr_feature_hits_dir', action='store', dest='amr_feature_hits_dir', help='Directory of tabular files containing feature hits') | 202 parser.add_argument('--amr_feature_hits_dir', action='store', dest='amr_feature_hits_dir', help='Directory of tabular files containing feature hits') |
116 parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', default=None, help='AMR deletions BED file') | 203 parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', default=None, help='AMR deletions BED file') |
117 parser.add_argument('--amr_mutations_file', action='store', dest='amr_mutations_file', default=None, help='AMR mutations TSV file') | 204 parser.add_argument('--amr_mutations_file', action='store', dest='amr_mutations_file', default=None, help='AMR mutations TSV file') |
205 parser.add_argument('--amr_mutation_regions_file', action='store', dest='amr_mutation_regions_file', default=None, help='AMR mutation regions BED file') | |
118 parser.add_argument('--amr_gene_drug_file', action='store', dest='amr_gene_drug_file', help='AMR_gene_drugs tsv file') | 206 parser.add_argument('--amr_gene_drug_file', action='store', dest='amr_gene_drug_file', help='AMR_gene_drugs tsv file') |
207 parser.add_argument('--reference_genome', action='store', dest='reference_genome', help='Reference genome fasta file') | |
208 parser.add_argument('--region_mutations_output_file', action='store', dest='region_mutations_output_file', default=None, help='Region mutations TSV output file') | |
209 parser.add_argument('--mutations_dir', action='store', dest='mutations_dir', help='Mutations directory') | |
119 parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory') | 210 parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory') |
120 | 211 |
121 args = parser.parse_args() | 212 args = parser.parse_args() |
122 | 213 |
123 # Get thge collection of feature hits files. The collection | 214 # Get the collection of feature hits files. The collection |
124 # will be sorted alphabetically and will contain 2 files | 215 # will be sorted alphabetically and will contain 2 files |
125 # named something like AMR_CDS_311_2022_12_20.fasta and | 216 # named something like AMR_CDS_311_2022_12_20.fasta and |
126 # Incompatibility_Groups_2023_01_01.fasta. | 217 # Incompatibility_Groups_2023_01_01.fasta. |
127 amr_feature_hits_files = [] | 218 amr_feature_hits_files = [] |
128 for file_name in sorted(os.listdir(args.amr_feature_hits_dir)): | 219 for file_name in sorted(os.listdir(args.amr_feature_hits_dir)): |
129 file_path = os.path.abspath(os.path.join(args.amr_feature_hits_dir, file_name)) | 220 file_path = os.path.abspath(os.path.join(args.amr_feature_hits_dir, file_name)) |
130 amr_feature_hits_files.append(file_path) | 221 amr_feature_hits_files.append(file_path) |
131 | 222 |
132 draw_amr_matrix(amr_feature_hits_files, args.amr_deletions_file, args.amr_mutations_file, args.amr_gene_drug_file, args.output_dir) | 223 # Load the reference genome into memory. |
224 reference = load_fasta(args.reference_genome) | |
225 reference_size = 0 | |
226 for i in reference: | |
227 reference_size += len(i.seq) | |
228 | |
229 draw_amr_matrix(amr_feature_hits_files, args.amr_deletions_file, args.amr_mutations_file, args.amr_mutation_regions_file, args.amr_gene_drug_file, reference, reference_size, args.region_mutations_output_file, args.mutations_dir, args.output_dir) |