Mercurial > repos > greg > draw_amr_matrix
comparison draw_amr_matrix.py @ 4:33a0ea992043 draft
Uploaded
| author | greg |
|---|---|
| date | Tue, 21 Feb 2023 19:55:42 +0000 |
| parents | 7d7884f2d921 |
| children | caf554e039b2 |
comparison
equal
deleted
inserted
replaced
| 3:7d7884f2d921 | 4:33a0ea992043 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 import argparse | 3 import argparse |
| 4 import csv | 4 import csv |
| 5 import os | 5 import os |
| 6 | 6 import subprocess |
| 7 import sys | |
| 8 import tempfile | |
| 9 | |
| 10 import Bio.SeqIO | |
| 7 import numpy | 11 import numpy |
| 8 import pandas | 12 import pandas |
| 9 import matplotlib.pyplot as pyplot | 13 import matplotlib.pyplot as pyplot |
| 10 | 14 |
| 11 | 15 |
| 14 if k.lower().find('amr') >= 0: | 18 if k.lower().find('amr') >= 0: |
| 15 return amr_feature_hits[k] | 19 return amr_feature_hits[k] |
| 16 return None | 20 return None |
| 17 | 21 |
| 18 | 22 |
| 19 def draw_amr_matrix(amr_feature_hits_files, amr_deletions_file, amr_mutations_file, amr_gene_drug_file, output_dir): | 23 def load_fasta(fasta_file): |
| 24 sequence = pandas.Series(dtype=object) | |
| 25 for contig in Bio.SeqIO.parse(fasta_file, 'fasta'): | |
| 26 sequence[contig.id] = contig | |
| 27 return sequence | |
| 28 | |
| 29 | |
| 30 def run_command(cmd): | |
| 31 try: | |
| 32 tmp_name = tempfile.NamedTemporaryFile(dir=".").name | |
| 33 tmp_stderr = open(tmp_name, 'wb') | |
| 34 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno()) | |
| 35 returncode = proc.wait() | |
| 36 tmp_stderr.close() | |
| 37 if returncode != 0: | |
| 38 # Get stderr, allowing for case where it's very large. | |
| 39 tmp_stderr = open(tmp_name, 'rb') | |
| 40 stderr = '' | |
| 41 buffsize = 1048576 | |
| 42 try: | |
| 43 while True: | |
| 44 stderr += tmp_stderr.read(buffsize) | |
| 45 if not stderr or len(stderr) % buffsize != 0: | |
| 46 break | |
| 47 except OverflowError: | |
| 48 pass | |
| 49 tmp_stderr.close() | |
| 50 os.remove(tmp_name) | |
| 51 stop_err(stderr) | |
| 52 except Exception as e: | |
| 53 stop_err('Command:\n%s\n\nended with error:\n%s\n\n' % (cmd, str(e))) | |
| 54 | |
| 55 | |
| 56 def stop_err(msg): | |
| 57 sys.stderr.write(msg) | |
| 58 sys.exit(1) | |
| 59 | |
| 60 | |
| 61 def draw_amr_matrix(amr_feature_hits_files, amr_deletions_file, amr_mutations_file, amr_mutation_regions_file, amr_gene_drug_file, reference, reference_size, region_mutations_output_file, mutations_dir, output_dir): | |
| 20 ofh = open('process_log', 'w') | 62 ofh = open('process_log', 'w') |
| 21 | 63 |
| 22 # Read amr_feature_hits_files. | 64 # Read amr_feature_hits_files. |
| 23 amr_feature_hits = pandas.Series(dtype=object) | 65 amr_feature_hits = pandas.Series(dtype=object) |
| 24 for amr_feature_hits_file in amr_feature_hits_files: | 66 for amr_feature_hits_file in amr_feature_hits_files: |
| 40 # Read amr_drug_gene_file. | 82 # Read amr_drug_gene_file. |
| 41 amr_gene_drug = pandas.read_csv(amr_gene_drug_file, index_col=None, sep='\t', quoting=csv.QUOTE_NONE, header=None) | 83 amr_gene_drug = pandas.read_csv(amr_gene_drug_file, index_col=None, sep='\t', quoting=csv.QUOTE_NONE, header=None) |
| 42 ofh.write("\namr_gene_drug:\n%s\n" % str(amr_gene_drug)) | 84 ofh.write("\namr_gene_drug:\n%s\n" % str(amr_gene_drug)) |
| 43 | 85 |
| 44 # Roll up AMR gene hits. | 86 # Roll up AMR gene hits. |
| 45 ofh.write("\namr_hits.shape[0]:%s\n" % str(amr_hits.shape[0])) | 87 ofh.write("\namr_hits.shape[0]: %s\n" % str(amr_hits.shape[0])) |
| 46 if amr_hits.shape[0] > 0: | 88 if amr_hits.shape[0] > 0: |
| 47 for gene_idx, gene in amr_hits.iterrows(): | 89 for gene_idx, gene in amr_hits.iterrows(): |
| 48 ofh.write("gene_idx:%s\n" % str(gene_idx)) | 90 ofh.write("gene_idx: %s\n" % str(gene_idx)) |
| 49 ofh.write("gene:%s\n" % str(gene)) | 91 ofh.write("gene: %s\n" % str(gene)) |
| 50 gene_name = gene[3] | 92 gene_name = gene[3] |
| 51 ofh.write("gene_name: %s\n" % str(gene_name)) | 93 ofh.write("gene_name: %s\n" % str(gene_name)) |
| 52 ofh.write("amr_gene_drug[0]: %s\n" % str(amr_gene_drug[0])) | 94 ofh.write("amr_gene_drug[0]: %s\n" % str(amr_gene_drug[0])) |
| 53 drugs = amr_gene_drug.loc[amr_gene_drug[0] == gene_name, :][1] | 95 drugs = amr_gene_drug.loc[amr_gene_drug[0] == gene_name, :][1] |
| 54 ofh.write("drugs:%s\n" % str(drugs)) | 96 ofh.write("drugs: %s\n" % str(drugs)) |
| 55 for drug in drugs: | 97 for drug in drugs: |
| 56 amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) | 98 amr_to_draw = amr_to_draw.append(pandas.Series([gene_name, drug], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) |
| 57 ofh.write("\amr_to_draw:%s\n" % str(amr_to_draw)) | 99 ofh.write("\amr_to_draw: %s\n" % str(amr_to_draw)) |
| 58 | 100 |
| 59 if amr_mutations_file is not None: | 101 ofh.write("\namr_mutations_file si None: %s\n" % str(amr_mutations_file == 'None')) |
| 60 # TODO: So far, no samples have produced mutations, so we haven't been able | 102 if amr_mutations_file not in [None, 'None'] and os.path.getsize(amr_mutations_file) > 0: |
| 61 # to produce a populated VarScan VCF file of mutations - https://github.com/appliedbinf/pima_md/blob/main/pima.py#L2923. | 103 amr_mutations = pandas.Series(dtype=object) |
| 62 # The call_amr_mutations Galaxy tool will currently produce this VarScan VCF file, but we need a sample that | 104 if amr_mutation_regions_file is not None: |
| 63 # will produce a populated file. After we find one, we'll need to figure out how to implement this loop | 105 mutation_regions = pandas.read_csv(amr_mutation_regions_file, header=0, sep='\t', index_col=False) |
| 64 # https://github.com/appliedbinf/pima_md/blob/main/pima.py#L2925 in a Galaxy tool so that the VarScan VCF | 106 if mutation_regions.shape[1] != 7: |
| 65 # file will be converted to the TSV amr_mutations_file that thsi tool expects. | 107 ofh.write("\nMutation regions should be a six column file.\n") |
| 66 amr_mutations = pandas.DataFrame() | 108 elif mutation_regions.shape[0] == 0: |
| 109 ofh.write("\nNo rows in mutation regions file.\n") | |
| 110 else: | |
| 111 # Make sure the positions in the BED file fall within the chromosomes provided in the reference sequence. | |
| 112 for mutation_region in range(mutation_regions.shape[0]): | |
| 113 mutation_region = mutation_regions.iloc[mutation_region, :] | |
| 114 if not (mutation_region[0] in reference): | |
| 115 ofh.write("\nMutation region :%s not found in reference genome.\n" % ' '.join(mutation_region.astype(str))) | |
| 116 continue | |
| 117 if not isinstance(mutation_region[1], numpy.int64): | |
| 118 ofh.write("\nNon-integer found in mutation region start (column 2): %s.\n" % str(mutation_region[1])) | |
| 119 break | |
| 120 elif not isinstance(mutation_region[2], numpy.int64): | |
| 121 ofh.write("\nNon-integer found in mutation region start (column 3): %s.\n" % str(mutation_region[2])) | |
| 122 break | |
| 123 if mutation_region[1] <= 0 or mutation_region[2] <= 0: | |
| 124 ofh.write("\nMutation region %s starts before the reference sequence.\n" % ' '.join(mutation_region.astype(str))) | |
| 125 if mutation_region[1] > len(reference[mutation_region[0]].seq) or mutation_region[2] > len(reference[mutation_region[0]].seq): | |
| 126 ofh.write("\nMutation region %s ends after the reference sequence.\n" % ' '.join(mutation_region.astype(str))) | |
| 127 for region_i in range(mutation_regions.shape[0]): | |
| 128 region = mutation_regions.iloc[region_i, :] | |
| 129 if not region.get('type', default='No Type') in ['snp', 'small-indel', 'any']: | |
| 130 continue | |
| 131 ofh.write("\nFinding AMR mutations for %s.\n" % str(region['name'])) | |
| 132 region_dir = os.path.join(mutations_dir, 'region_' + str(region_i)) | |
| 133 os.mkdir(region_dir) | |
| 134 region_bed = os.path.join(region_dir, 'region.bed') | |
| 135 mutation_regions.loc[[region_i], ].to_csv(path_or_buf=region_bed, sep='\t', header=False, index=False) | |
| 136 cmd = "bedtools intersect -nonamecheck -wb -a '%s' -b '%s' | awk '{BEGIN{getline < \"%s\" ;printf $0\"\t\";getline < \"%s\"; getline < \"%s\";print $0}{print}' > %s" % (region_bed, amr_mutations_file, amr_mutation_regions_file, amr_mutations_file, amr_mutations_file, region_mutations_output_file) | |
| 137 run_command(cmd) | |
| 138 try: | |
| 139 region_mutations = pandas.read_csv(region_mutations_output_file, sep='\t', header=0, index_col=False) | |
| 140 except Exception: | |
| 141 region_mutations = pandas.DataFrame() | |
| 142 if region_mutations.shape[0] == 0: | |
| 143 continue | |
| 144 # Figure out what kind of mutations are in this region. | |
| 145 region_mutation_types = pandas.Series(['snp'] * region_mutations.shape[0], name='TYPE', index=region_mutations.index) | |
| 146 region_mutation_types[region_mutations['REF'].str.len() != region_mutations['ALT'].str.len()] = 'small-indel' | |
| 147 region_mutation_drugs = pandas.Series(region['drug'] * region_mutations.shape[0], name='DRUG', index=region_mutations.index) | |
| 148 region_notes = pandas.Series(region['note'] * region_mutations.shape[0], name='NOTE', index=region_mutations.index) | |
| 149 region_mutations = pandas.concat([region_mutations, region_mutation_types, region_mutation_drugs, region_notes], axis=1) | |
| 150 region_mutations = region_mutations[['#CHROM', 'POS', 'TYPE', 'REF', 'ALT', 'DRUG', 'NOTE']] | |
| 151 amr_mutations[region['name']] = region_mutations | |
| 152 else: | |
| 153 ofh.write("\nMutation region BED not received.\n") | |
| 67 # Roll up potentially resistance conferring mutations. | 154 # Roll up potentially resistance conferring mutations. |
| 68 for mutation_region, mutation_hits in amr_mutations.iteritems(): | 155 for mutation_region, mutation_hits in amr_mutations.iteritems(): |
| 69 for mutation_idx, mutation_hit in mutation_hits.iterrows(): | 156 for mutation_idx, mutation_hit in mutation_hits.iterrows(): |
| 70 mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT'] | 157 mutation_name = mutation_region + ' ' + mutation_hit['REF'] + '->' + mutation_hit['ALT'] |
| 71 amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) | 158 amr_to_draw = amr_to_draw.append(pandas.Series([mutation_name, mutation_hit['DRUG']], name=amr_to_draw.shape[0], index=amr_to_draw.columns)) |
| 72 | 159 |
| 73 if amr_deletions_file is not None: | 160 if amr_deletions_file not in [None, 'None'] and os.path.getsize(amr_deletions_file) > 0: |
| 74 # TODO: So far, no samples have produced deletions, but we do have all the pices in place | 161 # TODO: So far, no samples have produced deletions, but we do have all the pices in place |
| 75 # within the workflow to receive the amr_deletions_file here, although it is currently | 162 # within the workflow to receive the amr_deletions_file here, although it is currently |
| 76 # always empty... | 163 # always empty... |
| 77 # Roll up deletions that might confer resistance. | 164 # Roll up deletions that might confer resistance. |
| 78 try: | 165 try: |
| 113 parser = argparse.ArgumentParser() | 200 parser = argparse.ArgumentParser() |
| 114 | 201 |
| 115 parser.add_argument('--amr_feature_hits_dir', action='store', dest='amr_feature_hits_dir', help='Directory of tabular files containing feature hits') | 202 parser.add_argument('--amr_feature_hits_dir', action='store', dest='amr_feature_hits_dir', help='Directory of tabular files containing feature hits') |
| 116 parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', default=None, help='AMR deletions BED file') | 203 parser.add_argument('--amr_deletions_file', action='store', dest='amr_deletions_file', default=None, help='AMR deletions BED file') |
| 117 parser.add_argument('--amr_mutations_file', action='store', dest='amr_mutations_file', default=None, help='AMR mutations TSV file') | 204 parser.add_argument('--amr_mutations_file', action='store', dest='amr_mutations_file', default=None, help='AMR mutations TSV file') |
| 205 parser.add_argument('--amr_mutation_regions_file', action='store', dest='amr_mutation_regions_file', default=None, help='AMR mutation regions BED file') | |
| 118 parser.add_argument('--amr_gene_drug_file', action='store', dest='amr_gene_drug_file', help='AMR_gene_drugs tsv file') | 206 parser.add_argument('--amr_gene_drug_file', action='store', dest='amr_gene_drug_file', help='AMR_gene_drugs tsv file') |
| 207 parser.add_argument('--reference_genome', action='store', dest='reference_genome', help='Reference genome fasta file') | |
| 208 parser.add_argument('--region_mutations_output_file', action='store', dest='region_mutations_output_file', default=None, help='Region mutations TSV output file') | |
| 209 parser.add_argument('--mutations_dir', action='store', dest='mutations_dir', help='Mutations directory') | |
| 119 parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory') | 210 parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory') |
| 120 | 211 |
| 121 args = parser.parse_args() | 212 args = parser.parse_args() |
| 122 | 213 |
| 123 # Get thge collection of feature hits files. The collection | 214 # Get the collection of feature hits files. The collection |
| 124 # will be sorted alphabetically and will contain 2 files | 215 # will be sorted alphabetically and will contain 2 files |
| 125 # named something like AMR_CDS_311_2022_12_20.fasta and | 216 # named something like AMR_CDS_311_2022_12_20.fasta and |
| 126 # Incompatibility_Groups_2023_01_01.fasta. | 217 # Incompatibility_Groups_2023_01_01.fasta. |
| 127 amr_feature_hits_files = [] | 218 amr_feature_hits_files = [] |
| 128 for file_name in sorted(os.listdir(args.amr_feature_hits_dir)): | 219 for file_name in sorted(os.listdir(args.amr_feature_hits_dir)): |
| 129 file_path = os.path.abspath(os.path.join(args.amr_feature_hits_dir, file_name)) | 220 file_path = os.path.abspath(os.path.join(args.amr_feature_hits_dir, file_name)) |
| 130 amr_feature_hits_files.append(file_path) | 221 amr_feature_hits_files.append(file_path) |
| 131 | 222 |
| 132 draw_amr_matrix(amr_feature_hits_files, args.amr_deletions_file, args.amr_mutations_file, args.amr_gene_drug_file, args.output_dir) | 223 # Load the reference genome into memory. |
| 224 reference = load_fasta(args.reference_genome) | |
| 225 reference_size = 0 | |
| 226 for i in reference: | |
| 227 reference_size += len(i.seq) | |
| 228 | |
| 229 draw_amr_matrix(amr_feature_hits_files, args.amr_deletions_file, args.amr_mutations_file, args.amr_mutation_regions_file, args.amr_gene_drug_file, reference, reference_size, args.region_mutations_output_file, args.mutations_dir, args.output_dir) |
