Mercurial > repos > mheinzl > variant_analyzer2
comparison mut2sscs.py @ 0:e5953c54cfb5 draft
planemo upload for repository https://github.com/gpovysil/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
| author | mheinzl |
|---|---|
| date | Sun, 04 Oct 2020 17:19:39 +0000 |
| parents | |
| children | 11a2a34f8a2b |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e5953c54cfb5 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 """mut2sscs.py | |
| 4 | |
| 5 Author -- Gundula Povysil | |
| 6 Contact -- povysil@bioinf.jku.at | |
| 7 | |
| 8 Takes a tabular file with mutations from DCS and a BAM file of SSCS as input | |
| 9 and extracts all tags of reads that carry the mutation. | |
| 10 Calculates statistics about number of ab/ba/duplex per mutation. | |
| 11 | |
| 12 ======= ========== ================= ================================ | |
| 13 Version Date Author Description | |
| 14 0.2.1 2019-10-27 Gundula Povysil - | |
| 15 ======= ========== ================= ================================ | |
| 16 | |
| 17 USAGE: python mut2sscs.py DCS_Mutations.tabular SSCS.bam SSCS_counts.json | |
| 18 | |
| 19 """ | |
| 20 | |
| 21 from __future__ import division | |
| 22 | |
| 23 import argparse | |
| 24 import json | |
| 25 import os | |
| 26 import sys | |
| 27 | |
| 28 import numpy as np | |
| 29 import pysam | |
| 30 | |
| 31 | |
| 32 def make_argparser(): | |
| 33 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.') | |
| 34 parser.add_argument('--mutFile', | |
| 35 help='TABULAR file with DCS mutations.') | |
| 36 parser.add_argument('--bamFile', | |
| 37 help='BAM file with aligned SSCS reads.') | |
| 38 parser.add_argument('--outputJson', | |
| 39 help='Output JSON file to store SSCS counts.') | |
| 40 return parser | |
| 41 | |
| 42 | |
| 43 def mut2sscs(argv): | |
| 44 parser = make_argparser() | |
| 45 args = parser.parse_args(argv[1:]) | |
| 46 | |
| 47 file1 = args.mutFile | |
| 48 file2 = args.bamFile | |
| 49 sscs_counts_json = args.outputJson | |
| 50 | |
| 51 if os.path.isfile(file1) is False: | |
| 52 sys.exit("Error: Could not find '{}'".format(file1)) | |
| 53 | |
| 54 if os.path.isfile(file2) is False: | |
| 55 sys.exit("Error: Could not find '{}'".format(file2)) | |
| 56 | |
| 57 # 1. read mut file | |
| 58 with open(file1, 'r') as mut: | |
| 59 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str) | |
| 60 | |
| 61 # 2 read SSCS bam file | |
| 62 # pysam.index(file2) | |
| 63 bam = pysam.AlignmentFile(file2, "rb") | |
| 64 | |
| 65 # get tags | |
| 66 mut_pos_dict = {} | |
| 67 ref_pos_dict = {} | |
| 68 if mut_array.shape == (1,13): | |
| 69 mut_array = mut_array.reshape((1, len(mut_array))) | |
| 70 | |
| 71 for m in range(0, len(mut_array[:, 0])): | |
| 72 print(str(m + 1) + " of " + str(len(mut_array[:, 0]))) | |
| 73 chrom = mut_array[m, 1] | |
| 74 stop_pos = mut_array[m, 2].astype(int) | |
| 75 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) | |
| 76 ref = mut_array[m, 9] | |
| 77 alt = mut_array[m, 10] | |
| 78 | |
| 79 for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=1000000000): | |
| 80 if pileupcolumn.reference_pos == stop_pos - 1: | |
| 81 count_alt = 0 | |
| 82 count_ref = 0 | |
| 83 count_indel = 0 | |
| 84 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | |
| 85 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | |
| 86 for pileupread in pileupcolumn.pileups: | |
| 87 if not pileupread.is_del and not pileupread.is_refskip: | |
| 88 tag = pileupread.alignment.query_name | |
| 89 abba = tag[-2:] | |
| 90 # query position is None if is_del or is_refskip is set. | |
| 91 if pileupread.alignment.query_sequence[pileupread.query_position] == alt: | |
| 92 count_alt += 1 | |
| 93 if chrom_stop_pos in mut_pos_dict: | |
| 94 if abba in mut_pos_dict[chrom_stop_pos]: | |
| 95 mut_pos_dict[chrom_stop_pos][abba] += 1 | |
| 96 else: | |
| 97 mut_pos_dict[chrom_stop_pos][abba] = 1 | |
| 98 else: | |
| 99 mut_pos_dict[chrom_stop_pos] = {} | |
| 100 mut_pos_dict[chrom_stop_pos][abba] = 1 | |
| 101 elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: | |
| 102 count_ref += 1 | |
| 103 if chrom_stop_pos in ref_pos_dict: | |
| 104 if abba in ref_pos_dict[chrom_stop_pos]: | |
| 105 ref_pos_dict[chrom_stop_pos][abba] += 1 | |
| 106 else: | |
| 107 ref_pos_dict[chrom_stop_pos][abba] = 1 | |
| 108 else: | |
| 109 ref_pos_dict[chrom_stop_pos] = {} | |
| 110 ref_pos_dict[chrom_stop_pos][abba] = 1 | |
| 111 else: | |
| 112 count_indel += 1 | |
| 113 | |
| 114 print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % | |
| 115 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) | |
| 116 | |
| 117 # if mutation is in DCS file but not in SSCS, then set counts to NA | |
| 118 if chrom_stop_pos not in mut_pos_dict.keys(): | |
| 119 mut_pos_dict[chrom_stop_pos] = {} | |
| 120 mut_pos_dict[chrom_stop_pos]["ab"] = 0 | |
| 121 mut_pos_dict[chrom_stop_pos]["ba"] = 0 | |
| 122 ref_pos_dict[chrom_stop_pos] = {} | |
| 123 ref_pos_dict[chrom_stop_pos]["ab"] = 0 | |
| 124 ref_pos_dict[chrom_stop_pos]["ba"] = 0 | |
| 125 bam.close() | |
| 126 | |
| 127 # save counts | |
| 128 with open(sscs_counts_json, "w") as f: | |
| 129 json.dump((mut_pos_dict, ref_pos_dict), f) | |
| 130 | |
| 131 | |
| 132 if __name__ == '__main__': | |
| 133 sys.exit(mut2sscs(sys.argv)) |
