Mercurial > repos > iuc > variant_analyzer
diff mut2sscs.py @ 2:3f1dbd2c59bf draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/variant_analyzer commit f492e9717cb946f0eb5689cd7b6eb8067abf6468"
author | iuc |
---|---|
date | Tue, 10 Nov 2020 12:55:29 +0000 |
parents | 3556001ff2db |
children |
line wrap: on
line diff
--- a/mut2sscs.py Wed Dec 04 16:21:17 2019 -0500 +++ b/mut2sscs.py Tue Nov 10 12:55:29 2020 +0000 @@ -11,7 +11,7 @@ ======= ========== ================= ================================ Version Date Author Description -0.2.1 2019-10-27 Gundula Povysil - +2.0.0 2020-10-30 Gundula Povysil - ======= ========== ================= ================================ USAGE: python mut2sscs.py DCS_Mutations.tabular SSCS.bam SSCS_counts.json @@ -25,14 +25,14 @@ import os import sys -import numpy as np import pysam +from cyvcf2 import VCF def make_argparser(): - parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.') + parser = argparse.ArgumentParser(description='Takes a vcf file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file.') parser.add_argument('--mutFile', - help='TABULAR file with DCS mutations.') + help='VCR file with DCS mutations.') parser.add_argument('--bamFile', help='BAM file with aligned SSCS reads.') parser.add_argument('--outputJson', @@ -54,74 +54,73 @@ if os.path.isfile(file2) is False: sys.exit("Error: Could not find '{}'".format(file2)) - # 1. read mut file - with open(file1, 'r') as mut: - mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str) - - # 2 read SSCS bam file - # pysam.index(file2) + # read SSCS bam file bam = pysam.AlignmentFile(file2, "rb") # get tags mut_pos_dict = {} ref_pos_dict = {} - if len(mut_array) == 13: - mut_array = mut_array.reshape((1, len(mut_array))) - for m in range(0, len(mut_array[:, 0])): - print(str(m + 1) + " of " + str(len(mut_array[:, 0]))) - chrom = mut_array[m, 1] - stop_pos = mut_array[m, 2].astype(int) + for variant in VCF(file1): + chrom = variant.CHROM + stop_pos = variant.start chrom_stop_pos = str(chrom) + "#" + str(stop_pos) - ref = mut_array[m, 9] - alt = mut_array[m, 10] + ref = variant.REF + alt = variant.ALT[0] - for pileupcolumn in bam.pileup(chrom, stop_pos - 2, stop_pos, max_depth=1000000000): - if pileupcolumn.reference_pos == stop_pos - 1: - count_alt = 0 - count_ref = 0 - count_indel = 0 - print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), - "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) - for pileupread in pileupcolumn.pileups: - if not pileupread.is_del and not pileupread.is_refskip: - tag = pileupread.alignment.query_name - abba = tag[-2:] - # query position is None if is_del or is_refskip is set. - if pileupread.alignment.query_sequence[pileupread.query_position] == alt: - count_alt += 1 - if chrom_stop_pos in mut_pos_dict: - if abba in mut_pos_dict[chrom_stop_pos]: - mut_pos_dict[chrom_stop_pos][abba] += 1 + if len(ref) == len(alt): + for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): + if pileupcolumn.reference_pos == stop_pos: + count_alt = 0 + count_ref = 0 + count_indel = 0 + print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), + "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) + for pileupread in pileupcolumn.pileups: + if not pileupread.is_del and not pileupread.is_refskip: + tag = pileupread.alignment.query_name + abba = tag[-2:] + # query position is None if is_del or is_refskip is set. + if pileupread.alignment.query_sequence[pileupread.query_position] == alt: + count_alt += 1 + if chrom_stop_pos in mut_pos_dict: + if abba in mut_pos_dict[chrom_stop_pos]: + mut_pos_dict[chrom_stop_pos][abba] += 1 + else: + mut_pos_dict[chrom_stop_pos][abba] = 1 else: + mut_pos_dict[chrom_stop_pos] = {} mut_pos_dict[chrom_stop_pos][abba] = 1 - else: - mut_pos_dict[chrom_stop_pos] = {} - mut_pos_dict[chrom_stop_pos][abba] = 1 - elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: - count_ref += 1 - if chrom_stop_pos in ref_pos_dict: - if abba in ref_pos_dict[chrom_stop_pos]: - ref_pos_dict[chrom_stop_pos][abba] += 1 + if chrom_stop_pos not in ref_pos_dict: + ref_pos_dict[chrom_stop_pos] = {} + ref_pos_dict[chrom_stop_pos][abba] = 0 + + elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: + count_ref += 1 + if chrom_stop_pos in ref_pos_dict: + if abba in ref_pos_dict[chrom_stop_pos]: + ref_pos_dict[chrom_stop_pos][abba] += 1 + else: + ref_pos_dict[chrom_stop_pos][abba] = 1 else: + ref_pos_dict[chrom_stop_pos] = {} ref_pos_dict[chrom_stop_pos][abba] = 1 else: - ref_pos_dict[chrom_stop_pos] = {} - ref_pos_dict[chrom_stop_pos][abba] = 1 - else: - count_indel += 1 + count_indel += 1 - print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % - (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) + print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % + (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) - # if mutation is in DCS file but not in SSCS, then set counts to NA - if chrom_stop_pos not in mut_pos_dict.keys(): - mut_pos_dict[chrom_stop_pos] = {} - mut_pos_dict[chrom_stop_pos]["ab"] = 0 - mut_pos_dict[chrom_stop_pos]["ba"] = 0 - ref_pos_dict[chrom_stop_pos] = {} - ref_pos_dict[chrom_stop_pos]["ab"] = 0 - ref_pos_dict[chrom_stop_pos]["ba"] = 0 + # if mutation is in DCS file but not in SSCS, then set counts to NA + if chrom_stop_pos not in mut_pos_dict.keys(): + mut_pos_dict[chrom_stop_pos] = {} + mut_pos_dict[chrom_stop_pos]["ab"] = 0 + mut_pos_dict[chrom_stop_pos]["ba"] = 0 + ref_pos_dict[chrom_stop_pos] = {} + ref_pos_dict[chrom_stop_pos]["ab"] = 0 + ref_pos_dict[chrom_stop_pos]["ba"] = 0 + else: + print("indels are currently not evaluated") bam.close() # save counts