Mercurial > repos > mheinzl > variant_analyzer2
comparison mut2read.py @ 6:11a2a34f8a2b draft
planemo upload for repository https://github.com/gpovysil/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
| author | mheinzl |
|---|---|
| date | Mon, 18 Jan 2021 09:49:15 +0000 |
| parents | e5953c54cfb5 |
| children | ded0dc6a20d3 |
comparison
equal
deleted
inserted
replaced
| 5:d9cbf833624e | 6:11a2a34f8a2b |
|---|---|
| 12 ======= ========== ================= ================================ | 12 ======= ========== ================= ================================ |
| 13 Version Date Author Description | 13 Version Date Author Description |
| 14 0.2.1 2019-10-27 Gundula Povysil - | 14 0.2.1 2019-10-27 Gundula Povysil - |
| 15 ======= ========== ================= ================================ | 15 ======= ========== ================= ================================ |
| 16 | 16 |
| 17 USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq | 17 USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq tag_count_dict.json |
| 18 tag_count_dict.json | |
| 19 """ | 18 """ |
| 20 | 19 |
| 21 import argparse | 20 import argparse |
| 22 import json | 21 import json |
| 23 import os | 22 import os |
| 24 import sys | 23 import sys |
| 25 | 24 |
| 26 import numpy as np | 25 import numpy as np |
| 27 import pysam | 26 import pysam |
| 27 from cyvcf2 import VCF | |
| 28 | 28 |
| 29 | 29 |
| 30 def make_argparser(): | 30 def make_argparser(): |
| 31 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.') | 31 parser = argparse.ArgumentParser(description='Takes a vcf file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.') |
| 32 parser.add_argument('--mutFile', | 32 parser.add_argument('--mutFile', |
| 33 help='TABULAR file with DCS mutations.') | 33 help='VCF file with DCS mutations.') |
| 34 parser.add_argument('--bamFile', | 34 parser.add_argument('--bamFile', |
| 35 help='BAM file with aligned DCS reads.') | 35 help='BAM file with aligned DCS reads.') |
| 36 parser.add_argument('--familiesFile', | 36 parser.add_argument('--familiesFile', |
| 37 help='TABULAR file with aligned families.') | 37 help='TABULAR file with aligned families.') |
| 38 parser.add_argument('--outputFastq', | 38 parser.add_argument('--outputFastq', |
| 59 sys.exit("Error: Could not find '{}'".format(file2)) | 59 sys.exit("Error: Could not find '{}'".format(file2)) |
| 60 | 60 |
| 61 if os.path.isfile(file3) is False: | 61 if os.path.isfile(file3) is False: |
| 62 sys.exit("Error: Could not find '{}'".format(file3)) | 62 sys.exit("Error: Could not find '{}'".format(file3)) |
| 63 | 63 |
| 64 # read mut file | |
| 65 with open(file1, 'r') as mut: | |
| 66 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str) | |
| 67 | |
| 68 # read dcs bam file | 64 # read dcs bam file |
| 69 # pysam.index(file2) | 65 # pysam.index(file2) |
| 70 bam = pysam.AlignmentFile(file2, "rb") | 66 bam = pysam.AlignmentFile(file2, "rb") |
| 71 | 67 |
| 72 # get tags | 68 # get tags |
| 73 tag_dict = {} | 69 tag_dict = {} |
| 74 cvrg_dict = {} | 70 cvrg_dict = {} |
| 75 | 71 |
| 76 if mut_array.shape == (1,13): | 72 for variant in VCF(file1): |
| 77 mut_array = mut_array.reshape((1, len(mut_array))) | 73 chrom = variant.CHROM |
| 78 | 74 stop_pos = variant.start |
| 79 for m in range(len(mut_array[:, 0])): | |
| 80 print(str(m + 1) + " of " + str(len(mut_array[:, 0]))) | |
| 81 chrom = mut_array[m, 1] | |
| 82 stop_pos = mut_array[m, 2].astype(int) | |
| 83 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) | 75 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) |
| 84 ref = mut_array[m, 9] | 76 ref = variant.REF |
| 85 alt = mut_array[m, 10] | 77 alt = variant.ALT[0] |
| 78 # nc = variant.format('NC') | |
| 79 ad = variant.format('AD') | |
| 86 | 80 |
| 87 dcs_len = [] | 81 dcs_len = [] |
| 82 if len(ref) == len(alt): | |
| 83 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): | |
| 88 | 84 |
| 89 for pileupcolumn in bam.pileup(chrom.tostring(), stop_pos - 2, stop_pos, max_depth=100000000): | 85 if pileupcolumn.reference_pos == stop_pos: |
| 86 count_alt = 0 | |
| 87 count_ref = 0 | |
| 88 count_indel = 0 | |
| 89 count_n = 0 | |
| 90 count_other = 0 | |
| 91 count_lowq = 0 | |
| 92 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | |
| 93 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | |
| 94 for pileupread in pileupcolumn.pileups: | |
| 95 if not pileupread.is_del and not pileupread.is_refskip: | |
| 96 # query position is None if is_del or is_refskip is set. | |
| 97 nuc = pileupread.alignment.query_sequence[pileupread.query_position] | |
| 98 dcs_len.append(len(pileupread.alignment.query_sequence)) | |
| 99 if nuc == alt: | |
| 100 count_alt += 1 | |
| 101 tag = pileupread.alignment.query_name | |
| 102 if tag in tag_dict: | |
| 103 tag_dict[tag][chrom_stop_pos] = alt | |
| 104 else: | |
| 105 tag_dict[tag] = {} | |
| 106 tag_dict[tag][chrom_stop_pos] = alt | |
| 107 elif nuc == ref: | |
| 108 count_ref += 1 | |
| 109 elif nuc == "N": | |
| 110 count_n += 1 | |
| 111 elif nuc == "lowQ": | |
| 112 count_lowq += 1 | |
| 113 else: | |
| 114 count_other += 1 | |
| 115 else: | |
| 116 count_indel += 1 | |
| 117 dcs_median = np.median(np.array(dcs_len)) | |
| 118 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) | |
| 90 | 119 |
| 91 if pileupcolumn.reference_pos == stop_pos - 1: | 120 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % |
| 92 count_alt = 0 | 121 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, |
| 93 count_ref = 0 | 122 count_indel, count_lowq, dcs_median)) |
| 94 count_indel = 0 | 123 else: |
| 95 count_n = 0 | 124 print("indels are currently not evaluated") |
| 96 count_other = 0 | |
| 97 count_lowq = 0 | |
| 98 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | |
| 99 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | |
| 100 for pileupread in pileupcolumn.pileups: | |
| 101 if not pileupread.is_del and not pileupread.is_refskip: | |
| 102 # query position is None if is_del or is_refskip is set. | |
| 103 nuc = pileupread.alignment.query_sequence[pileupread.query_position] | |
| 104 dcs_len.append(len(pileupread.alignment.query_sequence)) | |
| 105 if nuc == alt: | |
| 106 count_alt += 1 | |
| 107 tag = pileupread.alignment.query_name | |
| 108 if tag in tag_dict: | |
| 109 tag_dict[tag][chrom_stop_pos] = alt | |
| 110 else: | |
| 111 tag_dict[tag] = {} | |
| 112 tag_dict[tag][chrom_stop_pos] = alt | |
| 113 elif nuc == ref: | |
| 114 count_ref += 1 | |
| 115 elif nuc == "N": | |
| 116 count_n += 1 | |
| 117 elif nuc == "lowQ": | |
| 118 count_lowq += 1 | |
| 119 else: | |
| 120 count_other += 1 | |
| 121 else: | |
| 122 count_indel += 1 | |
| 123 dcs_median = np.median(np.array(dcs_len)) | |
| 124 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) | |
| 125 | |
| 126 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % | |
| 127 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, | |
| 128 count_indel, count_lowq, dcs_median)) | |
| 129 bam.close() | 125 bam.close() |
| 130 | 126 |
| 131 with open(json_file, "w") as f: | 127 with open(json_file, "w") as f: |
| 132 json.dump((tag_dict, cvrg_dict), f) | 128 json.dump((tag_dict, cvrg_dict), f) |
| 133 | 129 |
| 151 out.write(curr_qual + "\n") | 147 out.write(curr_qual + "\n") |
| 152 | 148 |
| 153 | 149 |
| 154 if __name__ == '__main__': | 150 if __name__ == '__main__': |
| 155 sys.exit(mut2read(sys.argv)) | 151 sys.exit(mut2read(sys.argv)) |
| 152 |
