Mercurial > repos > iuc > variant_analyzer
comparison mut2read.py @ 2:3f1dbd2c59bf draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/variant_analyzer commit f492e9717cb946f0eb5689cd7b6eb8067abf6468"
author | iuc |
---|---|
date | Tue, 10 Nov 2020 12:55:29 +0000 |
parents | 3556001ff2db |
children |
comparison
equal
deleted
inserted
replaced
1:3556001ff2db | 2:3f1dbd2c59bf |
---|---|
9 all tags of reads that carry the mutation to a user specified output file. | 9 all tags of reads that carry the mutation to a user specified output file. |
10 Creates fastq file of reads of tags with mutation. | 10 Creates fastq file of reads of tags with mutation. |
11 | 11 |
12 ======= ========== ================= ================================ | 12 ======= ========== ================= ================================ |
13 Version Date Author Description | 13 Version Date Author Description |
14 0.2.1 2019-10-27 Gundula Povysil - | 14 2.0.0 2020-10-30 Gundula Povysil - |
15 ======= ========== ================= ================================ | 15 ======= ========== ================= ================================ |
16 | 16 |
17 USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq | 17 USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq tag_count_dict.json |
18 tag_count_dict.json | |
19 """ | 18 """ |
20 | 19 |
21 import argparse | 20 import argparse |
22 import json | 21 import json |
23 import os | 22 import os |
24 import sys | 23 import sys |
25 | 24 |
26 import numpy as np | 25 import numpy as np |
27 import pysam | 26 import pysam |
27 from cyvcf2 import VCF | |
28 | 28 |
29 | 29 |
30 def make_argparser(): | 30 def make_argparser(): |
31 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.') | 31 parser = argparse.ArgumentParser(description='Takes a vcf file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.') |
32 parser.add_argument('--mutFile', | 32 parser.add_argument('--mutFile', |
33 help='TABULAR file with DCS mutations.') | 33 help='VCF file with DCS mutations.') |
34 parser.add_argument('--bamFile', | 34 parser.add_argument('--bamFile', |
35 help='BAM file with aligned DCS reads.') | 35 help='BAM file with aligned DCS reads.') |
36 parser.add_argument('--familiesFile', | 36 parser.add_argument('--familiesFile', |
37 help='TABULAR file with aligned families.') | 37 help='TABULAR file with aligned families.') |
38 parser.add_argument('--outputFastq', | 38 parser.add_argument('--outputFastq', |
59 sys.exit("Error: Could not find '{}'".format(file2)) | 59 sys.exit("Error: Could not find '{}'".format(file2)) |
60 | 60 |
61 if os.path.isfile(file3) is False: | 61 if os.path.isfile(file3) is False: |
62 sys.exit("Error: Could not find '{}'".format(file3)) | 62 sys.exit("Error: Could not find '{}'".format(file3)) |
63 | 63 |
64 # read mut file | |
65 with open(file1, 'r') as mut: | |
66 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str) | |
67 | |
68 # read dcs bam file | 64 # read dcs bam file |
69 # pysam.index(file2) | |
70 bam = pysam.AlignmentFile(file2, "rb") | 65 bam = pysam.AlignmentFile(file2, "rb") |
71 | 66 |
72 # get tags | 67 # get tags |
73 tag_dict = {} | 68 tag_dict = {} |
74 cvrg_dict = {} | 69 cvrg_dict = {} |
75 | 70 |
76 if len(mut_array) == 13: | 71 for variant in VCF(file1): |
77 mut_array = mut_array.reshape((1, len(mut_array))) | 72 chrom = variant.CHROM |
73 stop_pos = variant.start | |
74 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) | |
75 ref = variant.REF | |
76 alt = variant.ALT[0] | |
77 dcs_len = [] | |
78 if len(ref) == len(alt): | |
79 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): | |
80 if pileupcolumn.reference_pos == stop_pos: | |
81 count_alt = 0 | |
82 count_ref = 0 | |
83 count_indel = 0 | |
84 count_n = 0 | |
85 count_other = 0 | |
86 count_lowq = 0 | |
87 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | |
88 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | |
89 for pileupread in pileupcolumn.pileups: | |
90 if not pileupread.is_del and not pileupread.is_refskip: | |
91 # query position is None if is_del or is_refskip is set. | |
92 nuc = pileupread.alignment.query_sequence[pileupread.query_position] | |
93 dcs_len.append(len(pileupread.alignment.query_sequence)) | |
94 if nuc == alt: | |
95 count_alt += 1 | |
96 tag = pileupread.alignment.query_name | |
97 if tag in tag_dict: | |
98 tag_dict[tag][chrom_stop_pos] = alt | |
99 else: | |
100 tag_dict[tag] = {} | |
101 tag_dict[tag][chrom_stop_pos] = alt | |
102 elif nuc == ref: | |
103 count_ref += 1 | |
104 elif nuc == "N": | |
105 count_n += 1 | |
106 elif nuc == "lowQ": | |
107 count_lowq += 1 | |
108 else: | |
109 count_other += 1 | |
110 else: | |
111 count_indel += 1 | |
112 dcs_median = np.median(np.array(dcs_len)) | |
113 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) | |
78 | 114 |
79 for m in range(len(mut_array[:, 0])): | 115 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % |
80 print(str(m + 1) + " of " + str(len(mut_array[:, 0]))) | 116 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, |
81 chrom = mut_array[m, 1] | 117 count_indel, count_lowq, dcs_median)) |
82 stop_pos = mut_array[m, 2].astype(int) | 118 else: |
83 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) | 119 print("indels are currently not evaluated") |
84 ref = mut_array[m, 9] | |
85 alt = mut_array[m, 10] | |
86 | |
87 dcs_len = [] | |
88 | |
89 for pileupcolumn in bam.pileup(chrom, stop_pos - 2, stop_pos, max_depth=100000000): | |
90 | |
91 if pileupcolumn.reference_pos == stop_pos - 1: | |
92 count_alt = 0 | |
93 count_ref = 0 | |
94 count_indel = 0 | |
95 count_n = 0 | |
96 count_other = 0 | |
97 count_lowq = 0 | |
98 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | |
99 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | |
100 for pileupread in pileupcolumn.pileups: | |
101 if not pileupread.is_del and not pileupread.is_refskip: | |
102 # query position is None if is_del or is_refskip is set. | |
103 nuc = pileupread.alignment.query_sequence[pileupread.query_position] | |
104 dcs_len.append(len(pileupread.alignment.query_sequence)) | |
105 if nuc == alt: | |
106 count_alt += 1 | |
107 tag = pileupread.alignment.query_name | |
108 if tag in tag_dict: | |
109 tag_dict[tag][chrom_stop_pos] = alt | |
110 else: | |
111 tag_dict[tag] = {} | |
112 tag_dict[tag][chrom_stop_pos] = alt | |
113 elif nuc == ref: | |
114 count_ref += 1 | |
115 elif nuc == "N": | |
116 count_n += 1 | |
117 elif nuc == "lowQ": | |
118 count_lowq += 1 | |
119 else: | |
120 count_other += 1 | |
121 else: | |
122 count_indel += 1 | |
123 dcs_median = np.median(np.array(dcs_len)) | |
124 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) | |
125 | |
126 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % | |
127 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, | |
128 count_indel, count_lowq, dcs_median)) | |
129 bam.close() | 120 bam.close() |
130 | 121 |
131 with open(json_file, "w") as f: | 122 with open(json_file, "w") as f: |
132 json.dump((tag_dict, cvrg_dict), f) | 123 json.dump((tag_dict, cvrg_dict), f) |
133 | 124 |