comparison mut2read.py @ 2:3f1dbd2c59bf draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/variant_analyzer commit f492e9717cb946f0eb5689cd7b6eb8067abf6468"
author iuc
date Tue, 10 Nov 2020 12:55:29 +0000
parents 3556001ff2db
children
comparison
equal deleted inserted replaced
1:3556001ff2db 2:3f1dbd2c59bf
9 all tags of reads that carry the mutation to a user specified output file. 9 all tags of reads that carry the mutation to a user specified output file.
10 Creates fastq file of reads of tags with mutation. 10 Creates fastq file of reads of tags with mutation.
11 11
12 ======= ========== ================= ================================ 12 ======= ========== ================= ================================
13 Version Date Author Description 13 Version Date Author Description
14 0.2.1 2019-10-27 Gundula Povysil - 14 2.0.0 2020-10-30 Gundula Povysil -
15 ======= ========== ================= ================================ 15 ======= ========== ================= ================================
16 16
17 USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq 17 USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq tag_count_dict.json
18 tag_count_dict.json
19 """ 18 """
20 19
21 import argparse 20 import argparse
22 import json 21 import json
23 import os 22 import os
24 import sys 23 import sys
25 24
26 import numpy as np 25 import numpy as np
27 import pysam 26 import pysam
27 from cyvcf2 import VCF
28 28
29 29
30 def make_argparser(): 30 def make_argparser():
31 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.') 31 parser = argparse.ArgumentParser(description='Takes a vcf file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.')
32 parser.add_argument('--mutFile', 32 parser.add_argument('--mutFile',
33 help='TABULAR file with DCS mutations.') 33 help='VCF file with DCS mutations.')
34 parser.add_argument('--bamFile', 34 parser.add_argument('--bamFile',
35 help='BAM file with aligned DCS reads.') 35 help='BAM file with aligned DCS reads.')
36 parser.add_argument('--familiesFile', 36 parser.add_argument('--familiesFile',
37 help='TABULAR file with aligned families.') 37 help='TABULAR file with aligned families.')
38 parser.add_argument('--outputFastq', 38 parser.add_argument('--outputFastq',
59 sys.exit("Error: Could not find '{}'".format(file2)) 59 sys.exit("Error: Could not find '{}'".format(file2))
60 60
61 if os.path.isfile(file3) is False: 61 if os.path.isfile(file3) is False:
62 sys.exit("Error: Could not find '{}'".format(file3)) 62 sys.exit("Error: Could not find '{}'".format(file3))
63 63
64 # read mut file
65 with open(file1, 'r') as mut:
66 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype=str)
67
68 # read dcs bam file 64 # read dcs bam file
69 # pysam.index(file2)
70 bam = pysam.AlignmentFile(file2, "rb") 65 bam = pysam.AlignmentFile(file2, "rb")
71 66
72 # get tags 67 # get tags
73 tag_dict = {} 68 tag_dict = {}
74 cvrg_dict = {} 69 cvrg_dict = {}
75 70
76 if len(mut_array) == 13: 71 for variant in VCF(file1):
77 mut_array = mut_array.reshape((1, len(mut_array))) 72 chrom = variant.CHROM
73 stop_pos = variant.start
74 chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
75 ref = variant.REF
76 alt = variant.ALT[0]
77 dcs_len = []
78 if len(ref) == len(alt):
79 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
80 if pileupcolumn.reference_pos == stop_pos:
81 count_alt = 0
82 count_ref = 0
83 count_indel = 0
84 count_n = 0
85 count_other = 0
86 count_lowq = 0
87 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
88 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
89 for pileupread in pileupcolumn.pileups:
90 if not pileupread.is_del and not pileupread.is_refskip:
91 # query position is None if is_del or is_refskip is set.
92 nuc = pileupread.alignment.query_sequence[pileupread.query_position]
93 dcs_len.append(len(pileupread.alignment.query_sequence))
94 if nuc == alt:
95 count_alt += 1
96 tag = pileupread.alignment.query_name
97 if tag in tag_dict:
98 tag_dict[tag][chrom_stop_pos] = alt
99 else:
100 tag_dict[tag] = {}
101 tag_dict[tag][chrom_stop_pos] = alt
102 elif nuc == ref:
103 count_ref += 1
104 elif nuc == "N":
105 count_n += 1
106 elif nuc == "lowQ":
107 count_lowq += 1
108 else:
109 count_other += 1
110 else:
111 count_indel += 1
112 dcs_median = np.median(np.array(dcs_len))
113 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
78 114
79 for m in range(len(mut_array[:, 0])): 115 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
80 print(str(m + 1) + " of " + str(len(mut_array[:, 0]))) 116 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
81 chrom = mut_array[m, 1] 117 count_indel, count_lowq, dcs_median))
82 stop_pos = mut_array[m, 2].astype(int) 118 else:
83 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) 119 print("indels are currently not evaluated")
84 ref = mut_array[m, 9]
85 alt = mut_array[m, 10]
86
87 dcs_len = []
88
89 for pileupcolumn in bam.pileup(chrom, stop_pos - 2, stop_pos, max_depth=100000000):
90
91 if pileupcolumn.reference_pos == stop_pos - 1:
92 count_alt = 0
93 count_ref = 0
94 count_indel = 0
95 count_n = 0
96 count_other = 0
97 count_lowq = 0
98 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
99 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
100 for pileupread in pileupcolumn.pileups:
101 if not pileupread.is_del and not pileupread.is_refskip:
102 # query position is None if is_del or is_refskip is set.
103 nuc = pileupread.alignment.query_sequence[pileupread.query_position]
104 dcs_len.append(len(pileupread.alignment.query_sequence))
105 if nuc == alt:
106 count_alt += 1
107 tag = pileupread.alignment.query_name
108 if tag in tag_dict:
109 tag_dict[tag][chrom_stop_pos] = alt
110 else:
111 tag_dict[tag] = {}
112 tag_dict[tag][chrom_stop_pos] = alt
113 elif nuc == ref:
114 count_ref += 1
115 elif nuc == "N":
116 count_n += 1
117 elif nuc == "lowQ":
118 count_lowq += 1
119 else:
120 count_other += 1
121 else:
122 count_indel += 1
123 dcs_median = np.median(np.array(dcs_len))
124 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
125
126 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
127 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
128 count_indel, count_lowq, dcs_median))
129 bam.close() 120 bam.close()
130 121
131 with open(json_file, "w") as f: 122 with open(json_file, "w") as f:
132 json.dump((tag_dict, cvrg_dict), f) 123 json.dump((tag_dict, cvrg_dict), f)
133 124