comparison mut2read.py @ 0:8d29173d49a9 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/variant_analyzer commit 5a438f76d0ecb6478f82dae6b9596bc7f5a4f4e8"
author iuc
date Wed, 20 Nov 2019 17:47:35 -0500
parents
children 3556001ff2db
comparison
equal deleted inserted replaced
-1:000000000000 0:8d29173d49a9
1 #!/usr/bin/env python
2
3 """mut2read.py
4
5 Author -- Gundula Povysil
6 Contact -- povysil@bioinf.jku.at
7
8 Takes a tabular file with mutations and a BAM file as input and prints
9 all tags of reads that carry the mutation to a user specified output file.
10 Creates fastq file of reads of tags with mutation.
11
12 ======= ========== ================= ================================
13 Version Date Author Description
14 0.2.1 2019-10-27 Gundula Povysil -
15 ======= ========== ================= ================================
16
17 USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq
18 tag_count_dict.json
19 """
20
21 import argparse
22 import json
23 import os
24 import sys
25
26 import numpy as np
27 import pysam
28
29
30 def make_argparser():
31 parser = argparse.ArgumentParser(description='Takes a tabular file with mutations and a BAM file as input and prints all tags of reads that carry the mutation to a user specified output file and creates a fastq file of reads of tags with mutation.')
32 parser.add_argument('--mutFile',
33 help='TABULAR file with DCS mutations.')
34 parser.add_argument('--bamFile',
35 help='BAM file with aligned DCS reads.')
36 parser.add_argument('--familiesFile',
37 help='TABULAR file with aligned families.')
38 parser.add_argument('--outputFastq',
39 help='Output FASTQ file of reads with mutations.')
40 parser.add_argument('--outputJson',
41 help='Output JSON file to store collected data.')
42 return parser
43
44
45 def mut2read(argv):
46 parser = make_argparser()
47 args = parser.parse_args(argv[1:])
48
49 file1 = args.mutFile
50 file2 = args.bamFile
51 file3 = args.familiesFile
52 outfile = args.outputFastq
53 json_file = args.outputJson
54
55 if os.path.isfile(file1) is False:
56 sys.exit("Error: Could not find '{}'".format(file1))
57
58 if os.path.isfile(file2) is False:
59 sys.exit("Error: Could not find '{}'".format(file2))
60
61 if os.path.isfile(file3) is False:
62 sys.exit("Error: Could not find '{}'".format(file3))
63
64 # read mut file
65 with open(file1, 'r') as mut:
66 mut_array = np.genfromtxt(mut, skip_header=1, delimiter='\t', comments='#', dtype='string')
67
68 # read dcs bam file
69 # pysam.index(file2)
70 bam = pysam.AlignmentFile(file2, "rb")
71
72 # get tags
73 tag_dict = {}
74 cvrg_dict = {}
75
76 if len(mut_array) == 13:
77 mut_array = mut_array.reshape((1, len(mut_array)))
78
79 for m in range(len(mut_array[:, 0])):
80 print(str(m + 1) + " of " + str(len(mut_array[:, 0])))
81 chrom = mut_array[m, 1]
82 stop_pos = mut_array[m, 2].astype(int)
83 chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
84 ref = mut_array[m, 9]
85 alt = mut_array[m, 10]
86
87 dcs_len = []
88
89 for pileupcolumn in bam.pileup(chrom.tobytes(), stop_pos - 2, stop_pos, max_depth=100000000):
90
91 if pileupcolumn.reference_pos == stop_pos - 1:
92 count_alt = 0
93 count_ref = 0
94 count_indel = 0
95 count_n = 0
96 count_other = 0
97 count_lowq = 0
98 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
99 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
100 for pileupread in pileupcolumn.pileups:
101 if not pileupread.is_del and not pileupread.is_refskip:
102 # query position is None if is_del or is_refskip is set.
103 nuc = pileupread.alignment.query_sequence[pileupread.query_position]
104 dcs_len.append(len(pileupread.alignment.query_sequence))
105 if nuc == alt:
106 count_alt += 1
107 tag = pileupread.alignment.query_name
108 if tag in tag_dict:
109 tag_dict[tag][chrom_stop_pos] = alt
110 else:
111 tag_dict[tag] = {}
112 tag_dict[tag][chrom_stop_pos] = alt
113 elif nuc == ref:
114 count_ref += 1
115 elif nuc == "N":
116 count_n += 1
117 elif nuc == "lowQ":
118 count_lowq += 1
119 else:
120 count_other += 1
121 else:
122 count_indel += 1
123 dcs_median = np.median(np.array(dcs_len))
124 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
125
126 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
127 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
128 count_indel, count_lowq, dcs_median))
129 bam.close()
130
131 with open(json_file, "w") as f:
132 json.dump((tag_dict, cvrg_dict), f)
133
134 # create fastq from aligned reads
135 with open(outfile, 'w') as out:
136 with open(file3, 'r') as families:
137 for line in families:
138 line = line.rstrip('\n')
139 splits = line.split('\t')
140 tag = splits[0]
141
142 if tag in tag_dict:
143 str1 = splits[4]
144 curr_seq = str1.replace("-", "")
145 str2 = splits[5]
146 curr_qual = str2.replace(" ", "")
147
148 out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n")
149 out.write(curr_seq + "\n")
150 out.write("+" + "\n")
151 out.write(curr_qual + "\n")
152
153
154 if __name__ == '__main__':
155 sys.exit(mut2read(sys.argv))