comparison fsd_regions.py @ 11:37db9decb5d0 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 2aea9e30f5ed4fd3db3fb44ddb8aacb48a62eccc
author mheinzl
date Mon, 26 Nov 2018 04:51:11 -0500
parents eabfdc012d7b
children 63432e6f6a61
comparison
equal deleted inserted replaced
10:04c544617b44 11:37db9decb5d0
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 # Family size distribution of tags which were aligned to the reference genome 3 # Family size distribution of tags which were aligned to the reference genome
4 # 4 #
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) 5 # Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria)
6 # Contact: monika.heinzl@edumail.at 6 # Contact: monika.heinzl@edumail.at
7 # 7 #
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS 8 # Takes at least one TABULAR file with tags before the alignment to the SSCS,
9 # and a TXT with tags of reads that overlap the regions of the reference genome as input. 9 # a BAM file with tags of reads that overlap the regions of the reference genome and
10 # an optional BED file with chromosome, start and stop position of the regions as input.
10 # The program produces a plot which shows the distribution of family sizes of the tags from the input files and 11 # The program produces a plot which shows the distribution of family sizes of the tags from the input files and
11 # a tabular file with the data of the plot. 12 # a tabular file with the data of the plot.
12 13
13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf 14 # USAGE: python FSD_regions.py --inputFile filenameSSCS --inputName1 filenameSSCS
15 # --bamFile DCSbamFile --rangesFile BEDfile --output_tabular outptufile_name_tabular
16 # --output_pdf outputfile_name_pdf
14 17
15 import argparse 18 import argparse
19 import collections
16 import re 20 import re
17 import sys 21 import sys
18 from collections import OrderedDict
19 22
20 import matplotlib.pyplot as plt 23 import matplotlib.pyplot as plt
21 import numpy 24 import numpy as np
25 import pysam
22 from matplotlib.backends.backend_pdf import PdfPages 26 from matplotlib.backends.backend_pdf import PdfPages
23 27
24 plt.switch_backend('agg') 28 plt.switch_backend('agg')
25 29
26 30
27 def readFileReferenceFree(file, delim): 31 def readFileReferenceFree(file, delim):
28 with open(file, 'r') as dest_f: 32 with open(file, 'r') as dest_f:
29 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') 33 data_array = np.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
30 return(data_array) 34 return(data_array)
31 35
32 36
33 def make_argparser(): 37 def make_argparser():
34 parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome') 38 parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome')
35 parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.') 39 parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.')
36 parser.add_argument('--inputName1') 40 parser.add_argument('--inputName1')
37 parser.add_argument('--ref_genome', help='TXT File with tags of reads that overlap the region.') 41 parser.add_argument('--bamFile', help='BAM file with aligned reads.')
42 parser.add_argument('--rangesFile', default=None, help='BED file with chromosome, start and stop positions.')
38 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf and tabular file.') 43 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf and tabular file.')
39 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the pdf and tabular file.') 44 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the pdf and tabular file.')
40 return parser 45 return parser
41 46
42 47
45 args = parser.parse_args(argv[1:]) 50 args = parser.parse_args(argv[1:])
46 51
47 firstFile = args.inputFile 52 firstFile = args.inputFile
48 name1 = args.inputName1 53 name1 = args.inputName1
49 name1 = name1.split(".tabular")[0] 54 name1 = name1.split(".tabular")[0]
50 refGenome = args.ref_genome 55 bamFile = args.bamFile
56
57 rangesFile = args.rangesFile
51 title_file = args.output_pdf 58 title_file = args.output_pdf
52 title_file2 = args.output_tabular 59 title_file2 = args.output_tabular
53 sep = "\t" 60 sep = "\t"
54 61
55 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: 62 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:
56 data_array = readFileReferenceFree(firstFile, "\t") 63 data_array = readFileReferenceFree(firstFile, "\t")
57 64 pysam.index(bamFile)
58 mut_array = readFileReferenceFree(refGenome, " ") 65
59 group = numpy.array(mut_array[:, 0]) 66 bam = pysam.AlignmentFile(bamFile, "rb")
60 seq_mut = numpy.array(mut_array[:, 1]) 67 qname_dict = collections.OrderedDict()
61 68
62 seq = numpy.array(data_array[:, 1]) 69 if rangesFile != str(None):
63 tags = numpy.array(data_array[:, 2]) 70 with open(rangesFile, 'r') as regs:
64 quant = numpy.array(data_array[:, 0]).astype(int) 71 range_array = np.genfromtxt(regs, skip_header=0, delimiter='\t', comments='#', dtype='string')
65 72
66 all_ab = seq[numpy.where(tags == "ab")[0]] 73 if range_array.ndim == 0:
67 all_ba = seq[numpy.where(tags == "ba")[0]] 74 print("Error: file has 0 lines")
68 quant_ab = quant[numpy.where(tags == "ab")[0]] 75 exit(2)
69 quant_ba = quant[numpy.where(tags == "ba")[0]] 76
77 if range_array.ndim == 1:
78 chrList = range_array[0]
79 start_posList = range_array[1].astype(int)
80 stop_posList = range_array[2].astype(int)
81 chrList = [chrList.tolist()]
82 start_posList = [start_posList.tolist()]
83 stop_posList = [stop_posList.tolist()]
84 else:
85 chrList = range_array[:, 0]
86 start_posList = range_array[:, 1].astype(int)
87 stop_posList = range_array[:, 2].astype(int)
88
89 if len(start_posList) != len(stop_posList):
90 print("start_positions and end_positions do not have the same length")
91 exit(3)
92
93 chrList = np.array(chrList)
94 start_posList = np.array(start_posList).astype(int)
95 stop_posList = np.array(stop_posList).astype(int)
96 for chr, start_pos, stop_pos in zip(chrList, start_posList, stop_posList):
97 chr_start_stop = "{}_{}_{}".format(chr, start_pos, stop_pos)
98 qname_dict[chr_start_stop] = []
99 for read in bam.fetch(chr.tobytes(), start_pos, stop_pos):
100 if not read.is_unmapped:
101 if re.search('_', read.query_name):
102 tags = re.split('_', read.query_name)[0]
103 else:
104 tags = read.query_name
105 qname_dict[chr_start_stop].append(tags)
106
107 else:
108 for read in bam.fetch():
109 if not read.is_unmapped:
110 if re.search(r'_', read.query_name):
111 tags = re.split('_', read.query_name)[0]
112 else:
113 tags = read.query_name
114
115 if read.reference_name not in qname_dict.keys():
116 qname_dict[read.reference_name] = [tags]
117 else:
118 qname_dict[read.reference_name].append(tags)
119
120 seq = np.array(data_array[:, 1])
121 tags = np.array(data_array[:, 2])
122 quant = np.array(data_array[:, 0]).astype(int)
123 group = np.array(qname_dict.keys())
124
125 all_ab = seq[np.where(tags == "ab")[0]]
126 all_ba = seq[np.where(tags == "ba")[0]]
127 quant_ab = quant[np.where(tags == "ab")[0]]
128 quant_ba = quant[np.where(tags == "ba")[0]]
70 129
71 seqDic_ab = dict(zip(all_ab, quant_ab)) 130 seqDic_ab = dict(zip(all_ab, quant_ab))
72 seqDic_ba = dict(zip(all_ba, quant_ba)) 131 seqDic_ba = dict(zip(all_ba, quant_ba))
73 132
74 if re.search('_(\d)+_(\d)+$', str(mut_array[0,0])) is None:
75 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)
76 group = mut_array[seqMut_index,0]
77 mut_array = mut_array[seqMut_index,:]
78 length_regions = len(seq_mut)*2
79
80 groupUnique, group_index = numpy.unique(group, return_index=True)
81 groupUnique = groupUnique[numpy.argsort(group_index)]
82
83 lst_ab = [] 133 lst_ab = []
84 lst_ba = [] 134 lst_ba = []
85 for i in seq_mut:
86 lst_ab.append(seqDic_ab.get(i))
87 lst_ba.append(seqDic_ba.get(i))
88
89 quant_ab = numpy.array(lst_ab)
90 quant_ba = numpy.array(lst_ba)
91
92 quantAfterRegion = [] 135 quantAfterRegion = []
93 136 length_regions = 0
94 for i in groupUnique: 137 for i in group:
95 dataAB = quant_ab[numpy.where(group == i)[0]] 138 lst_ab_r = []
96 dataBA = quant_ba[numpy.where(group == i)[0]] 139 lst_ba_r = []
97 bigFamilies = numpy.where(dataAB > 20)[0] 140 seq_mut = qname_dict[i]
141 if rangesFile == str(None):
142 seq_mut, seqMut_index = np.unique(np.array(seq_mut), return_index=True)
143 length_regions = length_regions + len(seq_mut) * 2
144 for r in seq_mut:
145 count_ab = seqDic_ab.get(r)
146 count_ba = seqDic_ba.get(r)
147 lst_ab_r.append(count_ab)
148 lst_ab.append(count_ab)
149 lst_ba_r.append(count_ba)
150 lst_ba.append(count_ba)
151
152 dataAB = np.array(lst_ab_r)
153 dataBA = np.array(lst_ba_r)
154 bigFamilies = np.where(dataAB > 20)[0]
98 dataAB[bigFamilies] = 22 155 dataAB[bigFamilies] = 22
99 bigFamilies = numpy.where(dataBA > 20)[0] 156 bigFamilies = np.where(dataBA > 20)[0]
100 dataBA[bigFamilies] = 22 157 dataBA[bigFamilies] = 22
101 158
102 quantAll = numpy.concatenate((dataAB, dataBA)) 159 quantAll = np.concatenate((dataAB, dataBA))
103 quantAfterRegion.append(quantAll) 160 quantAfterRegion.append(quantAll)
104 161
105 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) 162 quant_ab = np.array(lst_ab)
106 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) 163 quant_ba = np.array(lst_ba)
164
165 maximumX = np.amax(np.concatenate(quantAfterRegion))
166 minimumX = np.amin(np.concatenate(quantAfterRegion))
107 167
108 # PLOT 168 # PLOT
109 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format 169 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
110 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color 170 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
111 plt.rcParams['xtick.labelsize'] = 14 171 plt.rcParams['xtick.labelsize'] = 14
115 plt.subplots_adjust(bottom=0.3) 175 plt.subplots_adjust(bottom=0.3)
116 176
117 colors = ["#6E6E6E", "#0431B4", "#5FB404", "#B40431", "#F4FA58", "#DF7401", "#81DAF5"] 177 colors = ["#6E6E6E", "#0431B4", "#5FB404", "#B40431", "#F4FA58", "#DF7401", "#81DAF5"]
118 178
119 col = [] 179 col = []
120 for i in range(0, len(groupUnique)): 180 for i in range(0, len(group)):
121 col.append(colors[i]) 181 col.append(colors[i])
122 182
123 counts = plt.hist(quantAfterRegion, bins=range(minimumX, maximumX + 1), stacked=False, label=groupUnique, 183 counts = plt.hist(quantAfterRegion, bins=range(minimumX, maximumX + 1), stacked=False, label=group,
124 align="left", alpha=1, color=col, edgecolor="black", linewidth=1) 184 align="left", alpha=1, color=col, edgecolor="black", linewidth=1)
125 ticks = numpy.arange(minimumX - 1, maximumX, 1) 185 ticks = np.arange(minimumX - 1, maximumX, 1)
126 186
127 ticks1 = map(str, ticks) 187 ticks1 = map(str, ticks)
128 ticks1[len(ticks1) - 1] = ">20" 188 ticks1[len(ticks1) - 1] = ">20"
129 plt.xticks(numpy.array(ticks), ticks1) 189 plt.xticks(np.array(ticks), ticks1)
130 count = numpy.bincount(map(int, quant_ab)) # original counts 190 count = np.bincount(map(int, quant_ab)) # original counts
131 191
132 legend = "max. family size =\nabsolute frequency=\nrelative frequency=\n\ntotal nr. of reads=" 192 legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)"
133 plt.text(0.15, 0.105, legend, size=11, transform=plt.gcf().transFigure) 193 plt.text(0.15, 0.085, legend, size=11, transform=plt.gcf().transFigure)
134 194
135 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \ 195 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}".format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), sum(np.array(data_array[:, 0]).astype(int)))
136 .format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count),
137 sum(numpy.array(data_array[:, 0]).astype(int)))
138 plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure) 196 plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
139 197
140 count2 = numpy.bincount(map(int, quant_ba)) # original counts 198 count2 = np.bincount(map(int, quant_ba)) # original counts
141 199
142 legend = "BA\n{}\n{}\n{:.5f}" \ 200 legend = "BA\n{}\n{}\n{:.5f}" \
143 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) 201 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
144 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure) 202 plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)
145 203
146 plt.text(0.55, 0.22, "total nr. of tags=", size=11, transform=plt.gcf().transFigure) 204 plt.text(0.55, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure)
147 plt.text(0.75, 0.22, "{:,} ({:,})".format(length_regions, length_regions/2), size=11, transform=plt.gcf().transFigure) 205 plt.text(0.8, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11,
148 206 transform=plt.gcf().transFigure)
149 # legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.' 207
150 # plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) 208 legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n"
151 209 plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure)
152 plt.text(0.75, 0.18, "total nr. of tags per region", size=11, transform=plt.gcf().transFigure) 210
153 #space = numpy.arange(0, len(groupUnique), 0.02) 211 space = 0
154 s = 0 212 for i, count in zip(group, quantAfterRegion):
155 index_array = 0 213 plt.text(0.55, 0.15 - space, "{}:\n".format(i), size=11, transform=plt.gcf().transFigure)
156 for i, count in zip(groupUnique, quantAfterRegion): 214 plt.text(0.8, 0.15 - space, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)
157 index_of_current_region = numpy.where(group == i)[0] 215 space = space + 0.02
158 plt.text(0.55, 0.14 - s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure)
159 if re.search('_(\d)+_(\d)+$', str(mut_array[0, 0])) is None:
160 nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1]))
161 else:
162 nr_tags_ab = len(mut_array[index_of_current_region, 1])
163 plt.text(0.75, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure)
164 s = s + 0.02
165 216
166 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) 217 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
167 plt.xlabel("Family size", fontsize=14) 218 plt.xlabel("Family size", fontsize=14)
168 plt.ylabel("Absolute Frequency", fontsize=14) 219 plt.ylabel("Absolute Frequency", fontsize=14)
169 plt.grid(b=True, which="major", color="#424242", linestyle=":") 220 plt.grid(b=True, which="major", color="#424242", linestyle=":")
175 output_file.write("Dataset:{}{}\n".format(sep, name1)) 226 output_file.write("Dataset:{}{}\n".format(sep, name1))
176 output_file.write("{}AB{}BA\n".format(sep, sep)) 227 output_file.write("{}AB{}BA\n".format(sep, sep))
177 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) 228 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))
178 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) 229 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
179 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) 230 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
180 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) 231 output_file.write("total nr. of reads{}{}\n".format(sep, sum(np.array(data_array[:, 0]).astype(int))))
181 output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions/2)) 232 output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions / 2))
182
183 output_file.write("\n\nValues from family size distribution\n") 233 output_file.write("\n\nValues from family size distribution\n")
184 output_file.write("{}".format(sep)) 234 output_file.write("{}".format(sep))
185 for i in groupUnique: 235 for i in group:
186 output_file.write("{}{}".format(i, sep)) 236 output_file.write("{}{}".format(i, sep))
187 output_file.write("\n") 237 output_file.write("\n")
238
188 j = 0 239 j = 0
189 for fs in counts[1][0:len(counts[1]) - 1]: 240 for fs in counts[1][0:len(counts[1]) - 1]:
190 if fs == 21: 241 if fs == 21:
191 fs = ">20" 242 fs = ">20"
192 else: 243 else:
193 fs = "={}".format(fs) 244 fs = "={}".format(fs)
194 output_file.write("FS{}{}".format(fs, sep)) 245 output_file.write("FS{}{}".format(fs, sep))
195 246 if len(group) == 1:
196 if len(groupUnique) == 1:
197 output_file.write("{}{}".format(int(counts[0][j]), sep)) 247 output_file.write("{}{}".format(int(counts[0][j]), sep))
198 else: 248 else:
199 for n in range(len(groupUnique)): 249 for n in range(len(group)):
200 output_file.write("{}{}".format(int(counts[0][n][j]), sep)) 250 output_file.write("{}{}".format(int(counts[0][n][j]), sep))
201
202 output_file.write("\n") 251 output_file.write("\n")
203 j += 1 252 j += 1
204 output_file.write("sum{}".format(sep)) 253 output_file.write("sum{}".format(sep))
205 if len(groupUnique) == 1: 254
255 if len(group) == 1:
206 output_file.write("{}{}".format(int(sum(counts[0])), sep)) 256 output_file.write("{}{}".format(int(sum(counts[0])), sep))
207 else: 257 else:
208 for i in counts[0]: 258 for i in counts[0]:
209 output_file.write("{}{}".format(int(sum(i)), sep)) 259 output_file.write("{}{}".format(int(sum(i)), sep))
210 output_file.write("\n") 260 output_file.write("\n")
211 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the count of the tags per region.\n") 261 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n")
212 output_file.write("\n\nRegion{}total nr. of tags per region\n".format(sep, sep)) 262 output_file.write("Region{}total nr. of tags per region\n".format(sep))
213 263 for i, count in zip(group, quantAfterRegion):
214 for i, count in zip(groupUnique, quantAfterRegion): 264 output_file.write("{}{}{}\n".format(i, sep, len(count) / 2))
215 output_file.write("{}{}{}\n".format(i,sep,len(count) / 2)) 265
216 print("Files successfully created!") 266 print("Files successfully created!")
217 267
218 268
219 if __name__ == '__main__': 269 if __name__ == '__main__':
220 sys.exit(compare_read_families_refGenome(sys.argv)) 270 sys.exit(compare_read_families_refGenome(sys.argv))