comparison fsd.py @ 45:6651e76baca1 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit 033dd7b750f68e8aa68f327d7d72bd311ddbee4e-dirty
author mheinzl
date Tue, 27 Aug 2019 07:36:53 -0400
parents a76af7fd9fca
children 901827154779
comparison
equal deleted inserted replaced
44:a76af7fd9fca 45:6651e76baca1
39 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') 39 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
40 parser.add_argument('--inputName3') 40 parser.add_argument('--inputName3')
41 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') 41 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.')
42 parser.add_argument('--inputName4') 42 parser.add_argument('--inputName4')
43 parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.') 43 parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.')
44 parser.add_argument('--rel_freq', action="store_false", help='If False, the relative frequencies are displayed.')
44 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') 45 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.')
45 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') 46 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.')
46 return parser 47 return parser
47 48
48 49
59 thirdFile = args.inputFile3 60 thirdFile = args.inputFile3
60 name3 = args.inputName3 61 name3 = args.inputName3
61 fourthFile = args.inputFile4 62 fourthFile = args.inputFile4
62 name4 = args.inputName4 63 name4 = args.inputName4
63 log_axis = args.log_axis 64 log_axis = args.log_axis
65 rel_freq = args.rel_freq
64 66
65 title_file = args.output_tabular 67 title_file = args.output_tabular
66 title_file2 = args.output_pdf 68 title_file2 = args.output_pdf
67 69
68 sep = "\t" 70 sep = "\t"
76 list_to_plot = [] 78 list_to_plot = []
77 label = [] 79 label = []
78 data_array_list = [] 80 data_array_list = []
79 list_to_plot_original = [] 81 list_to_plot_original = []
80 colors = [] 82 colors = []
81 bins = numpy.arange(1, 22) 83 bins = numpy.arange(1, 22)
82 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: 84 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
83 fig = plt.figure() 85 fig = plt.figure()
84 fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) 86 fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
85 fig2 = plt.figure() 87 fig2 = plt.figure()
86 fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) 88 fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0)
96 # data1 = numpy.array(file1[:, 0]).astype(int) 98 # data1 = numpy.array(file1[:, 0]).astype(int)
97 # bigFamilies = numpy.where(data1 > 20)[0] 99 # bigFamilies = numpy.where(data1 > 20)[0]
98 # data1[bigFamilies] = 22 100 # data1[bigFamilies] = 22
99 data1 = numpy.clip(integers, bins[0], bins[-1]) 101 data1 = numpy.clip(integers, bins[0], bins[-1])
100 name1 = name1.split(".tabular")[0] 102 name1 = name1.split(".tabular")[0]
103 if len(name1) > 40:
104 name1 = name1[:40]
101 list_to_plot.append(data1) 105 list_to_plot.append(data1)
102 label.append(name1) 106 label.append(name1)
103 data_array_list.append(file1) 107 data_array_list.append(file1)
104 108
105 legend = "\n\n\n{}".format(name1) 109 legend = "\n\n\n{}".format(name1)
106 fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) 110 fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
107 fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) 111 fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure)
108 112
109 legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1)) 113 legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1],
114 float(numpy.bincount(data1)[1]) / len(data1))
110 fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) 115 fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
111 fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) 116 fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure)
112 117
113 legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers)) 118 legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1],
119 float(numpy.bincount(data1)[1]) / sum(integers))
114 fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) 120 fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
115 fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) 121 fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure)
116 122
117 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(len(integers[integers > 20]), 123 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(len(integers[integers > 20]),
118 float(sum(integers[integers > 20])) 124 float(len(integers[integers > 20]))
119 / sum(integers)) 125 / len(integers))
120 fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) 126 fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
121 fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) 127 fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure)
122 128
123 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) 129 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]),
130 float(sum(integers[integers > 20])) / sum(integers))
124 fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) 131 fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
125 fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) 132 fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure)
126 133
127 legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) 134 legend6 = "total nr. of\ntags\n{:,}".format(len(data1))
128 fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) 135 fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure)
143 # data2[bigFamilies2] = 22 150 # data2[bigFamilies2] = 22
144 151
145 data2 = numpy.clip(integers2, bins[0], bins[-1]) 152 data2 = numpy.clip(integers2, bins[0], bins[-1])
146 list_to_plot.append(data2) 153 list_to_plot.append(data2)
147 name2 = name2.split(".tabular")[0] 154 name2 = name2.split(".tabular")[0]
155 if len(name2) > 40:
156 name2 = name2[:40]
148 label.append(name2) 157 label.append(name2)
149 data_array_list.append(file2) 158 data_array_list.append(file2)
150 159
151 fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) 160 fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
152 fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) 161 fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure)
157 166
158 legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2)) 167 legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2))
159 fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) 168 fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
160 fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) 169 fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure)
161 170
162 legend4 = "{:,} ({:.3f})".format(len(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) 171 legend4 = "{:,} ({:.3f})".format(len(integers2[integers2 > 20]),
172 float(len(integers2[integers2 > 20])) / len(integers2))
163 fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) 173 fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
164 fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) 174 fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure)
165 175
166 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) 176 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]),
177 float(sum(integers2[integers2 > 20])) / sum(integers2))
167 fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) 178 fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
168 fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) 179 fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
169 180
170 legend6 = "{:,}".format(len(data2)) 181 legend6 = "{:,}".format(len(data2))
171 fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) 182 fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure)
186 # data3[bigFamilies3] = 22 197 # data3[bigFamilies3] = 22
187 198
188 data3 = numpy.clip(integers3, bins[0], bins[-1]) 199 data3 = numpy.clip(integers3, bins[0], bins[-1])
189 list_to_plot.append(data3) 200 list_to_plot.append(data3)
190 name3 = name3.split(".tabular")[0] 201 name3 = name3.split(".tabular")[0]
202 if len(name3) > 40:
203 name3 = name3[:40]
191 label.append(name3) 204 label.append(name3)
192 data_array_list.append(file3) 205 data_array_list.append(file3)
193 206
194 fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) 207 fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
195 fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) 208 fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure)
196 209
197 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3)) 210 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3))
198 fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) 211 fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
199 fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) 212 fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure)
200 213
201 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3)) 214 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1],
215 float(numpy.bincount(data3)[1]) / sum(integers3))
202 fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) 216 fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
203 fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) 217 fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure)
204 218
205 legend4 = "{:,} ({:.3f})".format(len(integers3[integers3 > 20]), float(sum(integers3[integers3 > 20])) / sum(integers3)) 219 legend4 = "{:,} ({:.3f})".format(len(integers3[integers3 > 20]),
220 float(len(integers3[integers3 > 20])) / len(integers3))
206 fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) 221 fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
207 fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) 222 fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure)
208 223
209 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), 224 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]),
210 float(sum(integers3[integers3 > 20])) / sum(integers3)) 225 float(sum(integers3[integers3 > 20])) / sum(integers3))
229 # bigFamilies4 = numpy.where(data4 > 20)[0] 244 # bigFamilies4 = numpy.where(data4 > 20)[0]
230 # data4[bigFamilies4] = 22 245 # data4[bigFamilies4] = 22
231 data4 = numpy.clip(integers4, bins[0], bins[-1]) 246 data4 = numpy.clip(integers4, bins[0], bins[-1])
232 list_to_plot.append(data4) 247 list_to_plot.append(data4)
233 name4 = name4.split(".tabular")[0] 248 name4 = name4.split(".tabular")[0]
249 if len(name4) > 40:
250 name4 = name4[:40]
234 label.append(name4) 251 label.append(name4)
235 data_array_list.append(file4) 252 data_array_list.append(file4)
236 253
237 fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) 254 fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
238 fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) 255 fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure)
239 256
240 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4)) 257 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4))
241 fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) 258 fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
242 fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) 259 fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure)
243 260
244 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4)) 261 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1],
262 float(numpy.bincount(data4)[1]) / sum(integers4))
245 fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) 263 fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
246 fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) 264 fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure)
247 265
248 legend4 = "{:,} ({:.3f})".format(len(integers4[integers4 > 20]), float(sum(integers4[integers4 > 20])) / sum(integers4)) 266 legend4 = "{:,} ({:.3f})".format(len(integers4[integers4 > 20]),
267 float(len(integers4[integers4 > 20])) / len(integers4))
249 fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) 268 fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
250 fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) 269 fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure)
251 270
252 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), 271 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]),
253 float(sum(integers4[integers4 > 20])) / sum(integers4)) 272 float(sum(integers4[integers4 > 20])) / sum(integers4))
263 fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) 282 fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure)
264 283
265 maximumX = numpy.amax(numpy.concatenate(list_to_plot)) 284 maximumX = numpy.amax(numpy.concatenate(list_to_plot))
266 minimumX = numpy.amin(numpy.concatenate(list_to_plot)) 285 minimumX = numpy.amin(numpy.concatenate(list_to_plot))
267 list_to_plot2 = list_to_plot 286 list_to_plot2 = list_to_plot
268 to_plot = ["Absolute frequencies", "Relative frequencies"] 287
269 plt.xticks([], []) 288 if rel_freq:
270 plt.yticks([], []) 289 ylab = "Relative Frequency"
271 fig.suptitle('Family Size Distribution (tags)', fontsize=14) 290 else:
272 291 ylab = "Absolute Frequency"
273 for l in range(len(to_plot)): 292
274 ax = fig.add_subplot(2, 1, l+1) 293 # PLOT FSD based on tags
275 ticks = numpy.arange(1, 22, 1) 294 fig.suptitle('Family Size Distribution (FSD) based on families', fontsize=14)
276 ticks1 = map(str, ticks) 295 ax = fig.add_subplot(1, 1, 1)
277 ticks1[len(ticks1) - 1] = ">20" 296 ticks = numpy.arange(1, 22, 1)
278 297 ticks1 = map(str, ticks)
279 if to_plot[l] == "Relative frequencies": 298 ticks1[len(ticks1) - 1] = ">20"
280 w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2] 299 ax.set_xticks([], [])
281 counts_rel = ax.hist(list_to_plot2, weights=w, 300 if rel_freq:
282 bins=numpy.arange(1, 23), stacked=False, edgecolor="black", 301 w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2]
283 linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8) 302 counts = ax.hist(list_to_plot2, weights=w,
284 ax.set_ylim(0, 1.07) 303 bins=numpy.arange(1, 23), stacked=False, edgecolor="black", color=colors,
304 linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8)
305 ax.set_ylim(0, 1.07)
306 else:
307 counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1,
308 label=label, align="left", alpha=0.7, rwidth=0.8, color=colors)
309 ax.set_xticks(numpy.array(ticks))
310 ax.set_xticklabels(ticks1)
311 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
312
313 ax.set_ylabel(ylab, fontsize=14)
314 ax.set_xlabel("Family size", fontsize=14)
315 if log_axis:
316 ax.set_yscale('log')
317 ax.grid(b=True, which="major", color="#424242", linestyle=":")
318 ax.margins(0.01, None)
319 pdf.savefig(fig)
320 #plt.close()
321
322 # PLOT FSD based on PE reads
323 fig2.suptitle('Family Size Distribution (FSD) based on PE reads', fontsize=14)
324 ax2 = fig2.add_subplot(1, 1, 1)
325 ticks = numpy.arange(1, 22)
326 ticks1 = map(str, ticks)
327 ticks1[len(ticks1) - 1] = ">20"
328 reads = []
329 reads_rel = []
330
331 barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1)
332 ax2.set_xticks([], [])
333
334 for i in range(len(list_to_plot2)):
335 x = list(numpy.arange(1, 22).astype(float))
336 unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
337 y = unique * c
338 if sum(list_to_plot_original[i] > 20) > 0:
339 y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
340 y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
341 reads.append(y)
342 reads_rel.append(list(numpy.float_(y)) / sum(y))
343
344 if len(list_to_plot2) == 1:
345 x = [xi * 0.5 for xi in x]
346 w = 0.4
285 else: 347 else:
286 counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8, color=colors) 348 x = [xi + barWidth for xi in x]
287 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) 349 w = 1. / (len(list_to_plot) + 1)
288 350 if rel_freq:
289 ax.set_xticks(numpy.array(ticks)) 351 counts2_rel = ax2.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w,
290 ax.set_xticklabels(ticks1) 352 edgecolor="black", label=label[i], linewidth=1, alpha=0.7, color=colors[i])
291 353 ax2.set_ylim(0, 1.07)
292 ax.set_ylabel(to_plot[l], fontsize=14)
293 ax.set_xlabel("Family size", fontsize=14)
294 if log_axis:
295 ax.set_yscale('log')
296 ax.grid(b=True, which="major", color="#424242", linestyle=":")
297 ax.margins(0.01, None)
298 pdf.savefig(fig)
299 plt.close()
300
301 fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14)
302 for l in range(len(to_plot)):
303 ax = fig2.add_subplot(2, 1, l + 1)
304 ticks = numpy.arange(minimumX, maximumX + 1)
305 ticks = numpy.arange(1, 22)
306 ticks1 = map(str, ticks)
307 ticks1[len(ticks1) - 1] = ">20"
308 reads = []
309 reads_rel = []
310
311 barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot)+1)
312
313 for i in range(len(list_to_plot2)):
314 x = list(numpy.arange(1, 22).astype(float))
315 unique, c = numpy.unique(list_to_plot2[i], return_counts=True)
316 y = unique * c
317 if sum(list_to_plot_original[i] > 20) > 0:
318 y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20])
319 y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
320 reads.append(y)
321 reads_rel.append(list(numpy.float_(y)) / sum(y))
322 #x = [xi + barWidth for xi in x]
323
324 if len(list_to_plot2) == 1:
325 x = [xi * 0.5 for xi in x]
326 w = 0.4
327 else:
328 x = [xi + barWidth for xi in x]
329 w = 1./(len(list_to_plot) + 1)
330 if to_plot[l] == "Relative frequencies":
331 counts2_rel = ax.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w,
332 edgecolor="black", label=label[i],linewidth=1, alpha=0.7, color=colors[i])
333 ax.set_ylim(0, 1.07)
334 else:
335 #y = list(y.reshape((len(y))))
336 counts2 = ax.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1,
337 alpha=0.7, color=colors[i])
338 if i == len(list_to_plot2)-1:
339 barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
340 else:
341 barWidth += 1. / (len(list_to_plot) + 1)
342
343 if to_plot[l] == "Absolute frequencies":
344 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
345 else: 354 else:
346 ax.set_xlabel("Family size", fontsize=14) 355 counts2 = ax2.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1,
347 356 alpha=0.7, color=colors[i])
348 if len(list_to_plot2) == 1: 357
349 ax.set_xticks(numpy.array([xi + 0.2 for xi in x])) 358 if i == len(list_to_plot2) - 1:
359 barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1)
350 else: 360 else:
351 ax.set_xticks(numpy.array(ticks)) 361 barWidth += 1. / (len(list_to_plot) + 1)
352 ax.set_xticklabels(ticks1) 362
353 ax.set_ylabel(to_plot[l], fontsize=14) 363 ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
354 if log_axis: 364
355 ax.set_yscale('log') 365 if len(list_to_plot2) == 1:
356 ax.grid(b=True, which="major", color="#424242", linestyle=":") 366 ax2.set_xticks(numpy.array([xi + 0.2 for xi in x]))
357 ax.margins(0.01, None) 367 else:
368 ax2.set_xticks(numpy.array(ticks))
369 ax2.set_xticklabels(ticks1)
370 ax2.set_xlabel("Family size", fontsize=14)
371 ax2.set_ylabel(ylab, fontsize=14)
372 if log_axis:
373 ax2.set_yscale('log')
374 ax2.grid(b=True, which="major", color="#424242", linestyle=":")
375 ax2.margins(0.01, None)
358 376
359 pdf.savefig(fig2) 377 pdf.savefig(fig2)
360 plt.close() 378 plt.close()
361 379
362 # write data to CSV file tags 380 # write data to CSV file tags
363 output_file.write("Values from family size distribution with all datasets (tags)\n") 381 counts = [numpy.bincount(d, minlength=22)[1:] for d in list_to_plot2] # original counts of family sizes
382 output_file.write("Values from family size distribution with all datasets based on families\n")
364 output_file.write("\nFamily size") 383 output_file.write("\nFamily size")
365 for i in label: 384 for i in label:
366 output_file.write("{}{}".format(sep, i)) 385 output_file.write("{}{}".format(sep, i))
367 # output_file.write("{}sum".format(sep))
368 output_file.write("\n") 386 output_file.write("\n")
369 j = 0 387 j = 0
370 for fs in counts[1][0:len(counts[1]) - 1]: 388 for fs in bins:
371 if fs == 21: 389 if fs == 21:
372 fs = ">20" 390 fs = ">20"
373 else: 391 else:
374 fs = "={}".format(fs) 392 fs = "={}".format(fs)
375 output_file.write("FS{}{}".format(fs, sep)) 393 output_file.write("FS{}{}".format(fs, sep))
376 if len(label) == 1: 394 for n in range(len(label)):
377 output_file.write("{}{}".format(int(counts[0][j]), sep)) 395 output_file.write("{}{}".format(int(counts[n][j]), sep))
378 else:
379 for n in range(len(label)):
380 output_file.write("{}{}".format(int(counts[0][n][j]), sep))
381 output_file.write("\n") 396 output_file.write("\n")
382 j += 1 397 j += 1
383 output_file.write("sum{}".format(sep)) 398 output_file.write("sum{}".format(sep))
384 if len(label) == 1: 399 for i in counts:
385 output_file.write("{}{}".format(int(sum(counts[0])), sep)) 400 output_file.write("{}{}".format(int(sum(i)), sep))
386 else:
387 for i in counts[0]:
388 output_file.write("{}{}".format(int(sum(i)), sep))
389 401
390 # write data to CSV file PE reads 402 # write data to CSV file PE reads
391 output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n") 403 output_file.write("\n\nValues from family size distribution with all datasets based on PE reads\n")
392 output_file.write("\nFamily size") 404 output_file.write("\nFamily size")
393 for i in label: 405 for i in label:
394 output_file.write("{}{}".format(sep, i)) 406 output_file.write("{}{}".format(sep, i))
395 # output_file.write("{}sum".format(sep))
396 output_file.write("\n") 407 output_file.write("\n")
397 j = 0 408 j = 0
398 409
399 for fs in bins: 410 for fs in bins:
400 if fs == 21: 411 if fs == 21:
401 fs = ">20" 412 fs = ">20"
402 else: 413 else:
403 fs = "={}".format(fs) 414 fs = "={}".format(fs)
408 for n in range(len(label)): 419 for n in range(len(label)):
409 output_file.write("{}{}".format(int(reads[n][j]), sep)) 420 output_file.write("{}{}".format(int(reads[n][j]), sep))
410 output_file.write("\n") 421 output_file.write("\n")
411 j += 1 422 j += 1
412 output_file.write("sum{}".format(sep)) 423 output_file.write("sum{}".format(sep))
413 if len(label) == 1: 424 if len(label) == 1:
414 output_file.write("{}{}".format(int(sum(numpy.concatenate(reads))), sep)) 425 output_file.write("{}{}".format(int(sum(numpy.concatenate(reads))), sep))
415 else: 426 else:
416 for i in reads: 427 for i in reads:
417 output_file.write("{}{}".format(int(sum(i)), sep)) 428 output_file.write("{}{}".format(int(sum(i)), sep))
418 output_file.write("\n") 429 output_file.write("\n")
459 470
460 dataBA = ba[numpy.in1d(baSeq, d, invert=True)] 471 dataBA = ba[numpy.in1d(baSeq, d, invert=True)]
461 dataBA_o = ba_o[numpy.in1d(baSeq, d, invert=True)] 472 dataBA_o = ba_o[numpy.in1d(baSeq, d, invert=True)]
462 473
463 list1 = [duplTags_double, dataAB, dataBA] # list for plotting 474 list1 = [duplTags_double, dataAB, dataBA] # list for plotting
475 list1_o = [duplTags_double_o, dataAB_o, dataBA_o] # list for plotting
464 476
465 # information for family size >= 3 477 # information for family size >= 3
466 dataAB_FS3 = dataAB[dataAB >= 3] 478 dataAB_FS3 = dataAB[dataAB >= 3]
467 dataAB_FS3_o = dataAB_o[dataAB_o >= 3] 479 dataAB_FS3_o = dataAB_o[dataAB_o >= 3]
468 dataBA_FS3 = dataBA[dataBA >= 3] 480 dataBA_FS3 = dataBA[dataBA >= 3]
481 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 493 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3
482 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 494 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3
483 495
484 fig = plt.figure() 496 fig = plt.figure()
485 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) 497 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
486 counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], 498
487 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], 499 if rel_freq:
488 rwidth=0.8) 500 w = [numpy.zeros_like(d) + 1. / len(numpy.concatenate(list1)) for d in list1]
501 counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], weights=w,
502 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
503 rwidth=0.8)
504 plt.ylim(0, 1.07)
505 else:
506 counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"],
507 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"],
508 rwidth=0.8)
509
489 # tick labels of x axis 510 # tick labels of x axis
490 ticks = numpy.arange(1, 22, 1) 511 ticks = numpy.arange(1, 22, 1)
491 ticks1 = map(str, ticks) 512 ticks1 = map(str, ticks)
492 ticks1[len(ticks1) - 1] = ">20" 513 ticks1[len(ticks1) - 1] = ">20"
493 plt.xticks(numpy.array(ticks), ticks1) 514 plt.xticks(numpy.array(ticks), ticks1)
494 singl = counts[0][2][0] # singletons 515 # singl = counts[0][2][0] # singletons
495 last = counts[0][2][len(counts[0][0]) - 1] # large families 516 singl = len(data_o[data_o == 1])
517 last = len(data_o[data_o > 20]) # large families
496 if log_axis: 518 if log_axis:
497 plt.yscale('log') 519 plt.yscale('log')
498 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) 520 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
499 plt.title(name_file, fontsize=14) 521 plt.title("{}: FSD based on families".format(name_file), fontsize=14)
500 plt.xlabel("Family size", fontsize=14) 522 plt.xlabel("Family size", fontsize=14)
501 plt.ylabel("Absolute Frequency", fontsize=14) 523 plt.ylabel("Absolute Frequency", fontsize=14)
502 plt.margins(0.01, None) 524 plt.margins(0.01, None)
503 plt.grid(b=True, which="major", color="#424242", linestyle=":") 525 plt.grid(b=True, which="major", color="#424242", linestyle=":")
504 526
505 # extra information beneath the plot 527 # extra information beneath the plot
506 legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags=" 528 legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags="
507 plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure) 529 plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure)
508 530
509 legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA), len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags)), (len(ab) + len(ba))) 531 legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA),
532 len(duplTags), len(duplTags_double), (
533 len(dataAB) + len(
534 dataBA) + len(duplTags)),
535 (len(ab) + len(ba)))
510 plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure) 536 plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure)
511 537
512 legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o), sum(duplTags_o), sum(duplTags_double_o), (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), (sum(ab_o) + sum(ba_o))) 538 legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o),
539 sum(duplTags_o), sum(duplTags_double_o),
540 (sum(dataAB_o) + sum(dataBA_o) + sum(
541 duplTags_o)),
542 (sum(ab_o) + sum(ba_o)))
513 plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure) 543 plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
514 544
515 legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), (len(dataAB) + len(dataBA) + len(duplTags))) 545 legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
546 float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
547 float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
548 float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)),
549 (len(dataAB) + len(dataBA) + len(duplTags)))
516 plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure) 550 plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure)
517 551
518 legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)), float(len(dataBA)) / (len(ab) + len(ba)), float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), (len(ab) + len(ba))) 552 legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)),
553 float(len(dataBA)) / (len(ab) + len(ba)),
554 float(len(duplTags)) / (len(ab) + len(ba)),
555 float(len(duplTags_double)) / (
556 len(ab) + len(ba)),
557 (len(ab) + len(ba)))
519 plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure) 558 plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure)
520 559
521 legend1 = "\nsingletons:\nfamily size > 20:" 560 legend1 = "\nsingletons:\nfamily size > 20:"
522 plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure) 561 plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure)
523 562
524 legend4 = "{:,}\n{:,}".format(singl.astype(int), last.astype(int)) 563 legend4 = "{:,}\n{:,}".format(singl, last)
525 plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure) 564 plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure)
526 565 legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data))
527 legend3 = "{:.3f}\n{:.3f}".format(singl / len(data), last / len(data))
528 plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure) 566 plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
529 567
530 legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20])) 568 legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20]))
531 plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure) 569 plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
532 570
533 legend3 = "{:.3f}\n{:.3f}".format(float(singl)/sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o)) 571 legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o))
534 plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure) 572 plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
535 573
536 legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( 574 legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
537 float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), 575 float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
538 float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), 576 float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
548 plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure) 586 plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure)
549 587
550 pdf.savefig(fig) 588 pdf.savefig(fig)
551 plt.close() 589 plt.close()
552 590
591 # PLOT FSD based on PE reads
592 fig3 = plt.figure()
593 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0)
594
595 fig3.suptitle("{}: FSD based on PE reads".format(name_file), fontsize=14)
596 ax2 = fig3.add_subplot(1, 1, 1)
597 ticks = numpy.arange(1, 22)
598 ticks1 = map(str, ticks)
599 ticks1[len(ticks1) - 1] = ">20"
600 reads = []
601 reads_rel = []
602
603 #barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1)
604 ax2.set_xticks([], [])
605
606 list_y = []
607 label = ["duplex", "ab", "ba"]
608 col = ["#FF0000", "#5FB404", "#FFBF00"]
609 for i in range(len(list1)):
610 x = list(numpy.arange(1, 22).astype(float))
611 unique, c = numpy.unique(list1[i], return_counts=True)
612 y = unique * c
613 if sum(list1_o[i] > 20) > 0:
614 y[len(y) - 1] = sum(list1_o[i][list1_o[i] > 20])
615 y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))]
616 reads.append(y)
617 reads_rel.append(list(numpy.float_(y)) / sum(numpy.concatenate(list1_o)))
618
619 if rel_freq:
620 y = list(numpy.float_(y)) / sum(numpy.concatenate(list1_o))
621 ax2.set_ylim(0, 1.07)
622 else:
623 y = y
624
625 list_y.append(y)
626 if i == 0:
627 counts2 = ax2.bar(x, y, align="edge", width=0.8,
628 edgecolor="black", label=label[0],
629 linewidth=1, alpha=1, color=col[0])
630 elif i == 1:
631 counts2 = ax2.bar(x, y, bottom=list_y[i-1], align="edge", width=0.8,
632 edgecolor="black", label=label[1],
633 linewidth=1, alpha=1, color=col[1])
634 elif i == 2:
635 bars = numpy.add(list_y[0], list_y[1]).tolist()
636
637 counts2 = ax2.bar(x, y, bottom=bars, align="edge", width=0.8,
638 edgecolor="black", label=label[2],
639 linewidth=1, alpha=1, color=col[2])
640
641 ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
642
643 singl = len(data_o[data_o == 1])
644 last = len(data_o[data_o > 20]) # large families
645
646 ax2.set_xticks(numpy.array(ticks))
647 ax2.set_xticklabels(ticks1)
648 ax2.set_xlabel("Family size", fontsize=14)
649 ax2.set_ylabel(ylab, fontsize=14)
650 if log_axis:
651 ax2.set_yscale('log')
652 ax2.grid(b=True, which="major", color="#424242", linestyle=":")
653 ax2.margins(0.01, None)
654
655 # extra information beneath the plot
656 legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags="
657 plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure)
658
659 legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA),
660 len(duplTags), len(duplTags_double), (
661 len(dataAB) + len(
662 dataBA) + len(duplTags)),
663 (len(ab) + len(ba)))
664 plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure)
665
666 legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o),
667 sum(duplTags_o), sum(duplTags_double_o),
668 (sum(dataAB_o) + sum(dataBA_o) + sum(
669 duplTags_o)),
670 (sum(ab_o) + sum(ba_o)))
671 plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure)
672
673 legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
674 float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
675 float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
676 float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)),
677 (len(dataAB) + len(dataBA) + len(duplTags)))
678 plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure)
679
680 legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)),
681 float(len(dataBA)) / (len(ab) + len(ba)),
682 float(len(duplTags)) / (len(ab) + len(ba)),
683 float(len(duplTags_double)) / (
684 len(ab) + len(ba)),
685 (len(ab) + len(ba)))
686 plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure)
687
688 legend1 = "\nsingletons:\nfamily size > 20:"
689 plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure)
690
691 legend4 = "{:,}\n{:,}".format(singl, last)
692 plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure)
693 legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data))
694 plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
695
696 legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20]))
697 plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
698
699 legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o))
700 plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure)
701
702 legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(
703 float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
704 float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
705 float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)),
706 (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)))
707 plt.text(0.74, 0.09, legend, size=10, transform=plt.gcf().transFigure)
708
709 legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(
710 float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)),
711 float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o)),
712 float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)),
713 float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)), (sum(ab_o) + sum(ba_o)))
714 plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure)
715
716 pdf.savefig(fig3)
717 plt.close()
718
553 # write same information to a csv file 719 # write same information to a csv file
554 count = numpy.bincount(data_o) # original counts of family sizes 720 count = numpy.bincount(data_o) # original counts of family sizes
721
555 output_file.write("\nDataset:{}{}\n".format(sep, name_file)) 722 output_file.write("\nDataset:{}{}\n".format(sep, name_file))
556 output_file.write("max. family size:{}{}\n".format(sep, max(data_o))) 723 output_file.write("max. family size:{}{}\n".format(sep, max(data_o)))
557 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) 724 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
558 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) 725 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
559 726
560 output_file.write("median family size:{}{}\n".format(sep, numpy.median(numpy.array(data_o)))) 727 output_file.write("median family size:{}{}\n".format(sep, numpy.median(numpy.array(data_o))))
561 output_file.write("mean family size:{}{}\n\n".format(sep, numpy.mean(numpy.array(data_o)))) 728 output_file.write("mean family size:{}{}\n\n".format(sep, numpy.mean(numpy.array(data_o))))
562 729
563 output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) 730 output_file.write(
564 output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) 731 "{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep,
732 sep, sep))
733 output_file.write(
734 "{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(
735 sep, sep, sep, sep, sep, sep, sep, sep, sep))
565 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( 736 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format(
566 name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep, 737 name_file, sep, singl, sep, float(singl) / len(data), sep, float(singl) / sum(data_o), sep,
567 last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o))) 738 last, sep, float(last) / len(data), sep, sum(data_o[data_o > 20]), sep,
739 float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data),
740 sep, sum(data_o)))
568 741
569 # information for FS >= 1 742 # information for FS >= 1
570 output_file.write("The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n" 743 output_file.write(
571 "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n") 744 "The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n"
572 output_file.write("FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, sep, sep, sep)) 745 "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n")
746 output_file.write(
747 "FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep,
748 sep, sep,
749 sep))
573 output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep)) 750 output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep))
574 output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( 751 output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
575 sep, len(dataAB), sep, sum(dataAB_o), sep, float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), 752 sep, len(dataAB), sep, sum(dataAB_o), sep,
753 float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
576 sep, float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, 754 sep, float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
577 float(len(dataAB)) / (len(ab) + len(ba)), sep, float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)))) 755 float(len(dataAB)) / (len(ab) + len(ba)), sep, float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o))))
578 output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( 756 output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
579 sep, len(dataBA), sep, sum(dataBA_o), sep, float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)), 757 sep, len(dataBA), sep, sum(dataBA_o), sep,
580 sep, float(sum(dataBA_o)) / (sum(dataBA_o) + sum(dataBA_o) + sum(duplTags_o)), sep, float(len(dataBA)) / (len(ba) + len(ba)), 758 float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)),
759 sep, float(sum(dataBA_o)) / (sum(dataBA_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
760 float(len(dataBA)) / (len(ba) + len(ba)),
581 sep, float(sum(dataBA_o)) / (sum(ba_o) + sum(ba_o)))) 761 sep, float(sum(dataBA_o)) / (sum(ba_o) + sum(ba_o))))
582 output_file.write("DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format( 762 output_file.write(
583 sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep, 763 "DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
584 float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep, 764 sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep,
585 float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep, 765 float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep,
586 float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, 766 float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep,
587 float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)))) 767 float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
768 float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)),
769 float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o))))
588 output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format( 770 output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format(
589 sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep, (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, 771 sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep,
772 (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep,
590 (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(ab) + len(ba)), sep, 773 (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(ab) + len(ba)), sep,
591 (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, (sum(ab_o) + sum(ba_o)))) 774 (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, (sum(ab_o) + sum(ba_o))))
592 # information for FS >= 3 775 # information for FS >= 3
593 output_file.write("\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, sep, sep, sep)) 776 output_file.write(
777 "\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep,
778 sep,
779 sep,
780 sep,
781 sep))
594 output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep)) 782 output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep))
595 output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( 783 output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format(
596 sep, len(dataAB_FS3), sep, sum(dataAB_FS3_o), sep, 784 sep, len(dataAB_FS3), sep, sum(dataAB_FS3_o), sep,
597 float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, 785 float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
598 float(len(dataAB_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3), 786 float(len(dataAB_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
602 sep, len(dataBA_FS3), sep, sum(dataBA_FS3_o), sep, 790 sep, len(dataBA_FS3), sep, sum(dataBA_FS3_o), sep,
603 float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), 791 float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)),
604 sep, float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3), 792 sep, float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
605 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), 793 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
606 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) 794 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
607 output_file.write("DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format( 795 output_file.write(
608 sep, len(duplTags_FS3), duplTags_double_FS3, sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep, 796 "DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format(
609 float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, 797 sep, len(duplTags_FS3), duplTags_double_FS3, sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep,
610 float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), 798 float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
611 float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), 799 float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
612 sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep, 800 float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
613 float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o), 801 sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
614 float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) 802 sep,
803 float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o),
804 float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
615 output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format( 805 output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format(
616 sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), 806 sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
617 sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), 807 (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)),
618 sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) 808 sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
619 809 (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3),
620 output_file.write("\nValues from family size distribution\n") 810 sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep,
811 (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o)))
812
813 counts = [numpy.bincount(d, minlength=22)[1:] for d in list1] # original counts of family sizes
814 output_file.write("\nValues from family size distribution based on families\n")
621 output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep)) 815 output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep))
622 for dx, ab, ba, fs in zip(counts[0][0], counts[0][1], counts[0][2], counts[1]): 816
817 j = 0
818 for fs in bins:
623 if fs == 21: 819 if fs == 21:
624 fs = ">20" 820 fs = ">20"
625 else: 821 else:
626 fs = "={}".format(fs) 822 fs = "={}".format(fs)
627 ab1 = ab - dx 823 output_file.write("FS{}{}".format(fs, sep))
628 ba1 = ba - ab 824 for n in range(3):
629 output_file.write("FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(dx), sep, int(ab1), sep, int(ba1), sep, int(ba))) 825 output_file.write("{}{}".format(int(counts[n][j]), sep))
826 output_file.write("{}\n".format(counts[0][j] + counts[1][j] + counts[2][j]))
827 j += 1
828 output_file.write("sum{}".format(sep))
829 for i in counts:
830 output_file.write("{}{}".format(int(sum(i)), sep))
831 output_file.write("{}\n".format(sum(counts[0] + counts[1] + counts[2])))
832
833 output_file.write("\nValues from family size distribution based on PE reads\n")
834 output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep))
835 j = 0
836 for fs in bins:
837 if fs == 21:
838 fs = ">20"
839 else:
840 fs = "={}".format(fs)
841 output_file.write("FS{}{}".format(fs, sep))
842 for n in range(3):
843 output_file.write("{}{}".format(int(reads[n][j]), sep))
844 output_file.write("{}\n".format(reads[0][j] + reads[1][j] + reads[2][j]))
845 j += 1
846 output_file.write("sum{}".format(sep))
847 for i in reads:
848 output_file.write("{}{}".format(int(sum(i)), sep))
849 output_file.write("{}\n".format(sum(reads[0] + reads[1] + reads[2])))
630 850
631 print("Files successfully created!") 851 print("Files successfully created!")
632 852
633 853
634 if __name__ == '__main__': 854 if __name__ == '__main__':