Mercurial > repos > mheinzl > fsd
comparison fsd.py @ 45:6651e76baca1 draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit 033dd7b750f68e8aa68f327d7d72bd311ddbee4e-dirty
author | mheinzl |
---|---|
date | Tue, 27 Aug 2019 07:36:53 -0400 |
parents | a76af7fd9fca |
children | 901827154779 |
comparison
equal
deleted
inserted
replaced
44:a76af7fd9fca | 45:6651e76baca1 |
---|---|
39 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') | 39 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') |
40 parser.add_argument('--inputName3') | 40 parser.add_argument('--inputName3') |
41 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') | 41 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') |
42 parser.add_argument('--inputName4') | 42 parser.add_argument('--inputName4') |
43 parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.') | 43 parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.') |
44 parser.add_argument('--rel_freq', action="store_false", help='If False, the relative frequencies are displayed.') | |
44 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') | 45 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') |
45 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') | 46 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') |
46 return parser | 47 return parser |
47 | 48 |
48 | 49 |
59 thirdFile = args.inputFile3 | 60 thirdFile = args.inputFile3 |
60 name3 = args.inputName3 | 61 name3 = args.inputName3 |
61 fourthFile = args.inputFile4 | 62 fourthFile = args.inputFile4 |
62 name4 = args.inputName4 | 63 name4 = args.inputName4 |
63 log_axis = args.log_axis | 64 log_axis = args.log_axis |
65 rel_freq = args.rel_freq | |
64 | 66 |
65 title_file = args.output_tabular | 67 title_file = args.output_tabular |
66 title_file2 = args.output_pdf | 68 title_file2 = args.output_pdf |
67 | 69 |
68 sep = "\t" | 70 sep = "\t" |
76 list_to_plot = [] | 78 list_to_plot = [] |
77 label = [] | 79 label = [] |
78 data_array_list = [] | 80 data_array_list = [] |
79 list_to_plot_original = [] | 81 list_to_plot_original = [] |
80 colors = [] | 82 colors = [] |
81 bins = numpy.arange(1, 22) | 83 bins = numpy.arange(1, 22) |
82 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: | 84 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: |
83 fig = plt.figure() | 85 fig = plt.figure() |
84 fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) | 86 fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) |
85 fig2 = plt.figure() | 87 fig2 = plt.figure() |
86 fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) | 88 fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) |
96 # data1 = numpy.array(file1[:, 0]).astype(int) | 98 # data1 = numpy.array(file1[:, 0]).astype(int) |
97 # bigFamilies = numpy.where(data1 > 20)[0] | 99 # bigFamilies = numpy.where(data1 > 20)[0] |
98 # data1[bigFamilies] = 22 | 100 # data1[bigFamilies] = 22 |
99 data1 = numpy.clip(integers, bins[0], bins[-1]) | 101 data1 = numpy.clip(integers, bins[0], bins[-1]) |
100 name1 = name1.split(".tabular")[0] | 102 name1 = name1.split(".tabular")[0] |
103 if len(name1) > 40: | |
104 name1 = name1[:40] | |
101 list_to_plot.append(data1) | 105 list_to_plot.append(data1) |
102 label.append(name1) | 106 label.append(name1) |
103 data_array_list.append(file1) | 107 data_array_list.append(file1) |
104 | 108 |
105 legend = "\n\n\n{}".format(name1) | 109 legend = "\n\n\n{}".format(name1) |
106 fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) | 110 fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) |
107 fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) | 111 fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) |
108 | 112 |
109 legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1)) | 113 legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], |
114 float(numpy.bincount(data1)[1]) / len(data1)) | |
110 fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) | 115 fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) |
111 fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) | 116 fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) |
112 | 117 |
113 legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers)) | 118 legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], |
119 float(numpy.bincount(data1)[1]) / sum(integers)) | |
114 fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) | 120 fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) |
115 fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) | 121 fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) |
116 | 122 |
117 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(len(integers[integers > 20]), | 123 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(len(integers[integers > 20]), |
118 float(sum(integers[integers > 20])) | 124 float(len(integers[integers > 20])) |
119 / sum(integers)) | 125 / len(integers)) |
120 fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) | 126 fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) |
121 fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) | 127 fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) |
122 | 128 |
123 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) | 129 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), |
130 float(sum(integers[integers > 20])) / sum(integers)) | |
124 fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) | 131 fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) |
125 fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) | 132 fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) |
126 | 133 |
127 legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) | 134 legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) |
128 fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) | 135 fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) |
143 # data2[bigFamilies2] = 22 | 150 # data2[bigFamilies2] = 22 |
144 | 151 |
145 data2 = numpy.clip(integers2, bins[0], bins[-1]) | 152 data2 = numpy.clip(integers2, bins[0], bins[-1]) |
146 list_to_plot.append(data2) | 153 list_to_plot.append(data2) |
147 name2 = name2.split(".tabular")[0] | 154 name2 = name2.split(".tabular")[0] |
155 if len(name2) > 40: | |
156 name2 = name2[:40] | |
148 label.append(name2) | 157 label.append(name2) |
149 data_array_list.append(file2) | 158 data_array_list.append(file2) |
150 | 159 |
151 fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) | 160 fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) |
152 fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) | 161 fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) |
157 | 166 |
158 legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2)) | 167 legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2)) |
159 fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) | 168 fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) |
160 fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) | 169 fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) |
161 | 170 |
162 legend4 = "{:,} ({:.3f})".format(len(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) | 171 legend4 = "{:,} ({:.3f})".format(len(integers2[integers2 > 20]), |
172 float(len(integers2[integers2 > 20])) / len(integers2)) | |
163 fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) | 173 fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) |
164 fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) | 174 fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) |
165 | 175 |
166 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) | 176 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), |
177 float(sum(integers2[integers2 > 20])) / sum(integers2)) | |
167 fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) | 178 fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) |
168 fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) | 179 fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) |
169 | 180 |
170 legend6 = "{:,}".format(len(data2)) | 181 legend6 = "{:,}".format(len(data2)) |
171 fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) | 182 fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) |
186 # data3[bigFamilies3] = 22 | 197 # data3[bigFamilies3] = 22 |
187 | 198 |
188 data3 = numpy.clip(integers3, bins[0], bins[-1]) | 199 data3 = numpy.clip(integers3, bins[0], bins[-1]) |
189 list_to_plot.append(data3) | 200 list_to_plot.append(data3) |
190 name3 = name3.split(".tabular")[0] | 201 name3 = name3.split(".tabular")[0] |
202 if len(name3) > 40: | |
203 name3 = name3[:40] | |
191 label.append(name3) | 204 label.append(name3) |
192 data_array_list.append(file3) | 205 data_array_list.append(file3) |
193 | 206 |
194 fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) | 207 fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) |
195 fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) | 208 fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) |
196 | 209 |
197 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3)) | 210 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3)) |
198 fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) | 211 fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) |
199 fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) | 212 fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) |
200 | 213 |
201 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3)) | 214 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], |
215 float(numpy.bincount(data3)[1]) / sum(integers3)) | |
202 fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) | 216 fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) |
203 fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) | 217 fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) |
204 | 218 |
205 legend4 = "{:,} ({:.3f})".format(len(integers3[integers3 > 20]), float(sum(integers3[integers3 > 20])) / sum(integers3)) | 219 legend4 = "{:,} ({:.3f})".format(len(integers3[integers3 > 20]), |
220 float(len(integers3[integers3 > 20])) / len(integers3)) | |
206 fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) | 221 fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) |
207 fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) | 222 fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) |
208 | 223 |
209 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), | 224 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), |
210 float(sum(integers3[integers3 > 20])) / sum(integers3)) | 225 float(sum(integers3[integers3 > 20])) / sum(integers3)) |
229 # bigFamilies4 = numpy.where(data4 > 20)[0] | 244 # bigFamilies4 = numpy.where(data4 > 20)[0] |
230 # data4[bigFamilies4] = 22 | 245 # data4[bigFamilies4] = 22 |
231 data4 = numpy.clip(integers4, bins[0], bins[-1]) | 246 data4 = numpy.clip(integers4, bins[0], bins[-1]) |
232 list_to_plot.append(data4) | 247 list_to_plot.append(data4) |
233 name4 = name4.split(".tabular")[0] | 248 name4 = name4.split(".tabular")[0] |
249 if len(name4) > 40: | |
250 name4 = name4[:40] | |
234 label.append(name4) | 251 label.append(name4) |
235 data_array_list.append(file4) | 252 data_array_list.append(file4) |
236 | 253 |
237 fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) | 254 fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) |
238 fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) | 255 fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) |
239 | 256 |
240 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4)) | 257 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4)) |
241 fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) | 258 fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) |
242 fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) | 259 fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) |
243 | 260 |
244 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4)) | 261 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], |
262 float(numpy.bincount(data4)[1]) / sum(integers4)) | |
245 fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) | 263 fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) |
246 fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) | 264 fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) |
247 | 265 |
248 legend4 = "{:,} ({:.3f})".format(len(integers4[integers4 > 20]), float(sum(integers4[integers4 > 20])) / sum(integers4)) | 266 legend4 = "{:,} ({:.3f})".format(len(integers4[integers4 > 20]), |
267 float(len(integers4[integers4 > 20])) / len(integers4)) | |
249 fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) | 268 fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) |
250 fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) | 269 fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) |
251 | 270 |
252 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), | 271 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), |
253 float(sum(integers4[integers4 > 20])) / sum(integers4)) | 272 float(sum(integers4[integers4 > 20])) / sum(integers4)) |
263 fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) | 282 fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) |
264 | 283 |
265 maximumX = numpy.amax(numpy.concatenate(list_to_plot)) | 284 maximumX = numpy.amax(numpy.concatenate(list_to_plot)) |
266 minimumX = numpy.amin(numpy.concatenate(list_to_plot)) | 285 minimumX = numpy.amin(numpy.concatenate(list_to_plot)) |
267 list_to_plot2 = list_to_plot | 286 list_to_plot2 = list_to_plot |
268 to_plot = ["Absolute frequencies", "Relative frequencies"] | 287 |
269 plt.xticks([], []) | 288 if rel_freq: |
270 plt.yticks([], []) | 289 ylab = "Relative Frequency" |
271 fig.suptitle('Family Size Distribution (tags)', fontsize=14) | 290 else: |
272 | 291 ylab = "Absolute Frequency" |
273 for l in range(len(to_plot)): | 292 |
274 ax = fig.add_subplot(2, 1, l+1) | 293 # PLOT FSD based on tags |
275 ticks = numpy.arange(1, 22, 1) | 294 fig.suptitle('Family Size Distribution (FSD) based on families', fontsize=14) |
276 ticks1 = map(str, ticks) | 295 ax = fig.add_subplot(1, 1, 1) |
277 ticks1[len(ticks1) - 1] = ">20" | 296 ticks = numpy.arange(1, 22, 1) |
278 | 297 ticks1 = map(str, ticks) |
279 if to_plot[l] == "Relative frequencies": | 298 ticks1[len(ticks1) - 1] = ">20" |
280 w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2] | 299 ax.set_xticks([], []) |
281 counts_rel = ax.hist(list_to_plot2, weights=w, | 300 if rel_freq: |
282 bins=numpy.arange(1, 23), stacked=False, edgecolor="black", | 301 w = [numpy.zeros_like(data) + 1. / len(data) for data in list_to_plot2] |
283 linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8) | 302 counts = ax.hist(list_to_plot2, weights=w, |
284 ax.set_ylim(0, 1.07) | 303 bins=numpy.arange(1, 23), stacked=False, edgecolor="black", color=colors, |
304 linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8) | |
305 ax.set_ylim(0, 1.07) | |
306 else: | |
307 counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1, | |
308 label=label, align="left", alpha=0.7, rwidth=0.8, color=colors) | |
309 ax.set_xticks(numpy.array(ticks)) | |
310 ax.set_xticklabels(ticks1) | |
311 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) | |
312 | |
313 ax.set_ylabel(ylab, fontsize=14) | |
314 ax.set_xlabel("Family size", fontsize=14) | |
315 if log_axis: | |
316 ax.set_yscale('log') | |
317 ax.grid(b=True, which="major", color="#424242", linestyle=":") | |
318 ax.margins(0.01, None) | |
319 pdf.savefig(fig) | |
320 #plt.close() | |
321 | |
322 # PLOT FSD based on PE reads | |
323 fig2.suptitle('Family Size Distribution (FSD) based on PE reads', fontsize=14) | |
324 ax2 = fig2.add_subplot(1, 1, 1) | |
325 ticks = numpy.arange(1, 22) | |
326 ticks1 = map(str, ticks) | |
327 ticks1[len(ticks1) - 1] = ">20" | |
328 reads = [] | |
329 reads_rel = [] | |
330 | |
331 barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1) | |
332 ax2.set_xticks([], []) | |
333 | |
334 for i in range(len(list_to_plot2)): | |
335 x = list(numpy.arange(1, 22).astype(float)) | |
336 unique, c = numpy.unique(list_to_plot2[i], return_counts=True) | |
337 y = unique * c | |
338 if sum(list_to_plot_original[i] > 20) > 0: | |
339 y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20]) | |
340 y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))] | |
341 reads.append(y) | |
342 reads_rel.append(list(numpy.float_(y)) / sum(y)) | |
343 | |
344 if len(list_to_plot2) == 1: | |
345 x = [xi * 0.5 for xi in x] | |
346 w = 0.4 | |
285 else: | 347 else: |
286 counts = ax.hist(list_to_plot2, bins=numpy.arange(1, 23), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8, color=colors) | 348 x = [xi + barWidth for xi in x] |
287 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) | 349 w = 1. / (len(list_to_plot) + 1) |
288 | 350 if rel_freq: |
289 ax.set_xticks(numpy.array(ticks)) | 351 counts2_rel = ax2.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w, |
290 ax.set_xticklabels(ticks1) | 352 edgecolor="black", label=label[i], linewidth=1, alpha=0.7, color=colors[i]) |
291 | 353 ax2.set_ylim(0, 1.07) |
292 ax.set_ylabel(to_plot[l], fontsize=14) | |
293 ax.set_xlabel("Family size", fontsize=14) | |
294 if log_axis: | |
295 ax.set_yscale('log') | |
296 ax.grid(b=True, which="major", color="#424242", linestyle=":") | |
297 ax.margins(0.01, None) | |
298 pdf.savefig(fig) | |
299 plt.close() | |
300 | |
301 fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14) | |
302 for l in range(len(to_plot)): | |
303 ax = fig2.add_subplot(2, 1, l + 1) | |
304 ticks = numpy.arange(minimumX, maximumX + 1) | |
305 ticks = numpy.arange(1, 22) | |
306 ticks1 = map(str, ticks) | |
307 ticks1[len(ticks1) - 1] = ">20" | |
308 reads = [] | |
309 reads_rel = [] | |
310 | |
311 barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot)+1) | |
312 | |
313 for i in range(len(list_to_plot2)): | |
314 x = list(numpy.arange(1, 22).astype(float)) | |
315 unique, c = numpy.unique(list_to_plot2[i], return_counts=True) | |
316 y = unique * c | |
317 if sum(list_to_plot_original[i] > 20) > 0: | |
318 y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20]) | |
319 y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))] | |
320 reads.append(y) | |
321 reads_rel.append(list(numpy.float_(y)) / sum(y)) | |
322 #x = [xi + barWidth for xi in x] | |
323 | |
324 if len(list_to_plot2) == 1: | |
325 x = [xi * 0.5 for xi in x] | |
326 w = 0.4 | |
327 else: | |
328 x = [xi + barWidth for xi in x] | |
329 w = 1./(len(list_to_plot) + 1) | |
330 if to_plot[l] == "Relative frequencies": | |
331 counts2_rel = ax.bar(x, list(numpy.float_(y)) / numpy.sum(y), align="edge", width=w, | |
332 edgecolor="black", label=label[i],linewidth=1, alpha=0.7, color=colors[i]) | |
333 ax.set_ylim(0, 1.07) | |
334 else: | |
335 #y = list(y.reshape((len(y)))) | |
336 counts2 = ax.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1, | |
337 alpha=0.7, color=colors[i]) | |
338 if i == len(list_to_plot2)-1: | |
339 barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1) | |
340 else: | |
341 barWidth += 1. / (len(list_to_plot) + 1) | |
342 | |
343 if to_plot[l] == "Absolute frequencies": | |
344 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) | |
345 else: | 354 else: |
346 ax.set_xlabel("Family size", fontsize=14) | 355 counts2 = ax2.bar(x, y, align="edge", width=w, edgecolor="black", label=label[i], linewidth=1, |
347 | 356 alpha=0.7, color=colors[i]) |
348 if len(list_to_plot2) == 1: | 357 |
349 ax.set_xticks(numpy.array([xi + 0.2 for xi in x])) | 358 if i == len(list_to_plot2) - 1: |
359 barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1) | |
350 else: | 360 else: |
351 ax.set_xticks(numpy.array(ticks)) | 361 barWidth += 1. / (len(list_to_plot) + 1) |
352 ax.set_xticklabels(ticks1) | 362 |
353 ax.set_ylabel(to_plot[l], fontsize=14) | 363 ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) |
354 if log_axis: | 364 |
355 ax.set_yscale('log') | 365 if len(list_to_plot2) == 1: |
356 ax.grid(b=True, which="major", color="#424242", linestyle=":") | 366 ax2.set_xticks(numpy.array([xi + 0.2 for xi in x])) |
357 ax.margins(0.01, None) | 367 else: |
368 ax2.set_xticks(numpy.array(ticks)) | |
369 ax2.set_xticklabels(ticks1) | |
370 ax2.set_xlabel("Family size", fontsize=14) | |
371 ax2.set_ylabel(ylab, fontsize=14) | |
372 if log_axis: | |
373 ax2.set_yscale('log') | |
374 ax2.grid(b=True, which="major", color="#424242", linestyle=":") | |
375 ax2.margins(0.01, None) | |
358 | 376 |
359 pdf.savefig(fig2) | 377 pdf.savefig(fig2) |
360 plt.close() | 378 plt.close() |
361 | 379 |
362 # write data to CSV file tags | 380 # write data to CSV file tags |
363 output_file.write("Values from family size distribution with all datasets (tags)\n") | 381 counts = [numpy.bincount(d, minlength=22)[1:] for d in list_to_plot2] # original counts of family sizes |
382 output_file.write("Values from family size distribution with all datasets based on families\n") | |
364 output_file.write("\nFamily size") | 383 output_file.write("\nFamily size") |
365 for i in label: | 384 for i in label: |
366 output_file.write("{}{}".format(sep, i)) | 385 output_file.write("{}{}".format(sep, i)) |
367 # output_file.write("{}sum".format(sep)) | |
368 output_file.write("\n") | 386 output_file.write("\n") |
369 j = 0 | 387 j = 0 |
370 for fs in counts[1][0:len(counts[1]) - 1]: | 388 for fs in bins: |
371 if fs == 21: | 389 if fs == 21: |
372 fs = ">20" | 390 fs = ">20" |
373 else: | 391 else: |
374 fs = "={}".format(fs) | 392 fs = "={}".format(fs) |
375 output_file.write("FS{}{}".format(fs, sep)) | 393 output_file.write("FS{}{}".format(fs, sep)) |
376 if len(label) == 1: | 394 for n in range(len(label)): |
377 output_file.write("{}{}".format(int(counts[0][j]), sep)) | 395 output_file.write("{}{}".format(int(counts[n][j]), sep)) |
378 else: | |
379 for n in range(len(label)): | |
380 output_file.write("{}{}".format(int(counts[0][n][j]), sep)) | |
381 output_file.write("\n") | 396 output_file.write("\n") |
382 j += 1 | 397 j += 1 |
383 output_file.write("sum{}".format(sep)) | 398 output_file.write("sum{}".format(sep)) |
384 if len(label) == 1: | 399 for i in counts: |
385 output_file.write("{}{}".format(int(sum(counts[0])), sep)) | 400 output_file.write("{}{}".format(int(sum(i)), sep)) |
386 else: | |
387 for i in counts[0]: | |
388 output_file.write("{}{}".format(int(sum(i)), sep)) | |
389 | 401 |
390 # write data to CSV file PE reads | 402 # write data to CSV file PE reads |
391 output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n") | 403 output_file.write("\n\nValues from family size distribution with all datasets based on PE reads\n") |
392 output_file.write("\nFamily size") | 404 output_file.write("\nFamily size") |
393 for i in label: | 405 for i in label: |
394 output_file.write("{}{}".format(sep, i)) | 406 output_file.write("{}{}".format(sep, i)) |
395 # output_file.write("{}sum".format(sep)) | |
396 output_file.write("\n") | 407 output_file.write("\n") |
397 j = 0 | 408 j = 0 |
398 | 409 |
399 for fs in bins: | 410 for fs in bins: |
400 if fs == 21: | 411 if fs == 21: |
401 fs = ">20" | 412 fs = ">20" |
402 else: | 413 else: |
403 fs = "={}".format(fs) | 414 fs = "={}".format(fs) |
408 for n in range(len(label)): | 419 for n in range(len(label)): |
409 output_file.write("{}{}".format(int(reads[n][j]), sep)) | 420 output_file.write("{}{}".format(int(reads[n][j]), sep)) |
410 output_file.write("\n") | 421 output_file.write("\n") |
411 j += 1 | 422 j += 1 |
412 output_file.write("sum{}".format(sep)) | 423 output_file.write("sum{}".format(sep)) |
413 if len(label) == 1: | 424 if len(label) == 1: |
414 output_file.write("{}{}".format(int(sum(numpy.concatenate(reads))), sep)) | 425 output_file.write("{}{}".format(int(sum(numpy.concatenate(reads))), sep)) |
415 else: | 426 else: |
416 for i in reads: | 427 for i in reads: |
417 output_file.write("{}{}".format(int(sum(i)), sep)) | 428 output_file.write("{}{}".format(int(sum(i)), sep)) |
418 output_file.write("\n") | 429 output_file.write("\n") |
459 | 470 |
460 dataBA = ba[numpy.in1d(baSeq, d, invert=True)] | 471 dataBA = ba[numpy.in1d(baSeq, d, invert=True)] |
461 dataBA_o = ba_o[numpy.in1d(baSeq, d, invert=True)] | 472 dataBA_o = ba_o[numpy.in1d(baSeq, d, invert=True)] |
462 | 473 |
463 list1 = [duplTags_double, dataAB, dataBA] # list for plotting | 474 list1 = [duplTags_double, dataAB, dataBA] # list for plotting |
475 list1_o = [duplTags_double_o, dataAB_o, dataBA_o] # list for plotting | |
464 | 476 |
465 # information for family size >= 3 | 477 # information for family size >= 3 |
466 dataAB_FS3 = dataAB[dataAB >= 3] | 478 dataAB_FS3 = dataAB[dataAB >= 3] |
467 dataAB_FS3_o = dataAB_o[dataAB_o >= 3] | 479 dataAB_FS3_o = dataAB_o[dataAB_o >= 3] |
468 dataBA_FS3 = dataBA[dataBA >= 3] | 480 dataBA_FS3 = dataBA[dataBA >= 3] |
481 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 | 493 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 |
482 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 | 494 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 |
483 | 495 |
484 fig = plt.figure() | 496 fig = plt.figure() |
485 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) | 497 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) |
486 counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], | 498 |
487 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], | 499 if rel_freq: |
488 rwidth=0.8) | 500 w = [numpy.zeros_like(d) + 1. / len(numpy.concatenate(list1)) for d in list1] |
501 counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], weights=w, | |
502 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], | |
503 rwidth=0.8) | |
504 plt.ylim(0, 1.07) | |
505 else: | |
506 counts = plt.hist(list1, bins=numpy.arange(1, 23), stacked=True, label=["duplex", "ab", "ba"], | |
507 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], | |
508 rwidth=0.8) | |
509 | |
489 # tick labels of x axis | 510 # tick labels of x axis |
490 ticks = numpy.arange(1, 22, 1) | 511 ticks = numpy.arange(1, 22, 1) |
491 ticks1 = map(str, ticks) | 512 ticks1 = map(str, ticks) |
492 ticks1[len(ticks1) - 1] = ">20" | 513 ticks1[len(ticks1) - 1] = ">20" |
493 plt.xticks(numpy.array(ticks), ticks1) | 514 plt.xticks(numpy.array(ticks), ticks1) |
494 singl = counts[0][2][0] # singletons | 515 # singl = counts[0][2][0] # singletons |
495 last = counts[0][2][len(counts[0][0]) - 1] # large families | 516 singl = len(data_o[data_o == 1]) |
517 last = len(data_o[data_o > 20]) # large families | |
496 if log_axis: | 518 if log_axis: |
497 plt.yscale('log') | 519 plt.yscale('log') |
498 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) | 520 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) |
499 plt.title(name_file, fontsize=14) | 521 plt.title("{}: FSD based on families".format(name_file), fontsize=14) |
500 plt.xlabel("Family size", fontsize=14) | 522 plt.xlabel("Family size", fontsize=14) |
501 plt.ylabel("Absolute Frequency", fontsize=14) | 523 plt.ylabel("Absolute Frequency", fontsize=14) |
502 plt.margins(0.01, None) | 524 plt.margins(0.01, None) |
503 plt.grid(b=True, which="major", color="#424242", linestyle=":") | 525 plt.grid(b=True, which="major", color="#424242", linestyle=":") |
504 | 526 |
505 # extra information beneath the plot | 527 # extra information beneath the plot |
506 legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags=" | 528 legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags=" |
507 plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure) | 529 plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure) |
508 | 530 |
509 legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA), len(duplTags), len(duplTags_double), (len(dataAB) + len(dataBA) + len(duplTags)), (len(ab) + len(ba))) | 531 legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA), |
532 len(duplTags), len(duplTags_double), ( | |
533 len(dataAB) + len( | |
534 dataBA) + len(duplTags)), | |
535 (len(ab) + len(ba))) | |
510 plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure) | 536 plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure) |
511 | 537 |
512 legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o), sum(duplTags_o), sum(duplTags_double_o), (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), (sum(ab_o) + sum(ba_o))) | 538 legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o), |
539 sum(duplTags_o), sum(duplTags_double_o), | |
540 (sum(dataAB_o) + sum(dataBA_o) + sum( | |
541 duplTags_o)), | |
542 (sum(ab_o) + sum(ba_o))) | |
513 plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure) | 543 plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure) |
514 | 544 |
515 legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format(float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), (len(dataAB) + len(dataBA) + len(duplTags))) | 545 legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( |
546 float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), | |
547 float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), | |
548 float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), | |
549 (len(dataAB) + len(dataBA) + len(duplTags))) | |
516 plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure) | 550 plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure) |
517 | 551 |
518 legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)), float(len(dataBA)) / (len(ab) + len(ba)), float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), (len(ab) + len(ba))) | 552 legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)), |
553 float(len(dataBA)) / (len(ab) + len(ba)), | |
554 float(len(duplTags)) / (len(ab) + len(ba)), | |
555 float(len(duplTags_double)) / ( | |
556 len(ab) + len(ba)), | |
557 (len(ab) + len(ba))) | |
519 plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure) | 558 plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure) |
520 | 559 |
521 legend1 = "\nsingletons:\nfamily size > 20:" | 560 legend1 = "\nsingletons:\nfamily size > 20:" |
522 plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure) | 561 plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure) |
523 | 562 |
524 legend4 = "{:,}\n{:,}".format(singl.astype(int), last.astype(int)) | 563 legend4 = "{:,}\n{:,}".format(singl, last) |
525 plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure) | 564 plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure) |
526 | 565 legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data)) |
527 legend3 = "{:.3f}\n{:.3f}".format(singl / len(data), last / len(data)) | |
528 plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure) | 566 plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure) |
529 | 567 |
530 legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20])) | 568 legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20])) |
531 plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure) | 569 plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure) |
532 | 570 |
533 legend3 = "{:.3f}\n{:.3f}".format(float(singl)/sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o)) | 571 legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o)) |
534 plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure) | 572 plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure) |
535 | 573 |
536 legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( | 574 legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( |
537 float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), | 575 float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), |
538 float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), | 576 float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), |
548 plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure) | 586 plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure) |
549 | 587 |
550 pdf.savefig(fig) | 588 pdf.savefig(fig) |
551 plt.close() | 589 plt.close() |
552 | 590 |
591 # PLOT FSD based on PE reads | |
592 fig3 = plt.figure() | |
593 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) | |
594 | |
595 fig3.suptitle("{}: FSD based on PE reads".format(name_file), fontsize=14) | |
596 ax2 = fig3.add_subplot(1, 1, 1) | |
597 ticks = numpy.arange(1, 22) | |
598 ticks1 = map(str, ticks) | |
599 ticks1[len(ticks1) - 1] = ">20" | |
600 reads = [] | |
601 reads_rel = [] | |
602 | |
603 #barWidth = 0 - (len(list_to_plot) + 1) / 2 * 1. / (len(list_to_plot) + 1) | |
604 ax2.set_xticks([], []) | |
605 | |
606 list_y = [] | |
607 label = ["duplex", "ab", "ba"] | |
608 col = ["#FF0000", "#5FB404", "#FFBF00"] | |
609 for i in range(len(list1)): | |
610 x = list(numpy.arange(1, 22).astype(float)) | |
611 unique, c = numpy.unique(list1[i], return_counts=True) | |
612 y = unique * c | |
613 if sum(list1_o[i] > 20) > 0: | |
614 y[len(y) - 1] = sum(list1_o[i][list1_o[i] > 20]) | |
615 y = [y[x[idx] == unique][0] if x[idx] in unique else 0 for idx in range(len(x))] | |
616 reads.append(y) | |
617 reads_rel.append(list(numpy.float_(y)) / sum(numpy.concatenate(list1_o))) | |
618 | |
619 if rel_freq: | |
620 y = list(numpy.float_(y)) / sum(numpy.concatenate(list1_o)) | |
621 ax2.set_ylim(0, 1.07) | |
622 else: | |
623 y = y | |
624 | |
625 list_y.append(y) | |
626 if i == 0: | |
627 counts2 = ax2.bar(x, y, align="edge", width=0.8, | |
628 edgecolor="black", label=label[0], | |
629 linewidth=1, alpha=1, color=col[0]) | |
630 elif i == 1: | |
631 counts2 = ax2.bar(x, y, bottom=list_y[i-1], align="edge", width=0.8, | |
632 edgecolor="black", label=label[1], | |
633 linewidth=1, alpha=1, color=col[1]) | |
634 elif i == 2: | |
635 bars = numpy.add(list_y[0], list_y[1]).tolist() | |
636 | |
637 counts2 = ax2.bar(x, y, bottom=bars, align="edge", width=0.8, | |
638 edgecolor="black", label=label[2], | |
639 linewidth=1, alpha=1, color=col[2]) | |
640 | |
641 ax2.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) | |
642 | |
643 singl = len(data_o[data_o == 1]) | |
644 last = len(data_o[data_o > 20]) # large families | |
645 | |
646 ax2.set_xticks(numpy.array(ticks)) | |
647 ax2.set_xticklabels(ticks1) | |
648 ax2.set_xlabel("Family size", fontsize=14) | |
649 ax2.set_ylabel(ylab, fontsize=14) | |
650 if log_axis: | |
651 ax2.set_yscale('log') | |
652 ax2.grid(b=True, which="major", color="#424242", linestyle=":") | |
653 ax2.margins(0.01, None) | |
654 | |
655 # extra information beneath the plot | |
656 legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \ntotal nr. of tags=" | |
657 plt.text(0.1, 0.09, legend, size=10, transform=plt.gcf().transFigure) | |
658 | |
659 legend = "nr. of tags\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(len(dataAB), len(dataBA), | |
660 len(duplTags), len(duplTags_double), ( | |
661 len(dataAB) + len( | |
662 dataBA) + len(duplTags)), | |
663 (len(ab) + len(ba))) | |
664 plt.text(0.23, 0.09, legend, size=10, transform=plt.gcf().transFigure) | |
665 | |
666 legend5 = "PE reads\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,} ({:,})".format(sum(dataAB_o), sum(dataBA_o), | |
667 sum(duplTags_o), sum(duplTags_double_o), | |
668 (sum(dataAB_o) + sum(dataBA_o) + sum( | |
669 duplTags_o)), | |
670 (sum(ab_o) + sum(ba_o))) | |
671 plt.text(0.38, 0.09, legend5, size=10, transform=plt.gcf().transFigure) | |
672 | |
673 legend = "rel. freq. of tags\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( | |
674 float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), | |
675 float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)), | |
676 float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), | |
677 (len(dataAB) + len(dataBA) + len(duplTags))) | |
678 plt.text(0.54, 0.09, legend, size=10, transform=plt.gcf().transFigure) | |
679 | |
680 legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format(float(len(dataAB)) / (len(ab) + len(ba)), | |
681 float(len(dataBA)) / (len(ab) + len(ba)), | |
682 float(len(duplTags)) / (len(ab) + len(ba)), | |
683 float(len(duplTags_double)) / ( | |
684 len(ab) + len(ba)), | |
685 (len(ab) + len(ba))) | |
686 plt.text(0.64, 0.09, legend, size=10, transform=plt.gcf().transFigure) | |
687 | |
688 legend1 = "\nsingletons:\nfamily size > 20:" | |
689 plt.text(0.1, 0.03, legend1, size=10, transform=plt.gcf().transFigure) | |
690 | |
691 legend4 = "{:,}\n{:,}".format(singl, last) | |
692 plt.text(0.23, 0.03, legend4, size=10, transform=plt.gcf().transFigure) | |
693 legend3 = "{:.3f}\n{:.3f}".format(float(singl) / len(data), float(last) / len(data)) | |
694 plt.text(0.64, 0.03, legend3, size=10, transform=plt.gcf().transFigure) | |
695 | |
696 legend3 = "\n\n{:,}".format(sum(data_o[data_o > 20])) | |
697 plt.text(0.38, 0.03, legend3, size=10, transform=plt.gcf().transFigure) | |
698 | |
699 legend3 = "{:.3f}\n{:.3f}".format(float(singl) / sum(data_o), float(sum(data_o[data_o > 20])) / sum(data_o)) | |
700 plt.text(0.84, 0.03, legend3, size=10, transform=plt.gcf().transFigure) | |
701 | |
702 legend = "PE reads\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}".format( | |
703 float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), | |
704 float(sum(dataBA_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), | |
705 float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), | |
706 (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o))) | |
707 plt.text(0.74, 0.09, legend, size=10, transform=plt.gcf().transFigure) | |
708 | |
709 legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}".format( | |
710 float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)), | |
711 float(sum(dataBA_o)) / (sum(ab_o) + sum(ba_o)), | |
712 float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), | |
713 float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)), (sum(ab_o) + sum(ba_o))) | |
714 plt.text(0.84, 0.09, legend, size=10, transform=plt.gcf().transFigure) | |
715 | |
716 pdf.savefig(fig3) | |
717 plt.close() | |
718 | |
553 # write same information to a csv file | 719 # write same information to a csv file |
554 count = numpy.bincount(data_o) # original counts of family sizes | 720 count = numpy.bincount(data_o) # original counts of family sizes |
721 | |
555 output_file.write("\nDataset:{}{}\n".format(sep, name_file)) | 722 output_file.write("\nDataset:{}{}\n".format(sep, name_file)) |
556 output_file.write("max. family size:{}{}\n".format(sep, max(data_o))) | 723 output_file.write("max. family size:{}{}\n".format(sep, max(data_o))) |
557 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) | 724 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) |
558 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) | 725 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) |
559 | 726 |
560 output_file.write("median family size:{}{}\n".format(sep, numpy.median(numpy.array(data_o)))) | 727 output_file.write("median family size:{}{}\n".format(sep, numpy.median(numpy.array(data_o)))) |
561 output_file.write("mean family size:{}{}\n\n".format(sep, numpy.mean(numpy.array(data_o)))) | 728 output_file.write("mean family size:{}{}\n\n".format(sep, numpy.mean(numpy.array(data_o)))) |
562 | 729 |
563 output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) | 730 output_file.write( |
564 output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) | 731 "{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, |
732 sep, sep)) | |
733 output_file.write( | |
734 "{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format( | |
735 sep, sep, sep, sep, sep, sep, sep, sep, sep)) | |
565 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( | 736 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( |
566 name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep, | 737 name_file, sep, singl, sep, float(singl) / len(data), sep, float(singl) / sum(data_o), sep, |
567 last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o))) | 738 last, sep, float(last) / len(data), sep, sum(data_o[data_o > 20]), sep, |
739 float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), | |
740 sep, sum(data_o))) | |
568 | 741 |
569 # information for FS >= 1 | 742 # information for FS >= 1 |
570 output_file.write("The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n" | 743 output_file.write( |
571 "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n") | 744 "The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n" |
572 output_file.write("FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, sep, sep, sep)) | 745 "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n") |
746 output_file.write( | |
747 "FS >= 1{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, | |
748 sep, sep, | |
749 sep)) | |
573 output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep)) | 750 output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep)) |
574 output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( | 751 output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( |
575 sep, len(dataAB), sep, sum(dataAB_o), sep, float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), | 752 sep, len(dataAB), sep, sum(dataAB_o), sep, |
753 float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)), | |
576 sep, float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, | 754 sep, float(sum(dataAB_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, |
577 float(len(dataAB)) / (len(ab) + len(ba)), sep, float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)))) | 755 float(len(dataAB)) / (len(ab) + len(ba)), sep, float(sum(dataAB_o)) / (sum(ab_o) + sum(ba_o)))) |
578 output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( | 756 output_file.write("SSCS ba{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( |
579 sep, len(dataBA), sep, sum(dataBA_o), sep, float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)), | 757 sep, len(dataBA), sep, sum(dataBA_o), sep, |
580 sep, float(sum(dataBA_o)) / (sum(dataBA_o) + sum(dataBA_o) + sum(duplTags_o)), sep, float(len(dataBA)) / (len(ba) + len(ba)), | 758 float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)), |
759 sep, float(sum(dataBA_o)) / (sum(dataBA_o) + sum(dataBA_o) + sum(duplTags_o)), sep, | |
760 float(len(dataBA)) / (len(ba) + len(ba)), | |
581 sep, float(sum(dataBA_o)) / (sum(ba_o) + sum(ba_o)))) | 761 sep, float(sum(dataBA_o)) / (sum(ba_o) + sum(ba_o)))) |
582 output_file.write("DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format( | 762 output_file.write( |
583 sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep, | 763 "DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format( |
584 float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep, | 764 sep, len(duplTags), len(duplTags_double), sep, sum(duplTags_o), sum(duplTags_double_o), sep, |
585 float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep, | 765 float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)), sep, |
586 float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, | 766 float(len(duplTags)) / (len(ab) + len(ba)), float(len(duplTags_double)) / (len(ab) + len(ba)), sep, |
587 float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)))) | 767 float(sum(duplTags_o)) / (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, |
768 float(sum(duplTags_o)) / (sum(ab_o) + sum(ba_o)), | |
769 float(sum(duplTags_double_o)) / (sum(ab_o) + sum(ba_o)))) | |
588 output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format( | 770 output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format( |
589 sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep, (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, | 771 sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep, |
772 (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, | |
590 (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(ab) + len(ba)), sep, | 773 (len(dataAB) + len(dataBA) + len(duplTags)), sep, (len(ab) + len(ba)), sep, |
591 (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, (sum(ab_o) + sum(ba_o)))) | 774 (sum(dataAB_o) + sum(dataBA_o) + sum(duplTags_o)), sep, (sum(ab_o) + sum(ba_o)))) |
592 # information for FS >= 3 | 775 # information for FS >= 3 |
593 output_file.write("\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, sep, sep, sep, sep)) | 776 output_file.write( |
777 "\nFS >= 3{}nr. of tags{}nr. of PE reads{}rel. freq of tags{}{}rel. freq of PE reads:\n".format(sep, | |
778 sep, | |
779 sep, | |
780 sep, | |
781 sep)) | |
594 output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep)) | 782 output_file.write("{}{}{}unique:{}total{}unique{}total:\n".format(sep, sep, sep, sep, sep, sep)) |
595 output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( | 783 output_file.write("SSCS ab{}{}{}{}{}{:.3f}{}{:.3f}{}{:.3f}{}{:.3f}\n".format( |
596 sep, len(dataAB_FS3), sep, sum(dataAB_FS3_o), sep, | 784 sep, len(dataAB_FS3), sep, sum(dataAB_FS3_o), sep, |
597 float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, | 785 float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, |
598 float(len(dataAB_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3), | 786 float(len(dataAB_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3), |
602 sep, len(dataBA_FS3), sep, sum(dataBA_FS3_o), sep, | 790 sep, len(dataBA_FS3), sep, sum(dataBA_FS3_o), sep, |
603 float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), | 791 float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), |
604 sep, float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3), | 792 sep, float(len(dataBA_FS3)) / (len(dataBA_FS3) + len(dataBA_FS3) + duplTags_double_FS3), |
605 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), | 793 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), |
606 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) | 794 sep, float(sum(dataBA_FS3_o)) / (sum(dataBA_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) |
607 output_file.write("DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format( | 795 output_file.write( |
608 sep, len(duplTags_FS3), duplTags_double_FS3, sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep, | 796 "DCS (total){}{} ({}){}{} ({}){}{:.3f}{}{:.3f} ({:.3f}){}{:.3f}{}{:.3f} ({:.3f})\n".format( |
609 float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, | 797 sep, len(duplTags_FS3), duplTags_double_FS3, sep, sum(duplTags_FS3_o), duplTags_double_FS3_o, sep, |
610 float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), | 798 float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, |
611 float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), | 799 float(len(duplTags_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), |
612 sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep, | 800 float(duplTags_double_FS3) / (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), |
613 float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o), | 801 sep, float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), |
614 float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) | 802 sep, |
803 float(sum(duplTags_FS3_o)) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o), | |
804 float(duplTags_double_FS3_o) / (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) | |
615 output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format( | 805 output_file.write("total nr. of tags{}{}{}{}{}{}{}{}{}{}{}{}\n".format( |
616 sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), | 806 sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, |
617 sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), | 807 (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), |
618 sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) | 808 sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep, |
619 | 809 (len(dataAB_FS3) + len(dataBA_FS3) + duplTags_double_FS3), |
620 output_file.write("\nValues from family size distribution\n") | 810 sep, (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + sum(duplTags_FS3_o)), sep, |
811 (sum(dataAB_FS3_o) + sum(dataBA_FS3_o) + duplTags_double_FS3_o))) | |
812 | |
813 counts = [numpy.bincount(d, minlength=22)[1:] for d in list1] # original counts of family sizes | |
814 output_file.write("\nValues from family size distribution based on families\n") | |
621 output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep)) | 815 output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep)) |
622 for dx, ab, ba, fs in zip(counts[0][0], counts[0][1], counts[0][2], counts[1]): | 816 |
817 j = 0 | |
818 for fs in bins: | |
623 if fs == 21: | 819 if fs == 21: |
624 fs = ">20" | 820 fs = ">20" |
625 else: | 821 else: |
626 fs = "={}".format(fs) | 822 fs = "={}".format(fs) |
627 ab1 = ab - dx | 823 output_file.write("FS{}{}".format(fs, sep)) |
628 ba1 = ba - ab | 824 for n in range(3): |
629 output_file.write("FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(dx), sep, int(ab1), sep, int(ba1), sep, int(ba))) | 825 output_file.write("{}{}".format(int(counts[n][j]), sep)) |
826 output_file.write("{}\n".format(counts[0][j] + counts[1][j] + counts[2][j])) | |
827 j += 1 | |
828 output_file.write("sum{}".format(sep)) | |
829 for i in counts: | |
830 output_file.write("{}{}".format(int(sum(i)), sep)) | |
831 output_file.write("{}\n".format(sum(counts[0] + counts[1] + counts[2]))) | |
832 | |
833 output_file.write("\nValues from family size distribution based on PE reads\n") | |
834 output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep)) | |
835 j = 0 | |
836 for fs in bins: | |
837 if fs == 21: | |
838 fs = ">20" | |
839 else: | |
840 fs = "={}".format(fs) | |
841 output_file.write("FS{}{}".format(fs, sep)) | |
842 for n in range(3): | |
843 output_file.write("{}{}".format(int(reads[n][j]), sep)) | |
844 output_file.write("{}\n".format(reads[0][j] + reads[1][j] + reads[2][j])) | |
845 j += 1 | |
846 output_file.write("sum{}".format(sep)) | |
847 for i in reads: | |
848 output_file.write("{}{}".format(int(sum(i)), sep)) | |
849 output_file.write("{}\n".format(sum(reads[0] + reads[1] + reads[2]))) | |
630 | 850 |
631 print("Files successfully created!") | 851 print("Files successfully created!") |
632 | 852 |
633 | 853 |
634 if __name__ == '__main__': | 854 if __name__ == '__main__': |