comparison fsd.py @ 0:9736b9d04a0b draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/fsd commit f674213e798956531c935e7b9eb7f444286d0a5e-dirty
author mheinzl
date Wed, 25 Apr 2018 08:59:17 -0400
parents
children 648d5df50ca8
comparison
equal deleted inserted replaced
-1:000000000000 0:9736b9d04a0b
1 #!/usr/bin/env python
2
3 # Family size distribution of SSCSs
4 #
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
6 # Contact: monika.heinzl@edumail.at
7 #
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS, but up to 4 files can be provided, as input.
9 # The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and
10 # a CSV file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes.
11 # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced.
12 # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given.
13
14 # USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py filename --inputFile2 filename2 --inputFile3 filename3 --inputFile4 filename4 /
15 # --title_file outputFileName --sep "characterWhichSeparatesCSVFile"
16
17 import numpy
18 import matplotlib.pyplot as plt
19 from matplotlib.backends.backend_pdf import PdfPages
20 import argparse
21 import sys
22 import os
23 import re
24 from Cheetah.Template import Template
25
26 def readFileReferenceFree(file):
27 with open(file, 'r') as dest_f:
28 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
29 return(data_array)
30
31 def make_argparser():
32 parser = argparse.ArgumentParser(description='Family Size Distribution of duplex sequencing data')
33 parser.add_argument('inputFile',
34 help='Tabular File with three columns: ab or ba, tag and family size.')
35 parser.add_argument('--inputName1')
36 parser.add_argument('--inputFile2',default=None,
37 help='Tabular File with three columns: ab or ba, tag and family size.')
38 parser.add_argument('--inputName2')
39 parser.add_argument('--inputFile3',default=None,
40 help='Tabular File with three columns: ab or ba, tag and family size.')
41 parser.add_argument('--inputName3')
42 parser.add_argument('--inputFile4',default=None,
43 help='Tabular File with three columns: ab or ba, tag and family size.')
44 parser.add_argument('--inputName4')
45 parser.add_argument('--sep', default=",",
46 help='Separator in the csv file.')
47 parser.add_argument('--output_csv', default="data.csv",type=str,
48 help='Name of the pdf and csv file.')
49 parser.add_argument('--output_pdf', default="data.pdf",type=str,
50 help='Name of the pdf and csv file.')
51 return parser
52
53 def compare_read_families(argv):
54 parser = make_argparser()
55 args=parser.parse_args(argv[1:])
56
57 firstFile = args.inputFile
58 name1 = args.inputName1
59 secondFile = args.inputFile2
60 name2 = args.inputName2
61 thirdFile = args.inputFile3
62 name3 = args.inputName3
63 fourthFile = args.inputFile4
64 name4 = args.inputName4
65
66 title_file = args.output_csv
67 title_file2 = args.output_pdf
68 sep = args.sep
69
70 if type(sep) is not str or len(sep)>1:
71 print("Error: --sep must be a single character.")
72 exit(4)
73
74 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
75 plt.rcParams['patch.edgecolor'] = "black"
76 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
77 plt.rcParams['xtick.labelsize'] = 12
78 plt.rcParams['ytick.labelsize'] = 12
79
80 list_to_plot = []
81 label = []
82 data_array_list = []
83
84 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
85 fig = plt.figure()
86 plt.subplots_adjust(bottom=0.25)
87 if firstFile != str(None):
88 file1 = readFileReferenceFree(firstFile)
89 integers = numpy.array(file1[:, 0]).astype(int) ## keep original family sizes
90
91 # for plot: replace all big family sizes by 22
92 data1 = numpy.array(file1[:, 0]).astype(int)
93 bigFamilies = numpy.where(data1 > 20)[0]
94 data1[bigFamilies] = 22
95
96 name1 = name1.split(".tabular")[0]
97 list_to_plot.append(data1)
98 label.append(name1)
99 data_array_list.append(file1)
100
101 legend = "\n\n\n{}".format(name1)
102 plt.text(0.1, 0.11, legend, size=12, transform=plt.gcf().transFigure)
103 legend1 = "singletons:\nabsolute nr.\n{:,}".format(numpy.bincount(data1)[1])
104 plt.text(0.4, 0.11, legend1, size=12, transform=plt.gcf().transFigure)
105
106 legend3 = "rel. freq\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1))
107 plt.text(0.5, 0.11, legend3, size=12, transform=plt.gcf().transFigure)
108
109 legend4 = "family size > 20:\nabsolute nr.\n{:,}".format(
110 numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int))
111 plt.text(0.6, 0.11, legend4, size=12, transform=plt.gcf().transFigure)
112
113 legend5 = "rel. freq\n{:.3f}".format(float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1))
114 plt.text(0.7, 0.11, legend5, size=12, transform=plt.gcf().transFigure)
115
116 legend6 = "total length\n{:,}".format(len(data1))
117 plt.text(0.8, 0.11, legend6, size=12, transform=plt.gcf().transFigure)
118
119 if secondFile != str(None):
120 file2 = readFileReferenceFree(secondFile)
121 data2 = numpy.asarray(file2[:, 0]).astype(int)
122 bigFamilies2 = numpy.where(data2 > 20)[0]
123 data2[bigFamilies2] = 22
124
125 list_to_plot.append(data2)
126 name2 = name2.split(".tabular")[0]
127 label.append(name2)
128 data_array_list.append(file2)
129
130 plt.text(0.1, 0.09, name2, size=12, transform=plt.gcf().transFigure)
131
132 legend1 = "{:,}".format(numpy.bincount(data2)[1])
133 plt.text(0.4, 0.09, legend1, size=12, transform=plt.gcf().transFigure)
134
135 legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2))
136 plt.text(0.5, 0.09, legend3, size=12, transform=plt.gcf().transFigure)
137
138 legend4 = "{:,}".format(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int))
139 plt.text(0.6, 0.09, legend4, size=12, transform=plt.gcf().transFigure)
140
141 legend5 = "{:.3f}".format(float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2))
142 plt.text(0.7, 0.09, legend5, size=12, transform=plt.gcf().transFigure)
143
144 legend6 = "{:,}".format(len(data2))
145 plt.text(0.8, 0.09, legend6, size=12, transform=plt.gcf().transFigure)
146
147 if thirdFile != str(None):
148 file3 = readFileReferenceFree(thirdFile)
149
150 data3 = numpy.asarray(file3[:, 0]).astype(int)
151 bigFamilies3 = numpy.where(data3 > 20)[0]
152 data3[bigFamilies3] = 22
153
154 list_to_plot.append(data3)
155 name3 = name3.split(".tabular")[0]
156 label.append(name3)
157 data_array_list.append(file3)
158
159 plt.text(0.1, 0.07, name3, size=12, transform=plt.gcf().transFigure)
160
161 legend1 = "{:,}".format(numpy.bincount(data3)[1])
162 plt.text(0.4, 0.07, legend1, size=12, transform=plt.gcf().transFigure)
163
164 legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3))
165 plt.text(0.5, 0.07, legend3, size=12, transform=plt.gcf().transFigure)
166
167 legend4 = "{:,}".format(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int))
168 plt.text(0.6, 0.07, legend4, size=12, transform=plt.gcf().transFigure)
169
170 legend5 = "{:.3f}".format(float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3))
171 plt.text(0.7, 0.07, legend5, size=12, transform=plt.gcf().transFigure)
172
173 legend6 = "{:,}".format(len(data3))
174 plt.text(0.8, 0.07, legend6, size=12, transform=plt.gcf().transFigure)
175
176 if fourthFile != str(None):
177 file4 = readFileReferenceFree(fourthFile)
178
179 data4 = numpy.asarray(file4[:, 0]).astype(int)
180 bigFamilies4 = numpy.where(data4 > 20)[0]
181 data4[bigFamilies4] = 22
182
183 list_to_plot.append(data4)
184 name4 = name4.split(".tabular")[0]
185 label.append(name4)
186 data_array_list.append(file4)
187
188 plt.text(0.1, 0.05, name4, size=12, transform=plt.gcf().transFigure)
189
190 legend1 = "{:,}".format(numpy.bincount(data4)[1])
191 plt.text(0.4, 0.05, legend1, size=12, transform=plt.gcf().transFigure)
192
193 legend4 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4))
194 plt.text(0.5, 0.05, legend4, size=12, transform=plt.gcf().transFigure)
195
196 legend4 = "{:,}".format(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int))
197 plt.text(0.6, 0.05, legend4, size=12, transform=plt.gcf().transFigure)
198
199 legend5 = "{:.3f}".format(float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4))
200 plt.text(0.7, 0.05, legend5, size=12, transform=plt.gcf().transFigure)
201
202 legend6 = "{:,}".format(len(data4))
203 plt.text(0.8, 0.05, legend6, size=12, transform=plt.gcf().transFigure)
204
205 maximumX = numpy.amax(numpy.concatenate(list_to_plot))
206 minimumX = numpy.amin(numpy.concatenate(list_to_plot))
207
208 counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black",
209 linewidth=1, label=label, align="left", alpha=0.7, rwidth=0.8)
210
211 ticks = numpy.arange(minimumX - 1, maximumX, 1)
212 ticks1 = map(str, ticks)
213 ticks1[len(ticks1) - 1] = ">20"
214 plt.xticks(numpy.array(ticks), ticks1)
215
216 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1))
217 plt.title("Family Size Distribution", fontsize=14)
218 plt.xlabel("No. of Family Members", fontsize=14)
219 plt.ylabel("Absolute Frequency", fontsize=14)
220 plt.margins(0.01, None)
221 plt.grid(b=True, which="major", color="#424242", linestyle=":")
222 pdf.savefig(fig)
223 plt.close()
224
225 # write data to CSV file
226 output_file.write("Values from family size distribution with all datasets\n")
227 output_file.write("\nFamily size")
228 for i in label:
229 output_file.write("{}{}".format(sep, i))
230 output_file.write("{}sum".format(sep))
231 output_file.write("\n")
232 j = 0
233 for fs in counts[1][0:len(counts[1]) - 1]:
234 if fs == 21:
235 fs = ">20"
236 else:
237 fs = "={}".format(fs)
238 output_file.write("FS{}{}".format(fs, sep))
239 values_of_fs = []
240 if len(label) == 1:
241 output_file.write("{}{}".format(int(counts[0][j]), sep))
242 values_of_fs.append(int(counts[0][j]))
243 else:
244 for n in range(len(label)):
245 output_file.write("{}{}".format(int(counts[0][n][j]), sep))
246 values_of_fs.append(int(counts[0][n][j]))
247 output_file.write("{}\n".format(sum(values_of_fs)))
248 j += 1
249 output_file.write("sum{}".format(sep))
250 values_for_sum = []
251 if len(label) == 1:
252 output_file.write("{}{}".format(int(sum(counts[0])), sep))
253 values_for_sum.append(int(sum(counts[0])))
254 else:
255 for i in counts[0]:
256 output_file.write("{}{}".format(int(sum(i)), sep))
257 values_for_sum.append(int(sum(i)))
258
259 output_file.write("{}\n".format(sum(values_for_sum)))
260
261 ### Family size distribution after DCS and SSCS
262 for dataset, data, name_file in zip(list_to_plot, data_array_list, label):
263 maximumX = numpy.amax(dataset)
264 minimumX = numpy.amin(dataset)
265
266 tags = numpy.array(data[:, 2])
267 seq = numpy.array(data[:, 1])
268 data = numpy.array(dataset)
269
270 # find all unique tags and get the indices for ALL tags, but only once
271 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
272 d = u[c > 1]
273
274 # get family sizes, tag for duplicates
275 duplTags_double = data[numpy.in1d(seq, d)]
276 duplTags = duplTags_double[0::2] # ab of DCS
277 duplTagsBA = duplTags_double[1::2] # ba of DCS
278
279 duplTags_double_tag = tags[numpy.in1d(seq, d)]
280 duplTags_double_seq = seq[numpy.in1d(seq, d)]
281
282 # get family sizes for SSCS with no partner
283 ab = numpy.where(tags == "ab")[0]
284 abSeq = seq[ab]
285 ab = data[ab]
286 ba = numpy.where(tags == "ba")[0]
287 baSeq = seq[ba]
288 ba = data[ba]
289
290 dataAB = ab[numpy.in1d(abSeq, d, invert=True)]
291 dataBA = ba[numpy.in1d(baSeq, d, invert=True)]
292
293 # write DCS tags to file
294 # with open("DCS information_{}.txt".format(firstFile), "w") as file:
295 # for t, s, f in zip(duplTags_double_tag, duplTags_double_seq, duplTags_double):
296 # file.write("{}\t{}\t{}\n".format(t, s, f))
297
298 list1 = [duplTags_double, dataAB, dataBA] # list for plotting
299
300 ## information for family size >= 3
301 dataAB_FS3 = dataAB[dataAB >= 3]
302 dataBA_FS3 = dataBA[dataBA >= 3]
303 ab_FS3 = ab[ab >= 3]
304 ba_FS3 = ba[ba >= 3]
305
306 duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3
307 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3
308 duplTags_double_FS3 = len(duplTags_FS3)+len(duplTags_FS3_BA) # both ab and ba strands with FS>=3
309
310 fig = plt.figure()
311
312 plt.subplots_adjust(bottom=0.3)
313 counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True,
314 label=["duplex", "ab", "ba"], edgecolor="black", linewidth=1,
315 align="left", color=["#FF0000", "#5FB404", "#FFBF00"])
316 # tick labels of x axis
317 ticks = numpy.arange(minimumX - 1, maximumX, 1)
318 ticks1 = map(str, ticks)
319 ticks1[len(ticks1) - 1] = ">20"
320 plt.xticks(numpy.array(ticks), ticks1)
321 singl = counts[0][2][0] # singletons
322 last = counts[0][2][len(counts[0][0]) - 1] # large families
323
324 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
325 plt.title(name1, fontsize=14)
326 plt.xlabel("No. of Family Members", fontsize=14)
327 plt.ylabel("Absolute Frequency", fontsize=14)
328 plt.margins(0.01, None)
329 plt.grid(b=True, which="major", color="#424242", linestyle=":")
330
331 ## extra information beneath the plot
332 legend = "SSCS ab= \nSSCS ba= \nDCS (total)= \nlength of dataset="
333 plt.text(0.1, 0.09, legend, size=12, transform=plt.gcf().transFigure)
334
335 legend = "absolute numbers\n\n{:,}\n{:,}\n{:,} ({:,})\n{:,}" \
336 .format(len(dataAB), len(dataBA), len(duplTags), len(duplTags_double),
337 (len(dataAB) + len(dataBA) + len(duplTags)))
338 plt.text(0.35, 0.09, legend, size=12, transform=plt.gcf().transFigure)
339
340 legend = "relative frequencies\nunique\n{:.3f}\n{:.3f}\n{:.3f}\n{:,}" \
341 .format(float(len(dataAB)) / (len(dataAB) + len(dataBA) + len(duplTags)),
342 float(len(dataBA)) / (len(dataAB) + len(dataBA) + len(duplTags)),
343 float(len(duplTags)) / (len(dataAB) + len(dataBA) + len(duplTags)),
344 (len(dataAB) + len(dataBA) + len(duplTags)))
345 plt.text(0.54, 0.09, legend, size=12, transform=plt.gcf().transFigure)
346
347 legend = "total\n{:.3f}\n{:.3f}\n{:.3f} ({:.3f})\n{:,}" \
348 .format(float(len(dataAB)) / (len(ab) + len(ba)), float(len(dataBA)) / (len(ab) + len(ba)),
349 float(len(duplTags)) / (len(ab) + len(ba)),
350 float(len(duplTags_double)) / (len(ab) + len(ba)), (len(ab) + len(ba)))
351 plt.text(0.64, 0.09, legend, size=12, transform=plt.gcf().transFigure)
352
353 legend1 = "\nsingletons:\nfamily size > 20:"
354 plt.text(0.1, 0.03, legend1, size=12, transform=plt.gcf().transFigure)
355
356 legend4 = "{:,}\n{:,}".format(singl.astype(int), last.astype(int))
357 plt.text(0.35, 0.03, legend4, size=12, transform=plt.gcf().transFigure)
358
359 legend3 = "{:.3f}\n{:.3f}".format(singl / len(data),last / len(data))
360 plt.text(0.54, 0.03, legend3, size=12, transform=plt.gcf().transFigure)
361
362 pdf.savefig(fig)
363 plt.close()
364
365 # write same information to a csv file
366 count = numpy.bincount(integers) # original counts of family sizes
367 output_file.write("\nDataset:{}{}\n".format(sep, name_file))
368 output_file.write("max. family size:{}{}\n".format(sep, max(integers)))
369 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
370 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
371
372 output_file.write("{}singletons:{}{}family size > 20:\n".format(sep, sep, sep))
373 output_file.write(
374 "{}absolute nr.{}rel. freq{}absolute nr.{}rel. freq{}total length\n".format(sep, sep, sep, sep, sep))
375 output_file.write("{}{}{}{}{:.3f}{}{}{}{:.3f}{}{}\n\n".format(name_file, sep, singl.astype(int), sep,
376 singl / len(data), sep,last.astype(int), sep,
377 last / len(data), sep, len(data)))
378
379 ## information for FS >= 1
380 output_file.write(
381 "The unique frequencies were calculated from the dataset where the tags occured only once (=ab without DCS, ba without DCS)\n" \
382 "Whereas the total frequencies were calculated from the whole dataset (=including the DCS).\n\n")
383 output_file.write("FS >= 1{}{}unique:{}total:\n".format(sep, sep, sep))
384 output_file.write("nr./rel. freq of ab={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataAB), sep,
385 float(len(dataAB)) / (len(dataAB) + len(dataBA) + len( duplTags)), sep,
386 float(len(dataAB)) / (len(ab) + len(ba))))
387 output_file.write("nr./rel. freq of ba={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataBA), sep,
388 float(len(dataBA)) / (len(dataBA) + len(dataBA) + len(duplTags)), sep,
389 float(len(dataBA)) / (len(ba) + len(ba))))
390 output_file.write(
391 "nr./rel. freq of DCS (total)={}{} ({}){}{:.3f}{}{:.3f} ({:.3f})\n".format(sep, len(duplTags), len(duplTags_double), sep,
392 float(len(duplTags)) / ( len(dataAB) + len( dataBA) + len(duplTags)),
393 sep, float(len(duplTags)) / ( len(ab) + len(ba)),
394 float( len(duplTags_double)) / (len(ab) + len(ba))))
395 output_file.write(
396 "length of dataset={}{}{}{}{}{}\n".format(sep, (len(dataAB) + len(dataBA) + len(duplTags)), sep,
397 (len(dataAB) + len(dataBA) + len(duplTags)), sep,(len(ab) + len(ba))))
398 ## information for FS >= 3
399 output_file.write("FS >= 3{}{}unique:{}total:\n".format(sep, sep, sep))
400 output_file.write("nr./rel. freq of ab={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataAB_FS3), sep,
401 float(len(dataAB_FS3)) / (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)),
402 sep, float(len(dataAB_FS3)) / ( len(ab_FS3) + len(ba_FS3))))
403 output_file.write("nr./rel. freq of ba={}{}{}{:.3f}{}{:.3f}\n".format(sep, len(dataBA_FS3), sep,
404 float(len(dataBA_FS3)) / ( len(dataBA_FS3) + len(dataBA_FS3) + len(duplTags_FS3)),
405 sep,float(len(dataBA_FS3)) / (len(ba_FS3) + len(ba_FS3))))
406 output_file.write(
407 "nr./rel. freq of DCS (total)={}{} ({}){}{:.3f}{}{:.3f} ({:.3f})\n".format(sep, len(duplTags_FS3),duplTags_double_FS3,
408 sep, float(len( duplTags_FS3)) / (len(dataBA_FS3) + len(duplTags_FS3)),
409 sep, float(len(duplTags_FS3)) / (len(ab_FS3) + len(ba_FS3)),
410 float(duplTags_double_FS3) / (len(ab_FS3) + len(ba_FS3))))
411 output_file.write(
412 "length of dataset={}{}{}{}{}{}\n".format(sep, (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
413 (len(dataAB_FS3) + len(dataBA_FS3) + len(duplTags_FS3)), sep,
414 (len(ab_FS3) + len(ba_FS3))))
415
416 output_file.write("\nValues from family size distribution\n")
417 output_file.write("{}duplex{}ab{}ba{}sum\n".format(sep, sep, sep, sep))
418 for dx, ab, ba, fs in zip(counts[0][0], counts[0][1], counts[0][2], counts[1]):
419 if fs == 21:
420 fs = ">20"
421 else:
422 fs = "={}".format(fs)
423 ab1 = ab - dx
424 ba1 = ba - ab
425 output_file.write(
426 "FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(dx), sep, int(ab1), sep, int(ba1), sep, int(ba)))
427
428 print("Files successfully created!")
429
430 if __name__ == '__main__':
431 sys.exit(compare_read_families(sys.argv))