comparison hd.py @ 0:239c4448a163 draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit 6055f8c5c052f528ff85fb5e0d43b4500830637a
author mheinzl
date Thu, 10 May 2018 07:30:27 -0400
parents
children 7414792e1cb8
comparison
equal deleted inserted replaced
-1:000000000000 0:239c4448a163
1 #!/usr/bin/env python
2
3 # Hamming distance analysis of SSCSs
4 #
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
6 # Contact: monika.heinzl@edumail.at
7 #
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS and optionally a second TABULAR file as input.
9 # The program produces a plot which shows a histogram of Hamming distances separated after family sizes,
10 # a family size distribution separated after Hamming distances for all (sample_size=0) or a given sample of SSCSs or SSCSs, which form a DCS.
11 # In additon, the tool produces HD and FSD plots for the difference between the HDs of both parts of the tags and for the chimeric reads
12 # and finally a CSV file with the data of the plots.
13 # It is also possible to perform the HD analysis with shortened tags with given sizes as input.
14 # The tool can run on a certain number of processors, which can be defined by the user.
15
16 # USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /
17 # --only_DCS True --FamilySize3 True --subset_tag True --nproc int --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf
18
19 import numpy
20 import itertools
21 import operator
22 import matplotlib.pyplot as plt
23 import os.path
24 import cPickle as pickle
25 from multiprocessing.pool import Pool
26 from functools import partial
27 from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD
28 from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2
29 from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2
30 from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag
31 from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2
32 import argparse
33 import sys
34 import os
35 from matplotlib.backends.backend_pdf import PdfPages
36
37 def hamming(array1, array2):
38 res = 99 * numpy.ones(len(array1))
39 i = 0
40 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
41 for a in array1:
42 dist = numpy.array([sum(itertools.imap(operator.ne, a, b)) for b in array2]) # fastest
43 res[i] = numpy.amin(dist[dist > 0]) # pick min distance greater than zero
44 #print(i)
45 i += 1
46 return res
47
48 def hamming_difference(array1, array2, mate_b):
49 array2 = numpy.unique(array2) # remove duplicate sequences to decrease running time
50 array1_half = numpy.array([i[0:(len(i)) / 2] for i in array1]) # mate1 part1
51 array1_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array1]) # mate1 part 2
52
53 array2_half = numpy.array([i[0:(len(i)) / 2] for i in array2]) # mate2 part1
54 array2_half2 = numpy.array([i[len(i) / 2:len(i)] for i in array2]) # mate2 part2
55
56 diff11 = []
57 relativeDiffList = []
58 ham1 = []
59 ham2 = []
60 min_valueList = []
61 min_tagsList = []
62 diff11_zeros = []
63 min_tagsList_zeros = []
64 i = 0 # counter, only used to see how many HDs of tags were already calculated
65 if mate_b is False: # HD calculation for all a's
66 half1_mate1 = array1_half
67 half2_mate1 = array1_half2
68 half1_mate2 = array2_half
69 half2_mate2 = array2_half2
70 elif mate_b is True: # HD calculation for all b's
71 half1_mate1 = array1_half2
72 half2_mate1 = array1_half
73 half1_mate2 = array2_half2
74 half2_mate2 = array2_half
75
76 for a, b, tag in zip(half1_mate1, half2_mate1, array1):
77 ## exclude identical tag from array2, to prevent comparison to itself
78 sameTag = numpy.where(array2 == tag)
79 indexArray2 = numpy.arange(0, len(array2), 1)
80 index_withoutSame = numpy.delete(indexArray2, sameTag) # delete identical tag from the data
81
82 # all tags without identical tag
83 array2_half_withoutSame = half1_mate2[index_withoutSame]
84 array2_half2_withoutSame = half2_mate2[index_withoutSame]
85 #array2_withoutSame = array2[index_withoutSame] # whole tag (=not splitted into 2 halfs)
86
87 dist = numpy.array([sum(itertools.imap(operator.ne, a, c)) for c in
88 array2_half_withoutSame]) # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
89 min_index = numpy.where(dist == dist.min()) # get index of min HD
90 min_value = dist[min_index] # get minimum HDs
91 min_tag_half2 = array2_half2_withoutSame[min_index] # get all "b's" of the tag or all "a's" of the tag with minimum HD
92 #min_tag = array2_withoutSame[min_index] # get whole tag with min HD
93
94 dist2 = numpy.array([sum(itertools.imap(operator.ne, b, e)) for e in
95 min_tag_half2]) # calculate HD of "b" to all "b's" or "a" to all "a's"
96 for d_1, d_2 in zip(min_value, dist2):
97 if mate_b is True: # half2, corrects the variable of the HD from both halfs if it is a or b
98 d = d_2
99 d2 = d_1
100 else: # half1, corrects the variable of the HD from both halfs if it is a or b
101 d = d_1
102 d2 = d_2
103 min_valueList.append(d + d2)
104 min_tagsList.append(tag)
105 ham1.append(d)
106 ham2.append(d2)
107 difference1 = abs(d - d2)
108 diff11.append(difference1)
109 rel_difference = round(float(difference1) / (d + d2), 1)
110 relativeDiffList.append(rel_difference)
111
112 #### tags which have identical parts:
113 if d == 0 or d2 == 0:
114 min_tagsList_zeros.append(tag)
115 difference1_zeros = abs(d - d2)
116 diff11_zeros.append(difference1_zeros)
117 #print(i)
118 i += 1
119 return ([diff11, ham1, ham2, min_valueList, min_tagsList, relativeDiffList, diff11_zeros, min_tagsList_zeros])
120
121 def readFileReferenceFree(file):
122 with open(file, 'r') as dest_f:
123 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter='\t', comments='#', dtype='string')
124 integers = numpy.array(data_array[:, 0]).astype(int)
125 return(integers, data_array)
126
127 def hammingDistanceWithFS(quant, ham):
128 quant = numpy.asarray(quant)
129 maximum = max(ham)
130 minimum = min(ham)
131 ham = numpy.asarray(ham)
132
133 singletons = numpy.where(quant == 1)[0]
134 data = ham[singletons]
135
136 hd2 = numpy.where(quant == 2)[0]
137 data2 = ham[hd2]
138
139 hd3 = numpy.where(quant == 3)[0]
140 data3 = ham[hd3]
141
142 hd4 = numpy.where(quant == 4)[0]
143 data4 = ham[hd4]
144
145 hd5 = numpy.where((quant >= 5) & (quant <= 10))[0]
146 data5 = ham[hd5]
147
148 hd6 = numpy.where(quant > 10)[0]
149 data6 = ham[hd6]
150
151 list1 = [data, data2, data3, data4, data5, data6]
152 return(list1, maximum, minimum)
153
154 def familySizeDistributionWithHD(fs, ham, diff=False, rel = True):
155 hammingDistances = numpy.unique(ham)
156 fs = numpy.asarray(fs)
157
158 ham = numpy.asarray(ham)
159 bigFamilies2 = numpy.where(fs > 19)[0]
160 if len(bigFamilies2) != 0:
161 fs[bigFamilies2] = 20
162 maximum = max(fs)
163 minimum = min(fs)
164 if diff is True:
165 hd0 = numpy.where(ham == 0)[0]
166 data0 = fs[hd0]
167
168 if rel is True:
169 hd1 = numpy.where(ham == 0.1)[0]
170 else:
171 hd1 = numpy.where(ham == 1)[0]
172 data = fs[hd1]
173
174 if rel is True:
175 hd2 = numpy.where(ham == 0.2)[0]
176 else:
177 hd2 = numpy.where(ham == 2)[0]
178 data2 = fs[hd2]
179
180 if rel is True:
181 hd3 = numpy.where(ham == 0.3)[0]
182 else:
183 hd3 = numpy.where(ham == 3)[0]
184 data3 = fs[hd3]
185
186 if rel is True:
187 hd4 = numpy.where(ham == 0.4)[0]
188 else:
189 hd4 = numpy.where(ham == 4)[0]
190 data4 = fs[hd4]
191
192 if rel is True:
193 hd5 = numpy.where((ham >= 0.5) & (ham <= 0.8))[0]
194 else:
195 hd5 = numpy.where((ham >= 5) & (ham <= 8))[0]
196 data5 = fs[hd5]
197
198 if rel is True:
199 hd6 = numpy.where(ham > 0.8)[0]
200 else:
201 hd6 = numpy.where(ham > 8)[0]
202 data6 = fs[hd6]
203
204 if diff is True:
205 list1 = [data0,data, data2, data3, data4, data5, data6]
206 else:
207 list1 = [data, data2, data3, data4, data5, data6]
208
209 return(list1, hammingDistances, maximum, minimum)
210
211 def make_argparser():
212 parser = argparse.ArgumentParser(description='Hamming distance analysis of duplex sequencing data')
213 parser.add_argument('--inputFile',
214 help='Tabular File with three columns: ab or ba, tag and family size.')
215 parser.add_argument('--inputName1')
216 parser.add_argument('--inputFile2',default=None,
217 help='Tabular File with three columns: ab or ba, tag and family size.')
218 parser.add_argument('--inputName2')
219 parser.add_argument('--sample_size', default=1000,type=int,
220 help='Sample size of Hamming distance analysis.')
221 parser.add_argument('--sep', default=",",
222 help='Separator in the csv file.')
223 parser.add_argument('--subset_tag', default=0,type=int,
224 help='The tag is shortened to the given number.')
225 parser.add_argument('--nproc', default=4,type=int,
226 help='The tool runs with the given number of processors.')
227 parser.add_argument('--only_DCS', action="store_false", # default=False, type=bool,
228 help='Only tags of the DCSs are included in the HD analysis')
229
230 parser.add_argument('--minFS', default=1, type=int,
231 help='Only tags, which have a family size greater or equal than specified, are included in the HD analysis')
232 parser.add_argument('--maxFS', default=0, type=int,
233 help='Only tags, which have a family size smaller or equal than specified, are included in the HD analysis')
234
235 parser.add_argument('--output_csv', default="data.csv", type=str,
236 help='Name of the csv file.')
237 parser.add_argument('--output_pdf', default="data.pdf", type=str,
238 help='Name of the pdf file.')
239 parser.add_argument('--output_pdf2', default="data2.pdf", type=str,
240 help='Name of the pdf file.')
241 parser.add_argument('--output_csv2', default="data2.csv", type=str,
242 help='Name of the csv file.')
243
244 return parser
245
246 def Hamming_Distance_Analysis(argv):
247 parser = make_argparser()
248 args = parser.parse_args(argv[1:])
249
250 file1 = args.inputFile
251 name1 = args.inputName1
252
253 file2 = args.inputFile2
254 name2 = args.inputName2
255
256 index_size = args.sample_size
257 title_savedFile_pdf = args.output_pdf
258 title_savedFile_pdf2 = args.output_pdf2
259
260 title_savedFile_csv = args.output_csv
261 title_savedFile_csv2 = args.output_csv2
262
263 sep = args.sep
264 onlyDuplicates = args.only_DCS
265 minFS = args.minFS
266 maxFS = args.maxFS
267
268 subset = args.subset_tag
269 nproc = args.nproc
270
271 ### input checks
272 if index_size < 0:
273 print("index_size is a negative integer.")
274 exit(2)
275
276 if nproc <= 0:
277 print("nproc is smaller or equal zero")
278 exit(3)
279
280 if type(sep) is not str or len(sep)>1:
281 print("sep must be a single character.")
282 exit(4)
283
284 if subset < 0:
285 print("subset_tag is smaller or equal zero.")
286 exit(5)
287
288 ### PLOT ###
289 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
290 plt.rcParams['xtick.labelsize'] = 12
291 plt.rcParams['ytick.labelsize'] = 12
292 plt.rcParams['patch.edgecolor'] = "#000000"
293 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
294
295 if file2 != str(None):
296 files = [file1, file2]
297 name1 = name1.split(".tabular")[0]
298 name2 = name2.split(".tabular")[0]
299 names = [name1, name2]
300 pdf_files = [title_savedFile_pdf, title_savedFile_pdf2]
301 csv_files = [title_savedFile_csv, title_savedFile_csv2]
302 else:
303 files = [file1]
304 name1 = name1.split(".tabular")[0]
305 names = [name1]
306 pdf_files = [title_savedFile_pdf]
307 csv_files = [title_savedFile_csv]
308
309 print(type(onlyDuplicates))
310 print(onlyDuplicates)
311
312 for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files):
313 with open(csv_f, "w") as output_file, PdfPages(pdf_f) as pdf:
314 print("dataset: ", name_file)
315 integers, data_array = readFileReferenceFree(f)
316 data_array = numpy.array(data_array)
317 int_f = numpy.array(data_array[:, 0]).astype(int)
318 data_array = data_array[numpy.where(int_f >= minFS)]
319 integers = integers[integers >= minFS]
320
321 # select family size for tags
322 if maxFS > 0:
323 int_f2 = numpy.array(data_array[:, 0]).astype(int)
324 data_array = data_array[numpy.where(int_f2 <= maxFS)]
325 integers = integers[integers <= maxFS]
326
327 print("min FS", min(integers))
328 print("max FS", max(integers))
329
330 tags = data_array[:, 2]
331 seq = data_array[:, 1]
332
333 if onlyDuplicates is True:
334 # find all unique tags and get the indices for ALL tags, but only once
335 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
336 d = u[c > 1]
337
338 # get family sizes, tag for duplicates
339 duplTags_double = integers[numpy.in1d(seq, d)]
340 duplTags = duplTags_double[0::2] # ab of DCS
341 duplTagsBA = duplTags_double[1::2] # ba of DCS
342
343 duplTags_tag = tags[numpy.in1d(seq, d)][0::2] # ab
344 duplTags_seq = seq[numpy.in1d(seq, d)][0::2] # ab - tags
345
346 data_array = numpy.column_stack((duplTags, duplTags_seq))
347 data_array = numpy.column_stack((data_array, duplTags_tag))
348 integers = numpy.array(data_array[:, 0]).astype(int)
349 print("DCS in whole dataset", len(data_array))
350
351 ## HD analysis for a subset of the tag
352 if subset > 0:
353 tag1 = numpy.array([i[0:(len(i)) / 2] for i in data_array[:, 1]])
354 tag2 = numpy.array([i[len(i) / 2:len(i)] for i in data_array[:, 1]])
355
356 flanking_region_float = float((len(tag1[0]) - subset)) / 2
357 flanking_region = int(flanking_region_float)
358 if flanking_region_float % 2 == 0:
359 tag1_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag1])
360 tag2_shorten = numpy.array([i[flanking_region:len(i) - flanking_region] for i in tag2])
361 else:
362 flanking_region_rounded = int(round(flanking_region, 1))
363 flanking_region_rounded_end = len(tag1[0]) - subset - flanking_region_rounded
364 tag1_shorten = numpy.array(
365 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag1])
366 tag2_shorten = numpy.array(
367 [i[flanking_region:len(i) - flanking_region_rounded_end] for i in tag2])
368
369 data_array_tag = numpy.array([i + j for i, j in zip(tag1_shorten, tag2_shorten)])
370 data_array = numpy.column_stack((data_array[:, 0], data_array_tag, data_array[:, 2]))
371
372 print("length of tag= ", len(data_array[0, 1]))
373 # select sample: if no size given --> all vs. all comparison
374 if index_size == 0:
375 result = numpy.arange(0, len(data_array), 1)
376 else:
377 result = numpy.random.choice(len(integers), size=index_size,
378 replace=False) # array of random sequences of size=index.size
379
380 # with open("index_result1_{}.pkl".format(app_f), "wb") as o:
381 # pickle.dump(result, o, pickle.HIGHEST_PROTOCOL)
382
383 # comparison random tags to whole dataset
384 result1 = data_array[result, 1] # random tags
385 result2 = data_array[:, 1] # all tags
386 print("size of the whole dataset= ", len(result2))
387 print("sample size= ", len(result1))
388
389 # HD analysis of whole tag
390 proc_pool = Pool(nproc)
391 chunks_sample = numpy.array_split(result1, nproc)
392 ham = proc_pool.map(partial(hamming, array2=result2), chunks_sample)
393 proc_pool.close()
394 proc_pool.join()
395 ham = numpy.concatenate(ham).astype(int)
396 # with open("HD_whole dataset_{}.txt".format(app_f), "w") as output_file1:
397 # for h, tag in zip(ham, result1):
398 # output_file1.write("{}\t{}\n".format(tag, h))
399
400 # HD analysis for chimeric reads
401 proc_pool_b = Pool(nproc)
402 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
403 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
404 proc_pool_b.close()
405 proc_pool_b.join()
406 diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]),
407 numpy.concatenate([item_b[0] for item_b in diff_list_b]))).astype(int)
408 HDhalf1 = numpy.concatenate((numpy.concatenate([item[1] for item in diff_list_a]),
409 numpy.concatenate([item_b[1] for item_b in diff_list_b]))).astype(int)
410 HDhalf2 = numpy.concatenate((numpy.concatenate([item[2] for item in diff_list_a]),
411 numpy.concatenate([item_b[2] for item_b in diff_list_b]))).astype(int)
412 minHDs = numpy.concatenate((numpy.concatenate([item[3] for item in diff_list_a]),
413 numpy.concatenate([item_b[3] for item_b in diff_list_b]))).astype(int)
414 minHD_tags = numpy.concatenate((numpy.concatenate([item[4] for item in diff_list_a]),
415 numpy.concatenate([item_b[4] for item_b in diff_list_b])))
416 rel_Diff = numpy.concatenate((numpy.concatenate([item[5] for item in diff_list_a]),
417 numpy.concatenate([item_b[5] for item_b in diff_list_b])))
418 diff_zeros = numpy.concatenate((numpy.concatenate([item[6] for item in diff_list_a]),
419 numpy.concatenate([item_b[6] for item_b in diff_list_b]))).astype(int)
420 minHD_tags_zeros = numpy.concatenate((numpy.concatenate([item[7] for item in diff_list_a]),
421 numpy.concatenate([item_b[7] for item_b in diff_list_b])))
422
423 # with open("HD_within tag_{}.txt".format(app_f), "w") as output_file2:
424 # for d, s1, s2, hd, rel_d, tag in zip(diff, HDhalf1, HDhalf2, minHDs, rel_Diff, minHD_tags):
425 # output_file2.write(
426 # "{}\t{}\t{}\t{}\t{}\t{}\n".format(tag, hd, s1, s2, d, rel_d))
427
428 lenTags = len(data_array)
429
430 quant = numpy.array(data_array[result, 0]).astype(int) # family size for sample of tags
431 seq = numpy.array(data_array[result, 1]) # tags of sample
432 ham = numpy.asarray(ham) # HD for sample of tags
433
434 if onlyDuplicates is True: # ab and ba strands of DCSs
435 quant = numpy.concatenate((quant, duplTagsBA[result]))
436 seq = numpy.tile(seq, 2)
437 ham = numpy.tile(ham, 2)
438
439 # prepare data for different kinds of plots
440 list1, maximumX, minimumX = hammingDistanceWithFS(quant, ham) # histogram of HDs separated after FS
441 # distribution of FSs separated after HD
442 familySizeList1, hammingDistances, maximumXFS, minimumXFS = familySizeDistributionWithHD(quant, ham,
443 rel=False)
444
445 ## get FS for all tags with min HD of analysis of chimeric reads
446 # there are more tags than sample size in the plot, because one tag can have multiple minimas
447 seqDic = dict(zip(seq, quant))
448 lst_minHD_tags = []
449 for i in minHD_tags:
450 lst_minHD_tags.append(seqDic.get(i))
451
452 # histogram with absolute and relative difference between HDs of both parts of the tag
453 listDifference1, maximumXDifference, minimumXDifference = hammingDistanceWithFS(lst_minHD_tags, diff)
454 listRelDifference1, maximumXRelDifference, minimumXRelDifference = hammingDistanceWithFS(lst_minHD_tags,
455 rel_Diff)
456
457 # family size distribution separated after the difference between HDs of both parts of the tag
458 familySizeList1_diff, hammingDistances_diff, maximumXFS_diff, minimumXFS_diff = familySizeDistributionWithHD(
459 lst_minHD_tags, diff, diff=True, rel=False)
460 familySizeList1_reldiff, hammingDistances_reldiff, maximumXFS_reldiff, minimumXFS_reldiff = familySizeDistributionWithHD(
461 lst_minHD_tags, rel_Diff, diff=True, rel=True)
462
463 # chimeric read analysis: tags which have HD=0 in one of the halfs
464 if len(minHD_tags_zeros) != 0:
465 lst_minHD_tags_zeros = []
466 for i in minHD_tags_zeros:
467 lst_minHD_tags_zeros.append(seqDic.get(i)) # get family size for tags of chimeric reads
468
469 # histogram with HD of non-identical half
470 listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros = hammingDistanceWithFS(
471 lst_minHD_tags_zeros, diff_zeros)
472 # family size distribution of non-identical half
473 familySizeList1_diff_zeros, hammingDistances_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros = familySizeDistributionWithHD(
474 lst_minHD_tags_zeros, diff_zeros, diff=False, rel=False)
475
476 ########################## Plot HD within tags ########################################################
477 ######################################################################################################################
478 plotHDwithinSeq_Sum2(HDhalf1, HDhalf2, minHDs, pdf=pdf, lenTags=lenTags, title_file1=name_file)
479
480 #####################################################################################################################
481 ################## plot Hamming Distance with Family size distribution ##############################
482 #####################################################################################################################
483 plotHDwithFSD(list1=list1, maximumX=maximumX, minimumX=minimumX, pdf=pdf,
484 subtitle="Overall hamming distance with separation after family size", title_file1=name_file,
485 lenTags=lenTags,xlabel="Hamming distance")
486
487 ########################## Plot FSD with separation after HD ###############################################
488 ########################################################################################################################
489 plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS,
490 quant=quant, subtitle="Family size distribution with separation after hamming distance",
491 pdf=pdf,relative=False, title_file1=name_file, diff=False)
492
493 ########################## Plot difference between HD's separated after FSD ##########################################
494 ########################################################################################################################
495 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
496 subtitle="Delta Hamming distances within tags with separation after family size",
497 title_file1=name_file, lenTags=lenTags,
498 xlabel="absolute delta Hamming distance", relative=False)
499
500 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
501 subtitle="Relative delta Hamming distances within tags with separation after family size",
502 title_file1=name_file, lenTags=lenTags,
503 xlabel="relative delta Hamming distance", relative=True)
504
505 #################### Plot FSD separated after difference between HD's #####################################
506 ########################################################################################################################
507 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff,
508 subtitle="Family size distribution with separation after delta Hamming distances within the tags",
509 pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant)
510
511 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf,
512 subtitle="Family size distribution with separation after delta Hamming distances within the tags",
513 relative=True, diff=True, title_file1=name_file)
514
515
516 # plots for chimeric reads
517 if len(minHD_tags_zeros) != 0:
518 ## HD
519 plotHDwithFSD(listDifference1_zeros, maximumXDifference_zeros, minimumXDifference_zeros, pdf=pdf,
520 subtitle="Hamming Distance of the non-identical half with separation after family size"
521 "\n(at least one half is identical with the half of the min. tag)\n",
522 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False)
523
524 ## FSD
525 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros,
526 quant=quant, pdf=pdf,
527 subtitle="Family size distribution with separation after hamming distances from the non-identical half\n"
528 "(at least one half is identical with the half of the min. tag)\n",
529 relative=False, diff=False, title_file1=name_file)
530
531 ### print all data to a CSV file
532 #### HD ####
533 summary, sumCol = createTableHD(list1, "HD=")
534 overallSum = sum(sumCol) # sum of columns in table
535
536 #### FSD ####
537 summary5, sumCol5 = createTableFSD2(familySizeList1, diff=False)
538 overallSum5 = sum(sumCol5)
539
540 ### HD of both parts of the tag ####
541 summary9, sumCol9 = createTableHDwithTags([numpy.array(minHDs), HDhalf1, HDhalf2])
542 overallSum9 = sum(sumCol9)
543
544 ## HD
545 # absolute difference
546 summary11, sumCol11 = createTableHD(listDifference1, "diff=")
547 overallSum11 = sum(sumCol11)
548 # relative difference and all tags
549 summary13, sumCol13 = createTableHD(listRelDifference1, "diff=")
550 overallSum13 = sum(sumCol13)
551
552 ## FSD
553 # absolute difference
554 summary19, sumCol19 = createTableFSD2(familySizeList1_diff)
555 overallSum19 = sum(sumCol19)
556 # relative difference
557 summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff)
558 overallSum21 = sum(sumCol21)
559
560 # chimeric reads
561 if len(minHD_tags_zeros) != 0:
562 # absolute difference and tags where at least one half has HD=0
563 summary15, sumCol15 = createTableHD(listDifference1_zeros, "diff=")
564 overallSum15 = sum(sumCol15)
565 # absolute difference and tags where at least one half has HD=0
566 summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False)
567 overallSum23 = sum(sumCol23)
568
569 output_file.write("{}\n".format(f))
570 output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\n\n".format(sep, len(
571 numpy.concatenate(list1)), lenTags, lenTags))
572
573 ### HD ###
574 createFileHD(summary, sumCol, overallSum, output_file,
575 "Hamming distance with separation after family size: file1", sep)
576 ### FSD ###
577 createFileFSD2(summary5, sumCol5, overallSum5, output_file,
578 "Family size distribution with separation after hamming distances: file1", sep,
579 diff=False)
580
581 count = numpy.bincount(quant)
582 output_file.write("{}{}\n".format(sep, f))
583 output_file.write("max. family size:{}{}\n".format(sep, max(quant)))
584 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1]))
585 output_file.write(
586 "relative frequency:{}{}\n\n".format(sep, float(count[len(count) - 1]) / sum(count)))
587
588 ### HD within tags ###
589 output_file.write(
590 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\n"
591 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\n")
592 output_file.write(
593 "file 1: actual number of tags with min HD = {:,} (sample size by user = {:,})\n".format(
594 len(numpy.concatenate(listDifference1)), len(numpy.concatenate(list1))))
595 output_file.write("length of one part of the tag = {}\n\n".format(len(data_array[0, 1]) / 2))
596
597 createFileHDwithinTag(summary9, sumCol9, overallSum9, output_file,
598 "Hamming distance of each half in the tag: file1", sep)
599 createFileHD(summary11, sumCol11, overallSum11, output_file,
600 "Absolute delta Hamming distances within the tag: file1", sep)
601 createFileHD(summary13, sumCol13, overallSum13, output_file,
602 "Relative delta Hamming distances within the tag: file1", sep)
603
604 createFileFSD2(summary19, sumCol19, overallSum19, output_file,
605 "Family size distribution with separation after absolute delta Hamming distances: file1",
606 sep)
607 createFileFSD2(summary21, sumCol21, overallSum21, output_file,
608 "Family size distribution with separation after relative delta Hamming distances: file1",
609 sep, rel=True)
610
611 if len(minHD_tags_zeros) != 0:
612 output_file.write(
613 "All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\nSo the hamming distance of the non-identical half is compared.\n")
614 createFileHD(summary15, sumCol15, overallSum15, output_file,
615 "Hamming distances of non-zero half: file1", sep)
616 createFileFSD2(summary23, sumCol23, overallSum23, output_file,
617 "Family size distribution with separation after Hamming distances of non-zero half: file1",
618 sep, diff=False)
619 output_file.write("\n")
620
621
622
623 if __name__ == '__main__':
624 sys.exit(Hamming_Distance_Analysis(sys.argv))