comparison fsd_regions.py @ 6:26014c24323a draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty
author mheinzl
date Fri, 26 Oct 2018 07:54:03 -0400
parents 52454637bc45
children 3b8a0e462021
comparison
equal deleted inserted replaced
5:52454637bc45 6:26014c24323a
54 54
55 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: 55 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:
56 data_array = readFileReferenceFree(firstFile, "\t") 56 data_array = readFileReferenceFree(firstFile, "\t")
57 57
58 mut_array = readFileReferenceFree(refGenome, " ") 58 mut_array = readFileReferenceFree(refGenome, " ")
59 length_regions = len(mut_array)
60 group = numpy.array(mut_array[:, 0]) 59 group = numpy.array(mut_array[:, 0])
61 seq_mut = numpy.array(mut_array[:, 1]) 60 seq_mut = numpy.array(mut_array[:, 1])
62 alt_group = numpy.array(mut_array[:, 2])
63 61
64 seq = numpy.array(data_array[:, 1]) 62 seq = numpy.array(data_array[:, 1])
65 tags = numpy.array(data_array[:, 2]) 63 tags = numpy.array(data_array[:, 2])
66 quant = numpy.array(data_array[:, 0]).astype(int) 64 quant = numpy.array(data_array[:, 0]).astype(int)
67 65
71 quant_ba = quant[numpy.where(tags == "ba")[0]] 69 quant_ba = quant[numpy.where(tags == "ba")[0]]
72 70
73 seqDic_ab = dict(zip(all_ab, quant_ab)) 71 seqDic_ab = dict(zip(all_ab, quant_ab))
74 seqDic_ba = dict(zip(all_ba, quant_ba)) 72 seqDic_ba = dict(zip(all_ba, quant_ba))
75 73
76 if re.search('^(\d)+_(\d)+', str(mut_array[0,0])) is None: 74 if re.search('_(\d)+_(\d)+$', str(mut_array[0,0])) is None:
77 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) 75 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)
78 group = mut_array[seqMut_index,0] 76 group = mut_array[seqMut_index,0]
79 alt_group = mut_array[seqMut_index,2]
80 mut_array = mut_array[seqMut_index,:] 77 mut_array = mut_array[seqMut_index,:]
78 length_regions = len(seq_mut)*2
81 79
82 groupUnique, group_index = numpy.unique(group, return_index=True) 80 groupUnique, group_index = numpy.unique(group, return_index=True)
83 groupUnique = groupUnique[numpy.argsort(group_index)] 81 groupUnique = groupUnique[numpy.argsort(group_index)]
82
84 lst_ab = [] 83 lst_ab = []
85 lst_ba = [] 84 lst_ba = []
86 for i in seq_mut: 85 for i in seq_mut:
87 lst_ab.append(seqDic_ab.get(i)) 86 lst_ab.append(seqDic_ab.get(i))
88 lst_ba.append(seqDic_ba.get(i)) 87 lst_ba.append(seqDic_ba.get(i))
89 88
90 quant_ab = numpy.array(lst_ab) 89 quant_ab = numpy.array(lst_ab)
91 quant_ba = numpy.array(lst_ba) 90 quant_ba = numpy.array(lst_ba)
92 91
93 quantAfterRegion = OrderedDict() 92 quantAfterRegion = []
94 for key in groupUnique:
95 quantAfterRegion[key] = []
96 93
97 for i in groupUnique: 94 for i in groupUnique:
98 index_of_current_region = numpy.where(group == i)[0] 95 dataAB = quant_ab[numpy.where(group == i)[0]]
99 quant_ba_i = quant_ba[index_of_current_region] 96 dataBA = quant_ba[numpy.where(group == i)[0]]
100 alt_group_i = alt_group[index_of_current_region]
101 index_alternative_refs = numpy.where(alt_group_i != "=")[0]
102
103 dataAB = quant_ab[index_of_current_region]
104 bigFamilies = numpy.where(dataAB > 20)[0] 97 bigFamilies = numpy.where(dataAB > 20)[0]
105 dataAB[bigFamilies] = 22 98 dataAB[bigFamilies] = 22
106 for el in dataAB: 99 bigFamilies = numpy.where(dataBA > 20)[0]
107 quantAfterRegion[i].append(el) 100 dataBA[bigFamilies] = 22
108 101
109 if len(index_alternative_refs) == 0: 102 quantAll = numpy.concatenate((dataAB, dataBA))
110 dataBA = quant_ba_i 103 quantAfterRegion.append(quantAll)
111 bigFamilies = numpy.where(dataBA > 20)[0] 104
112 dataBA[bigFamilies] = 22
113 for el2 in dataBA:
114 quantAfterRegion[i].append(el2)
115 else: # get tags where 2nd mate is aligned to a different ref genome
116 unique_alt = numpy.unique(alt_group_i[index_alternative_refs])
117 for alt in unique_alt:
118 ind_alt_tags = numpy.where(alt_group_i == alt)[0]
119 dataBA = quant_ba_i[ind_alt_tags]
120
121 bigFamilies = numpy.where(dataBA > 20)[0]
122 if len(bigFamilies) != 0:
123 if len(bigFamilies) == 1 and type(dataBA) != list:
124 dataBA = 22
125 quantAfterRegion[alt].append(dataBA)
126 else:
127 dataBA[bigFamilies] = 22
128 for el3 in dataBA:
129 quantAfterRegion[alt].append(el3)
130
131 index_inverse = [x for x in range(0, len(index_of_current_region)) if x not in index_alternative_refs]
132 data_BA_other = quant_ba_i[index_inverse]
133 bigFamilies_other = numpy.where(data_BA_other > 20)[0]
134
135 if len(bigFamilies_other) != 0:
136 if len(bigFamilies_other) == 1 and type(data_BA_other) != list:
137 data_BA_other = 22
138 quantAfterRegion[i].append(data_BA_other)
139 else:
140 data_BA_other[bigFamilies_other] = 22
141 for el3 in data_BA_other:
142 quantAfterRegion[i].append(el3)
143
144 quantAfterRegion = quantAfterRegion.values()
145 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) 105 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion))
146 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) 106 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion))
147 107
148 # PLOT 108 # PLOT
149 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format 109 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
182 legend = "BA\n{}\n{}\n{:.5f}" \ 142 legend = "BA\n{}\n{}\n{:.5f}" \
183 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) 143 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
184 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure) 144 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure)
185 145
186 plt.text(0.55, 0.22, "total nr. of tags=", size=11, transform=plt.gcf().transFigure) 146 plt.text(0.55, 0.22, "total nr. of tags=", size=11, transform=plt.gcf().transFigure)
187 plt.text(0.7, 0.22, "{:,}".format(length_regions), size=11, transform=plt.gcf().transFigure) 147 plt.text(0.75, 0.22, "{:,} ({:,})".format(length_regions, length_regions/2), size=11, transform=plt.gcf().transFigure)
188 148
189 legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.' 149 # legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.'
190 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) 150 # plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
191 151
192 space = numpy.arange(0, len(groupUnique), 0.02) 152 plt.text(0.75, 0.18, "total nr. of tags per region", size=11, transform=plt.gcf().transFigure)
193 plt.text(0.7, 0.18, "total number of *\nab", size=11, transform=plt.gcf().transFigure) 153 #space = numpy.arange(0, len(groupUnique), 0.02)
194 plt.text(0.78, 0.18, "ba tags", size=11, transform=plt.gcf().transFigure) 154 s = 0
195 lengths_array_ab = []
196 lengths_array_ba = []
197
198 index_array = 0 155 index_array = 0
199 for i, s, count in zip(groupUnique, space, quantAfterRegion): 156 for i, count in zip(groupUnique, quantAfterRegion):
200 index_of_current_region = numpy.where(group == i)[0] 157 index_of_current_region = numpy.where(group == i)[0]
201
202 plt.text(0.55, 0.14 - s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) 158 plt.text(0.55, 0.14 - s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure)
203 if re.search('^(\d)+_(\d)+', str(mut_array[0, 0])) is None: 159 if re.search('_(\d)+_(\d)+$', str(mut_array[0, 0])) is None:
204 nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1])) 160 nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1]))
205 else: 161 else:
206 nr_tags_ab = len(mut_array[index_of_current_region, 1]) 162 nr_tags_ab = len(mut_array[index_of_current_region, 1])
207 163 plt.text(0.75, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure)
208 plt.text(0.7, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure) 164 s = s + 0.02
209
210 alt_group_i = alt_group[index_of_current_region]
211 alternative = numpy.where(alt_group_i != "=")[0]
212 unique_alt = numpy.unique(alt_group_i[alternative])
213 lengths_of_alt_aligned_tags = []
214 if len(alternative) != 0:
215 for alt in unique_alt:
216 ind_alt_tags = numpy.where(alt_group_i == alt)[0]
217 name = "{:,} to {}".format(len(ind_alt_tags), alt)
218 lengths_of_alt_aligned_tags.append(name)
219 ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0]
220 name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i)
221 lengths_of_alt_aligned_tags.append(name_inverse)
222 plt.text(0.78, 0.14 - s, "{}\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure)
223 lengths_array_ab.append(nr_tags_ab)
224 lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags))
225 else:
226 plt.text(0.78, 0.14 - s, "=\n", size=11,transform=plt.gcf().transFigure)
227 lengths_array_ab.append(nr_tags_ab)
228 lengths_array_ba.append(nr_tags_ab)
229 index_array += 1
230 165
231 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) 166 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
232 plt.xlabel("Family size", fontsize=14) 167 plt.xlabel("Family size", fontsize=14)
233 plt.ylabel("Absolute Frequency", fontsize=14) 168 plt.ylabel("Absolute Frequency", fontsize=14)
234 plt.grid(b=True, which="major", color="#424242", linestyle=":") 169 plt.grid(b=True, which="major", color="#424242", linestyle=":")
241 output_file.write("{}AB{}BA\n".format(sep, sep)) 176 output_file.write("{}AB{}BA\n".format(sep, sep))
242 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) 177 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))
243 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) 178 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
244 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) 179 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
245 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) 180 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
246 output_file.write("total nr. of tags{}{}\n".format(sep, length_regions)) 181 output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions/2))
247 182
248 output_file.write("\n\nValues from family size distribution\n") 183 output_file.write("\n\nValues from family size distribution\n")
249 output_file.write("{}".format(sep)) 184 output_file.write("{}".format(sep))
250 for i in groupUnique: 185 for i in groupUnique:
251 output_file.write("{}{}".format(i, sep)) 186 output_file.write("{}{}".format(i, sep))
271 output_file.write("{}{}".format(int(sum(counts[0])), sep)) 206 output_file.write("{}{}".format(int(sum(counts[0])), sep))
272 else: 207 else:
273 for i in counts[0]: 208 for i in counts[0]:
274 output_file.write("{}{}".format(int(sum(i)), sep)) 209 output_file.write("{}{}".format(int(sum(i)), sep))
275 output_file.write("\n") 210 output_file.write("\n")
276 output_file.write("\n\nRegion{}total nr. of ab{}ba tags\n".format(sep, sep)) 211 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the count of the tags per region.\n")
277 212 output_file.write("\n\nRegion{}total nr. of tags per region\n".format(sep, sep))
278 for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique): 213
279 output_file.write("{}{}{}{}{}\n".format(i, sep, ab, sep, ba)) 214 for i, count in zip(groupUnique, quantAfterRegion):
280 215 output_file.write("{}{}{}\n".format(i,sep,len(count) / 2))
281 print("Files successfully created!") 216 print("Files successfully created!")
282 217
283 218
284 if __name__ == '__main__': 219 if __name__ == '__main__':
285 sys.exit(compare_read_families_refGenome(sys.argv)) 220 sys.exit(compare_read_families_refGenome(sys.argv))