Mercurial > repos > mheinzl > fsd_regions
comparison fsd_regions.py @ 6:26014c24323a draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty
author | mheinzl |
---|---|
date | Fri, 26 Oct 2018 07:54:03 -0400 |
parents | 52454637bc45 |
children | 3b8a0e462021 |
comparison
equal
deleted
inserted
replaced
5:52454637bc45 | 6:26014c24323a |
---|---|
54 | 54 |
55 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: | 55 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: |
56 data_array = readFileReferenceFree(firstFile, "\t") | 56 data_array = readFileReferenceFree(firstFile, "\t") |
57 | 57 |
58 mut_array = readFileReferenceFree(refGenome, " ") | 58 mut_array = readFileReferenceFree(refGenome, " ") |
59 length_regions = len(mut_array) | |
60 group = numpy.array(mut_array[:, 0]) | 59 group = numpy.array(mut_array[:, 0]) |
61 seq_mut = numpy.array(mut_array[:, 1]) | 60 seq_mut = numpy.array(mut_array[:, 1]) |
62 alt_group = numpy.array(mut_array[:, 2]) | |
63 | 61 |
64 seq = numpy.array(data_array[:, 1]) | 62 seq = numpy.array(data_array[:, 1]) |
65 tags = numpy.array(data_array[:, 2]) | 63 tags = numpy.array(data_array[:, 2]) |
66 quant = numpy.array(data_array[:, 0]).astype(int) | 64 quant = numpy.array(data_array[:, 0]).astype(int) |
67 | 65 |
71 quant_ba = quant[numpy.where(tags == "ba")[0]] | 69 quant_ba = quant[numpy.where(tags == "ba")[0]] |
72 | 70 |
73 seqDic_ab = dict(zip(all_ab, quant_ab)) | 71 seqDic_ab = dict(zip(all_ab, quant_ab)) |
74 seqDic_ba = dict(zip(all_ba, quant_ba)) | 72 seqDic_ba = dict(zip(all_ba, quant_ba)) |
75 | 73 |
76 if re.search('^(\d)+_(\d)+', str(mut_array[0,0])) is None: | 74 if re.search('_(\d)+_(\d)+$', str(mut_array[0,0])) is None: |
77 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) | 75 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) |
78 group = mut_array[seqMut_index,0] | 76 group = mut_array[seqMut_index,0] |
79 alt_group = mut_array[seqMut_index,2] | |
80 mut_array = mut_array[seqMut_index,:] | 77 mut_array = mut_array[seqMut_index,:] |
78 length_regions = len(seq_mut)*2 | |
81 | 79 |
82 groupUnique, group_index = numpy.unique(group, return_index=True) | 80 groupUnique, group_index = numpy.unique(group, return_index=True) |
83 groupUnique = groupUnique[numpy.argsort(group_index)] | 81 groupUnique = groupUnique[numpy.argsort(group_index)] |
82 | |
84 lst_ab = [] | 83 lst_ab = [] |
85 lst_ba = [] | 84 lst_ba = [] |
86 for i in seq_mut: | 85 for i in seq_mut: |
87 lst_ab.append(seqDic_ab.get(i)) | 86 lst_ab.append(seqDic_ab.get(i)) |
88 lst_ba.append(seqDic_ba.get(i)) | 87 lst_ba.append(seqDic_ba.get(i)) |
89 | 88 |
90 quant_ab = numpy.array(lst_ab) | 89 quant_ab = numpy.array(lst_ab) |
91 quant_ba = numpy.array(lst_ba) | 90 quant_ba = numpy.array(lst_ba) |
92 | 91 |
93 quantAfterRegion = OrderedDict() | 92 quantAfterRegion = [] |
94 for key in groupUnique: | |
95 quantAfterRegion[key] = [] | |
96 | 93 |
97 for i in groupUnique: | 94 for i in groupUnique: |
98 index_of_current_region = numpy.where(group == i)[0] | 95 dataAB = quant_ab[numpy.where(group == i)[0]] |
99 quant_ba_i = quant_ba[index_of_current_region] | 96 dataBA = quant_ba[numpy.where(group == i)[0]] |
100 alt_group_i = alt_group[index_of_current_region] | |
101 index_alternative_refs = numpy.where(alt_group_i != "=")[0] | |
102 | |
103 dataAB = quant_ab[index_of_current_region] | |
104 bigFamilies = numpy.where(dataAB > 20)[0] | 97 bigFamilies = numpy.where(dataAB > 20)[0] |
105 dataAB[bigFamilies] = 22 | 98 dataAB[bigFamilies] = 22 |
106 for el in dataAB: | 99 bigFamilies = numpy.where(dataBA > 20)[0] |
107 quantAfterRegion[i].append(el) | 100 dataBA[bigFamilies] = 22 |
108 | 101 |
109 if len(index_alternative_refs) == 0: | 102 quantAll = numpy.concatenate((dataAB, dataBA)) |
110 dataBA = quant_ba_i | 103 quantAfterRegion.append(quantAll) |
111 bigFamilies = numpy.where(dataBA > 20)[0] | 104 |
112 dataBA[bigFamilies] = 22 | |
113 for el2 in dataBA: | |
114 quantAfterRegion[i].append(el2) | |
115 else: # get tags where 2nd mate is aligned to a different ref genome | |
116 unique_alt = numpy.unique(alt_group_i[index_alternative_refs]) | |
117 for alt in unique_alt: | |
118 ind_alt_tags = numpy.where(alt_group_i == alt)[0] | |
119 dataBA = quant_ba_i[ind_alt_tags] | |
120 | |
121 bigFamilies = numpy.where(dataBA > 20)[0] | |
122 if len(bigFamilies) != 0: | |
123 if len(bigFamilies) == 1 and type(dataBA) != list: | |
124 dataBA = 22 | |
125 quantAfterRegion[alt].append(dataBA) | |
126 else: | |
127 dataBA[bigFamilies] = 22 | |
128 for el3 in dataBA: | |
129 quantAfterRegion[alt].append(el3) | |
130 | |
131 index_inverse = [x for x in range(0, len(index_of_current_region)) if x not in index_alternative_refs] | |
132 data_BA_other = quant_ba_i[index_inverse] | |
133 bigFamilies_other = numpy.where(data_BA_other > 20)[0] | |
134 | |
135 if len(bigFamilies_other) != 0: | |
136 if len(bigFamilies_other) == 1 and type(data_BA_other) != list: | |
137 data_BA_other = 22 | |
138 quantAfterRegion[i].append(data_BA_other) | |
139 else: | |
140 data_BA_other[bigFamilies_other] = 22 | |
141 for el3 in data_BA_other: | |
142 quantAfterRegion[i].append(el3) | |
143 | |
144 quantAfterRegion = quantAfterRegion.values() | |
145 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) | 105 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) |
146 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) | 106 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) |
147 | 107 |
148 # PLOT | 108 # PLOT |
149 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | 109 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format |
182 legend = "BA\n{}\n{}\n{:.5f}" \ | 142 legend = "BA\n{}\n{}\n{:.5f}" \ |
183 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) | 143 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) |
184 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure) | 144 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure) |
185 | 145 |
186 plt.text(0.55, 0.22, "total nr. of tags=", size=11, transform=plt.gcf().transFigure) | 146 plt.text(0.55, 0.22, "total nr. of tags=", size=11, transform=plt.gcf().transFigure) |
187 plt.text(0.7, 0.22, "{:,}".format(length_regions), size=11, transform=plt.gcf().transFigure) | 147 plt.text(0.75, 0.22, "{:,} ({:,})".format(length_regions, length_regions/2), size=11, transform=plt.gcf().transFigure) |
188 | 148 |
189 legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.' | 149 # legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.' |
190 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) | 150 # plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) |
191 | 151 |
192 space = numpy.arange(0, len(groupUnique), 0.02) | 152 plt.text(0.75, 0.18, "total nr. of tags per region", size=11, transform=plt.gcf().transFigure) |
193 plt.text(0.7, 0.18, "total number of *\nab", size=11, transform=plt.gcf().transFigure) | 153 #space = numpy.arange(0, len(groupUnique), 0.02) |
194 plt.text(0.78, 0.18, "ba tags", size=11, transform=plt.gcf().transFigure) | 154 s = 0 |
195 lengths_array_ab = [] | |
196 lengths_array_ba = [] | |
197 | |
198 index_array = 0 | 155 index_array = 0 |
199 for i, s, count in zip(groupUnique, space, quantAfterRegion): | 156 for i, count in zip(groupUnique, quantAfterRegion): |
200 index_of_current_region = numpy.where(group == i)[0] | 157 index_of_current_region = numpy.where(group == i)[0] |
201 | |
202 plt.text(0.55, 0.14 - s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) | 158 plt.text(0.55, 0.14 - s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) |
203 if re.search('^(\d)+_(\d)+', str(mut_array[0, 0])) is None: | 159 if re.search('_(\d)+_(\d)+$', str(mut_array[0, 0])) is None: |
204 nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1])) | 160 nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1])) |
205 else: | 161 else: |
206 nr_tags_ab = len(mut_array[index_of_current_region, 1]) | 162 nr_tags_ab = len(mut_array[index_of_current_region, 1]) |
207 | 163 plt.text(0.75, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure) |
208 plt.text(0.7, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure) | 164 s = s + 0.02 |
209 | |
210 alt_group_i = alt_group[index_of_current_region] | |
211 alternative = numpy.where(alt_group_i != "=")[0] | |
212 unique_alt = numpy.unique(alt_group_i[alternative]) | |
213 lengths_of_alt_aligned_tags = [] | |
214 if len(alternative) != 0: | |
215 for alt in unique_alt: | |
216 ind_alt_tags = numpy.where(alt_group_i == alt)[0] | |
217 name = "{:,} to {}".format(len(ind_alt_tags), alt) | |
218 lengths_of_alt_aligned_tags.append(name) | |
219 ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0] | |
220 name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i) | |
221 lengths_of_alt_aligned_tags.append(name_inverse) | |
222 plt.text(0.78, 0.14 - s, "{}\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure) | |
223 lengths_array_ab.append(nr_tags_ab) | |
224 lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags)) | |
225 else: | |
226 plt.text(0.78, 0.14 - s, "=\n", size=11,transform=plt.gcf().transFigure) | |
227 lengths_array_ab.append(nr_tags_ab) | |
228 lengths_array_ba.append(nr_tags_ab) | |
229 index_array += 1 | |
230 | 165 |
231 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) | 166 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) |
232 plt.xlabel("Family size", fontsize=14) | 167 plt.xlabel("Family size", fontsize=14) |
233 plt.ylabel("Absolute Frequency", fontsize=14) | 168 plt.ylabel("Absolute Frequency", fontsize=14) |
234 plt.grid(b=True, which="major", color="#424242", linestyle=":") | 169 plt.grid(b=True, which="major", color="#424242", linestyle=":") |
241 output_file.write("{}AB{}BA\n".format(sep, sep)) | 176 output_file.write("{}AB{}BA\n".format(sep, sep)) |
242 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) | 177 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) |
243 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) | 178 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) |
244 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) | 179 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) |
245 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) | 180 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) |
246 output_file.write("total nr. of tags{}{}\n".format(sep, length_regions)) | 181 output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions/2)) |
247 | 182 |
248 output_file.write("\n\nValues from family size distribution\n") | 183 output_file.write("\n\nValues from family size distribution\n") |
249 output_file.write("{}".format(sep)) | 184 output_file.write("{}".format(sep)) |
250 for i in groupUnique: | 185 for i in groupUnique: |
251 output_file.write("{}{}".format(i, sep)) | 186 output_file.write("{}{}".format(i, sep)) |
271 output_file.write("{}{}".format(int(sum(counts[0])), sep)) | 206 output_file.write("{}{}".format(int(sum(counts[0])), sep)) |
272 else: | 207 else: |
273 for i in counts[0]: | 208 for i in counts[0]: |
274 output_file.write("{}{}".format(int(sum(i)), sep)) | 209 output_file.write("{}{}".format(int(sum(i)), sep)) |
275 output_file.write("\n") | 210 output_file.write("\n") |
276 output_file.write("\n\nRegion{}total nr. of ab{}ba tags\n".format(sep, sep)) | 211 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the count of the tags per region.\n") |
277 | 212 output_file.write("\n\nRegion{}total nr. of tags per region\n".format(sep, sep)) |
278 for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique): | 213 |
279 output_file.write("{}{}{}{}{}\n".format(i, sep, ab, sep, ba)) | 214 for i, count in zip(groupUnique, quantAfterRegion): |
280 | 215 output_file.write("{}{}{}\n".format(i,sep,len(count) / 2)) |
281 print("Files successfully created!") | 216 print("Files successfully created!") |
282 | 217 |
283 | 218 |
284 if __name__ == '__main__': | 219 if __name__ == '__main__': |
285 sys.exit(compare_read_families_refGenome(sys.argv)) | 220 sys.exit(compare_read_families_refGenome(sys.argv)) |