comparison fsd_regions.py @ 14:6879295d3f11 draft default tip

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author mheinzl
date Tue, 08 Jan 2019 09:50:01 -0500
parents 63432e6f6a61
children
comparison
equal deleted inserted replaced
13:63432e6f6a61 14:6879295d3f11
131 seqDic_ba = dict(zip(all_ba, quant_ba)) 131 seqDic_ba = dict(zip(all_ba, quant_ba))
132 132
133 lst_ab = [] 133 lst_ab = []
134 lst_ba = [] 134 lst_ba = []
135 quantAfterRegion = [] 135 quantAfterRegion = []
136 length_regions = 0
137 for i in group: 136 for i in group:
138 lst_ab_r = [] 137 lst_ab_r = []
139 lst_ba_r = [] 138 lst_ba_r = []
140 seq_mut = qname_dict[i] 139 seq_mut = qname_dict[i]
141 if rangesFile == str(None): 140 if rangesFile == str(None):
142 seq_mut, seqMut_index = np.unique(np.array(seq_mut), return_index=True) 141 seq_mut, seqMut_index = np.unique(np.array(seq_mut), return_index=True)
143 length_regions = length_regions + len(seq_mut) * 2
144 for r in seq_mut: 142 for r in seq_mut:
145 count_ab = seqDic_ab.get(r) 143 if re.search('\.', r): # BAM file with SSCS tags
146 count_ba = seqDic_ba.get(r) 144 splitted_tag = re.split('\.', r)[0]
147 lst_ab_r.append(count_ab) 145 direction = re.split('\.', r)[1]
148 lst_ab.append(count_ab) 146
149 lst_ba_r.append(count_ba) 147 if direction == "ab":
150 lst_ba.append(count_ba) 148 count_ab = seqDic_ab.get(splitted_tag)
149 lst_ab_r.append(count_ab)
150 lst_ab.append(count_ab)
151
152 else:
153 count_ba = seqDic_ba.get(splitted_tag)
154 lst_ba_r.append(count_ba)
155 lst_ba.append(count_ba)
156 else: # BAM file with DCS tags
157 count_ab = seqDic_ab.get(r)
158 lst_ab_r.append(count_ab)
159 lst_ab.append(count_ab)
160 count_ba = seqDic_ba.get(r)
161 lst_ba_r.append(count_ba)
162 lst_ba.append(count_ba)
151 163
152 dataAB = np.array(lst_ab_r) 164 dataAB = np.array(lst_ab_r)
153 dataBA = np.array(lst_ba_r) 165 dataBA = np.array(lst_ba_r)
154 bigFamilies = np.where(dataAB > 20)[0] 166 bigFamilies = np.where(dataAB > 20)[0]
155 dataAB[bigFamilies] = 22 167 dataAB[bigFamilies] = 22
159 quantAll = np.concatenate((dataAB, dataBA)) 171 quantAll = np.concatenate((dataAB, dataBA))
160 quantAfterRegion.append(quantAll) 172 quantAfterRegion.append(quantAll)
161 173
162 quant_ab = np.array(lst_ab) 174 quant_ab = np.array(lst_ab)
163 quant_ba = np.array(lst_ba) 175 quant_ba = np.array(lst_ba)
176 length_regions = len(np.concatenate(quantAfterRegion))
164 177
165 maximumX = np.amax(np.concatenate(quantAfterRegion)) 178 maximumX = np.amax(np.concatenate(quantAfterRegion))
166 minimumX = np.amin(np.concatenate(quantAfterRegion)) 179 minimumX = np.amin(np.concatenate(quantAfterRegion))
167 180
168 # PLOT 181 # PLOT
187 ticks1 = map(str, ticks) 200 ticks1 = map(str, ticks)
188 ticks1[len(ticks1) - 1] = ">20" 201 ticks1[len(ticks1) - 1] = ">20"
189 plt.xticks(np.array(ticks), ticks1) 202 plt.xticks(np.array(ticks), ticks1)
190 count = np.bincount(map(int, quant_ab)) # original counts 203 count = np.bincount(map(int, quant_ab)) # original counts
191 204
192 legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)" 205 legend = "max. family size:\nabsolute frequency:" \
206 "\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)"
193 plt.text(0.15, 0.085, legend, size=11, transform=plt.gcf().transFigure) 207 plt.text(0.15, 0.085, legend, size=11, transform=plt.gcf().transFigure)
194 208
195 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}".format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), sum(np.array(data_array[:, 0]).astype(int))) 209 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}".format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), sum(np.array(data_array[:, 0]).astype(int)))
196 plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure) 210 plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
197 211
200 legend = "BA\n{}\n{}\n{:.5f}" \ 214 legend = "BA\n{}\n{}\n{:.5f}" \
201 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) 215 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
202 plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure) 216 plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)
203 217
204 plt.text(0.53, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure) 218 plt.text(0.53, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure)
205 plt.text(0.85, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11, 219 if re.search('\.', r): # BAM file with SSCS tags
220 plt.text(0.85, 0.2125, "{:,}".format(length_regions), size=11,
221 transform=plt.gcf().transFigure)
222 else:
223 plt.text(0.85, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11,
206 transform=plt.gcf().transFigure) 224 transform=plt.gcf().transFigure)
207 225 legend4 = "* In the plot, both family sizes of the ab and ba strands were used." \
208 legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n" 226 "\nWhereas the total numbers indicate only the single count of the tags per region.\n"
209 plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure) 227 plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure)
210 228
211 space = 0 229 space = 0
212 for i, count in zip(group, quantAfterRegion): 230 for i, count in zip(group, quantAfterRegion):
213 plt.text(0.53, 0.15 - space, "{}:\n".format(i), size=11, transform=plt.gcf().transFigure) 231 plt.text(0.53, 0.15 - space, "{}:\n".format(i), size=11, transform=plt.gcf().transFigure)
214 plt.text(0.85, 0.15 - space, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) 232 if re.search('\.', r): # BAM file with SSCS tags
233 plt.text(0.85, 0.15 - space, "{:,}\n".format(len(count)), size=11, transform=plt.gcf().transFigure)
234 else:
235 plt.text(0.85, 0.15 - space, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)
215 space = space + 0.02 236 space = space + 0.02
216 237
217 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) 238 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
218 plt.xlabel("Family size", fontsize=14) 239 plt.xlabel("Family size", fontsize=14)
219 plt.ylabel("Absolute Frequency", fontsize=14) 240 plt.ylabel("Absolute Frequency", fontsize=14)
227 output_file.write("{}AB{}BA\n".format(sep, sep)) 248 output_file.write("{}AB{}BA\n".format(sep, sep))
228 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) 249 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))
229 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) 250 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
230 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) 251 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
231 output_file.write("total nr. of reads{}{}\n".format(sep, sum(np.array(data_array[:, 0]).astype(int)))) 252 output_file.write("total nr. of reads{}{}\n".format(sep, sum(np.array(data_array[:, 0]).astype(int))))
232 output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions / 2)) 253 if re.search('\.', r): # BAM file with SSCS tags
254 output_file.write("total nr. of tags{}{}\n".format(sep, length_regions))
255 else:
256 output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions / 2))
233 output_file.write("\n\nValues from family size distribution\n") 257 output_file.write("\n\nValues from family size distribution\n")
234 output_file.write("{}".format(sep)) 258 output_file.write("{}".format(sep))
235 for i in group: 259 for i in group:
236 output_file.write("{}{}".format(i, sep)) 260 output_file.write("{}{}".format(i, sep))
237 output_file.write("\n") 261 output_file.write("\n")
256 output_file.write("{}{}".format(int(sum(counts[0])), sep)) 280 output_file.write("{}{}".format(int(sum(counts[0])), sep))
257 else: 281 else:
258 for i in counts[0]: 282 for i in counts[0]:
259 output_file.write("{}{}".format(int(sum(i)), sep)) 283 output_file.write("{}{}".format(int(sum(i)), sep))
260 output_file.write("\n") 284 output_file.write("\n")
261 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") 285 if re.search('\.', r): # BAM file with SSCS tags
286 output_file.write("\n")
287 else:
288 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used."
289 "\nWhereas the total numbers indicate only the single count of the tags per region.\n")
262 output_file.write("Region{}total nr. of tags per region\n".format(sep)) 290 output_file.write("Region{}total nr. of tags per region\n".format(sep))
263 for i, count in zip(group, quantAfterRegion): 291 for i, count in zip(group, quantAfterRegion):
264 output_file.write("{}{}{}\n".format(i, sep, len(count) / 2)) 292 if re.search('\.', r): # BAM file with SSCS tags
293 output_file.write("{}{}{}\n".format(i, sep, len(count)))
294 else:
295 output_file.write("{}{}{}\n".format(i, sep, len(count) / 2))
265 296
266 print("Files successfully created!") 297 print("Files successfully created!")
267 298
268 299
269 if __name__ == '__main__': 300 if __name__ == '__main__':