comparison fsd_beforevsafter.py @ 5:1eae0524b285 draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/fsd_beforevsafter commit f4eb0a7cd4fd5baaa9afe0c931afb57ac6abc0c1
author mheinzl
date Wed, 23 May 2018 14:59:33 -0400
parents 327c40a821ed
children 238a71241876
comparison
equal deleted inserted replaced
4:2c6cff101f49 5:1eae0524b285
119 119
120 # get family sizes, tag for the duplicates 120 # get family sizes, tag for the duplicates
121 duplTags_double = quant[numpy.in1d(seq, d)] 121 duplTags_double = quant[numpy.in1d(seq, d)]
122 list1.append(duplTags_double) 122 list1.append(duplTags_double)
123 colors.append("#0000FF") 123 colors.append("#0000FF")
124 labels.append("before alignment\nto SSCS") 124 labels.append("before SSCS building")
125 125
126 duplTags = duplTags_double[0::2] # ab of DCS 126 duplTags = duplTags_double[0::2] # ab of DCS
127 duplTagsBA = duplTags_double[1::2] # ba of DCS 127 duplTagsBA = duplTags_double[1::2] # ba of DCS
128 128
129 d2 = d[(duplTags >= 3) & (duplTagsBA >= 3)] # ab and ba FS>=3 129 d2 = d[(duplTags >= 3) & (duplTagsBA >= 3)] # ab and ba FS>=3
131 # all SSCSs FS>=3 131 # all SSCSs FS>=3
132 seq_unique, seqUnique_index = numpy.unique(seq, return_index=True) 132 seq_unique, seqUnique_index = numpy.unique(seq, return_index=True)
133 seq_unique_FS = quant[seqUnique_index] 133 seq_unique_FS = quant[seqUnique_index]
134 seq_unique_FS3 = seq_unique_FS[seq_unique_FS >= 3] 134 seq_unique_FS3 = seq_unique_FS[seq_unique_FS >= 3]
135 135
136 legend1 = "\ntotal nr. of tags (unique, FS>=1):\nDCS (before alignment to SSCS, FS>=1):\ntotal nr. of tags (unique, FS>=3):\nDCS (before alignment to SSCS, FS>=3):" 136 legend1 = "\ntotal nr. of tags (unique, FS>=1):\nDCS (before SSCS building, FS>=1):\ntotal nr. of tags (unique, FS>=3):\nDCS (before SSCS building, FS>=3):"
137 legend2 = "total numbers * \n{:,}\n{:,}\n{:,}\n{:,}".format(len(seq_unique_FS), len(duplTags), 137 legend2 = "total numbers * \n{:,}\n{:,}\n{:,}\n{:,}".format(len(seq_unique_FS), len(duplTags),
138 len(seq_unique_FS3), len(d2)) 138 len(seq_unique_FS3), len(d2))
139 plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure) 139 plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure)
140 plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure) 140 plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure)
141 141
144 ### group large family sizes in the plot of fasta files 144 ### group large family sizes in the plot of fasta files
145 bigFamilies = numpy.where(fs_consensus > 20)[0] 145 bigFamilies = numpy.where(fs_consensus > 20)[0]
146 fs_consensus[bigFamilies] = 22 146 fs_consensus[bigFamilies] = 22
147 list1.append(fs_consensus) 147 list1.append(fs_consensus)
148 colors.append("#298A08") 148 colors.append("#298A08")
149 labels.append("make DCS") 149 labels.append("after DCS building")
150 legend3 = "make DCS:" 150 legend3 = "after DCS building:"
151 legend4 = "{:,}".format(len(tag_consensus)) 151 legend4 = "{:,}".format(len(tag_consensus))
152 plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure) 152 plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure)
153 plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure) 153 plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure)
154 154
155 ### data after trimming 155 ### data after trimming
185 quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref)) 185 quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref))
186 bigFamilies = numpy.where(quant_all_ref > 20)[0] # group large family sizes 186 bigFamilies = numpy.where(quant_all_ref > 20)[0] # group large family sizes
187 quant_all_ref[bigFamilies] = 22 187 quant_all_ref[bigFamilies] = 22
188 list1.append(quant_all_ref) 188 list1.append(quant_all_ref)
189 colors.append("#04cec7") 189 colors.append("#04cec7")
190 labels.append("after alignment\nto reference genome") 190 labels.append("after alignment\nto reference")
191 legend7 = "after alignment to reference genome:" 191 legend7 = "after alignment to reference:"
192 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome 192 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome
193 legend8 = "{:,}".format(length_DCS_ref) 193 legend8 = "{:,}".format(length_DCS_ref)
194 plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure) 194 plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure)
195 plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure) 195 plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure)
196 196
247 output_file.write("\n\nValues from family size distribution\n") 247 output_file.write("\n\nValues from family size distribution\n")
248 248
249 if afterTrimming == str(None) and ref_genome == str(None): 249 if afterTrimming == str(None) and ref_genome == str(None):
250 if afterTrimming == str(None): 250 if afterTrimming == str(None):
251 output_file.write( 251 output_file.write(
252 "{}before alignment to SSCS{}make DCS\n".format(sep, sep)) 252 "{}before SSCS buidling{}after DCS building\n".format(sep, sep))
253 elif ref_genome == str(None): 253 elif ref_genome == str(None):
254 output_file.write( 254 output_file.write(
255 "{}before alignment to SSCS{}make DCS\n".format(sep, sep)) 255 "{}before SSCS building{}atfer DCS building\n".format(sep, sep))
256 256
257 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], 257 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
258 counts[0][1][2:len(counts[0][1])]): 258 counts[0][1][2:len(counts[0][1])]):
259 if fs == 21: 259 if fs == 21:
260 fs = ">20" 260 fs = ">20"
266 "sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])))) 266 "sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1]))))
267 267
268 elif afterTrimming == str(None) or ref_genome == str(None): 268 elif afterTrimming == str(None) or ref_genome == str(None):
269 if afterTrimming == str(None): 269 if afterTrimming == str(None):
270 output_file.write( 270 output_file.write(
271 "{}before alignment to SSCS{}make DCS{}after alignment to reference genome\n".format(sep, sep, sep)) 271 "{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
272 elif ref_genome == str(None): 272 elif ref_genome == str(None):
273 output_file.write( 273 output_file.write(
274 "{}before alignment to SSCS{}make DCS{}after trimming\n".format(sep, sep, sep)) 274 "{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep))
275 275
276 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], 276 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
277 counts[0][1][2:len(counts[0][1])],counts[0][2][2:len(counts[0][2])]): 277 counts[0][1][2:len(counts[0][1])],counts[0][2][2:len(counts[0][2])]):
278 if fs == 21: 278 if fs == 21:
279 fs = ">20" 279 fs = ">20"
284 output_file.write( 284 output_file.write(
285 "sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2])))) 285 "sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2]))))
286 286
287 else: 287 else:
288 output_file.write( 288 output_file.write(
289 "{}before alignment to SSCS{}make DCS{}after trimming{}after alignment to reference genome\n".format( 289 "{}before SSCS building{}after DCS building{}after trimming{}after alignment to reference\n".format(
290 sep, sep, sep, sep)) 290 sep, sep, sep, sep))
291 for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], 291 for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
292 counts[0][1][2:len(counts[0][1])], 292 counts[0][1][2:len(counts[0][1])],
293 counts[0][2][2:len(counts[0][2])], 293 counts[0][2][2:len(counts[0][2])],
294 counts[0][3][2:len(counts[0][3])]): 294 counts[0][3][2:len(counts[0][3])]):
303 "sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, 303 "sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep,
304 int(sum(counts[0][2])), sep, int(sum(counts[0][3])))) 304 int(sum(counts[0][2])), sep, int(sum(counts[0][3]))))
305 305
306 output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n") 306 output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n")
307 output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS))) 307 output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS)))
308 output_file.write("DCS (before alignment to SSCS, FS>=1){}{}\n".format(sep, len(duplTags))) 308 output_file.write("DCS (before SSCS building, FS>=1){}{}\n".format(sep, len(duplTags)))
309 output_file.write("total nr. of tags (unique, FS>=3){}{}\n".format(sep, len(seq_unique_FS3))) 309 output_file.write("total nr. of tags (unique, FS>=3){}{}\n".format(sep, len(seq_unique_FS3)))
310 output_file.write("DCS (before alignment to SSCS, FS>=3){}{}\n".format(sep, len(d2))) 310 output_file.write("DCS (before SSCS building, FS>=3){}{}\n".format(sep, len(d2)))
311 output_file.write("make DCS{}{}\n".format(sep, len(tag_consensus))) 311 output_file.write("after DCS building{}{}\n".format(sep, len(tag_consensus)))
312 if afterTrimming != str(None): 312 if afterTrimming != str(None):
313 output_file.write("after trimming{}{}\n".format(sep, len(tag_trimming))) 313 output_file.write("after trimming{}{}\n".format(sep, len(tag_trimming)))
314 if ref_genome != str(None): 314 if ref_genome != str(None):
315 output_file.write("after alignment to reference genome{}{}\n".format(sep, length_DCS_ref)) 315 output_file.write("after alignment to reference{}{}\n".format(sep, length_DCS_ref))
316 316
317 print("Files successfully created!") 317 print("Files successfully created!")
318 318
319 if __name__ == '__main__': 319 if __name__ == '__main__':
320 sys.exit(compare_read_families_read_loss(sys.argv)) 320 sys.exit(compare_read_families_read_loss(sys.argv))