Mercurial > repos > mheinzl > fsd
comparison fsd.py @ 18:c825a29a7d9f draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
| author | mheinzl |
|---|---|
| date | Wed, 08 May 2019 07:03:39 -0400 |
| parents | 2e517a54eedc |
| children | b7bccbbee4a7 |
comparison
equal
deleted
inserted
replaced
| 17:2e517a54eedc | 18:c825a29a7d9f |
|---|---|
| 9 # The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and | 9 # The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and |
| 10 # a tabular file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes. | 10 # a tabular file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes. |
| 11 # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced. | 11 # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced. |
| 12 # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given. | 12 # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given. |
| 13 | 13 |
| 14 # USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf | 14 # USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --log_axis --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf |
| 15 | 15 |
| 16 import argparse | 16 import argparse |
| 17 import sys | 17 import sys |
| 18 import os | |
| 18 | 19 |
| 19 import matplotlib.pyplot as plt | 20 import matplotlib.pyplot as plt |
| 20 import numpy | 21 import numpy |
| 21 from matplotlib.backends.backend_pdf import PdfPages | 22 from matplotlib.backends.backend_pdf import PdfPages |
| 22 | 23 |
| 37 parser.add_argument('--inputName2') | 38 parser.add_argument('--inputName2') |
| 38 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') | 39 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') |
| 39 parser.add_argument('--inputName3') | 40 parser.add_argument('--inputName3') |
| 40 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') | 41 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') |
| 41 parser.add_argument('--inputName4') | 42 parser.add_argument('--inputName4') |
| 43 parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.') | |
| 42 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') | 44 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') |
| 43 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') | 45 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') |
| 44 return parser | 46 return parser |
| 45 | 47 |
| 46 | 48 |
| 47 def compare_read_families(argv): | 49 def compare_read_families(argv): |
| 48 | 50 |
| 49 parser = make_argparser() | 51 parser = make_argparser() |
| 50 args = parser.parse_args(argv[1:]) | 52 args = parser.parse_args(argv[1:]) |
| 53 | |
| 51 firstFile = args.inputFile1 | 54 firstFile = args.inputFile1 |
| 52 name1 = args.inputName1 | 55 name1 = args.inputName1 |
| 56 | |
| 53 secondFile = args.inputFile2 | 57 secondFile = args.inputFile2 |
| 54 name2 = args.inputName2 | 58 name2 = args.inputName2 |
| 55 thirdFile = args.inputFile3 | 59 thirdFile = args.inputFile3 |
| 56 name3 = args.inputName3 | 60 name3 = args.inputName3 |
| 57 fourthFile = args.inputFile4 | 61 fourthFile = args.inputFile4 |
| 58 name4 = args.inputName4 | 62 name4 = args.inputName4 |
| 63 log_axis = args.log_axis | |
| 64 | |
| 59 title_file = args.output_tabular | 65 title_file = args.output_tabular |
| 60 title_file2 = args.output_pdf | 66 title_file2 = args.output_pdf |
| 61 | 67 |
| 62 sep = "\t" | 68 sep = "\t" |
| 63 | 69 |
| 68 plt.rcParams['ytick.labelsize'] = 14 | 74 plt.rcParams['ytick.labelsize'] = 14 |
| 69 | 75 |
| 70 list_to_plot = [] | 76 list_to_plot = [] |
| 71 label = [] | 77 label = [] |
| 72 data_array_list = [] | 78 data_array_list = [] |
| 79 list_to_plot_original = [] | |
| 80 colors = [] | |
| 81 | |
| 73 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: | 82 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: |
| 74 fig = plt.figure() | 83 fig = plt.figure() |
| 75 plt.subplots_adjust(bottom=0.25) | 84 fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) |
| 85 fig2 = plt.figure() | |
| 86 fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) | |
| 87 | |
| 88 # plt.subplots_adjust(bottom=0.25) | |
| 76 if firstFile != str(None): | 89 if firstFile != str(None): |
| 77 file1 = readFileReferenceFree(firstFile) | 90 file1 = readFileReferenceFree(firstFile) |
| 78 integers = numpy.array(file1[:, 0]).astype(int) # keep original family sizes | 91 integers = numpy.array(file1[:, 0]).astype(int) # keep original family sizes |
| 92 list_to_plot_original.append(integers) | |
| 93 colors.append("#0000FF") | |
| 79 | 94 |
| 80 # for plot: replace all big family sizes by 22 | 95 # for plot: replace all big family sizes by 22 |
| 81 data1 = numpy.array(file1[:, 0]).astype(int) | 96 # data1 = numpy.array(file1[:, 0]).astype(int) |
| 82 bigFamilies = numpy.where(data1 > 20)[0] | 97 # bigFamilies = numpy.where(data1 > 20)[0] |
| 83 data1[bigFamilies] = 22 | 98 # data1[bigFamilies] = 22 |
| 84 | 99 if numpy.amax(integers) > 20: |
| 100 bins = numpy.arange(numpy.amin(integers), numpy.amax(integers) + 1) | |
| 101 data1 = numpy.clip(integers, bins[0], bins[-1]) | |
| 102 else: | |
| 103 data1 = integers | |
| 85 name1 = name1.split(".tabular")[0] | 104 name1 = name1.split(".tabular")[0] |
| 86 list_to_plot.append(data1) | 105 list_to_plot.append(data1) |
| 87 label.append(name1) | 106 label.append(name1) |
| 88 data_array_list.append(file1) | 107 data_array_list.append(file1) |
| 89 | 108 |
| 90 legend = "\n\n\n{}".format(name1) | 109 legend = "\n\n\n{}".format(name1) |
| 91 plt.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) | 110 fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) |
| 92 legend1 = "singletons:\nnr. of tags\n{:,}".format(numpy.bincount(data1)[1]) | 111 fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) |
| 93 plt.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) | 112 |
| 94 | 113 legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1)) |
| 95 legend3 = "freq. of tags\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1)) | 114 fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) |
| 96 plt.text(0.41, 0.11, legend3, size=10, transform=plt.gcf().transFigure) | 115 fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) |
| 97 | 116 |
| 98 legend3b = "PE reads\n{:.3f}".format(float(numpy.bincount(data1)[1]) / sum(integers)) | 117 legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers)) |
| 99 plt.text(0.5, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) | 118 fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) |
| 119 fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) | |
| 100 | 120 |
| 101 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1)) | 121 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1)) |
| 102 plt.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) | 122 fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) |
| 123 fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) | |
| 103 | 124 |
| 104 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) | 125 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) |
| 105 plt.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) | 126 fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) |
| 127 fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) | |
| 106 | 128 |
| 107 legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) | 129 legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) |
| 108 plt.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) | 130 fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) |
| 131 fig2.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) | |
| 109 | 132 |
| 110 legend6b = "PE reads\n{:,}".format(sum(integers)) | 133 legend6b = "PE reads\n{:,}".format(sum(integers)) |
| 111 plt.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) | 134 fig.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) |
| 135 fig2.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) | |
| 112 | 136 |
| 113 if secondFile != str(None): | 137 if secondFile != str(None): |
| 114 file2 = readFileReferenceFree(secondFile) | 138 file2 = readFileReferenceFree(secondFile) |
| 115 integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes | 139 integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes |
| 116 | 140 list_to_plot_original.append(integers2) |
| 117 data2 = numpy.asarray(file2[:, 0]).astype(int) | 141 colors.append("#298A08") |
| 118 bigFamilies2 = numpy.where(data2 > 20)[0] | 142 |
| 119 data2[bigFamilies2] = 22 | 143 # data2 = numpy.asarray(file2[:, 0]).astype(int) |
| 120 | 144 # bigFamilies2 = numpy.where(data2 > 20)[0] |
| 145 # data2[bigFamilies2] = 22 | |
| 146 | |
| 147 if numpy.amax(integers) > 20: | |
| 148 bins = numpy.arange(numpy.amin(integers2), numpy.amax(integers2) + 1) | |
| 149 data2 = numpy.clip(integers2, bins[0], bins[-1]) | |
| 150 else: | |
| 151 data2 = integers2 | |
| 121 list_to_plot.append(data2) | 152 list_to_plot.append(data2) |
| 122 name2 = name2.split(".tabular")[0] | 153 name2 = name2.split(".tabular")[0] |
| 123 label.append(name2) | 154 label.append(name2) |
| 124 data_array_list.append(file2) | 155 data_array_list.append(file2) |
| 125 | 156 |
| 126 plt.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) | 157 fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) |
| 127 | 158 fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) |
| 128 legend1 = "{:,}".format(numpy.bincount(data2)[1]) | 159 |
| 129 plt.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) | 160 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / len(data2)) |
| 130 | 161 fig.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) |
| 131 legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2)) | 162 fig2.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) |
| 132 plt.text(0.41, 0.09, legend3, size=10, transform=plt.gcf().transFigure) | 163 |
| 133 | 164 legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2)) |
| 134 legend3b = "{:.3f}".format(float(numpy.bincount(data2)[1]) / sum(integers2)) | 165 fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) |
| 135 plt.text(0.5, 0.09, legend3b, size=10, transform=plt.gcf().transFigure) | 166 fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) |
| 136 | 167 |
| 137 legend4 = "{:,} ({:.3f})".format( | 168 legend4 = "{:,} ({:.3f})".format( |
| 138 numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int), | 169 numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int), |
| 139 float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2)) | 170 float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2)) |
| 140 plt.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) | 171 fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) |
| 172 fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) | |
| 141 | 173 |
| 142 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) | 174 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) |
| 143 plt.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) | 175 fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) |
| 176 fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) | |
| 144 | 177 |
| 145 legend6 = "{:,}".format(len(data2)) | 178 legend6 = "{:,}".format(len(data2)) |
| 146 plt.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) | 179 fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) |
| 180 fig2.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) | |
| 147 | 181 |
| 148 legend6b = "{:,}".format(sum(integers2)) | 182 legend6b = "{:,}".format(sum(integers2)) |
| 149 plt.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) | 183 fig.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) |
| 184 fig2.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) | |
| 150 | 185 |
| 151 if thirdFile != str(None): | 186 if thirdFile != str(None): |
| 152 file3 = readFileReferenceFree(thirdFile) | 187 file3 = readFileReferenceFree(thirdFile) |
| 153 integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes | 188 integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes |
| 154 | 189 list_to_plot_original.append(integers3) |
| 155 data3 = numpy.asarray(file3[:, 0]).astype(int) | 190 colors.append("#DF0101") |
| 156 bigFamilies3 = numpy.where(data3 > 20)[0] | 191 |
| 157 data3[bigFamilies3] = 22 | 192 # data3 = numpy.asarray(file3[:, 0]).astype(int) |
| 158 | 193 # bigFamilies3 = numpy.where(data3 > 20)[0] |
| 194 # data3[bigFamilies3] = 22 | |
| 195 | |
| 196 if numpy.amax(integers3) > 20: | |
| 197 bins = numpy.arange(numpy.amin(integers3), numpy.amax(integers3) + 1) | |
| 198 data3 = numpy.clip(integers3, bins[0], bins[-1]) | |
| 199 else: | |
| 200 data3 = integers3 | |
| 159 list_to_plot.append(data3) | 201 list_to_plot.append(data3) |
| 160 name3 = name3.split(".tabular")[0] | 202 name3 = name3.split(".tabular")[0] |
| 161 label.append(name3) | 203 label.append(name3) |
| 162 data_array_list.append(file3) | 204 data_array_list.append(file3) |
| 163 | 205 |
| 164 plt.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) | 206 fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) |
| 165 | 207 fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) |
| 166 legend1 = "{:,}".format(numpy.bincount(data3)[1]) | 208 |
| 167 plt.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) | 209 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3)) |
| 168 | 210 fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) |
| 169 legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3)) | 211 fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) |
| 170 plt.text(0.41, 0.07, legend3, size=10, transform=plt.gcf().transFigure) | 212 |
| 171 | 213 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3)) |
| 172 legend3b = "{:.3f}".format(float(numpy.bincount(data3)[1]) / sum(integers3)) | 214 fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) |
| 173 plt.text(0.5, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) | 215 fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) |
| 174 | 216 |
| 175 legend4 = "{:,} ({:.3f})".format( | 217 legend4 = "{:,} ({:.3f})".format( |
| 176 numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int), | 218 numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int), |
| 177 float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3)) | 219 float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3)) |
| 178 plt.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) | 220 fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) |
| 221 fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) | |
| 179 | 222 |
| 180 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), | 223 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), |
| 181 float(sum(integers3[integers3 > 20])) / sum(integers3)) | 224 float(sum(integers3[integers3 > 20])) / sum(integers3)) |
| 182 plt.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) | 225 fig.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) |
| 226 fig2.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) | |
| 183 | 227 |
| 184 legend6 = "{:,}".format(len(data3)) | 228 legend6 = "{:,}".format(len(data3)) |
| 185 plt.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) | 229 fig.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) |
| 230 fig2.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) | |
| 186 | 231 |
| 187 legend6b = "{:,}".format(sum(integers3)) | 232 legend6b = "{:,}".format(sum(integers3)) |
| 188 plt.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) | 233 fig.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) |
| 234 fig2.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) | |
| 189 | 235 |
| 190 if fourthFile != str(None): | 236 if fourthFile != str(None): |
| 191 file4 = readFileReferenceFree(fourthFile) | 237 file4 = readFileReferenceFree(fourthFile) |
| 192 integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes | 238 integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes |
| 193 | 239 list_to_plot_original.append(integers4) |
| 194 data4 = numpy.asarray(file4[:, 0]).astype(int) | 240 colors.append("#04cec7") |
| 195 | 241 |
| 196 bigFamilies4 = numpy.where(data4 > 20)[0] | 242 # data4 = numpy.asarray(file4[:, 0]).astype(int) |
| 197 data4[bigFamilies4] = 22 | 243 # bigFamilies4 = numpy.where(data4 > 20)[0] |
| 198 | 244 # data4[bigFamilies4] = 22 |
| 245 if numpy.amax(integers4) > 20: | |
| 246 bins = numpy.arange(numpy.amin(integers4), numpy.amax(integers4) + 1) | |
| 247 data4 = numpy.clip(integers4, bins[0], bins[-1]) | |
| 248 else: | |
| 249 data4 = integers4 | |
| 199 list_to_plot.append(data4) | 250 list_to_plot.append(data4) |
| 200 name4 = name4.split(".tabular")[0] | 251 name4 = name4.split(".tabular")[0] |
| 201 label.append(name4) | 252 label.append(name4) |
| 202 data_array_list.append(file4) | 253 data_array_list.append(file4) |
| 203 | 254 |
| 204 plt.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) | 255 fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) |
| 205 | 256 fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) |
| 206 legend1 = "{:,}".format(numpy.bincount(data4)[1]) | 257 |
| 207 plt.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) | 258 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4)) |
| 208 | 259 fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) |
| 209 legend3 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4)) | 260 fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) |
| 210 plt.text(0.41, 0.05, legend3, size=10, transform=plt.gcf().transFigure) | 261 |
| 211 | 262 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4)) |
| 212 legend3b = "{:.3f}".format(float(numpy.bincount(data4)[1]) / sum(integers4)) | 263 fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) |
| 213 plt.text(0.5, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) | 264 fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) |
| 214 | 265 |
| 215 legend4 = "{:,} ({:.3f})".format( | 266 legend4 = "{:,} ({:.3f})".format( |
| 216 numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int), | 267 numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int), |
| 217 float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4)) | 268 float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4)) |
| 218 plt.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) | 269 fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) |
| 270 fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) | |
| 219 | 271 |
| 220 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), | 272 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), |
| 221 float(sum(integers4[integers4 > 20])) / sum(integers4)) | 273 float(sum(integers4[integers4 > 20])) / sum(integers4)) |
| 222 plt.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) | 274 fig.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) |
| 275 fig2.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) | |
| 223 | 276 |
| 224 legend6 = "{:,}".format(len(data4)) | 277 legend6 = "{:,}".format(len(data4)) |
| 225 plt.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) | 278 fig.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) |
| 279 fig2.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) | |
| 226 | 280 |
| 227 legend6b = "{:,}".format(sum(integers4)) | 281 legend6b = "{:,}".format(sum(integers4)) |
| 228 plt.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) | 282 fig.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) |
| 283 fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) | |
| 229 | 284 |
| 230 maximumX = numpy.amax(numpy.concatenate(list_to_plot)) | 285 maximumX = numpy.amax(numpy.concatenate(list_to_plot)) |
| 231 minimumX = numpy.amin(numpy.concatenate(list_to_plot)) | 286 minimumX = numpy.amin(numpy.concatenate(list_to_plot)) |
| 232 | 287 bins = numpy.arange(minimumX, maximumX + 1) |
| 233 counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black", | 288 list_to_plot2 = list_to_plot |
| 234 linewidth=1, label=label, align="left", rwidth=0.8, alpha=0.7) | 289 to_plot = ["Absolute frequencies", "Relative frequencies"] |
| 235 | 290 plt.xticks([], []) |
| 236 ticks = numpy.arange(minimumX - 1, maximumX, 1) | 291 plt.yticks([], []) |
| 237 ticks1 = map(str, ticks) | 292 fig.suptitle('Family Size Distribution (tags)', fontsize=14) |
| 238 ticks1[len(ticks1) - 1] = ">20" | 293 |
| 239 plt.xticks(numpy.array(ticks), ticks1) | 294 for l in range(len(to_plot)): |
| 240 | 295 ax = fig.add_subplot(2, 1, l+1) |
| 241 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) | 296 ticks = numpy.arange(1, 22, 1) |
| 242 # plt.title("Family Size Distribution", fontsize=14) | 297 ticks1 = map(str, ticks) |
| 243 plt.xlabel("Family size", fontsize=14) | 298 if maximumX > 20: |
| 244 plt.ylabel("Absolute Frequency", fontsize=14) | 299 ticks1[len(ticks1) - 1] = ">20" |
| 245 plt.margins(0.01, None) | 300 |
| 246 plt.grid(b=True, which="major", color="#424242", linestyle=":") | 301 if to_plot[l] == "Relative frequencies": |
| 302 counts_rel = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8, normed=True) | |
| 303 else: | |
| 304 counts = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8) | |
| 305 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) | |
| 306 | |
| 307 ax.set_xticks(numpy.array(ticks)) | |
| 308 ax.set_xticklabels(ticks1) | |
| 309 | |
| 310 ax.set_ylabel(to_plot[l], fontsize=14) | |
| 311 ax.set_xlabel("Family size", fontsize=14) | |
| 312 if log_axis: | |
| 313 ax.set_yscale('log') | |
| 314 ax.grid(b=True, which="major", color="#424242", linestyle=":") | |
| 315 ax.margins(0.01, None) | |
| 247 pdf.savefig(fig) | 316 pdf.savefig(fig) |
| 248 plt.close() | 317 plt.close() |
| 249 | 318 |
| 250 # write data to CSV file | 319 fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14) |
| 251 output_file.write("Values from family size distribution with all datasets\n") | 320 for l in range(len(to_plot)): |
| 321 ax = fig2.add_subplot(2, 1, l + 1) | |
| 322 ticks = numpy.arange(minimumX, maximumX + 1) | |
| 323 ticks1 = map(str, ticks) | |
| 324 if maximumX > 20: | |
| 325 ticks1[len(ticks1) - 1] = ">20" | |
| 326 reads = [] | |
| 327 reads_rel = [] | |
| 328 | |
| 329 barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot) + 1) | |
| 330 | |
| 331 for i in range(len(list_to_plot2)): | |
| 332 unique, c = numpy.unique(list_to_plot2[i], return_counts=True) | |
| 333 new_c = [] | |
| 334 new_unique = [] | |
| 335 | |
| 336 for t in ticks: | |
| 337 if t not in unique: | |
| 338 new_c.append(0) # add zero count of not occuring | |
| 339 new_unique.append(t) | |
| 340 else: | |
| 341 c_idx = numpy.where(t == unique)[0] | |
| 342 new_c.append(c[c_idx]) | |
| 343 new_unique.append(unique[c_idx]) | |
| 344 y = numpy.array(new_unique) * numpy.array(new_c) | |
| 345 if len([list_to_plot_original > 20]) > 0: | |
| 346 y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20]) | |
| 347 reads.append(y) | |
| 348 reads_rel.append(list(numpy.float_(y)) / sum(y)) | |
| 349 | |
| 350 x = list(numpy.arange(numpy.amin(unique), numpy.amax(unique) + 1).astype(float)) | |
| 351 x = [xi + barWidth for xi in x] | |
| 352 | |
| 353 if to_plot[l] == "Relative frequencies": | |
| 354 counts2_rel = ax.bar(x, list(numpy.float_(y)) / sum(y), align="edge", width=1./(len(list_to_plot) + 1), | |
| 355 edgecolor="black", label=label[i], alpha=1, linewidth=1, color=colors[i]) | |
| 356 else: | |
| 357 counts2 = ax.bar(x, y, align="edge", width=1./len(list_to_plot), edgecolor="black", label=label[i], | |
| 358 alpha=1, linewidth=1, color=colors[i]) | |
| 359 if i == len(list_to_plot2): | |
| 360 barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1) | |
| 361 else: | |
| 362 barWidth += 1. / (len(list_to_plot) + 1) | |
| 363 | |
| 364 if to_plot[l] == "Absolute frequencies": | |
| 365 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) | |
| 366 else: | |
| 367 ax.set_xlabel("Family size", fontsize=14) | |
| 368 | |
| 369 ax.set_xticks(numpy.array(ticks)) | |
| 370 ax.set_xticklabels(ticks1) | |
| 371 ax.set_ylabel(to_plot[l], fontsize=14) | |
| 372 if log_axis: | |
| 373 ax.set_yscale('log') | |
| 374 ax.grid(b=True, which="major", color="#424242", linestyle=":") | |
| 375 ax.margins(0.01, None) | |
| 376 | |
| 377 pdf.savefig(fig2) | |
| 378 plt.close() | |
| 379 | |
| 380 # write data to CSV file tags | |
| 381 output_file.write("Values from family size distribution with all datasets (tags)\n") | |
| 252 output_file.write("\nFamily size") | 382 output_file.write("\nFamily size") |
| 253 for i in label: | 383 for i in label: |
| 254 output_file.write("{}{}".format(sep, i)) | 384 output_file.write("{}{}".format(sep, i)) |
| 255 # output_file.write("{}sum".format(sep)) | 385 # output_file.write("{}sum".format(sep)) |
| 256 output_file.write("\n") | 386 output_file.write("\n") |
| 273 output_file.write("{}{}".format(int(sum(counts[0])), sep)) | 403 output_file.write("{}{}".format(int(sum(counts[0])), sep)) |
| 274 else: | 404 else: |
| 275 for i in counts[0]: | 405 for i in counts[0]: |
| 276 output_file.write("{}{}".format(int(sum(i)), sep)) | 406 output_file.write("{}{}".format(int(sum(i)), sep)) |
| 277 | 407 |
| 408 # write data to CSV file PE reads | |
| 409 output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n") | |
| 410 output_file.write("\nFamily size") | |
| 411 for i in label: | |
| 412 output_file.write("{}{}".format(sep, i)) | |
| 413 # output_file.write("{}sum".format(sep)) | |
| 414 output_file.write("\n") | |
| 415 j = 0 | |
| 416 for fs in bins: | |
| 417 if fs == 21: | |
| 418 fs = ">20" | |
| 419 else: | |
| 420 fs = "={}".format(fs) | |
| 421 output_file.write("FS{}{}".format(fs, sep)) | |
| 422 if len(label) == 1: | |
| 423 output_file.write("{}{}".format(int(reads[0][j]), sep)) | |
| 424 else: | |
| 425 for n in range(len(label)): | |
| 426 output_file.write("{}{}".format(int(reads[n][j]), sep)) | |
| 427 output_file.write("\n") | |
| 428 j += 1 | |
| 429 output_file.write("sum{}".format(sep)) | |
| 430 if len(label) == 1: | |
| 431 output_file.write("{}{}".format(int(sum(reads)), sep)) | |
| 432 else: | |
| 433 for i in reads: | |
| 434 output_file.write("{}{}".format(int(sum(i)), sep)) | |
| 435 output_file.write("\n") | |
| 436 | |
| 278 # Family size distribution after DCS and SSCS | 437 # Family size distribution after DCS and SSCS |
| 279 for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label): | 438 for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label): |
| 280 maximumX = numpy.amax(dataset) | 439 maximumX = numpy.amax(dataset) |
| 281 minimumX = numpy.amin(dataset) | 440 minimumX = numpy.amin(dataset) |
| 282 | 441 |
| 296 duplTags_o = duplTags_double_o[0::2] # ab of DCS | 455 duplTags_o = duplTags_double_o[0::2] # ab of DCS |
| 297 | 456 |
| 298 duplTagsBA = duplTags_double[1::2] # ba of DCS | 457 duplTagsBA = duplTags_double[1::2] # ba of DCS |
| 299 duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS | 458 duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS |
| 300 | 459 |
| 460 # duplTags_double_tag = tags[numpy.in1d(seq, d)] | |
| 461 # duplTags_double_seq = seq[numpy.in1d(seq, d)] | |
| 462 | |
| 301 # get family sizes for SSCS with no partner | 463 # get family sizes for SSCS with no partner |
| 302 ab = numpy.where(tags == "ab")[0] | 464 ab = numpy.where(tags == "ab")[0] |
| 303 abSeq = seq[ab] | 465 abSeq = seq[ab] |
| 304 ab_o = data_o[ab] | 466 ab_o = data_o[ab] |
| 305 ab = data[ab] | 467 ab = data[ab] |
| 320 # information for family size >= 3 | 482 # information for family size >= 3 |
| 321 dataAB_FS3 = dataAB[dataAB >= 3] | 483 dataAB_FS3 = dataAB[dataAB >= 3] |
| 322 dataAB_FS3_o = dataAB_o[dataAB_o >= 3] | 484 dataAB_FS3_o = dataAB_o[dataAB_o >= 3] |
| 323 dataBA_FS3 = dataBA[dataBA >= 3] | 485 dataBA_FS3 = dataBA[dataBA >= 3] |
| 324 dataBA_FS3_o = dataBA_o[dataBA_o >= 3] | 486 dataBA_FS3_o = dataBA_o[dataBA_o >= 3] |
| 325 ab_FS3 = ab[ab >= 3] | 487 # ab_FS3 = ab[ab >= 3] |
| 326 ba_FS3 = ba[ba >= 3] | 488 # ba_FS3 = ba[ba >= 3] |
| 327 ab_FS3_o = ab_o[ab_o >= 3] | 489 # ab_FS3_o = ab_o[ab_o >= 3] |
| 328 ba_FS3_o = ba_o[ba_o >= 3] | 490 # ba_FS3_o = ba_o[ba_o >= 3] |
| 329 | 491 |
| 330 duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 | 492 duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 |
| 331 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3 | 493 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3 |
| 332 duplTags_double_FS3 = len(duplTags_FS3) + len(duplTags_FS3_BA) # both ab and ba strands with FS>=3 | 494 duplTags_double_FS3 = len(duplTags_FS3) + len(duplTags_FS3_BA) # both ab and ba strands with FS>=3 |
| 333 | 495 |
| 335 duplTags_FS3_o = duplTags_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ab+ba with FS>=3 | 497 duplTags_FS3_o = duplTags_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ab+ba with FS>=3 |
| 336 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 | 498 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 |
| 337 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 | 499 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 |
| 338 | 500 |
| 339 fig = plt.figure() | 501 fig = plt.figure() |
| 340 plt.subplots_adjust(bottom=0.3) | 502 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) |
| 341 counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True, label=["duplex", "ab", "ba"], | 503 counts = plt.hist(list1, bins=numpy.arange(minimumX, maximumX + 2), stacked=True, label=["duplex", "ab", "ba"], |
| 342 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], | 504 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], |
| 343 rwidth=0.8) | 505 rwidth=0.8) |
| 344 # tick labels of x axis | 506 # tick labels of x axis |
| 345 ticks = numpy.arange(minimumX - 1, maximumX, 1) | 507 ticks = numpy.arange(1, 22, 1) |
| 346 ticks1 = map(str, ticks) | 508 ticks1 = map(str, ticks) |
| 347 ticks1[len(ticks1) - 1] = ">20" | 509 if maximumX > 20: |
| 510 ticks1[len(ticks1) - 1] = ">20" | |
| 348 plt.xticks(numpy.array(ticks), ticks1) | 511 plt.xticks(numpy.array(ticks), ticks1) |
| 349 singl = counts[0][2][0] # singletons | 512 singl = counts[0][2][0] # singletons |
| 350 last = counts[0][2][len(counts[0][0]) - 1] # large families | 513 last = counts[0][2][len(counts[0][0]) - 1] # large families |
| 351 | 514 if log_axis: |
| 515 plt.yscale('log') | |
| 352 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) | 516 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) |
| 353 plt.title(name_file, fontsize=14) | 517 plt.title(name_file, fontsize=14) |
| 354 plt.xlabel("Family size", fontsize=14) | 518 plt.xlabel("Family size", fontsize=14) |
| 355 plt.ylabel("Absolute Frequency", fontsize=14) | 519 plt.ylabel("Absolute Frequency", fontsize=14) |
| 356 plt.margins(0.01, None) | 520 plt.margins(0.01, None) |
| 409 output_file.write("\nDataset:{}{}\n".format(sep, name_file)) | 573 output_file.write("\nDataset:{}{}\n".format(sep, name_file)) |
| 410 output_file.write("max. family size:{}{}\n".format(sep, max(integers))) | 574 output_file.write("max. family size:{}{}\n".format(sep, max(integers))) |
| 411 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) | 575 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) |
| 412 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) | 576 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) |
| 413 | 577 |
| 414 output_file.write("{}singletons:{}{}{}family size > 20:\n".format(sep, sep, sep, sep)) | 578 output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) |
| 415 output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) | 579 output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) |
| 416 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( | 580 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( |
| 417 name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep, | 581 name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep, |
| 418 last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o))) | 582 last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o))) |
| 419 | 583 |
