comparison fsd_regions.py @ 0:b82fdb006304 draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/fsd_regions commit 6055f8c5c052f528ff85fb5e0d43b4500830637a
author mheinzl
date Thu, 10 May 2018 07:28:39 -0400
parents
children 9ce2b4089c1b
comparison
equal deleted inserted replaced
-1:000000000000 0:b82fdb006304
1 #!/usr/bin/env python
2
3 # Family size distribution of tags which were aligned to the reference genome
4 #
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
6 # Contact: monika.heinzl@edumail.at
7 #
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS
9 # and a TXT with tags of reads that overlap the regions of the reference genome as input.
10 # The program produces a plot which shows the distribution of family sizes of the tags from the input files and
11 # a CSV file with the data of the plot.
12
13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf
14
15 import numpy
16 import matplotlib.pyplot as plt
17 import argparse
18 import sys
19 import os
20 from matplotlib.backends.backend_pdf import PdfPages
21
22 def readFileReferenceFree(file, delim):
23 with open(file, 'r') as dest_f:
24 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
25 return(data_array)
26
27 def make_argparser():
28 parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome')
29 parser.add_argument('--inputFile',
30 help='Tabular File with three columns: ab or ba, tag and family size.')
31 parser.add_argument('--inputName1')
32 parser.add_argument('--ref_genome',
33 help='TXT File with tags of reads that overlap the region.')
34 parser.add_argument('--output_csv', default="data.csv", type=str,
35 help='Name of the pdf and csv file.')
36 parser.add_argument('--output_pdf', default="data.pdf", type=str,
37 help='Name of the pdf and csv file.')
38 parser.add_argument('--sep', default=",",
39 help='Separator in the csv file.')
40 return parser
41
42 def compare_read_families_refGenome(argv):
43 parser = make_argparser()
44 args = parser.parse_args(argv[1:])
45
46 firstFile = args.inputFile
47 name1 = args.inputName1
48 name1 = name1.split(".tabular")[0]
49 refGenome = args.ref_genome
50 title_file = args.output_pdf
51 title_file2 = args.output_csv
52 sep = args.sep
53
54 if type(sep) is not str or len(sep) > 1:
55 print("Error: --sep must be a single character.")
56 exit(3)
57
58 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:
59 data_array = readFileReferenceFree(firstFile, "\t")
60
61 mut_array = readFileReferenceFree(refGenome, " ")
62 length_regions = len(mut_array)
63
64 seq = numpy.array(data_array[:, 1])
65 tags = numpy.array(data_array[:, 2])
66 quant = numpy.array(data_array[:, 0]).astype(int)
67 group = numpy.array(mut_array[:, 0])
68
69 all_ab = seq[numpy.where(tags == "ab")[0]]
70 all_ba = seq[numpy.where(tags == "ba")[0]]
71 quant_ab = quant[numpy.where(tags == "ab")[0]]
72 quant_ba = quant[numpy.where(tags == "ba")[0]]
73
74 seqDic_ab = dict(zip(all_ab, quant_ab))
75 seqDic_ba = dict(zip(all_ba, quant_ba))
76
77 seq_mut = numpy.array(mut_array[:, 1])
78
79 groupUnique, group_index = numpy.unique(group, return_index=True)
80 groupUnique = groupUnique[numpy.argsort(group_index)]
81
82 lst_ab = []
83 for i in seq_mut:
84 lst_ab.append(seqDic_ab.get(i))
85
86 lst_ba = []
87 for i in seq_mut:
88 lst_ba.append(seqDic_ba.get(i))
89
90 quant_ab = numpy.array(lst_ab)
91 quant_ba = numpy.array(lst_ba)
92
93 quantAfterRegion = []
94 for i in groupUnique:
95 dataAB = quant_ab[numpy.where(group == i)[0]]
96 dataBA = quant_ba[numpy.where(group == i)[0]]
97 bigFamilies = numpy.where(dataAB > 20)[0]
98 dataAB[bigFamilies] = 22
99 bigFamilies = numpy.where(dataBA > 20)[0]
100 dataBA[bigFamilies] = 22
101
102 quantAll = numpy.concatenate((dataAB, dataBA))
103 quantAfterRegion.append(quantAll)
104
105 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion))
106 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion))
107
108 ### PLOT ###
109 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
110 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
111 plt.rcParams['xtick.labelsize'] = 12
112 plt.rcParams['ytick.labelsize'] = 12
113 plt.rcParams['patch.edgecolor'] = "black"
114 fig = plt.figure()
115 plt.subplots_adjust(bottom=0.3)
116
117 colors = ["#6E6E6E", "#0431B4", "#5FB404", "#B40431", "#F4FA58", "#DF7401", "#81DAF5"]
118
119 col = []
120 for i in range(0, len(groupUnique)):
121 col.append(colors[i])
122
123 counts = plt.hist(quantAfterRegion, bins=range(minimumX, maximumX + 1), stacked=False, label=groupUnique,
124 align="left", alpha=1, color=col, edgecolor="black", linewidth=1)
125 ticks = numpy.arange(minimumX - 1, maximumX, 1)
126
127 ticks1 = map(str, ticks)
128 ticks1[len(ticks1) - 1] = ">20"
129 plt.xticks(numpy.array(ticks), ticks1)
130 count = numpy.bincount(map(int, quant_ab)) # original counts
131
132 legend = "max. family size =\nabsolute frequency=\nrelative frequency=\n\ntotal nr. of reads="
133 plt.text(0.15, 0.105, legend, size=11, transform=plt.gcf().transFigure)
134
135 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \
136 .format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count),
137 sum(numpy.array(data_array[:, 0]).astype(int)))
138 plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
139
140 count2 = numpy.bincount(map(int, quant_ba)) # original counts
141
142 legend = "BA\n{}\n{}\n{:.5f}" \
143 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
144 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure)
145
146 legend1 = "total nr. of tags="
147 legend2 = "total numbers * \n{:,}".format(length_regions)
148 plt.text(0.6, 0.2, legend1, size=11, transform=plt.gcf().transFigure)
149 plt.text(0.75, 0.2, legend2, size=11, transform=plt.gcf().transFigure)
150 legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n"
151 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
152
153 space = numpy.arange(0, len(groupUnique), 0.02)
154 for i, s, count in zip(groupUnique, space, quantAfterRegion):
155 plt.text(0.6, 0.05 + s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure)
156 plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)
157
158 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
159 plt.title(name1, fontsize=14)
160 plt.xlabel("No. of Family Members", fontsize=12)
161 plt.ylabel("Absolute Frequency", fontsize=12)
162 plt.grid(b=True, which="major", color="#424242", linestyle=":")
163 plt.margins(0.01, None)
164
165 # plt.savefig("{}_regions.pdf".format(title_file), bbox_inch="tight")
166 pdf.savefig(fig, bbox_inch="tight")
167 plt.close()
168
169 output_file.write("Dataset:{}{}\n".format(sep, firstFile))
170 output_file.write("{}AB{}BA\n".format(sep, sep))
171 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))
172 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
173 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
174 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
175 output_file.write("\n\nValues from family size distribution\n")
176 output_file.write("{}".format(sep))
177 for i in groupUnique:
178 output_file.write("{}{}".format(i,sep))
179 output_file.write("\n")
180 j=0
181 for fs in counts[1][0:len(counts[1])-1]:
182 if fs == 21:
183 fs = ">20"
184 else:
185 fs = "={}".format(fs)
186 output_file.write("FS{}{}".format(fs,sep))
187 for n in range(len(groupUnique)):
188 output_file.write("{}{}".format(int(counts[0][n][j]), sep))
189 output_file.write("\n")
190 j+=1
191 output_file.write("sum{}".format(sep))
192 for i in counts[0]:
193 output_file.write("{}{}".format(int(sum(i)), sep))
194 output_file.write("\n")
195 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n")
196 output_file.write("Region{}total nr. of tags per region\n".format(sep))
197 for i, count in zip(groupUnique, quantAfterRegion):
198 output_file.write("{}{}{}\n".format(i,sep,len(count) / 2))
199 output_file.write("sum of tags{}{}\n".format(sep,length_regions))
200
201 print("Files successfully created!")
202 #print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd()))
203
204 if __name__ == '__main__':
205 sys.exit(compare_read_families_refGenome(sys.argv))