comparison fsd_regions.py @ 4:b202c97deabe draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit dfaab79252a858e8df16bbea3607ebf1b6962e5a
author mheinzl
date Mon, 08 Oct 2018 05:53:50 -0400
parents 2631864873d7
children 52454637bc45
comparison
equal deleted inserted replaced
3:85d870b8ae92 4:b202c97deabe
6 # Contact: monika.heinzl@edumail.at 6 # Contact: monika.heinzl@edumail.at
7 # 7 #
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS 8 # Takes at least one TABULAR file with tags before the alignment to the SSCS
9 # and a TXT with tags of reads that overlap the regions of the reference genome as input. 9 # and a TXT with tags of reads that overlap the regions of the reference genome as input.
10 # The program produces a plot which shows the distribution of family sizes of the tags from the input files and 10 # The program produces a plot which shows the distribution of family sizes of the tags from the input files and
11 # a CSV file with the data of the plot. 11 # a tabular file with the data of the plot.
12 12
13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf 13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
14 14
15 import numpy
16 import matplotlib.pyplot as plt
17 import argparse 15 import argparse
18 import sys 16 import sys
19 import os 17
18 import matplotlib.pyplot as plt
19 import numpy
20 from matplotlib.backends.backend_pdf import PdfPages 20 from matplotlib.backends.backend_pdf import PdfPages
21
22 plt.switch_backend('agg')
23
21 24
22 def readFileReferenceFree(file, delim): 25 def readFileReferenceFree(file, delim):
23 with open(file, 'r') as dest_f: 26 with open(file, 'r') as dest_f:
24 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') 27 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
25 return(data_array) 28 return(data_array)
26 29
30
27 def make_argparser(): 31 def make_argparser():
28 parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome') 32 parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome')
29 parser.add_argument('--inputFile', 33 parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.')
30 help='Tabular File with three columns: ab or ba, tag and family size.')
31 parser.add_argument('--inputName1') 34 parser.add_argument('--inputName1')
32 parser.add_argument('--ref_genome', 35 parser.add_argument('--ref_genome', help='TXT File with tags of reads that overlap the region.')
33 help='TXT File with tags of reads that overlap the region.') 36 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf and tabular file.')
34 parser.add_argument('--output_pdf', default="data.pdf", type=str, 37 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the pdf and tabular file.')
35 help='Name of the pdf and csv file.')
36 parser.add_argument('--output_csv', default="data.csv", type=str,
37 help='Name of the pdf and csv file.')
38 parser.add_argument('--sep', default=",",
39 help='Separator in the csv file.')
40 return parser 38 return parser
39
41 40
42 def compare_read_families_refGenome(argv): 41 def compare_read_families_refGenome(argv):
43 parser = make_argparser() 42 parser = make_argparser()
44 args = parser.parse_args(argv[1:]) 43 args = parser.parse_args(argv[1:])
45 44
46 firstFile = args.inputFile 45 firstFile = args.inputFile
47 name1 = args.inputName1 46 name1 = args.inputName1
48 name1 = name1.split(".tabular")[0] 47 name1 = name1.split(".tabular")[0]
49 refGenome = args.ref_genome 48 refGenome = args.ref_genome
50 title_file = args.output_pdf 49 title_file = args.output_pdf
51 title_file2 = args.output_csv 50 title_file2 = args.output_tabular
52 sep = args.sep 51 sep = "\t"
53
54 if type(sep) is not str or len(sep) > 1:
55 print("Error: --sep must be a single character.")
56 exit(3)
57 52
58 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: 53 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:
59 data_array = readFileReferenceFree(firstFile, "\t") 54 data_array = readFileReferenceFree(firstFile, "\t")
60 55
61 mut_array = readFileReferenceFree(refGenome, " ") 56 mut_array = readFileReferenceFree(refGenome, " ")
103 quantAfterRegion.append(quantAll) 98 quantAfterRegion.append(quantAll)
104 99
105 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) 100 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion))
106 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) 101 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion))
107 102
108 ### PLOT ### 103 # PLOT
109 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format 104 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
110 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color 105 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
111 plt.rcParams['xtick.labelsize'] = 14 106 plt.rcParams['xtick.labelsize'] = 14
112 plt.rcParams['ytick.labelsize'] = 14 107 plt.rcParams['ytick.labelsize'] = 14
113 plt.rcParams['patch.edgecolor'] = "black" 108 plt.rcParams['patch.edgecolor'] = "black"
154 for i, s, count in zip(groupUnique, space, quantAfterRegion): 149 for i, s, count in zip(groupUnique, space, quantAfterRegion):
155 plt.text(0.6, 0.05 + s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) 150 plt.text(0.6, 0.05 + s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure)
156 plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) 151 plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)
157 152
158 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) 153 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
159 #plt.title(name1, fontsize=14) 154 # plt.title(name1, fontsize=14)
160 plt.xlabel("Family size", fontsize=14) 155 plt.xlabel("Family size", fontsize=14)
161 plt.ylabel("Absolute Frequency", fontsize=14) 156 plt.ylabel("Absolute Frequency", fontsize=14)
162 plt.grid(b=True, which="major", color="#424242", linestyle=":") 157 plt.grid(b=True, which="major", color="#424242", linestyle=":")
163 plt.margins(0.01, None) 158 plt.margins(0.01, None)
164 159
173 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) 168 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
174 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) 169 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
175 output_file.write("\n\nValues from family size distribution\n") 170 output_file.write("\n\nValues from family size distribution\n")
176 output_file.write("{}".format(sep)) 171 output_file.write("{}".format(sep))
177 for i in groupUnique: 172 for i in groupUnique:
178 output_file.write("{}{}".format(i,sep)) 173 output_file.write("{}{}".format(i, sep))
179 output_file.write("\n") 174 output_file.write("\n")
180 j=0 175 j = 0
181 for fs in counts[1][0:len(counts[1])-1]: 176 for fs in counts[1][0:len(counts[1]) - 1]:
182 if fs == 21: 177 if fs == 21:
183 fs = ">20" 178 fs = ">20"
184 else: 179 else:
185 fs = "={}".format(fs) 180 fs = "={}".format(fs)
186 output_file.write("FS{}{}".format(fs,sep)) 181 output_file.write("FS{}{}".format(fs, sep))
187 for n in range(len(groupUnique)): 182 for n in range(len(groupUnique)):
188 output_file.write("{}{}".format(int(counts[0][n][j]), sep)) 183 output_file.write("{}{}".format(int(counts[0][n][j]), sep))
189 output_file.write("\n") 184 output_file.write("\n")
190 j+=1 185 j += 1
191 output_file.write("sum{}".format(sep)) 186 output_file.write("sum{}".format(sep))
192 for i in counts[0]: 187 for i in counts[0]:
193 output_file.write("{}{}".format(int(sum(i)), sep)) 188 output_file.write("{}{}".format(int(sum(i)), sep))
194 output_file.write("\n") 189 output_file.write("\n")
195 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") 190 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n")
196 output_file.write("Region{}total nr. of tags per region\n".format(sep)) 191 output_file.write("Region{}total nr. of tags per region\n".format(sep))
197 for i, count in zip(groupUnique, quantAfterRegion): 192 for i, count in zip(groupUnique, quantAfterRegion):
198 output_file.write("{}{}{}\n".format(i,sep,len(count) / 2)) 193 output_file.write("{}{}{}\n".format(i, sep, len(count) / 2))
199 output_file.write("sum of tags{}{}\n".format(sep,length_regions)) 194 output_file.write("sum of tags{}{}\n".format(sep, length_regions))
200 195
201 print("Files successfully created!") 196 print("Files successfully created!")
202 #print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd())) 197 # print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd()))
198
203 199
204 if __name__ == '__main__': 200 if __name__ == '__main__':
205 sys.exit(compare_read_families_refGenome(sys.argv)) 201 sys.exit(compare_read_families_refGenome(sys.argv))