Mercurial > repos > yhoogstrate > edger_with_design_matrix
comparison bin/edger_dge_table_to_bedgraph @ 1:a4a4c88783ea draft
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools/raw/master/edger_with_design_matrix commit 2700e500a4fb135a20ede7d52221a9d31f1aaa5e-dirty
author | yhoogstrate |
---|---|
date | Tue, 01 Sep 2015 04:59:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:acb56603b268 | 1:a4a4c88783ea |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import re,sys,os,os.path,argparse,textwrap,datetime | |
4 | |
5 class GTF: | |
6 def __init__(self,filename,features=["exon"],symbol="gene_id"): | |
7 self.features = features | |
8 if(re.match("^[a-zA-Z0-9_\-]+$",symbol)): | |
9 self.symbol = symbol | |
10 else: | |
11 raise ValueError('False symbol matching symbol: '+str(symbol)) | |
12 self.index = {} | |
13 self.parse(filename) | |
14 | |
15 def parse(self,filename): | |
16 with open(filename) as infile: | |
17 for line in infile: | |
18 self.parse_line(line) | |
19 | |
20 def parse_line(self,line): | |
21 line = line.strip().split("\t") | |
22 if(len(line) == 9): | |
23 if(line[2] in self.features): | |
24 gene_id = self.parse_column_9(line[8]) | |
25 if(gene_id): | |
26 if(not self.index.has_key(gene_id)): | |
27 self.index[gene_id] = [] | |
28 self.index[gene_id].append([line[0],line[3],line[4]]) | |
29 | |
30 def parse_column_9(self,line): | |
31 query = self.symbol+'[ =]+([^ ;]+)' | |
32 m = re.search(query, line) | |
33 | |
34 if(m): | |
35 return m.group(1).strip("'").strip('"') | |
36 else: | |
37 return None | |
38 | |
39 def get(self,key): | |
40 try: | |
41 return self.index[key] | |
42 except: | |
43 return False | |
44 | |
45 | |
46 class EdgeR_table: | |
47 def __init__(self,table,gtf,columns=[3,7]): | |
48 self.index = {} | |
49 self.gtf = gtf | |
50 self.columns = columns | |
51 self.parse(table) | |
52 | |
53 def parse(self,filename): | |
54 i = 0 | |
55 with open(filename) as infile: | |
56 for line in infile: | |
57 if(i == 0): | |
58 self.parse_header(line) | |
59 else: | |
60 self.parse_line(line) | |
61 i += 1 | |
62 | |
63 def parse_header(self,line): | |
64 params = line.strip().split("\t") | |
65 if(params[1].lower().find("genes") == -1): | |
66 raise ValueError('False header in file - no "genes" in 2nd colum: '+line) | |
67 if(params[2].lower().find("logfc") == -1): | |
68 raise ValueError('False header in file - no "logfc" in 3rd colum: '+line) | |
69 if(params[3].lower().find("logcpm") == -1): | |
70 raise ValueError('False header in file - no "logcpm" in 4th colum: '+line) | |
71 if(params[4].lower().find("lr") == -1): | |
72 raise ValueError('False header in file - no "lr" in 5th colum: '+line) | |
73 if(params[5].lower().find("pvalue") == -1): | |
74 raise ValueError('False header in file - no "pvalue" in 6th colum: '+line) | |
75 if(params[6].lower().find("fdr") == -1): | |
76 raise ValueError('False header in file - no "fdr" in 7th colum: '+line) | |
77 | |
78 def parse_line(self,line): | |
79 line = line.strip().split("\t") | |
80 | |
81 if(len(line) == 7): | |
82 gene_id = line[1].strip('"').strip("'") | |
83 column_data = {} | |
84 for column in self.columns: | |
85 if(column in [6,7]): | |
86 column_data[column] = str(1.0 - float(line[column-1])) | |
87 else: | |
88 column_data[column] = line[column-1] | |
89 | |
90 locations = self.gtf.get(gene_id) | |
91 if(not locations): | |
92 print "Warning: no location found for gene "+gene_id | |
93 else: | |
94 for location in locations: | |
95 self.insert(location,column_data) | |
96 | |
97 def insert(self,location,data): | |
98 chrom = location[0] | |
99 start = location[1] | |
100 end = location[2] | |
101 | |
102 if(not self.index.has_key(chrom)): | |
103 self.index[chrom] = {} | |
104 | |
105 if(not self.index[chrom].has_key(start)): | |
106 self.index[chrom][start] = {} | |
107 | |
108 if(not self.index[chrom][start].has_key(end)): | |
109 self.index[chrom][start][end] = [] | |
110 | |
111 self.index[chrom][start][end].append(data) | |
112 | |
113 def export(self,filenames={3:"log_cpm.txt",7:"fdr.txt"}): | |
114 for column in self.columns: | |
115 fh = open(filenames[column],"w") | |
116 | |
117 buf = False | |
118 | |
119 for chrom in sorted(self.index.keys()): | |
120 for start in sorted(self.index[chrom].keys()): | |
121 for end in sorted(self.index[chrom][start].keys()): | |
122 fh.write(chrom+"\t"+start+"\t"+end+"\t"+self.index[chrom][start][end][0][column]+"\n") | |
123 | |
124 fh.close() | |
125 | |
126 os.system("sort -k1,1V -k2,2g -k3,3g '"+filenames[column]+"' > '"+filenames[column]+".sorted'") | |
127 | |
128 | |
129 def remove_overlap_in_bedgraph(bedgraph_file_dirty,bedgraph_file_clean): | |
130 fh = open(bedgraph_file_clean,"w") | |
131 buf = False | |
132 | |
133 with open(bedgraph_file_dirty,"r") as f: | |
134 for line in f: | |
135 cur = line.strip().split("\t") | |
136 cur[1] = int(cur[1]) | |
137 cur[2] = int(cur[2]) | |
138 | |
139 if(not buf): | |
140 buf = cur | |
141 else: | |
142 if(cur[0] == buf[0] and cur[1] <= buf[2] ): | |
143 if(buf[1] == cur[1]): #is subset | |
144 newscore = (float(buf[3])+float(cur[3]))/2 | |
145 buf[2] = cur[2] | |
146 buf[3] = newscore | |
147 else: | |
148 c1 = buf[1] | |
149 c2 = cur[1] | |
150 c3 = min(buf[2],cur[2]) | |
151 c4 = max(buf[2],cur[2]) | |
152 | |
153 fh.write(buf[0]+"\t"+str(c1)+"\t"+str(c2-1)+"\t"+str(buf[3])+"\n") | |
154 | |
155 newscore = (float(buf[3])+float(cur[3]))/2 | |
156 #fh.write(buf[0]+"\t"+str(c2+1)+"\t"+str(c3)+"\t"+str(newscore)+"\tp2\n") | |
157 #buf = [buf[0], c3+1 , c4 , cur[3]] | |
158 | |
159 buf = [buf[0], c2 , c4 , cur[3]] | |
160 | |
161 # find if buf is a subset -> if so, merge and send to buffer | |
162 # or find the overlapping region | |
163 | |
164 # if current is overlapping with buffer; merge: | |
165 ## [ ] < buf | |
166 ## [ ] < cur | |
167 ## | |
168 ## [ ] < buf | |
169 ## [ ] < cur | |
170 ## 111112222333333 << write 1 and 2 and keep 3 in buf | |
171 else: | |
172 fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") | |
173 buf=cur | |
174 | |
175 fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") | |
176 fh.close() | |
177 | |
178 | |
179 | |
180 if __name__ == "__main__": | |
181 parser = argparse.ArgumentParser() | |
182 | |
183 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,epilog="For more info please visit:\n<https://github.com/yhoogstrate/fuma>") | |
184 | |
185 parser.add_argument("-t",help="CPM table to extract columns from",nargs=1,required=True) | |
186 parser.add_argument("-g",help="GTF file used to extract genomic location",nargs=1,required=True) | |
187 | |
188 parser.add_argument("-c3",help="Output (bedgraph) for column 3 (logFC)",nargs="?",required=False) | |
189 parser.add_argument("-c4",help="Output (bedgraph) for column 4 (logCPM)",nargs="?",required=False) | |
190 parser.add_argument("-c5",help="Output (bedgraph) for column 5 (LR)",nargs="?",required=False) | |
191 parser.add_argument("-c6",help="Output (bedgraph) for column 6 (PValue)",nargs="?",required=False) | |
192 parser.add_argument("-c7",help="Output (bedgraph) for column 7 (FDR)",nargs="?",required=False) | |
193 | |
194 args = parser.parse_args() | |
195 | |
196 #files = {3:"VCAP_logFC.hg19.bedgraph",7:"VCAP_fdr.hg19.bedgraph"} | |
197 files = {} | |
198 | |
199 if(args.c3): | |
200 files[3] = args.c3 | |
201 if(args.c4): | |
202 files[4] = args.c4 | |
203 if(args.c5): | |
204 files[5] = args.c5 | |
205 if(args.c6): | |
206 files[6] = args.c6 | |
207 if(args.c7): | |
208 files[7] = args.c7 | |
209 | |
210 print "Parsing GTF file" | |
211 g = GTF(args.g[0]) | |
212 | |
213 print "Parsing EdgeR table" | |
214 e = EdgeR_table(args.t[0],g,files.keys()) | |
215 | |
216 print "Exporting raw bedgraph(s)" | |
217 e.export(files) | |
218 | |
219 print "Removing overlapping entries in bedgraph(s)" | |
220 for key in files.keys(): | |
221 remove_overlap_in_bedgraph(files[key]+".sorted",files[key]) | |
222 os.system("rm '"+files[key]+".sorted'") |