comparison bin/edger_dge_table_to_bedgraph @ 1:a4a4c88783ea draft

planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools/raw/master/edger_with_design_matrix commit 2700e500a4fb135a20ede7d52221a9d31f1aaa5e-dirty
author yhoogstrate
date Tue, 01 Sep 2015 04:59:05 -0400
parents
children
comparison
equal deleted inserted replaced
0:acb56603b268 1:a4a4c88783ea
1 #!/usr/bin/env python
2
3 import re,sys,os,os.path,argparse,textwrap,datetime
4
5 class GTF:
6 def __init__(self,filename,features=["exon"],symbol="gene_id"):
7 self.features = features
8 if(re.match("^[a-zA-Z0-9_\-]+$",symbol)):
9 self.symbol = symbol
10 else:
11 raise ValueError('False symbol matching symbol: '+str(symbol))
12 self.index = {}
13 self.parse(filename)
14
15 def parse(self,filename):
16 with open(filename) as infile:
17 for line in infile:
18 self.parse_line(line)
19
20 def parse_line(self,line):
21 line = line.strip().split("\t")
22 if(len(line) == 9):
23 if(line[2] in self.features):
24 gene_id = self.parse_column_9(line[8])
25 if(gene_id):
26 if(not self.index.has_key(gene_id)):
27 self.index[gene_id] = []
28 self.index[gene_id].append([line[0],line[3],line[4]])
29
30 def parse_column_9(self,line):
31 query = self.symbol+'[ =]+([^ ;]+)'
32 m = re.search(query, line)
33
34 if(m):
35 return m.group(1).strip("'").strip('"')
36 else:
37 return None
38
39 def get(self,key):
40 try:
41 return self.index[key]
42 except:
43 return False
44
45
46 class EdgeR_table:
47 def __init__(self,table,gtf,columns=[3,7]):
48 self.index = {}
49 self.gtf = gtf
50 self.columns = columns
51 self.parse(table)
52
53 def parse(self,filename):
54 i = 0
55 with open(filename) as infile:
56 for line in infile:
57 if(i == 0):
58 self.parse_header(line)
59 else:
60 self.parse_line(line)
61 i += 1
62
63 def parse_header(self,line):
64 params = line.strip().split("\t")
65 if(params[1].lower().find("genes") == -1):
66 raise ValueError('False header in file - no "genes" in 2nd colum: '+line)
67 if(params[2].lower().find("logfc") == -1):
68 raise ValueError('False header in file - no "logfc" in 3rd colum: '+line)
69 if(params[3].lower().find("logcpm") == -1):
70 raise ValueError('False header in file - no "logcpm" in 4th colum: '+line)
71 if(params[4].lower().find("lr") == -1):
72 raise ValueError('False header in file - no "lr" in 5th colum: '+line)
73 if(params[5].lower().find("pvalue") == -1):
74 raise ValueError('False header in file - no "pvalue" in 6th colum: '+line)
75 if(params[6].lower().find("fdr") == -1):
76 raise ValueError('False header in file - no "fdr" in 7th colum: '+line)
77
78 def parse_line(self,line):
79 line = line.strip().split("\t")
80
81 if(len(line) == 7):
82 gene_id = line[1].strip('"').strip("'")
83 column_data = {}
84 for column in self.columns:
85 if(column in [6,7]):
86 column_data[column] = str(1.0 - float(line[column-1]))
87 else:
88 column_data[column] = line[column-1]
89
90 locations = self.gtf.get(gene_id)
91 if(not locations):
92 print "Warning: no location found for gene "+gene_id
93 else:
94 for location in locations:
95 self.insert(location,column_data)
96
97 def insert(self,location,data):
98 chrom = location[0]
99 start = location[1]
100 end = location[2]
101
102 if(not self.index.has_key(chrom)):
103 self.index[chrom] = {}
104
105 if(not self.index[chrom].has_key(start)):
106 self.index[chrom][start] = {}
107
108 if(not self.index[chrom][start].has_key(end)):
109 self.index[chrom][start][end] = []
110
111 self.index[chrom][start][end].append(data)
112
113 def export(self,filenames={3:"log_cpm.txt",7:"fdr.txt"}):
114 for column in self.columns:
115 fh = open(filenames[column],"w")
116
117 buf = False
118
119 for chrom in sorted(self.index.keys()):
120 for start in sorted(self.index[chrom].keys()):
121 for end in sorted(self.index[chrom][start].keys()):
122 fh.write(chrom+"\t"+start+"\t"+end+"\t"+self.index[chrom][start][end][0][column]+"\n")
123
124 fh.close()
125
126 os.system("sort -k1,1V -k2,2g -k3,3g '"+filenames[column]+"' > '"+filenames[column]+".sorted'")
127
128
129 def remove_overlap_in_bedgraph(bedgraph_file_dirty,bedgraph_file_clean):
130 fh = open(bedgraph_file_clean,"w")
131 buf = False
132
133 with open(bedgraph_file_dirty,"r") as f:
134 for line in f:
135 cur = line.strip().split("\t")
136 cur[1] = int(cur[1])
137 cur[2] = int(cur[2])
138
139 if(not buf):
140 buf = cur
141 else:
142 if(cur[0] == buf[0] and cur[1] <= buf[2] ):
143 if(buf[1] == cur[1]): #is subset
144 newscore = (float(buf[3])+float(cur[3]))/2
145 buf[2] = cur[2]
146 buf[3] = newscore
147 else:
148 c1 = buf[1]
149 c2 = cur[1]
150 c3 = min(buf[2],cur[2])
151 c4 = max(buf[2],cur[2])
152
153 fh.write(buf[0]+"\t"+str(c1)+"\t"+str(c2-1)+"\t"+str(buf[3])+"\n")
154
155 newscore = (float(buf[3])+float(cur[3]))/2
156 #fh.write(buf[0]+"\t"+str(c2+1)+"\t"+str(c3)+"\t"+str(newscore)+"\tp2\n")
157 #buf = [buf[0], c3+1 , c4 , cur[3]]
158
159 buf = [buf[0], c2 , c4 , cur[3]]
160
161 # find if buf is a subset -> if so, merge and send to buffer
162 # or find the overlapping region
163
164 # if current is overlapping with buffer; merge:
165 ## [ ] < buf
166 ## [ ] < cur
167 ##
168 ## [ ] < buf
169 ## [ ] < cur
170 ## 111112222333333 << write 1 and 2 and keep 3 in buf
171 else:
172 fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n")
173 buf=cur
174
175 fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n")
176 fh.close()
177
178
179
180 if __name__ == "__main__":
181 parser = argparse.ArgumentParser()
182
183 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,epilog="For more info please visit:\n<https://github.com/yhoogstrate/fuma>")
184
185 parser.add_argument("-t",help="CPM table to extract columns from",nargs=1,required=True)
186 parser.add_argument("-g",help="GTF file used to extract genomic location",nargs=1,required=True)
187
188 parser.add_argument("-c3",help="Output (bedgraph) for column 3 (logFC)",nargs="?",required=False)
189 parser.add_argument("-c4",help="Output (bedgraph) for column 4 (logCPM)",nargs="?",required=False)
190 parser.add_argument("-c5",help="Output (bedgraph) for column 5 (LR)",nargs="?",required=False)
191 parser.add_argument("-c6",help="Output (bedgraph) for column 6 (PValue)",nargs="?",required=False)
192 parser.add_argument("-c7",help="Output (bedgraph) for column 7 (FDR)",nargs="?",required=False)
193
194 args = parser.parse_args()
195
196 #files = {3:"VCAP_logFC.hg19.bedgraph",7:"VCAP_fdr.hg19.bedgraph"}
197 files = {}
198
199 if(args.c3):
200 files[3] = args.c3
201 if(args.c4):
202 files[4] = args.c4
203 if(args.c5):
204 files[5] = args.c5
205 if(args.c6):
206 files[6] = args.c6
207 if(args.c7):
208 files[7] = args.c7
209
210 print "Parsing GTF file"
211 g = GTF(args.g[0])
212
213 print "Parsing EdgeR table"
214 e = EdgeR_table(args.t[0],g,files.keys())
215
216 print "Exporting raw bedgraph(s)"
217 e.export(files)
218
219 print "Removing overlapping entries in bedgraph(s)"
220 for key in files.keys():
221 remove_overlap_in_bedgraph(files[key]+".sorted",files[key])
222 os.system("rm '"+files[key]+".sorted'")