0
|
1 #Module defines exon, transcript and gene object.
|
|
2 #It is extendable for more attributes and functions.
|
|
3 #It is part of py-PRADA.
|
|
4 #Author: Siyuan Zheng (szheng2@mdanderson.org)
|
|
5 #Last modified at 03/07/2013
|
|
6
|
|
7 class Exon(object):
|
|
8 """Is an object with information about Exons and the Transcripts it is found in.
|
|
9
|
|
10 If given an exon name, returns an object with exon location inforamtion and, gene and transcript information.
|
|
11
|
|
12 e.g.
|
|
13
|
|
14 >>> exon=bioclass.Exon(chr,int(start),int(end),strand,'tx','gene')
|
|
15 exon.start
|
|
16 exon.end
|
|
17 exon.chr
|
|
18 exon.strand
|
|
19 exon.gene
|
|
20 exon.transcript
|
|
21 exon.name
|
|
22 exon.length
|
|
23 """
|
|
24
|
|
25 def __init__(self,chr,start,end,strand,tx,gene):
|
|
26 if not all([isinstance(x,int) for x in [start,end]]):
|
|
27 raise Exception('start,end must be int!')
|
|
28 self.start=start
|
|
29 self.end=end
|
|
30 self.chr=chr
|
|
31 self.strand=strand #'1' or '-1'
|
|
32 self.gene=gene
|
|
33 self.transcript=tx #exon may map to multi-transcripts, but for simplicity, use one
|
|
34 self.name='%s:%s:%s:%s:%s'%(gene,chr,str(start),str(end),strand) #keep gene in the name too
|
|
35 self.length=self._length()
|
|
36 def _length(self):
|
|
37 return abs(self.end-self.start)+1
|
|
38
|
|
39 class Transcript(object):
|
|
40 """
|
|
41 Is an object with information about Transcript and the Gene it is found in.
|
|
42
|
|
43 If given an transcript name, returns an object with transcript location inforamtion, gene information and exon objects. .
|
|
44
|
|
45 e.g.
|
|
46
|
|
47 >>> tx=bioclass.Transcript('tx','gene')
|
|
48 tx.start
|
|
49 tx.end
|
|
50 tx.chr
|
|
51 tx.strand
|
|
52 tx.gene
|
|
53 tx.exon
|
|
54 tx.name
|
|
55 tx.length
|
|
56
|
|
57 #Add new exons to the transcript:
|
|
58 tx.add_exon(exon)
|
|
59
|
|
60 #Relative position of the transcript based on the exons defined:
|
|
61 tx.exon_relative_pos()
|
|
62 """
|
|
63
|
|
64 def __init__(self,name,gene):
|
|
65 self.exon=[]
|
|
66 self.name=name
|
|
67 self.gene=gene
|
|
68 self.length=0
|
|
69 self.strand=None
|
|
70 self.start=None
|
|
71 self.end=None
|
|
72 self.cds_start=None
|
|
73 self.cds_end=None
|
|
74 self.is_primary=None
|
|
75 def _update(self):
|
|
76 self._basics()
|
|
77 self._sort_exon()
|
|
78 def _basics(self):
|
|
79 assert len(self.exon)>0, 'no exon in the transcript'
|
|
80 self.strand=self.exon[0].strand
|
|
81 self.chr=self.exon[0].chr
|
|
82 self.length=reduce(lambda x,y:x+y, [e.length for e in self.exon])
|
|
83 self.start=min([e.start for e in self.exon])
|
|
84 self.end=max([e.end for e in self.exon])
|
|
85 def _sort_exon(self):
|
|
86 sorted_exons=sorted(self.exon, key=lambda x:x.start)
|
|
87 self.exon=sorted_exons
|
|
88 def add_exon(self,exon):
|
|
89 nameset=[x.name for x in self.exon]
|
|
90 if exon.name not in nameset:
|
|
91 self.exon.append(exon)
|
|
92 self._update()
|
|
93 def set_cds(self,start,end):
|
|
94 self.cds_start=start
|
|
95 self.cds_end=end
|
|
96 def set_primary(self,isprim):
|
|
97 self.is_primary=isprim
|
|
98 def exon_relative_pos(self):
|
|
99 L=[x.length for x in self.exon]
|
|
100 pos=[]
|
|
101 if self.strand=='1':
|
|
102 init=0
|
|
103 for item in L:
|
|
104 region=(init+1, init+item)
|
|
105 init=init+item
|
|
106 pos.append(region)
|
|
107 if self.strand=='-1':
|
|
108 init=self.length
|
|
109 for item in L:
|
|
110 region=(init-item+1,init)
|
|
111 init=init-item
|
|
112 pos.append(region)
|
|
113 relpos=dict(zip([x.name for x in self.exon],pos))
|
|
114 return relpos
|
|
115
|
|
116 class Gene(object):
|
|
117 """
|
|
118 Is an object with information about Gene.
|
|
119
|
|
120 If given an gene name, returns an object with gene location inforamtion, transcript and exon objects. .
|
|
121
|
|
122 e.g.
|
|
123
|
|
124 >>> gene=bioclass.Gene('gene')
|
|
125 gene.start
|
|
126 gene.end
|
|
127 gene.chr
|
|
128 gene.strand
|
|
129 gene.transcript
|
|
130 gene.name
|
|
131 gene.length
|
|
132
|
|
133 #Add new transcript to the gene:
|
|
134 gene.add_transcript()
|
|
135
|
|
136 #Obtain a list of all the exons define within that gene:
|
|
137 gene.get_exons()
|
|
138 """
|
|
139
|
|
140 def __init__(self,name):
|
|
141 self.name=name
|
|
142 self.transcript=[]
|
|
143 def _update(self):
|
|
144 self._basics()
|
|
145 def _basics(self):
|
|
146 assert len(self.transcript)>0, 'no transcript in the gene'
|
|
147 self.strand=self.transcript[0].strand
|
|
148 self.chr=self.transcript[0].chr
|
|
149 self.start=min([t.start for t in self.transcript])
|
|
150 self.end=max([t.end for t in self.transcript])
|
|
151
|
|
152 def add_transcript(self,tx):
|
|
153 nameset=[x.name for x in self.transcript]
|
|
154 if tx.name not in nameset:
|
|
155 self.transcript.append(tx)
|
|
156 self._update()
|
|
157 def get_exons(self):
|
|
158 exons={}
|
|
159 for t in self.transcript:
|
|
160 for e in t.exon:
|
|
161 exons[e.name]=e
|
|
162 return exons
|
|
163
|
|
164
|
|
165 if __name__=='__main__':
|
|
166 #below for testing purpose only.
|
|
167 infile=open('/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt')
|
|
168 txdb={} #keep track of all transcripts
|
|
169 exdb={} #keep track of all exons
|
|
170 genedb={} #keep track of all genes
|
|
171 i=0
|
|
172 for line in infile:
|
|
173 i+=1
|
|
174 if i%100000==0:
|
|
175 print '%d exon records loaded'%i
|
|
176 chr,start,end,tx,gene,strand,cat=line.split()
|
|
177 if cat != 'protein_coding':
|
|
178 continue
|
|
179 exon=Exon(chr,int(start),int(end),strand,tx,gene)
|
|
180 exdb[exon.name]=exon
|
|
181 if not txdb.has_key(tx):
|
|
182 txdb[tx]=Transcript(tx,gene)
|
|
183 txdb[tx].add_exon(exon)
|
|
184 for txname in txdb:
|
|
185 t=txdb[txname]
|
|
186 if not genedb.has_key(t.gene):
|
|
187 genedb[t.gene]=Gene(t.gene)
|
|
188 genedb[t.gene].add_transcript(t)
|
|
189 infile.close()
|
|
190
|