comparison pyPRADA_1.2/bioclass.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 #Module defines exon, transcript and gene object.
2 #It is extendable for more attributes and functions.
3 #It is part of py-PRADA.
4 #Author: Siyuan Zheng (szheng2@mdanderson.org)
5 #Last modified at 03/07/2013
6
7 class Exon(object):
8 """Is an object with information about Exons and the Transcripts it is found in.
9
10 If given an exon name, returns an object with exon location inforamtion and, gene and transcript information.
11
12 e.g.
13
14 >>> exon=bioclass.Exon(chr,int(start),int(end),strand,'tx','gene')
15 exon.start
16 exon.end
17 exon.chr
18 exon.strand
19 exon.gene
20 exon.transcript
21 exon.name
22 exon.length
23 """
24
25 def __init__(self,chr,start,end,strand,tx,gene):
26 if not all([isinstance(x,int) for x in [start,end]]):
27 raise Exception('start,end must be int!')
28 self.start=start
29 self.end=end
30 self.chr=chr
31 self.strand=strand #'1' or '-1'
32 self.gene=gene
33 self.transcript=tx #exon may map to multi-transcripts, but for simplicity, use one
34 self.name='%s:%s:%s:%s:%s'%(gene,chr,str(start),str(end),strand) #keep gene in the name too
35 self.length=self._length()
36 def _length(self):
37 return abs(self.end-self.start)+1
38
39 class Transcript(object):
40 """
41 Is an object with information about Transcript and the Gene it is found in.
42
43 If given an transcript name, returns an object with transcript location inforamtion, gene information and exon objects. .
44
45 e.g.
46
47 >>> tx=bioclass.Transcript('tx','gene')
48 tx.start
49 tx.end
50 tx.chr
51 tx.strand
52 tx.gene
53 tx.exon
54 tx.name
55 tx.length
56
57 #Add new exons to the transcript:
58 tx.add_exon(exon)
59
60 #Relative position of the transcript based on the exons defined:
61 tx.exon_relative_pos()
62 """
63
64 def __init__(self,name,gene):
65 self.exon=[]
66 self.name=name
67 self.gene=gene
68 self.length=0
69 self.strand=None
70 self.start=None
71 self.end=None
72 self.cds_start=None
73 self.cds_end=None
74 self.is_primary=None
75 def _update(self):
76 self._basics()
77 self._sort_exon()
78 def _basics(self):
79 assert len(self.exon)>0, 'no exon in the transcript'
80 self.strand=self.exon[0].strand
81 self.chr=self.exon[0].chr
82 self.length=reduce(lambda x,y:x+y, [e.length for e in self.exon])
83 self.start=min([e.start for e in self.exon])
84 self.end=max([e.end for e in self.exon])
85 def _sort_exon(self):
86 sorted_exons=sorted(self.exon, key=lambda x:x.start)
87 self.exon=sorted_exons
88 def add_exon(self,exon):
89 nameset=[x.name for x in self.exon]
90 if exon.name not in nameset:
91 self.exon.append(exon)
92 self._update()
93 def set_cds(self,start,end):
94 self.cds_start=start
95 self.cds_end=end
96 def set_primary(self,isprim):
97 self.is_primary=isprim
98 def exon_relative_pos(self):
99 L=[x.length for x in self.exon]
100 pos=[]
101 if self.strand=='1':
102 init=0
103 for item in L:
104 region=(init+1, init+item)
105 init=init+item
106 pos.append(region)
107 if self.strand=='-1':
108 init=self.length
109 for item in L:
110 region=(init-item+1,init)
111 init=init-item
112 pos.append(region)
113 relpos=dict(zip([x.name for x in self.exon],pos))
114 return relpos
115
116 class Gene(object):
117 """
118 Is an object with information about Gene.
119
120 If given an gene name, returns an object with gene location inforamtion, transcript and exon objects. .
121
122 e.g.
123
124 >>> gene=bioclass.Gene('gene')
125 gene.start
126 gene.end
127 gene.chr
128 gene.strand
129 gene.transcript
130 gene.name
131 gene.length
132
133 #Add new transcript to the gene:
134 gene.add_transcript()
135
136 #Obtain a list of all the exons define within that gene:
137 gene.get_exons()
138 """
139
140 def __init__(self,name):
141 self.name=name
142 self.transcript=[]
143 def _update(self):
144 self._basics()
145 def _basics(self):
146 assert len(self.transcript)>0, 'no transcript in the gene'
147 self.strand=self.transcript[0].strand
148 self.chr=self.transcript[0].chr
149 self.start=min([t.start for t in self.transcript])
150 self.end=max([t.end for t in self.transcript])
151
152 def add_transcript(self,tx):
153 nameset=[x.name for x in self.transcript]
154 if tx.name not in nameset:
155 self.transcript.append(tx)
156 self._update()
157 def get_exons(self):
158 exons={}
159 for t in self.transcript:
160 for e in t.exon:
161 exons[e.name]=e
162 return exons
163
164
165 if __name__=='__main__':
166 #below for testing purpose only.
167 infile=open('/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt')
168 txdb={} #keep track of all transcripts
169 exdb={} #keep track of all exons
170 genedb={} #keep track of all genes
171 i=0
172 for line in infile:
173 i+=1
174 if i%100000==0:
175 print '%d exon records loaded'%i
176 chr,start,end,tx,gene,strand,cat=line.split()
177 if cat != 'protein_coding':
178 continue
179 exon=Exon(chr,int(start),int(end),strand,tx,gene)
180 exdb[exon.name]=exon
181 if not txdb.has_key(tx):
182 txdb[tx]=Transcript(tx,gene)
183 txdb[tx].add_exon(exon)
184 for txname in txdb:
185 t=txdb[txname]
186 if not genedb.has_key(t.gene):
187 genedb[t.gene]=Gene(t.gene)
188 genedb[t.gene].add_transcript(t)
189 infile.close()
190