Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/bioclass.py @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #Module defines exon, transcript and gene object. | |
2 #It is extendable for more attributes and functions. | |
3 #It is part of py-PRADA. | |
4 #Author: Siyuan Zheng (szheng2@mdanderson.org) | |
5 #Last modified at 03/07/2013 | |
6 | |
7 class Exon(object): | |
8 """Is an object with information about Exons and the Transcripts it is found in. | |
9 | |
10 If given an exon name, returns an object with exon location inforamtion and, gene and transcript information. | |
11 | |
12 e.g. | |
13 | |
14 >>> exon=bioclass.Exon(chr,int(start),int(end),strand,'tx','gene') | |
15 exon.start | |
16 exon.end | |
17 exon.chr | |
18 exon.strand | |
19 exon.gene | |
20 exon.transcript | |
21 exon.name | |
22 exon.length | |
23 """ | |
24 | |
25 def __init__(self,chr,start,end,strand,tx,gene): | |
26 if not all([isinstance(x,int) for x in [start,end]]): | |
27 raise Exception('start,end must be int!') | |
28 self.start=start | |
29 self.end=end | |
30 self.chr=chr | |
31 self.strand=strand #'1' or '-1' | |
32 self.gene=gene | |
33 self.transcript=tx #exon may map to multi-transcripts, but for simplicity, use one | |
34 self.name='%s:%s:%s:%s:%s'%(gene,chr,str(start),str(end),strand) #keep gene in the name too | |
35 self.length=self._length() | |
36 def _length(self): | |
37 return abs(self.end-self.start)+1 | |
38 | |
39 class Transcript(object): | |
40 """ | |
41 Is an object with information about Transcript and the Gene it is found in. | |
42 | |
43 If given an transcript name, returns an object with transcript location inforamtion, gene information and exon objects. . | |
44 | |
45 e.g. | |
46 | |
47 >>> tx=bioclass.Transcript('tx','gene') | |
48 tx.start | |
49 tx.end | |
50 tx.chr | |
51 tx.strand | |
52 tx.gene | |
53 tx.exon | |
54 tx.name | |
55 tx.length | |
56 | |
57 #Add new exons to the transcript: | |
58 tx.add_exon(exon) | |
59 | |
60 #Relative position of the transcript based on the exons defined: | |
61 tx.exon_relative_pos() | |
62 """ | |
63 | |
64 def __init__(self,name,gene): | |
65 self.exon=[] | |
66 self.name=name | |
67 self.gene=gene | |
68 self.length=0 | |
69 self.strand=None | |
70 self.start=None | |
71 self.end=None | |
72 self.cds_start=None | |
73 self.cds_end=None | |
74 self.is_primary=None | |
75 def _update(self): | |
76 self._basics() | |
77 self._sort_exon() | |
78 def _basics(self): | |
79 assert len(self.exon)>0, 'no exon in the transcript' | |
80 self.strand=self.exon[0].strand | |
81 self.chr=self.exon[0].chr | |
82 self.length=reduce(lambda x,y:x+y, [e.length for e in self.exon]) | |
83 self.start=min([e.start for e in self.exon]) | |
84 self.end=max([e.end for e in self.exon]) | |
85 def _sort_exon(self): | |
86 sorted_exons=sorted(self.exon, key=lambda x:x.start) | |
87 self.exon=sorted_exons | |
88 def add_exon(self,exon): | |
89 nameset=[x.name for x in self.exon] | |
90 if exon.name not in nameset: | |
91 self.exon.append(exon) | |
92 self._update() | |
93 def set_cds(self,start,end): | |
94 self.cds_start=start | |
95 self.cds_end=end | |
96 def set_primary(self,isprim): | |
97 self.is_primary=isprim | |
98 def exon_relative_pos(self): | |
99 L=[x.length for x in self.exon] | |
100 pos=[] | |
101 if self.strand=='1': | |
102 init=0 | |
103 for item in L: | |
104 region=(init+1, init+item) | |
105 init=init+item | |
106 pos.append(region) | |
107 if self.strand=='-1': | |
108 init=self.length | |
109 for item in L: | |
110 region=(init-item+1,init) | |
111 init=init-item | |
112 pos.append(region) | |
113 relpos=dict(zip([x.name for x in self.exon],pos)) | |
114 return relpos | |
115 | |
116 class Gene(object): | |
117 """ | |
118 Is an object with information about Gene. | |
119 | |
120 If given an gene name, returns an object with gene location inforamtion, transcript and exon objects. . | |
121 | |
122 e.g. | |
123 | |
124 >>> gene=bioclass.Gene('gene') | |
125 gene.start | |
126 gene.end | |
127 gene.chr | |
128 gene.strand | |
129 gene.transcript | |
130 gene.name | |
131 gene.length | |
132 | |
133 #Add new transcript to the gene: | |
134 gene.add_transcript() | |
135 | |
136 #Obtain a list of all the exons define within that gene: | |
137 gene.get_exons() | |
138 """ | |
139 | |
140 def __init__(self,name): | |
141 self.name=name | |
142 self.transcript=[] | |
143 def _update(self): | |
144 self._basics() | |
145 def _basics(self): | |
146 assert len(self.transcript)>0, 'no transcript in the gene' | |
147 self.strand=self.transcript[0].strand | |
148 self.chr=self.transcript[0].chr | |
149 self.start=min([t.start for t in self.transcript]) | |
150 self.end=max([t.end for t in self.transcript]) | |
151 | |
152 def add_transcript(self,tx): | |
153 nameset=[x.name for x in self.transcript] | |
154 if tx.name not in nameset: | |
155 self.transcript.append(tx) | |
156 self._update() | |
157 def get_exons(self): | |
158 exons={} | |
159 for t in self.transcript: | |
160 for e in t.exon: | |
161 exons[e.name]=e | |
162 return exons | |
163 | |
164 | |
165 if __name__=='__main__': | |
166 #below for testing purpose only. | |
167 infile=open('/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt') | |
168 txdb={} #keep track of all transcripts | |
169 exdb={} #keep track of all exons | |
170 genedb={} #keep track of all genes | |
171 i=0 | |
172 for line in infile: | |
173 i+=1 | |
174 if i%100000==0: | |
175 print '%d exon records loaded'%i | |
176 chr,start,end,tx,gene,strand,cat=line.split() | |
177 if cat != 'protein_coding': | |
178 continue | |
179 exon=Exon(chr,int(start),int(end),strand,tx,gene) | |
180 exdb[exon.name]=exon | |
181 if not txdb.has_key(tx): | |
182 txdb[tx]=Transcript(tx,gene) | |
183 txdb[tx].add_exon(exon) | |
184 for txname in txdb: | |
185 t=txdb[txname] | |
186 if not genedb.has_key(t.gene): | |
187 genedb[t.gene]=Gene(t.gene) | |
188 genedb[t.gene].add_transcript(t) | |
189 infile.close() | |
190 |