annotate pyPRADA_1.2/bioclass.py @ 3:f17965495ec9 draft default tip

Uploaded
author siyuan
date Tue, 11 Mar 2014 12:14:01 -0400
parents acc2ca1a3ba4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #Module defines exon, transcript and gene object.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2 #It is extendable for more attributes and functions.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 #It is part of py-PRADA.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4 #Author: Siyuan Zheng (szheng2@mdanderson.org)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 #Last modified at 03/07/2013
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 class Exon(object):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 """Is an object with information about Exons and the Transcripts it is found in.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 If given an exon name, returns an object with exon location inforamtion and, gene and transcript information.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 e.g.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 >>> exon=bioclass.Exon(chr,int(start),int(end),strand,'tx','gene')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 exon.start
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 exon.end
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 exon.chr
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 exon.strand
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 exon.gene
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 exon.transcript
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 exon.name
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 exon.length
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 """
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 def __init__(self,chr,start,end,strand,tx,gene):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 if not all([isinstance(x,int) for x in [start,end]]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 raise Exception('start,end must be int!')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 self.start=start
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 self.end=end
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 self.chr=chr
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 self.strand=strand #'1' or '-1'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 self.gene=gene
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 self.transcript=tx #exon may map to multi-transcripts, but for simplicity, use one
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 self.name='%s:%s:%s:%s:%s'%(gene,chr,str(start),str(end),strand) #keep gene in the name too
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 self.length=self._length()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 def _length(self):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 return abs(self.end-self.start)+1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 class Transcript(object):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 """
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 Is an object with information about Transcript and the Gene it is found in.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 If given an transcript name, returns an object with transcript location inforamtion, gene information and exon objects. .
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 e.g.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 >>> tx=bioclass.Transcript('tx','gene')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 tx.start
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 tx.end
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 tx.chr
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 tx.strand
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 tx.gene
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 tx.exon
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 tx.name
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 tx.length
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 #Add new exons to the transcript:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 tx.add_exon(exon)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 #Relative position of the transcript based on the exons defined:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 tx.exon_relative_pos()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 """
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 def __init__(self,name,gene):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 self.exon=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 self.name=name
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 self.gene=gene
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 self.length=0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 self.strand=None
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 self.start=None
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 self.end=None
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 self.cds_start=None
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 self.cds_end=None
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 self.is_primary=None
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 def _update(self):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 self._basics()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 self._sort_exon()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 def _basics(self):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 assert len(self.exon)>0, 'no exon in the transcript'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 self.strand=self.exon[0].strand
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 self.chr=self.exon[0].chr
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 self.length=reduce(lambda x,y:x+y, [e.length for e in self.exon])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 self.start=min([e.start for e in self.exon])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 self.end=max([e.end for e in self.exon])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 def _sort_exon(self):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 sorted_exons=sorted(self.exon, key=lambda x:x.start)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 self.exon=sorted_exons
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 def add_exon(self,exon):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 nameset=[x.name for x in self.exon]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 if exon.name not in nameset:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 self.exon.append(exon)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 self._update()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 def set_cds(self,start,end):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 self.cds_start=start
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 self.cds_end=end
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 def set_primary(self,isprim):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 self.is_primary=isprim
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 def exon_relative_pos(self):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 L=[x.length for x in self.exon]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 pos=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 if self.strand=='1':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 init=0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 for item in L:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 region=(init+1, init+item)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 init=init+item
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 pos.append(region)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 if self.strand=='-1':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 init=self.length
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 for item in L:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 region=(init-item+1,init)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 init=init-item
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 pos.append(region)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 relpos=dict(zip([x.name for x in self.exon],pos))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 return relpos
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 class Gene(object):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 """
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 Is an object with information about Gene.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 If given an gene name, returns an object with gene location inforamtion, transcript and exon objects. .
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 e.g.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 >>> gene=bioclass.Gene('gene')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 gene.start
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 gene.end
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 gene.chr
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 gene.strand
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 gene.transcript
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 gene.name
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 gene.length
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 #Add new transcript to the gene:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 gene.add_transcript()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 #Obtain a list of all the exons define within that gene:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 gene.get_exons()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 """
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 def __init__(self,name):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 self.name=name
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 self.transcript=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 def _update(self):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 self._basics()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 def _basics(self):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 assert len(self.transcript)>0, 'no transcript in the gene'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 self.strand=self.transcript[0].strand
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 self.chr=self.transcript[0].chr
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 self.start=min([t.start for t in self.transcript])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 self.end=max([t.end for t in self.transcript])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 def add_transcript(self,tx):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 nameset=[x.name for x in self.transcript]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 if tx.name not in nameset:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 self.transcript.append(tx)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 self._update()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 def get_exons(self):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 exons={}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 for t in self.transcript:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 for e in t.exon:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 exons[e.name]=e
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 return exons
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165 if __name__=='__main__':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 #below for testing purpose only.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 infile=open('/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 txdb={} #keep track of all transcripts
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 exdb={} #keep track of all exons
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170 genedb={} #keep track of all genes
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 i=0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 for line in infile:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 i+=1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 if i%100000==0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 print '%d exon records loaded'%i
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 chr,start,end,tx,gene,strand,cat=line.split()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177 if cat != 'protein_coding':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 exon=Exon(chr,int(start),int(end),strand,tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 exdb[exon.name]=exon
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 if not txdb.has_key(tx):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 txdb[tx]=Transcript(tx,gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 txdb[tx].add_exon(exon)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 for txname in txdb:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 t=txdb[txname]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 if not genedb.has_key(t.gene):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 genedb[t.gene]=Gene(t.gene)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188 genedb[t.gene].add_transcript(t)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189 infile.close()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190