Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/bioclass.py @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 #Module defines exon, transcript and gene object. | |
| 2 #It is extendable for more attributes and functions. | |
| 3 #It is part of py-PRADA. | |
| 4 #Author: Siyuan Zheng (szheng2@mdanderson.org) | |
| 5 #Last modified at 03/07/2013 | |
| 6 | |
| 7 class Exon(object): | |
| 8 """Is an object with information about Exons and the Transcripts it is found in. | |
| 9 | |
| 10 If given an exon name, returns an object with exon location inforamtion and, gene and transcript information. | |
| 11 | |
| 12 e.g. | |
| 13 | |
| 14 >>> exon=bioclass.Exon(chr,int(start),int(end),strand,'tx','gene') | |
| 15 exon.start | |
| 16 exon.end | |
| 17 exon.chr | |
| 18 exon.strand | |
| 19 exon.gene | |
| 20 exon.transcript | |
| 21 exon.name | |
| 22 exon.length | |
| 23 """ | |
| 24 | |
| 25 def __init__(self,chr,start,end,strand,tx,gene): | |
| 26 if not all([isinstance(x,int) for x in [start,end]]): | |
| 27 raise Exception('start,end must be int!') | |
| 28 self.start=start | |
| 29 self.end=end | |
| 30 self.chr=chr | |
| 31 self.strand=strand #'1' or '-1' | |
| 32 self.gene=gene | |
| 33 self.transcript=tx #exon may map to multi-transcripts, but for simplicity, use one | |
| 34 self.name='%s:%s:%s:%s:%s'%(gene,chr,str(start),str(end),strand) #keep gene in the name too | |
| 35 self.length=self._length() | |
| 36 def _length(self): | |
| 37 return abs(self.end-self.start)+1 | |
| 38 | |
| 39 class Transcript(object): | |
| 40 """ | |
| 41 Is an object with information about Transcript and the Gene it is found in. | |
| 42 | |
| 43 If given an transcript name, returns an object with transcript location inforamtion, gene information and exon objects. . | |
| 44 | |
| 45 e.g. | |
| 46 | |
| 47 >>> tx=bioclass.Transcript('tx','gene') | |
| 48 tx.start | |
| 49 tx.end | |
| 50 tx.chr | |
| 51 tx.strand | |
| 52 tx.gene | |
| 53 tx.exon | |
| 54 tx.name | |
| 55 tx.length | |
| 56 | |
| 57 #Add new exons to the transcript: | |
| 58 tx.add_exon(exon) | |
| 59 | |
| 60 #Relative position of the transcript based on the exons defined: | |
| 61 tx.exon_relative_pos() | |
| 62 """ | |
| 63 | |
| 64 def __init__(self,name,gene): | |
| 65 self.exon=[] | |
| 66 self.name=name | |
| 67 self.gene=gene | |
| 68 self.length=0 | |
| 69 self.strand=None | |
| 70 self.start=None | |
| 71 self.end=None | |
| 72 self.cds_start=None | |
| 73 self.cds_end=None | |
| 74 self.is_primary=None | |
| 75 def _update(self): | |
| 76 self._basics() | |
| 77 self._sort_exon() | |
| 78 def _basics(self): | |
| 79 assert len(self.exon)>0, 'no exon in the transcript' | |
| 80 self.strand=self.exon[0].strand | |
| 81 self.chr=self.exon[0].chr | |
| 82 self.length=reduce(lambda x,y:x+y, [e.length for e in self.exon]) | |
| 83 self.start=min([e.start for e in self.exon]) | |
| 84 self.end=max([e.end for e in self.exon]) | |
| 85 def _sort_exon(self): | |
| 86 sorted_exons=sorted(self.exon, key=lambda x:x.start) | |
| 87 self.exon=sorted_exons | |
| 88 def add_exon(self,exon): | |
| 89 nameset=[x.name for x in self.exon] | |
| 90 if exon.name not in nameset: | |
| 91 self.exon.append(exon) | |
| 92 self._update() | |
| 93 def set_cds(self,start,end): | |
| 94 self.cds_start=start | |
| 95 self.cds_end=end | |
| 96 def set_primary(self,isprim): | |
| 97 self.is_primary=isprim | |
| 98 def exon_relative_pos(self): | |
| 99 L=[x.length for x in self.exon] | |
| 100 pos=[] | |
| 101 if self.strand=='1': | |
| 102 init=0 | |
| 103 for item in L: | |
| 104 region=(init+1, init+item) | |
| 105 init=init+item | |
| 106 pos.append(region) | |
| 107 if self.strand=='-1': | |
| 108 init=self.length | |
| 109 for item in L: | |
| 110 region=(init-item+1,init) | |
| 111 init=init-item | |
| 112 pos.append(region) | |
| 113 relpos=dict(zip([x.name for x in self.exon],pos)) | |
| 114 return relpos | |
| 115 | |
| 116 class Gene(object): | |
| 117 """ | |
| 118 Is an object with information about Gene. | |
| 119 | |
| 120 If given an gene name, returns an object with gene location inforamtion, transcript and exon objects. . | |
| 121 | |
| 122 e.g. | |
| 123 | |
| 124 >>> gene=bioclass.Gene('gene') | |
| 125 gene.start | |
| 126 gene.end | |
| 127 gene.chr | |
| 128 gene.strand | |
| 129 gene.transcript | |
| 130 gene.name | |
| 131 gene.length | |
| 132 | |
| 133 #Add new transcript to the gene: | |
| 134 gene.add_transcript() | |
| 135 | |
| 136 #Obtain a list of all the exons define within that gene: | |
| 137 gene.get_exons() | |
| 138 """ | |
| 139 | |
| 140 def __init__(self,name): | |
| 141 self.name=name | |
| 142 self.transcript=[] | |
| 143 def _update(self): | |
| 144 self._basics() | |
| 145 def _basics(self): | |
| 146 assert len(self.transcript)>0, 'no transcript in the gene' | |
| 147 self.strand=self.transcript[0].strand | |
| 148 self.chr=self.transcript[0].chr | |
| 149 self.start=min([t.start for t in self.transcript]) | |
| 150 self.end=max([t.end for t in self.transcript]) | |
| 151 | |
| 152 def add_transcript(self,tx): | |
| 153 nameset=[x.name for x in self.transcript] | |
| 154 if tx.name not in nameset: | |
| 155 self.transcript.append(tx) | |
| 156 self._update() | |
| 157 def get_exons(self): | |
| 158 exons={} | |
| 159 for t in self.transcript: | |
| 160 for e in t.exon: | |
| 161 exons[e.name]=e | |
| 162 return exons | |
| 163 | |
| 164 | |
| 165 if __name__=='__main__': | |
| 166 #below for testing purpose only. | |
| 167 infile=open('/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt') | |
| 168 txdb={} #keep track of all transcripts | |
| 169 exdb={} #keep track of all exons | |
| 170 genedb={} #keep track of all genes | |
| 171 i=0 | |
| 172 for line in infile: | |
| 173 i+=1 | |
| 174 if i%100000==0: | |
| 175 print '%d exon records loaded'%i | |
| 176 chr,start,end,tx,gene,strand,cat=line.split() | |
| 177 if cat != 'protein_coding': | |
| 178 continue | |
| 179 exon=Exon(chr,int(start),int(end),strand,tx,gene) | |
| 180 exdb[exon.name]=exon | |
| 181 if not txdb.has_key(tx): | |
| 182 txdb[tx]=Transcript(tx,gene) | |
| 183 txdb[tx].add_exon(exon) | |
| 184 for txname in txdb: | |
| 185 t=txdb[txname] | |
| 186 if not genedb.has_key(t.gene): | |
| 187 genedb[t.gene]=Gene(t.gene) | |
| 188 genedb[t.gene].add_transcript(t) | |
| 189 infile.close() | |
| 190 |
