Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/bioclass.py @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/bioclass.py Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,190 @@ +#Module defines exon, transcript and gene object. +#It is extendable for more attributes and functions. +#It is part of py-PRADA. +#Author: Siyuan Zheng (szheng2@mdanderson.org) +#Last modified at 03/07/2013 + +class Exon(object): + """Is an object with information about Exons and the Transcripts it is found in. + + If given an exon name, returns an object with exon location inforamtion and, gene and transcript information. + + e.g. + + >>> exon=bioclass.Exon(chr,int(start),int(end),strand,'tx','gene') + exon.start + exon.end + exon.chr + exon.strand + exon.gene + exon.transcript + exon.name + exon.length + """ + + def __init__(self,chr,start,end,strand,tx,gene): + if not all([isinstance(x,int) for x in [start,end]]): + raise Exception('start,end must be int!') + self.start=start + self.end=end + self.chr=chr + self.strand=strand #'1' or '-1' + self.gene=gene + self.transcript=tx #exon may map to multi-transcripts, but for simplicity, use one + self.name='%s:%s:%s:%s:%s'%(gene,chr,str(start),str(end),strand) #keep gene in the name too + self.length=self._length() + def _length(self): + return abs(self.end-self.start)+1 + +class Transcript(object): + """ + Is an object with information about Transcript and the Gene it is found in. + + If given an transcript name, returns an object with transcript location inforamtion, gene information and exon objects. . + + e.g. + + >>> tx=bioclass.Transcript('tx','gene') + tx.start + tx.end + tx.chr + tx.strand + tx.gene + tx.exon + tx.name + tx.length + + #Add new exons to the transcript: + tx.add_exon(exon) + + #Relative position of the transcript based on the exons defined: + tx.exon_relative_pos() + """ + + def __init__(self,name,gene): + self.exon=[] + self.name=name + self.gene=gene + self.length=0 + self.strand=None + self.start=None + self.end=None + self.cds_start=None + self.cds_end=None + self.is_primary=None + def _update(self): + self._basics() + self._sort_exon() + def _basics(self): + assert len(self.exon)>0, 'no exon in the transcript' + self.strand=self.exon[0].strand + self.chr=self.exon[0].chr + self.length=reduce(lambda x,y:x+y, [e.length for e in self.exon]) + self.start=min([e.start for e in self.exon]) + self.end=max([e.end for e in self.exon]) + def _sort_exon(self): + sorted_exons=sorted(self.exon, key=lambda x:x.start) + self.exon=sorted_exons + def add_exon(self,exon): + nameset=[x.name for x in self.exon] + if exon.name not in nameset: + self.exon.append(exon) + self._update() + def set_cds(self,start,end): + self.cds_start=start + self.cds_end=end + def set_primary(self,isprim): + self.is_primary=isprim + def exon_relative_pos(self): + L=[x.length for x in self.exon] + pos=[] + if self.strand=='1': + init=0 + for item in L: + region=(init+1, init+item) + init=init+item + pos.append(region) + if self.strand=='-1': + init=self.length + for item in L: + region=(init-item+1,init) + init=init-item + pos.append(region) + relpos=dict(zip([x.name for x in self.exon],pos)) + return relpos + +class Gene(object): + """ + Is an object with information about Gene. + + If given an gene name, returns an object with gene location inforamtion, transcript and exon objects. . + + e.g. + + >>> gene=bioclass.Gene('gene') + gene.start + gene.end + gene.chr + gene.strand + gene.transcript + gene.name + gene.length + + #Add new transcript to the gene: + gene.add_transcript() + + #Obtain a list of all the exons define within that gene: + gene.get_exons() + """ + + def __init__(self,name): + self.name=name + self.transcript=[] + def _update(self): + self._basics() + def _basics(self): + assert len(self.transcript)>0, 'no transcript in the gene' + self.strand=self.transcript[0].strand + self.chr=self.transcript[0].chr + self.start=min([t.start for t in self.transcript]) + self.end=max([t.end for t in self.transcript]) + + def add_transcript(self,tx): + nameset=[x.name for x in self.transcript] + if tx.name not in nameset: + self.transcript.append(tx) + self._update() + def get_exons(self): + exons={} + for t in self.transcript: + for e in t.exon: + exons[e.name]=e + return exons + + +if __name__=='__main__': + #below for testing purpose only. + infile=open('/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt') + txdb={} #keep track of all transcripts + exdb={} #keep track of all exons + genedb={} #keep track of all genes + i=0 + for line in infile: + i+=1 + if i%100000==0: + print '%d exon records loaded'%i + chr,start,end,tx,gene,strand,cat=line.split() + if cat != 'protein_coding': + continue + exon=Exon(chr,int(start),int(end),strand,tx,gene) + exdb[exon.name]=exon + if not txdb.has_key(tx): + txdb[tx]=Transcript(tx,gene) + txdb[tx].add_exon(exon) + for txname in txdb: + t=txdb[txname] + if not genedb.has_key(t.gene): + genedb[t.gene]=Gene(t.gene) + genedb[t.gene].add_transcript(t) + infile.close() +