prada: pyPRADA_1.2/bioclass.py comparison

comparison pyPRADA_1.2/bioclass.py @ 0:acc2ca1a3ba4

Uploaded

author	siyuan
date	Thu, 20 Feb 2014 00:44:58 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:acc2ca1a3ba4
+#Module defines exon, transcript and gene object.
+#It is extendable for more attributes and functions.
+#It is part of py-PRADA.
+#Author: Siyuan Zheng (szheng2@mdanderson.org)
+#Last modified at 03/07/2013
+class Exon(object):
+"""Is an object with information about Exons and the Transcripts it is found in.
+If given an exon name, returns an object with exon location inforamtion and, gene and transcript information.
+e.g.
+>>> exon=bioclass.Exon(chr,int(start),int(end),strand,'tx','gene')
+exon.start
+exon.end
+exon.chr
+exon.strand
+exon.gene
+exon.transcript
+exon.name
+exon.length
+"""
+def __init__(self,chr,start,end,strand,tx,gene):
+if not all([isinstance(x,int) for x in [start,end]]):
+raise Exception('start,end must be int!')
+self.start=start
+self.end=end
+self.chr=chr
+self.strand=strand  #'1' or '-1'
+self.gene=gene
+self.transcript=tx  #exon may map to multi-transcripts, but for simplicity, use one
+self.name='%s:%s:%s:%s:%s'%(gene,chr,str(start),str(end),strand) #keep gene in the name too
+self.length=self._length()
+def _length(self):
+return abs(self.end-self.start)+1
+class Transcript(object):
+"""
+Is an object with information about Transcript and the Gene it is found in.
+If given an transcript name, returns an object with transcript location inforamtion, gene information and exon objects.   .
+e.g.
+>>> tx=bioclass.Transcript('tx','gene')
+tx.start
+tx.end
+tx.chr
+tx.strand
+tx.gene
+tx.exon
+tx.name
+tx.length
+#Add new exons to the transcript:
+tx.add_exon(exon)
+#Relative position of the transcript based on the exons defined:
+tx.exon_relative_pos()
+"""
+def __init__(self,name,gene):
+self.exon=[]
+self.name=name
+self.gene=gene
+self.length=0
+self.strand=None
+self.start=None
+self.end=None
+self.cds_start=None
+self.cds_end=None
+self.is_primary=None
+def _update(self):
+self._basics()
+self._sort_exon()
+def _basics(self):
+assert len(self.exon)>0, 'no exon in the transcript'
+self.strand=self.exon[0].strand
+self.chr=self.exon[0].chr
+self.length=reduce(lambda x,y:x+y, [e.length for e in self.exon])
+self.start=min([e.start for e in self.exon])
+self.end=max([e.end for e in self.exon])
+def _sort_exon(self):
+sorted_exons=sorted(self.exon, key=lambda x:x.start)
+self.exon=sorted_exons
+def add_exon(self,exon):
+nameset=[x.name for x in self.exon]
+if exon.name not in nameset:
+self.exon.append(exon)
+self._update()
+def set_cds(self,start,end):
+self.cds_start=start
+self.cds_end=end
+def set_primary(self,isprim):
+self.is_primary=isprim
+def exon_relative_pos(self):
+L=[x.length for x in self.exon]
+pos=[]
+if self.strand=='1':
+init=0
+for item in L:
+region=(init+1, init+item)
+init=init+item
+pos.append(region)
+if self.strand=='-1':
+init=self.length
+for item in L:
+region=(init-item+1,init)
+init=init-item
+pos.append(region)
+relpos=dict(zip([x.name for x in self.exon],pos))
+return relpos
+class Gene(object):
+"""
+Is an object with information about Gene.
+If given an gene name, returns an object with gene location inforamtion, transcript and exon objects.   .
+e.g.
+>>> gene=bioclass.Gene('gene')
+gene.start
+gene.end
+gene.chr
+gene.strand
+gene.transcript
+gene.name
+gene.length
+#Add new transcript to the gene:
+gene.add_transcript()
+#Obtain a list of all the exons define within that gene:
+gene.get_exons()
+"""
+def __init__(self,name):
+self.name=name
+self.transcript=[]
+def _update(self):
+self._basics()
+def _basics(self):
+assert len(self.transcript)>0, 'no transcript in the gene'
+self.strand=self.transcript[0].strand
+self.chr=self.transcript[0].chr
+self.start=min([t.start for t in self.transcript])
+self.end=max([t.end for t in self.transcript])
+def add_transcript(self,tx):
+nameset=[x.name for x in self.transcript]
+if tx.name not in nameset:
+self.transcript.append(tx)
+self._update()
+def get_exons(self):
+exons={}
+for t in self.transcript:
+for e in t.exon:
+exons[e.name]=e
+return exons
+if __name__=='__main__':
+#below for testing purpose only.
+infile=open('/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt')
+txdb={}    #keep track of all transcripts
+exdb={}    #keep track of all exons
+genedb={}  #keep track of all genes
+i=0
+for line in infile:
+i+=1
+if i%100000==0:
+print '%d exon records loaded'%i
+chr,start,end,tx,gene,strand,cat=line.split()
+if cat != 'protein_coding':
+continue
+exon=Exon(chr,int(start),int(end),strand,tx,gene)
+exdb[exon.name]=exon
+if not txdb.has_key(tx):
+txdb[tx]=Transcript(tx,gene)
+txdb[tx].add_exon(exon)
+for txname in txdb:
+t=txdb[txname]
+if not genedb.has_key(t.gene):
+genedb[t.gene]=Gene(t.gene)
+genedb[t.gene].add_transcript(t)
+infile.close()

Mercurial > repos > siyuan > prada

comparison pyPRADA_1.2/bioclass.py @ 0:acc2ca1a3ba4