diff pyPRADA_1.2/bioclass.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/bioclass.py	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,190 @@
+#Module defines exon, transcript and gene object.
+#It is extendable for more attributes and functions. 
+#It is part of py-PRADA. 
+#Author: Siyuan Zheng (szheng2@mdanderson.org)
+#Last modified at 03/07/2013
+
+class Exon(object):
+    """Is an object with information about Exons and the Transcripts it is found in.
+
+    If given an exon name, returns an object with exon location inforamtion and, gene and transcript information.
+
+    e.g.
+
+    >>> exon=bioclass.Exon(chr,int(start),int(end),strand,'tx','gene')
+    exon.start
+    exon.end
+    exon.chr
+    exon.strand
+    exon.gene
+    exon.transcript
+    exon.name
+    exon.length
+    """
+
+    def __init__(self,chr,start,end,strand,tx,gene):
+        if not all([isinstance(x,int) for x in [start,end]]):
+            raise Exception('start,end must be int!')
+        self.start=start
+        self.end=end
+        self.chr=chr
+        self.strand=strand  #'1' or '-1'
+        self.gene=gene
+        self.transcript=tx  #exon may map to multi-transcripts, but for simplicity, use one
+        self.name='%s:%s:%s:%s:%s'%(gene,chr,str(start),str(end),strand) #keep gene in the name too
+        self.length=self._length()
+    def _length(self):
+        return abs(self.end-self.start)+1
+
+class Transcript(object):
+    """
+    Is an object with information about Transcript and the Gene it is found in.
+
+    If given an transcript name, returns an object with transcript location inforamtion, gene information and exon objects.   .
+
+    e.g.
+
+    >>> tx=bioclass.Transcript('tx','gene')
+    tx.start
+    tx.end
+    tx.chr
+    tx.strand
+    tx.gene
+    tx.exon
+    tx.name
+    tx.length
+
+    #Add new exons to the transcript:
+    tx.add_exon(exon)
+
+    #Relative position of the transcript based on the exons defined:
+    tx.exon_relative_pos()
+    """
+
+    def __init__(self,name,gene):
+        self.exon=[]
+        self.name=name
+        self.gene=gene
+        self.length=0
+        self.strand=None
+        self.start=None
+        self.end=None
+        self.cds_start=None
+        self.cds_end=None
+        self.is_primary=None
+    def _update(self):
+        self._basics()
+        self._sort_exon()
+    def _basics(self):
+        assert len(self.exon)>0, 'no exon in the transcript'
+        self.strand=self.exon[0].strand
+        self.chr=self.exon[0].chr
+        self.length=reduce(lambda x,y:x+y, [e.length for e in self.exon])
+        self.start=min([e.start for e in self.exon])
+        self.end=max([e.end for e in self.exon])
+    def _sort_exon(self):
+        sorted_exons=sorted(self.exon, key=lambda x:x.start)
+        self.exon=sorted_exons
+    def add_exon(self,exon):
+        nameset=[x.name for x in self.exon]
+        if exon.name not in nameset:
+            self.exon.append(exon)
+        self._update()
+    def set_cds(self,start,end):
+        self.cds_start=start
+        self.cds_end=end 
+    def set_primary(self,isprim):
+        self.is_primary=isprim
+    def exon_relative_pos(self):
+        L=[x.length for x in self.exon]
+        pos=[]
+        if self.strand=='1':
+            init=0
+            for item in L:
+                region=(init+1, init+item)
+                init=init+item
+                pos.append(region)
+        if self.strand=='-1':
+            init=self.length
+            for item in L:
+                region=(init-item+1,init)
+                init=init-item
+                pos.append(region)
+        relpos=dict(zip([x.name for x in self.exon],pos))
+        return relpos
+
+class Gene(object):
+    """
+    Is an object with information about Gene.
+
+    If given an gene name, returns an object with gene location inforamtion, transcript and exon objects.   .
+
+    e.g.
+
+    >>> gene=bioclass.Gene('gene')
+    gene.start
+    gene.end
+    gene.chr
+    gene.strand
+    gene.transcript
+    gene.name
+    gene.length
+
+    #Add new transcript to the gene:
+    gene.add_transcript()
+
+    #Obtain a list of all the exons define within that gene:
+    gene.get_exons()
+    """
+
+    def __init__(self,name):
+        self.name=name
+        self.transcript=[]
+    def _update(self):
+        self._basics()
+    def _basics(self):
+        assert len(self.transcript)>0, 'no transcript in the gene'
+        self.strand=self.transcript[0].strand
+        self.chr=self.transcript[0].chr
+        self.start=min([t.start for t in self.transcript])
+        self.end=max([t.end for t in self.transcript]) 
+    
+    def add_transcript(self,tx):
+        nameset=[x.name for x in self.transcript]
+        if tx.name not in nameset:
+            self.transcript.append(tx)
+        self._update()
+    def get_exons(self):
+        exons={}
+        for t in self.transcript:
+            for e in t.exon:
+                exons[e.name]=e
+        return exons
+
+ 
+if __name__=='__main__':
+    #below for testing purpose only.
+    infile=open('/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt')
+    txdb={}    #keep track of all transcripts
+    exdb={}    #keep track of all exons
+    genedb={}  #keep track of all genes
+    i=0
+    for line in infile:
+        i+=1
+        if i%100000==0:
+            print '%d exon records loaded'%i
+        chr,start,end,tx,gene,strand,cat=line.split()
+        if cat != 'protein_coding':
+            continue
+        exon=Exon(chr,int(start),int(end),strand,tx,gene)
+        exdb[exon.name]=exon
+        if not txdb.has_key(tx):
+            txdb[tx]=Transcript(tx,gene)
+        txdb[tx].add_exon(exon)
+    for txname in txdb:
+        t=txdb[txname]
+        if not genedb.has_key(t.gene):
+            genedb[t.gene]=Gene(t.gene)
+        genedb[t.gene].add_transcript(t)
+    infile.close()
+