Mercurial > repos > siyuan > prada

#define IO functions. May add more gradually.
import bioclass

def read_conf(conffile):
    '''read configure file (conf.txt)'''
    ref_input=open(conffile)
    refdict={}
    for line in ref_input:
        if not line.strip():
            continue
        if '#' in line:
            contline=line.split('#')[0]
            content=contline.strip()
        else:
            content=line.strip()
        if not content:
            continue
        if content.startswith('--'):
            key=content
            refdict[key]={}
        else:
            idx=content.index('=')
            a=content[0:idx].strip()
            b=content[(idx+1):].strip()
            refdict[key][a]=b
    ref_input.close()
    return refdict

def read_feature(featurefile,verbose=True):
    '''Read exon/transcript/gene info and return bioclass obj'''
    infile=open(featurefile)
    txdb={}    #keep track of all transcripts
    exdb={}    #keep track of all exons
    genedb={}  #keep track of all genes
    i=0
    for line in infile:
        i+=1
        if verbose:
            if i%100000==0:
                print '%d exon records loaded'%i
        chr,start,end,tx,gene,strand,cat=line.split()
        if cat != 'protein_coding':
            continue
        exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
        exdb[exon.name]=exon
        if not txdb.has_key(tx):
            txdb[tx]=bioclass.Transcript(tx,gene)
        txdb[tx].add_exon(exon)
    for txname in txdb:
        t=txdb[txname]
        if not genedb.has_key(t.gene):
            genedb[t.gene]=bioclass.Gene(t.gene)
        genedb[t.gene].add_transcript(t)
    infile.close()
    return (txdb,genedb)

def read_feature_genes(featurefile,*args):
    '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
    infile=open(featurefile)
    txdb={}
    exdb={}
    genedb={}
    for line in infile:
        chr,start,end,tx,gene,strand,cat=line.split()
        if gene not in args:
            continue
        if cat != 'protein_coding':
            continue
        exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
        exdb[exon.name]=exon
        if not txdb.has_key(tx):
            txdb[tx]=bioclass.Transcript(tx,gene)
        txdb[tx].add_exon(exon)
    for txname in txdb:
        t=txdb[txname]
        if not genedb.has_key(t.gene):
            genedb[t.gene]=bioclass.Gene(t.gene)
        genedb[t.gene].add_transcript(t)
    res=[]
    for g in args:
        if genedb.has_key(g):
            gobj=genedb[g]
        else:
            gobj=None
        res.append(gobj)
    infile.close()
    return res

def read_cds(cdsfile):
    '''read CDS start/end for transcripts'''
    infile=open(cdsfile)
    tx_cds={}
    for line in infile:
        info=line.split()
        tx=info[0]
        cds_start,cds_end=int(info[3]),int(info[4])
        tx_cds[tx]=(cds_start,cds_end)
    infile.close()
    return tx_cds

def read_tx_cat(txcatfile):
    '''find primary transcript for each gene'''
    tx_primary={}
    tx_cat={}
    infile=open(txcatfile)
    for line in infile:
        if not line.strip():
            continue
        info=line.split()
        if len(info) != 4:
            continue
        gene=info[2]
        txid=info[1]
        acc=int(info[3][-3:])
        if tx_cat.has_key(gene):
            tx_cat[gene].append([txid,acc])
        else:
            tx_cat[gene]=[[txid,acc]]
    infile.close()
    for item in tx_cat:
        vv=tx_cat[item]
        vv=sorted(vv,key=lambda x: x[1])
        tx_primary[item]=vv[0][0]
    return tx_primary


if __name__=='__main__':
    #The following lines are for module testing purpose only.
    #They would not affect the program.
    featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
    cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt'
    txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt'
author	siyuan
date	Tue, 11 Mar 2014 12:14:01 -0400
parents	acc2ca1a3ba4
children