Mercurial > repos > siyuan > prada
view pyPRADA_1.2/ioprada.py @ 3:f17965495ec9 draft default tip
Uploaded
author | siyuan |
---|---|
date | Tue, 11 Mar 2014 12:14:01 -0400 |
parents | acc2ca1a3ba4 |
children |
line wrap: on
line source
#define IO functions. May add more gradually. import bioclass def read_conf(conffile): '''read configure file (conf.txt)''' ref_input=open(conffile) refdict={} for line in ref_input: if not line.strip(): continue if '#' in line: contline=line.split('#')[0] content=contline.strip() else: content=line.strip() if not content: continue if content.startswith('--'): key=content refdict[key]={} else: idx=content.index('=') a=content[0:idx].strip() b=content[(idx+1):].strip() refdict[key][a]=b ref_input.close() return refdict def read_feature(featurefile,verbose=True): '''Read exon/transcript/gene info and return bioclass obj''' infile=open(featurefile) txdb={} #keep track of all transcripts exdb={} #keep track of all exons genedb={} #keep track of all genes i=0 for line in infile: i+=1 if verbose: if i%100000==0: print '%d exon records loaded'%i chr,start,end,tx,gene,strand,cat=line.split() if cat != 'protein_coding': continue exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) exdb[exon.name]=exon if not txdb.has_key(tx): txdb[tx]=bioclass.Transcript(tx,gene) txdb[tx].add_exon(exon) for txname in txdb: t=txdb[txname] if not genedb.has_key(t.gene): genedb[t.gene]=bioclass.Gene(t.gene) genedb[t.gene].add_transcript(t) infile.close() return (txdb,genedb) def read_feature_genes(featurefile,*args): '''Read exon/transcript/gene info for spec genes and return bioclass obj''' infile=open(featurefile) txdb={} exdb={} genedb={} for line in infile: chr,start,end,tx,gene,strand,cat=line.split() if gene not in args: continue if cat != 'protein_coding': continue exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) exdb[exon.name]=exon if not txdb.has_key(tx): txdb[tx]=bioclass.Transcript(tx,gene) txdb[tx].add_exon(exon) for txname in txdb: t=txdb[txname] if not genedb.has_key(t.gene): genedb[t.gene]=bioclass.Gene(t.gene) genedb[t.gene].add_transcript(t) res=[] for g in args: if genedb.has_key(g): gobj=genedb[g] else: gobj=None res.append(gobj) infile.close() return res def read_cds(cdsfile): '''read CDS start/end for transcripts''' infile=open(cdsfile) tx_cds={} for line in infile: info=line.split() tx=info[0] cds_start,cds_end=int(info[3]),int(info[4]) tx_cds[tx]=(cds_start,cds_end) infile.close() return tx_cds def read_tx_cat(txcatfile): '''find primary transcript for each gene''' tx_primary={} tx_cat={} infile=open(txcatfile) for line in infile: if not line.strip(): continue info=line.split() if len(info) != 4: continue gene=info[2] txid=info[1] acc=int(info[3][-3:]) if tx_cat.has_key(gene): tx_cat[gene].append([txid,acc]) else: tx_cat[gene]=[[txid,acc]] infile.close() for item in tx_cat: vv=tx_cat[item] vv=sorted(vv,key=lambda x: x[1]) tx_primary[item]=vv[0][0] return tx_primary if __name__=='__main__': #The following lines are for module testing purpose only. #They would not affect the program. featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt' cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt' txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt'