Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/ioprada.py @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/ioprada.py Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,133 @@ +#define IO functions. May add more gradually. +import bioclass + +def read_conf(conffile): + '''read configure file (conf.txt)''' + ref_input=open(conffile) + refdict={} + for line in ref_input: + if not line.strip(): + continue + if '#' in line: + contline=line.split('#')[0] + content=contline.strip() + else: + content=line.strip() + if not content: + continue + if content.startswith('--'): + key=content + refdict[key]={} + else: + idx=content.index('=') + a=content[0:idx].strip() + b=content[(idx+1):].strip() + refdict[key][a]=b + ref_input.close() + return refdict + +def read_feature(featurefile,verbose=True): + '''Read exon/transcript/gene info and return bioclass obj''' + infile=open(featurefile) + txdb={} #keep track of all transcripts + exdb={} #keep track of all exons + genedb={} #keep track of all genes + i=0 + for line in infile: + i+=1 + if verbose: + if i%100000==0: + print '%d exon records loaded'%i + chr,start,end,tx,gene,strand,cat=line.split() + if cat != 'protein_coding': + continue + exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) + exdb[exon.name]=exon + if not txdb.has_key(tx): + txdb[tx]=bioclass.Transcript(tx,gene) + txdb[tx].add_exon(exon) + for txname in txdb: + t=txdb[txname] + if not genedb.has_key(t.gene): + genedb[t.gene]=bioclass.Gene(t.gene) + genedb[t.gene].add_transcript(t) + infile.close() + return (txdb,genedb) + +def read_feature_genes(featurefile,*args): + '''Read exon/transcript/gene info for spec genes and return bioclass obj''' + infile=open(featurefile) + txdb={} + exdb={} + genedb={} + for line in infile: + chr,start,end,tx,gene,strand,cat=line.split() + if gene not in args: + continue + if cat != 'protein_coding': + continue + exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene) + exdb[exon.name]=exon + if not txdb.has_key(tx): + txdb[tx]=bioclass.Transcript(tx,gene) + txdb[tx].add_exon(exon) + for txname in txdb: + t=txdb[txname] + if not genedb.has_key(t.gene): + genedb[t.gene]=bioclass.Gene(t.gene) + genedb[t.gene].add_transcript(t) + res=[] + for g in args: + if genedb.has_key(g): + gobj=genedb[g] + else: + gobj=None + res.append(gobj) + infile.close() + return res + +def read_cds(cdsfile): + '''read CDS start/end for transcripts''' + infile=open(cdsfile) + tx_cds={} + for line in infile: + info=line.split() + tx=info[0] + cds_start,cds_end=int(info[3]),int(info[4]) + tx_cds[tx]=(cds_start,cds_end) + infile.close() + return tx_cds + +def read_tx_cat(txcatfile): + '''find primary transcript for each gene''' + tx_primary={} + tx_cat={} + infile=open(txcatfile) + for line in infile: + if not line.strip(): + continue + info=line.split() + if len(info) != 4: + continue + gene=info[2] + txid=info[1] + acc=int(info[3][-3:]) + if tx_cat.has_key(gene): + tx_cat[gene].append([txid,acc]) + else: + tx_cat[gene]=[[txid,acc]] + infile.close() + for item in tx_cat: + vv=tx_cat[item] + vv=sorted(vv,key=lambda x: x[1]) + tx_primary[item]=vv[0][0] + return tx_primary + + +if __name__=='__main__': + #The following lines are for module testing purpose only. + #They would not affect the program. + featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt' + cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt' + txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt' +