0
|
1 #define IO functions. May add more gradually.
|
|
2 import bioclass
|
|
3
|
|
4 def read_conf(conffile):
|
|
5 '''read configure file (conf.txt)'''
|
|
6 ref_input=open(conffile)
|
|
7 refdict={}
|
|
8 for line in ref_input:
|
|
9 if not line.strip():
|
|
10 continue
|
|
11 if '#' in line:
|
|
12 contline=line.split('#')[0]
|
|
13 content=contline.strip()
|
|
14 else:
|
|
15 content=line.strip()
|
|
16 if not content:
|
|
17 continue
|
|
18 if content.startswith('--'):
|
|
19 key=content
|
|
20 refdict[key]={}
|
|
21 else:
|
|
22 idx=content.index('=')
|
|
23 a=content[0:idx].strip()
|
|
24 b=content[(idx+1):].strip()
|
|
25 refdict[key][a]=b
|
|
26 ref_input.close()
|
|
27 return refdict
|
|
28
|
|
29 def read_feature(featurefile,verbose=True):
|
|
30 '''Read exon/transcript/gene info and return bioclass obj'''
|
|
31 infile=open(featurefile)
|
|
32 txdb={} #keep track of all transcripts
|
|
33 exdb={} #keep track of all exons
|
|
34 genedb={} #keep track of all genes
|
|
35 i=0
|
|
36 for line in infile:
|
|
37 i+=1
|
|
38 if verbose:
|
|
39 if i%100000==0:
|
|
40 print '%d exon records loaded'%i
|
|
41 chr,start,end,tx,gene,strand,cat=line.split()
|
|
42 if cat != 'protein_coding':
|
|
43 continue
|
|
44 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
|
|
45 exdb[exon.name]=exon
|
|
46 if not txdb.has_key(tx):
|
|
47 txdb[tx]=bioclass.Transcript(tx,gene)
|
|
48 txdb[tx].add_exon(exon)
|
|
49 for txname in txdb:
|
|
50 t=txdb[txname]
|
|
51 if not genedb.has_key(t.gene):
|
|
52 genedb[t.gene]=bioclass.Gene(t.gene)
|
|
53 genedb[t.gene].add_transcript(t)
|
|
54 infile.close()
|
|
55 return (txdb,genedb)
|
|
56
|
|
57 def read_feature_genes(featurefile,*args):
|
|
58 '''Read exon/transcript/gene info for spec genes and return bioclass obj'''
|
|
59 infile=open(featurefile)
|
|
60 txdb={}
|
|
61 exdb={}
|
|
62 genedb={}
|
|
63 for line in infile:
|
|
64 chr,start,end,tx,gene,strand,cat=line.split()
|
|
65 if gene not in args:
|
|
66 continue
|
|
67 if cat != 'protein_coding':
|
|
68 continue
|
|
69 exon=bioclass.Exon(chr,int(start),int(end),strand,tx,gene)
|
|
70 exdb[exon.name]=exon
|
|
71 if not txdb.has_key(tx):
|
|
72 txdb[tx]=bioclass.Transcript(tx,gene)
|
|
73 txdb[tx].add_exon(exon)
|
|
74 for txname in txdb:
|
|
75 t=txdb[txname]
|
|
76 if not genedb.has_key(t.gene):
|
|
77 genedb[t.gene]=bioclass.Gene(t.gene)
|
|
78 genedb[t.gene].add_transcript(t)
|
|
79 res=[]
|
|
80 for g in args:
|
|
81 if genedb.has_key(g):
|
|
82 gobj=genedb[g]
|
|
83 else:
|
|
84 gobj=None
|
|
85 res.append(gobj)
|
|
86 infile.close()
|
|
87 return res
|
|
88
|
|
89 def read_cds(cdsfile):
|
|
90 '''read CDS start/end for transcripts'''
|
|
91 infile=open(cdsfile)
|
|
92 tx_cds={}
|
|
93 for line in infile:
|
|
94 info=line.split()
|
|
95 tx=info[0]
|
|
96 cds_start,cds_end=int(info[3]),int(info[4])
|
|
97 tx_cds[tx]=(cds_start,cds_end)
|
|
98 infile.close()
|
|
99 return tx_cds
|
|
100
|
|
101 def read_tx_cat(txcatfile):
|
|
102 '''find primary transcript for each gene'''
|
|
103 tx_primary={}
|
|
104 tx_cat={}
|
|
105 infile=open(txcatfile)
|
|
106 for line in infile:
|
|
107 if not line.strip():
|
|
108 continue
|
|
109 info=line.split()
|
|
110 if len(info) != 4:
|
|
111 continue
|
|
112 gene=info[2]
|
|
113 txid=info[1]
|
|
114 acc=int(info[3][-3:])
|
|
115 if tx_cat.has_key(gene):
|
|
116 tx_cat[gene].append([txid,acc])
|
|
117 else:
|
|
118 tx_cat[gene]=[[txid,acc]]
|
|
119 infile.close()
|
|
120 for item in tx_cat:
|
|
121 vv=tx_cat[item]
|
|
122 vv=sorted(vv,key=lambda x: x[1])
|
|
123 tx_primary[item]=vv[0][0]
|
|
124 return tx_primary
|
|
125
|
|
126
|
|
127 if __name__=='__main__':
|
|
128 #The following lines are for module testing purpose only.
|
|
129 #They would not affect the program.
|
|
130 featurefile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64.canonical.gene.exons.tab.txt'
|
|
131 cdsfile='/RIS/home/wtorres/RNAseq/hg19broad/ensembl.hg19.cds.txt'
|
|
132 txcatfile='/RIS/home/wtorres/RNAseq/hg19broad/Ensembl64_primary_transcript.txt'
|
|
133
|