annotate pyPRADA_1.2/privutils.py @ 2:74f6c4fcab2f draft

Uploaded
author siyuan
date Tue, 25 Feb 2014 11:07:15 -0500
parents acc2ca1a3ba4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #This module collects functions used in homology comparison and frame prediction
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 import subprocess
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 def maptranscript(gene,exon_end,part=''):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 """
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 Find all transcripts of the gene having the fusion boundary.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 "gene" is a Gene object.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 "exon_end" is the exon boundary that to be searched.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 "part" has to be '5' or '3'.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 Return a list of Transcript objects that has "exon_end" boundary.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 """
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 if part not in ['5','3']:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 raise Exception('part must be "5" or "3"')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 txuse=[] ##
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 g=gene.name
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 strand=gene.strand
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 if part=='5':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 tag='big' if strand=='1' else 'small'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 tag='small' if strand=='1' else 'big'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 txs=gene.transcript
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 if tag=='small':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 for tx in txs:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 e_pos=[x.start for x in tx.exon]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 if exon_end in e_pos:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 txuse.append(tx)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 elif tag=='big':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 for tx in txs:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 e_pos=[x.end for x in tx.exon]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 if exon_end in e_pos:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 txuse.append(tx)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 return txuse
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 def overlap(r1,r2):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 """compare two ranges, return the overlapped range"""
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 if r1[0] <= r2[0]:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 ra,rb=r1[:],r2[:]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 ra,rb=r2[:],r1[:]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 if rb[0] > ra[1]:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 return []
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 if rb[1] <= ra[1]:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 return rb
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 return [rb[0],ra[1]]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 def br_front_len(tx,br,part):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 '''No matter it is 5'/3' partner,calculate the ORF length before the break'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 txname=tx.name
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 strand=tx.strand
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 cds_start=tx.cds_start
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 cds_end=tx.cds_end
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 e_pos=[[x.start,x.end] for x in tx.exon]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 if part=='5' and strand=='1':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 posset=[x.end for x in tx.exon]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 if br not in posset:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 raise Exception('Br is not exon boundary')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 if br < cds_start:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 return '5UTR'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 elif br > cds_end:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 return '3UTR'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 chim_r=[cds_start,br]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 ol=[overlap(x,chim_r) for x in e_pos]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 return L
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 if part=='5' and strand=='-1':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 posset=[x.start for x in tx.exon]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 if br not in posset:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 raise Exception('Br is not exon boundary')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 if br > cds_end:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 return '5UTR'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 elif br < cds_start:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 return '3UTR'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 chim_r=[br,cds_end]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 ol=[overlap(x,chim_r) for x in e_pos]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 return L
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 if part=='3' and strand=='1':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 posset=[x.start for x in tx.exon]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 if br not in posset:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 raise Exception('Br is not exon boundary')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 if br < cds_start:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 return '5UTR'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 elif br > cds_end:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 return '3UTR'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 chim_r=[cds_start,br-1]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 ol=[overlap(x,chim_r) for x in e_pos]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 return L
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 if part=='3' and strand=='-1':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 posset=[x.end for x in tx.exon]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 if br not in posset:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 raise Exception('Br is not exon boundary')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 if br > cds_end:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 return '5UTR'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 elif br < cds_start:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 return '3UTR'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 chim_r=[br+1,cds_end]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 ol=[overlap(x,chim_r) for x in e_pos]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 L=sum([x[1]-x[0]+1 for x in ol if len(x)>0])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 return L
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 def fusion_frame(gene_a,ga_br,gene_b,gb_br):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 """gene_a:FGFR3,gene_b:TACC3,ga_br:1808661,gb_br:1741429
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 or chr4.1808661.chr4.1741429
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 gene_a, gene_b are Gene objects.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 """
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 ga_br=int(ga_br)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 gb_br=int(gb_br)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 ga_txs=maptranscript(gene_a,ga_br,part='5')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 gb_txs=maptranscript(gene_b,gb_br,part='3')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 res=[]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 for ta in ga_txs:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 for tb in gb_txs:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 s1=br_front_len(ta,ga_br,'5')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 s2=br_front_len(tb,gb_br,'3')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 fusion_conseq='Unknown'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 if isinstance(s1,int) and isinstance(s2,int): #both br in CDS region
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 if s1%3==s2%3:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 fusion_conseq='In-frame'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 fusion_conseq='Out-of-frame'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 elif isinstance(s1,int) and not isinstance(s2,int):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 fusion_conseq='CDS'+'-'+s2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 elif isinstance(s2,int) and not isinstance(s1,int):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 fusion_conseq=s1+'-'+'CDS'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 fusion_conseq=s1+'-'+s2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 res.append((ta.name,tb.name,fusion_conseq))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 if ga_txs==[] and gb_txs==[]:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 res.append(['NA','NA','NA'])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 elif ga_txs==[]:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 for tb in gb_txs:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 res.append(['NA',tb.name,'NA'])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 elif gb_txs==[]:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 for ta in ga_txs:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 res.append([ta.name,'NA','NA'])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 return res
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 def seqblast(seqA,seqB,blastn=None):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 '''seqA,seqB:fasta files'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 if blastn==None: blastn='blastn'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 cmdstr='%s -task=blastn -subject %s -query %s -outfmt 6'%(blastn,seqA,seqB)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 cmdout=subprocess.Popen(cmdstr.split(),stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 result=cmdout.stdout.read()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 best_align=result.split('\n')[0]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 if best_align=='':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 return None
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 info=best_align.split('\t')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 identity=info[2].strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 align_len=info[3].strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 evalue=info[10].strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 bit_score=info[11].strip()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 return [identity,align_len,evalue,bit_score]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165