annotate tools/mytools/altschulEriksonDinuclShuffle.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #! /usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 # altschulEriksonDinuclShuffle.py
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 # P. Clote, Oct 2003
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 # NOTE: One cannot use function "count(s,word)" to count the number
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 # of occurrences of dinucleotide word in string s, since the built-in
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 # function counts only nonoverlapping words, presumably in a left to
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 # right fashion.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 import sys,string,random
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 def computeCountAndLists(s):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 #WARNING: Use of function count(s,'UU') returns 1 on word UUU
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 #since it apparently counts only nonoverlapping words UU
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 #For this reason, we work with the indices.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 #Initialize lists and mono- and dinucleotide dictionaries
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 List = {} #List is a dictionary of lists
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 List['A'] = []; List['C'] = [];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 List['G'] = []; List['T'] = [];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 nuclList = ["A","C","G","T"]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 s = s.upper()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 s = s.replace("U","T")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 nuclCnt = {} #empty dictionary
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 dinuclCnt = {} #empty dictionary
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 for x in nuclList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 nuclCnt[x]=0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 dinuclCnt[x]={}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 for y in nuclList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 dinuclCnt[x][y]=0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 #Compute count and lists
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 nuclCnt[s[0]] = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 nuclTotal = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 dinuclTotal = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 for i in range(len(s)-1):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 x = s[i]; y = s[i+1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 List[x].append( y )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 nuclCnt[y] += 1; nuclTotal += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 dinuclCnt[x][y] += 1; dinuclTotal += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 assert (nuclTotal==len(s))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 assert (dinuclTotal==len(s)-1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 return nuclCnt,dinuclCnt,List
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 def chooseEdge(x,dinuclCnt):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 numInList = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 for y in ['A','C','G','T']:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 numInList += dinuclCnt[x][y]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 z = random.random()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 numerator = dinuclCnt[x]['A']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 if z < float(numerator)/float(denom):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 dinuclCnt[x]['A'] -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 return 'A'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 numerator += dinuclCnt[x]['C']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 if z < float(numerator)/float(denom):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 dinuclCnt[x]['C'] -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 return 'C'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 numerator += dinuclCnt[x]['G']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 if z < float(numerator)/float(denom):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 dinuclCnt[x]['G'] -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 return 'G'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 dinuclCnt[x]['T'] -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 return 'T'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 def connectedToLast(edgeList,nuclList,lastCh):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 D = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 for x in nuclList: D[x]=0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 for edge in edgeList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 a = edge[0]; b = edge[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 if b==lastCh: D[a]=1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 for i in range(2):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 for edge in edgeList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 a = edge[0]; b = edge[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 if D[b]==1: D[a]=1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 ok = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 for x in nuclList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 if x!=lastCh and D[x]==0: return 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 return 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 def eulerian(s):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 #compute nucleotides appearing in s
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 nuclList = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 for x in ["A","C","G","T"]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 if x in s: nuclList.append(x)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 #compute numInList[x] = number of dinucleotides beginning with x
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 numInList = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 for x in nuclList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 numInList[x]=0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 for y in nuclList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 numInList[x] += dinuclCnt[x][y]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 #create dinucleotide shuffle L
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 firstCh = s[0] #start with first letter of s
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 lastCh = s[-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 edgeList = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 for x in nuclList:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 ok = connectedToLast(edgeList,nuclList,lastCh)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 return ok,edgeList,nuclList,lastCh
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 def shuffleEdgeList(L):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 n = len(L); barrier = n
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 for i in range(n-1):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 z = int(random.random() * barrier)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 tmp = L[z]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 L[z]= L[barrier-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 L[barrier-1] = tmp
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 barrier -= 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 return L
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 def dinuclShuffle(s):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 ok = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 while not ok:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 ok,edgeList,nuclList,lastCh = eulerian(s)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 #remove last edges from each vertex list, shuffle, then add back
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 #the removed edges at end of vertex lists.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 for [x,y] in edgeList: List[x].remove(y)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 for x in nuclList: shuffleEdgeList(List[x])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 for [x,y] in edgeList: List[x].append(y)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 #construct the eulerian path
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 L = [s[0]]; prevCh = s[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 for i in range(len(s)-2):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 ch = List[prevCh][0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 L.append( ch )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 del List[prevCh][0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 prevCh = ch
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 L.append(s[-1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 t = string.join(L,"")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 return t
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 if __name__ == '__main__':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 if len(sys.argv)!=3:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 s = sys.argv[1].upper()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 for i in range(int(sys.argv[2])):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 print dinuclShuffle(s)