annotate mytools/altschulEriksonDinuclShuffle.py @ 9:87eb5c5ddfe9

Uploaded
author xuebing
date Fri, 09 Mar 2012 20:01:43 -0500
parents f0dc65e7f6c0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
7
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
1 #! /usr/bin/env python
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
2
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
3 # altschulEriksonDinuclShuffle.py
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
4 # P. Clote, Oct 2003
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
5 # NOTE: One cannot use function "count(s,word)" to count the number
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
6 # of occurrences of dinucleotide word in string s, since the built-in
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
7 # function counts only nonoverlapping words, presumably in a left to
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
8 # right fashion.
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
9
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
10
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
11 import sys,string,random
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
12
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
13
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
14
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
15 def computeCountAndLists(s):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
16 #WARNING: Use of function count(s,'UU') returns 1 on word UUU
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
17 #since it apparently counts only nonoverlapping words UU
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
18 #For this reason, we work with the indices.
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
19
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
20 #Initialize lists and mono- and dinucleotide dictionaries
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
21 List = {} #List is a dictionary of lists
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
22 List['A'] = []; List['C'] = [];
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
23 List['G'] = []; List['T'] = [];
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
24 nuclList = ["A","C","G","T"]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
25 s = s.upper()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
26 s = s.replace("U","T")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
27 nuclCnt = {} #empty dictionary
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
28 dinuclCnt = {} #empty dictionary
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
29 for x in nuclList:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
30 nuclCnt[x]=0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
31 dinuclCnt[x]={}
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
32 for y in nuclList:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
33 dinuclCnt[x][y]=0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
34
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
35 #Compute count and lists
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
36 nuclCnt[s[0]] = 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
37 nuclTotal = 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
38 dinuclTotal = 0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
39 for i in range(len(s)-1):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
40 x = s[i]; y = s[i+1]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
41 List[x].append( y )
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
42 nuclCnt[y] += 1; nuclTotal += 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
43 dinuclCnt[x][y] += 1; dinuclTotal += 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
44 assert (nuclTotal==len(s))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
45 assert (dinuclTotal==len(s)-1)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
46 return nuclCnt,dinuclCnt,List
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
47
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
48
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
49 def chooseEdge(x,dinuclCnt):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
50 numInList = 0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
51 for y in ['A','C','G','T']:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
52 numInList += dinuclCnt[x][y]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
53 z = random.random()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
54 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
55 numerator = dinuclCnt[x]['A']
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
56 if z < float(numerator)/float(denom):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
57 dinuclCnt[x]['A'] -= 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
58 return 'A'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
59 numerator += dinuclCnt[x]['C']
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
60 if z < float(numerator)/float(denom):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
61 dinuclCnt[x]['C'] -= 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
62 return 'C'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
63 numerator += dinuclCnt[x]['G']
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
64 if z < float(numerator)/float(denom):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
65 dinuclCnt[x]['G'] -= 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
66 return 'G'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
67 dinuclCnt[x]['T'] -= 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
68 return 'T'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
69
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
70 def connectedToLast(edgeList,nuclList,lastCh):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
71 D = {}
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
72 for x in nuclList: D[x]=0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
73 for edge in edgeList:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
74 a = edge[0]; b = edge[1]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
75 if b==lastCh: D[a]=1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
76 for i in range(2):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
77 for edge in edgeList:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
78 a = edge[0]; b = edge[1]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
79 if D[b]==1: D[a]=1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
80 ok = 0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
81 for x in nuclList:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
82 if x!=lastCh and D[x]==0: return 0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
83 return 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
84
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
85
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
86
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
87 def eulerian(s):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
88 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
89 #compute nucleotides appearing in s
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
90 nuclList = []
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
91 for x in ["A","C","G","T"]:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
92 if x in s: nuclList.append(x)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
93 #compute numInList[x] = number of dinucleotides beginning with x
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
94 numInList = {}
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
95 for x in nuclList:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
96 numInList[x]=0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
97 for y in nuclList:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
98 numInList[x] += dinuclCnt[x][y]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
99 #create dinucleotide shuffle L
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
100 firstCh = s[0] #start with first letter of s
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
101 lastCh = s[-1]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
102 edgeList = []
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
103 for x in nuclList:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
104 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
105 ok = connectedToLast(edgeList,nuclList,lastCh)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
106 return ok,edgeList,nuclList,lastCh
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
107
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
108
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
109 def shuffleEdgeList(L):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
110 n = len(L); barrier = n
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
111 for i in range(n-1):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
112 z = int(random.random() * barrier)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
113 tmp = L[z]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
114 L[z]= L[barrier-1]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
115 L[barrier-1] = tmp
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
116 barrier -= 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
117 return L
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
118
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
119 def dinuclShuffle(s):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
120 ok = 0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
121 while not ok:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
122 ok,edgeList,nuclList,lastCh = eulerian(s)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
123 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
124
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
125 #remove last edges from each vertex list, shuffle, then add back
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
126 #the removed edges at end of vertex lists.
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
127 for [x,y] in edgeList: List[x].remove(y)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
128 for x in nuclList: shuffleEdgeList(List[x])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
129 for [x,y] in edgeList: List[x].append(y)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
130
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
131 #construct the eulerian path
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
132 L = [s[0]]; prevCh = s[0]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
133 for i in range(len(s)-2):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
134 ch = List[prevCh][0]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
135 L.append( ch )
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
136 del List[prevCh][0]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
137 prevCh = ch
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
138 L.append(s[-1])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
139 t = string.join(L,"")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
140 return t
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
141
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
142
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
143
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
144 if __name__ == '__main__':
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
145 if len(sys.argv)!=3:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
146 print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5"
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
147 sys.exit(1)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
148 s = sys.argv[1].upper()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
149 for i in range(int(sys.argv[2])):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
150 print dinuclShuffle(s)