annotate fasta-dinucleotide-shuffle.py @ 15:0e221dbd17b2 default tip

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:53:06 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
15
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/python
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
2
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
3 import sys, string, random
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
4 import sequence
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
5
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
6 #
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
7 # turn on psyco to speed up by 3X
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
8 #
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
9 if __name__=='__main__':
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
10 try:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
11 import psyco
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
12 #psyco.log()
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
13 psyco.full()
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
14 psyco_found = True
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
15 except ImportError:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
16 # psyco_found = False
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
17 pass
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
18 # print >> sys.stderr, "psyco_found", psyco_found
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
19
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
20
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
21 # altschulEriksonDinuclShuffle.py
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
22 # P. Clote, Oct 2003
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
23
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
24 def computeCountAndLists(s):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
25
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
26 #Initialize lists and mono- and dinucleotide dictionaries
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
27 List = {} #List is a dictionary of lists
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
28 List['A'] = []; List['C'] = [];
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
29 List['G'] = []; List['T'] = [];
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
30 # FIXME: is this ok?
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
31 List['N'] = []
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
32 nuclList = ["A","C","G","T","N"]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
33 s = s.upper()
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
34 #s = s.replace("U","T")
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
35 nuclCnt = {} #empty dictionary
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
36 dinuclCnt = {} #empty dictionary
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
37 for x in nuclList:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
38 nuclCnt[x]=0
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
39 dinuclCnt[x]={}
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
40 for y in nuclList:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
41 dinuclCnt[x][y]=0
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
42
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
43 #Compute count and lists
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
44 nuclCnt[s[0]] = 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
45 nuclTotal = 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
46 dinuclTotal = 0
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
47 for i in range(len(s)-1):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
48 x = s[i]; y = s[i+1]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
49 List[x].append( y )
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
50 nuclCnt[y] += 1; nuclTotal += 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
51 dinuclCnt[x][y] += 1; dinuclTotal += 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
52 assert (nuclTotal==len(s))
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
53 assert (dinuclTotal==len(s)-1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
54 return nuclCnt,dinuclCnt,List
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
55
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
56
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
57 def chooseEdge(x,dinuclCnt):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
58 z = random.random()
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
59 denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N']
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
60 numerator = dinuclCnt[x]['A']
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
61 if z < float(numerator)/float(denom):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
62 dinuclCnt[x]['A'] -= 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
63 return 'A'
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
64 numerator += dinuclCnt[x]['C']
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
65 if z < float(numerator)/float(denom):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
66 dinuclCnt[x]['C'] -= 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
67 return 'C'
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
68 numerator += dinuclCnt[x]['G']
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
69 if z < float(numerator)/float(denom):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
70 dinuclCnt[x]['G'] -= 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
71 return 'G'
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
72 numerator += dinuclCnt[x]['T']
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
73 if z < float(numerator)/float(denom):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
74 dinuclCnt[x]['T'] -= 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
75 return 'T'
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
76 dinuclCnt[x]['N'] -= 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
77 return 'N'
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
78
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
79 def connectedToLast(edgeList,nuclList,lastCh):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
80 D = {}
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
81 for x in nuclList: D[x]=0
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
82 for edge in edgeList:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
83 a = edge[0]; b = edge[1]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
84 if b==lastCh: D[a]=1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
85 for i in range(3):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
86 for edge in edgeList:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
87 a = edge[0]; b = edge[1]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
88 if D[b]==1: D[a]=1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
89 ok = 0
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
90 for x in nuclList:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
91 if x!=lastCh and D[x]==0: return 0
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
92 return 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
93
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
94 def eulerian(s):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
95 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
96 #compute nucleotides appearing in s
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
97 nuclList = []
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
98 for x in ["A","C","G","T","N"]:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
99 if x in s: nuclList.append(x)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
100 #create dinucleotide shuffle L
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
101 firstCh = s[0] #start with first letter of s
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
102 lastCh = s[-1]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
103 edgeList = []
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
104 for x in nuclList:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
105 if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
106 ok = connectedToLast(edgeList,nuclList,lastCh)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
107 return ok,edgeList,nuclList,lastCh
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
108
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
109
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
110 def shuffleEdgeList(L):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
111 n = len(L); barrier = n
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
112 for i in range(n-1):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
113 z = int(random.random() * barrier)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
114 tmp = L[z]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
115 L[z]= L[barrier-1]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
116 L[barrier-1] = tmp
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
117 barrier -= 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
118 return L
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
119
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
120 def dinuclShuffle(s):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
121 ok = 0
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
122 while not ok:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
123 ok,edgeList,nuclList,lastCh = eulerian(s)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
124 nuclCnt,dinuclCnt,List = computeCountAndLists(s)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
125
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
126 #remove last edges from each vertex list, shuffle, then add back
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
127 #the removed edges at end of vertex lists.
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
128 for [x,y] in edgeList: List[x].remove(y)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
129 for x in nuclList: shuffleEdgeList(List[x])
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
130 for [x,y] in edgeList: List[x].append(y)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
131
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
132 #construct the eulerian path
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
133 L = [s[0]]; prevCh = s[0]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
134 for i in range(len(s)-2):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
135 ch = List[prevCh][0]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
136 L.append( ch )
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
137 del List[prevCh][0]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
138 prevCh = ch
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
139 L.append(s[-1])
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
140 t = string.join(L,"")
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
141 return t
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
142
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
143 def main():
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
144
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
145 #
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
146 # defaults
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
147 #
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
148 file_name = None
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
149 seed = 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
150 copies = 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
151
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
152 #
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
153 # get command line arguments
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
154 #
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
155 usage = """USAGE:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
156 %s [options]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
157
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
158 -f <filename> file name (required)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
159 -t <tag> added to shuffled sequence names
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
160 -s <seed> random seed; default: %d
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
161 -c <n> make <n> shuffled copies of each sequence; default: %d
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
162 -h print this usage message
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
163 """ % (sys.argv[0], seed, copies)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
164
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
165 # no arguments: print usage
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
166 if len(sys.argv) == 1:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
167 print >> sys.stderr, usage; sys.exit(1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
168
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
169 tag = "";
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
170
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
171 # parse command line
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
172 i = 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
173 while i < len(sys.argv):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
174 arg = sys.argv[i]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
175 if (arg == "-f"):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
176 i += 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
177 try: file_name = sys.argv[i]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
178 except: print >> sys.stderr, usage; sys.exit(1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
179 elif (arg == "-t"):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
180 i += 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
181 try: tag = sys.argv[i]
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
182 except: print >> sys.stderr, usage; sys.exit(1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
183 elif (arg == "-s"):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
184 i += 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
185 try: seed = string.atoi(sys.argv[i])
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
186 except: print >> sys.stderr, usage; sys.exit(1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
187 elif (arg == "-c"):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
188 i += 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
189 try: copies = string.atoi(sys.argv[i])
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
190 except: print >> sys.stderr, usage; sys.exit(1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
191 elif (arg == "-h"):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
192 print >> sys.stderr, usage; sys.exit(1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
193 else:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
194 print >> sys.stderr, "Unknown command line argument: " + arg
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
195 sys.exit(1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
196 i += 1
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
197
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
198 # check that required arguments given
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
199 if (file_name == None):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
200 print >> sys.stderr, usage; sys.exit(1)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
201
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
202 random.seed(seed)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
203
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
204 # read sequences
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
205 seqs = sequence.readFASTA(file_name,'Extended DNA')
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
206
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
207 for s in seqs:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
208 str = s.getString()
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
209 #FIXME altschul can't handle ambigs
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
210 name = s.getName()
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
211
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
212 #print >> sys.stderr, ">%s" % name
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
213
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
214 for i in range(copies):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
215
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
216 shuffledSeq = dinuclShuffle(str)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
217
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
218 if (copies == 1):
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
219 print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
220 else:
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
221 print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq)
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
222
0e221dbd17b2 Uploaded
xuebing
parents:
diff changeset
223 if __name__ == '__main__': main()