annotate seq2meme.py @ 13:292186c14b08

Uploaded
author xuebing
date Sat, 10 Mar 2012 08:17:36 -0500
parents b7f1d9f8f3bc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
1 # -*- coding: iso-8859-1 -*-
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
2 import random,sys,math
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
3
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
4 #import pylab
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
5
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
6 def readMotifFile(filename):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
7
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
8 try:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
9 f=open(filename)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
10 except IOError:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
11 print "could not open",filename,"Are you sure this file exists?"
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
12 sys.exit(1)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
13
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
14 seqs=[]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
15 maxL = 0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
16 for line in f:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
17 if '>' in line or 'N' in line:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
18 next
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
19 else:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
20 seq = line.strip().upper()
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
21 if maxL < len(seq):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
22 maxL = len(seq)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
23 seqs.append(seq)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
24 f.close()
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
25
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
26 print len(seqs),'sequences loaded'
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
27 print 'max seq length:',maxL
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
28 for i in range(len(seqs)):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
29 if len(seqs[i]) < maxL:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
30 del seqs[i]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
31 print len(seqs),'sequences with length = ',maxL
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
32 return seqs
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
33
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
34
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
35 def createWeightMatrix(seqs,psuedocont):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
36
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
37 motifWidth = len(seqs[0])
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
38 weightMatrix = []
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
39 for i in range(motifWidth):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
40 weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont})
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
41
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
42 #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
43 #YOUR CODE HERE
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
44 for seq in seqs:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
45 for pos in range(motifWidth):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
46 weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
47
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
48 #Normalize your weight matrix (so that it contains probabilities rather than counts)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
49 #Remember the added psuedocounts when you normalize!
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
50 for pos in range(motifWidth):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
51 totalCount = sum(weightMatrix[pos].values())
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
52 for letter in weightMatrix[pos].keys():
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
53 weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
54
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
55 #Return your weight matrix
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
56 return weightMatrix
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
57
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
58 def printMemeFormat(weightMatrix,motifName,filename,nSite,background):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
59 f = open(filename,'w')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
60 f.write('MEME version 4.4\n\n')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
61
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
62 f.write('ALPHABET= ACGT\n\n')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
63
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
64 f.write('strands: + -\n\n')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
65
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
66 f.write('Background letter frequencies (from:\n')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
67 f.write(background+'\n\n')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
68
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
69 f.write('MOTIF '+motifName+'\n\n')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
70
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
71 f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
72 for position in range(len(weightMatrix)):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
73 probsThisPosition=weightMatrix[position]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
74 f.write(' '+"%.6f" %(probsThisPosition['A'])+'\t '+"%.6f" %(probsThisPosition['C'])+'\t '+"%.6f" %(probsThisPosition['G'])+'\t '+"%.6f" %(probsThisPosition['T'])+'\t\n')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
75 f.write('\n\n')
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
76 f.close()
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
77
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
78 #get a two decimal-place string representation of a float f
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
79 def twoDecimal(f):
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
80 return "%.6f" %(f)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
81
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
82 def run():
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
83
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
84 #Get file name from command line
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
85 if len(sys.argv) < 3:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
86 print "python seq2meme.py motif_fasta outputfile motifName psuedocont background"
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
87 sys.exit(1)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
88 else:
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
89 motifFile=sys.argv[1] #
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
90 outFile=sys.argv[2]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
91 motifName=sys.argv[3]
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
92 psuedocont = float(sys.argv[4])
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
93 background=' '.join(sys.argv[5].strip().split(','))
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
94
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
95 motifs=readMotifFile(motifFile)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
96
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
97 #Create weight matrix
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
98 motifWeightMatrix=createWeightMatrix(motifs,psuedocont)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
99 printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background)
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
100 run()
b7f1d9f8f3bc Uploaded
xuebing
parents:
diff changeset
101