annotate seq_to_meme.py @ 1:54f508fcdc40 default tip

Uploaded
author xuebing
date Sat, 31 Mar 2012 21:50:15 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
1 # -*- coding: iso-8859-1 -*-
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
2 import random,sys,math
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
3
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
4 #import pylab
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
5
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
6 def readMotifFile(filename):
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
7
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
8 try:
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
9 f=open(filename)
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
10 except IOError:
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
11 print "could not open",filename,"Are you sure this file exists?"
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
12 sys.exit(1)
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
13
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
14 seqs=[]
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
15 maxL = 0
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
16 for line in f:
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
17 if '>' in line or 'N' in line:
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
18 next
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
19 else:
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
20 seq = line.strip().upper()
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
21 if maxL < len(seq):
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
22 maxL = len(seq)
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
23 seqs.append(seq)
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
24 f.close()
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
25
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
26 print len(seqs),'sequences loaded'
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
27 print 'max seq length:',maxL
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
28 for i in range(len(seqs)):
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
29 if len(seqs[i]) < maxL:
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
30 del seqs[i]
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
31 print len(seqs),'sequences with length = ',maxL
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
32 return seqs
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
33
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
34
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
35 def createWeightMatrix(seqs,psuedocont):
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
36
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
37 motifWidth = len(seqs[0])
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
38 weightMatrix = []
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
39 for i in range(motifWidth):
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
40 weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont})
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
41
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
42 #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
43 #YOUR CODE HERE
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
44 for seq in seqs:
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
45 for pos in range(motifWidth):
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
46 weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
47
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
48 #Normalize your weight matrix (so that it contains probabilities rather than counts)
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
49 #Remember the added psuedocounts when you normalize!
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
50 for pos in range(motifWidth):
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
51 totalCount = sum(weightMatrix[pos].values())
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
52 for letter in weightMatrix[pos].keys():
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
53 weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
54
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
55 #Return your weight matrix
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
56 return weightMatrix
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
57
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
58 def printMemeFormat(weightMatrix,motifName,filename,nSite,background):
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
59 f = open(filename,'w')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
60 f.write('MEME version 4.4\n\n')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
61
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
62 f.write('ALPHABET= ACGT\n\n')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
63
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
64 f.write('strands: + -\n\n')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
65
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
66 f.write('Background letter frequencies (from:\n')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
67 f.write(background+'\n\n')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
68
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
69 f.write('MOTIF '+motifName+'\n\n')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
70
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
71 f.write('BL MOTIF '+motifName+' width='+str(len(weightMatrix))+' seqs='+str(nSite)+'\n')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
72 f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
73 for position in range(len(weightMatrix)):
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
74 probsThisPosition=weightMatrix[position]
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
75 f.write(' '+"%.6f" %(probsThisPosition['A'])+'\t '+"%.6f" %(probsThisPosition['C'])+'\t '+"%.6f" %(probsThisPosition['G'])+'\t '+"%.6f" %(probsThisPosition['T'])+'\t\n')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
76 f.write('\n\n')
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
77 f.close()
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
78
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
79 #get a two decimal-place string representation of a float f
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
80 def twoDecimal(f):
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
81 return "%.6f" %(f)
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
82
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
83 def run():
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
84
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
85 #Get file name from command line
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
86 if len(sys.argv) < 3:
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
87 print "python seq2meme.py motif_fasta outputfile motifName psuedocont background"
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
88 sys.exit(1)
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
89 else:
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
90 motifFile=sys.argv[1] #
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
91 outFile=sys.argv[2]
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
92 motifName=sys.argv[3]
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
93 psuedocont = float(sys.argv[4])
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
94 background=' '.join(sys.argv[5].strip().split(','))
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
95
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
96 motifs=readMotifFile(motifFile)
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
97
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
98 #Create weight matrix
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
99 motifWeightMatrix=createWeightMatrix(motifs,psuedocont)
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
100 printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background)
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
101 run()
54f508fcdc40 Uploaded
xuebing
parents:
diff changeset
102