annotate tools/mytools/seq2meme.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 # -*- coding: iso-8859-1 -*-
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 import random,sys,math
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 #import pylab
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 def readMotifFile(filename):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 f=open(filename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 except IOError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 print "could not open",filename,"Are you sure this file exists?"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 seqs=[]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 maxL = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 for line in f:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 if '>' in line or 'N' in line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 next
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 seq = line.strip().upper()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 if maxL < len(seq):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 maxL = len(seq)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 seqs.append(seq)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 print len(seqs),'sequences loaded'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 print 'max seq length:',maxL
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 for i in range(len(seqs)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 if len(seqs[i]) < maxL:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 del seqs[i]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 print len(seqs),'sequences with length = ',maxL
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 return seqs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 def createWeightMatrix(seqs,psuedocont):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 motifWidth = len(seqs[0])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 weightMatrix = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 for i in range(motifWidth):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont})
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 #YOUR CODE HERE
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 for seq in seqs:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 for pos in range(motifWidth):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 #Normalize your weight matrix (so that it contains probabilities rather than counts)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 #Remember the added psuedocounts when you normalize!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 for pos in range(motifWidth):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 totalCount = sum(weightMatrix[pos].values())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 for letter in weightMatrix[pos].keys():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 #Return your weight matrix
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 return weightMatrix
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 def printMemeFormat(weightMatrix,motifName,filename,nSite,background):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 f = open(filename,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 f.write('MEME version 4.4\n\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 f.write('ALPHABET= ACGT\n\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 f.write('strands: + -\n\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 f.write('Background letter frequencies (from:\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 f.write(background+'\n\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 f.write('MOTIF '+motifName+'\n\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 for position in range(len(weightMatrix)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 probsThisPosition=weightMatrix[position]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 f.write(' '+"%.6f" %(probsThisPosition['A'])+'\t '+"%.6f" %(probsThisPosition['C'])+'\t '+"%.6f" %(probsThisPosition['G'])+'\t '+"%.6f" %(probsThisPosition['T'])+'\t\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 f.write('\n\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 #get a two decimal-place string representation of a float f
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 def twoDecimal(f):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 return "%.6f" %(f)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 def run():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 #Get file name from command line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 if len(sys.argv) < 3:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 print "python seq2meme.py motif_fasta outputfile motifName psuedocont background"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 motifFile=sys.argv[1] #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 outFile=sys.argv[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 motifName=sys.argv[3]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 psuedocont = float(sys.argv[4])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 background=' '.join(sys.argv[5].strip().split(','))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 motifs=readMotifFile(motifFile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 #Create weight matrix
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 motifWeightMatrix=createWeightMatrix(motifs,psuedocont)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 run()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101