comparison seq2meme.py @ 14:76e1b1b21cce default tip

Deleted selected files
author xuebing
date Tue, 13 Mar 2012 19:05:10 -0400
parents 292186c14b08
children
comparison
equal deleted inserted replaced
13:292186c14b08 14:76e1b1b21cce
1 # -*- coding: iso-8859-1 -*-
2 import random,sys,math
3
4 #import pylab
5
6 def readMotifFile(filename):
7
8 try:
9 f=open(filename)
10 except IOError:
11 print "could not open",filename,"Are you sure this file exists?"
12 sys.exit(1)
13
14 seqs=[]
15 maxL = 0
16 for line in f:
17 if '>' in line or 'N' in line:
18 next
19 else:
20 seq = line.strip().upper()
21 if maxL < len(seq):
22 maxL = len(seq)
23 seqs.append(seq)
24 f.close()
25
26 print len(seqs),'sequences loaded'
27 print 'max seq length:',maxL
28 for i in range(len(seqs)):
29 if len(seqs[i]) < maxL:
30 del seqs[i]
31 print len(seqs),'sequences with length = ',maxL
32 return seqs
33
34
35 def createWeightMatrix(seqs,psuedocont):
36
37 motifWidth = len(seqs[0])
38 weightMatrix = []
39 for i in range(motifWidth):
40 weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont})
41
42 #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif
43 #YOUR CODE HERE
44 for seq in seqs:
45 for pos in range(motifWidth):
46 weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0
47
48 #Normalize your weight matrix (so that it contains probabilities rather than counts)
49 #Remember the added psuedocounts when you normalize!
50 for pos in range(motifWidth):
51 totalCount = sum(weightMatrix[pos].values())
52 for letter in weightMatrix[pos].keys():
53 weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount
54
55 #Return your weight matrix
56 return weightMatrix
57
58 def printMemeFormat(weightMatrix,motifName,filename,nSite,background):
59 f = open(filename,'w')
60 f.write('MEME version 4.4\n\n')
61
62 f.write('ALPHABET= ACGT\n\n')
63
64 f.write('strands: + -\n\n')
65
66 f.write('Background letter frequencies (from:\n')
67 f.write(background+'\n\n')
68
69 f.write('MOTIF '+motifName+'\n\n')
70
71 f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n')
72 for position in range(len(weightMatrix)):
73 probsThisPosition=weightMatrix[position]
74 f.write(' '+"%.6f" %(probsThisPosition['A'])+'\t '+"%.6f" %(probsThisPosition['C'])+'\t '+"%.6f" %(probsThisPosition['G'])+'\t '+"%.6f" %(probsThisPosition['T'])+'\t\n')
75 f.write('\n\n')
76 f.close()
77
78 #get a two decimal-place string representation of a float f
79 def twoDecimal(f):
80 return "%.6f" %(f)
81
82 def run():
83
84 #Get file name from command line
85 if len(sys.argv) < 3:
86 print "python seq2meme.py motif_fasta outputfile motifName psuedocont background"
87 sys.exit(1)
88 else:
89 motifFile=sys.argv[1] #
90 outFile=sys.argv[2]
91 motifName=sys.argv[3]
92 psuedocont = float(sys.argv[4])
93 background=' '.join(sys.argv[5].strip().split(','))
94
95 motifs=readMotifFile(motifFile)
96
97 #Create weight matrix
98 motifWeightMatrix=createWeightMatrix(motifs,psuedocont)
99 printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background)
100 run()
101