### view tools/mytools/seq2meme.py @ 1:cdcb0ce84a1b

author xuebing Fri, 09 Mar 2012 19:45:15 -0500 9071e359b9a3
line wrap: on
line source
```
# -*- coding: iso-8859-1 -*-
import random,sys,math

#import pylab

try:
f=open(filename)
except IOError:
print "could not open",filename,"Are you sure this file exists?"
sys.exit(1)

seqs=[]
maxL = 0
for line in f:
if '>' in line or 'N' in line:
next
else:
seq = line.strip().upper()
if maxL < len(seq):
maxL = len(seq)
seqs.append(seq)
f.close()

print 'max seq length:',maxL
for i in range(len(seqs)):
if len(seqs[i]) < maxL:
del seqs[i]
print len(seqs),'sequences with length = ',maxL
return seqs

def createWeightMatrix(seqs,psuedocont):

motifWidth = len(seqs[0])
weightMatrix = []
for i in range(motifWidth):
weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont})

#Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif
for seq in seqs:
for pos in range(motifWidth):
weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0

#Normalize your weight matrix (so that it contains probabilities rather than counts)
#Remember the added psuedocounts when you normalize!
for pos in range(motifWidth):
totalCount = sum(weightMatrix[pos].values())
for letter in weightMatrix[pos].keys():
weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount

return weightMatrix

def printMemeFormat(weightMatrix,motifName,filename,nSite,background):
f = open(filename,'w')
f.write('MEME version 4.4\n\n')

f.write('ALPHABET= ACGT\n\n')

f.write('strands: + -\n\n')

f.write('Background letter frequencies (from:\n')
f.write(background+'\n\n')

f.write('MOTIF '+motifName+'\n\n')

f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n')
for position in range(len(weightMatrix)):
probsThisPosition=weightMatrix[position]
f.write('  '+"%.6f" %(probsThisPosition['A'])+'\t  '+"%.6f" %(probsThisPosition['C'])+'\t  '+"%.6f" %(probsThisPosition['G'])+'\t  '+"%.6f" %(probsThisPosition['T'])+'\t\n')
f.write('\n\n')
f.close()

#get a two decimal-place string representation of a float f
def twoDecimal(f):
return "%.6f" %(f)

def run():

#Get file name from command line
if len(sys.argv) < 3:
print "python seq2meme.py motif_fasta outputfile motifName psuedocont background"
sys.exit(1)
else:
motifFile=sys.argv[1] #
outFile=sys.argv[2]
motifName=sys.argv[3]
psuedocont = float(sys.argv[4])
background=' '.join(sys.argv[5].strip().split(','))