# HG changeset patch # User xuebing # Date 1333198366 14400 # Node ID d1f0f85ee5bce9f1f05d2339b0776f410a9e691a # Parent 5a93fe53194d500247b5f683173ee7e42ff87c58 Deleted selected files diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/dreme.xml --- a/mytools/dreme.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ - - short motif discovery - /Users/xuebing/bin/dreme.py -p $input -png -e $ethresh - #if $background_select.bg_select == "fromfile": - -n "${bgfile}" - #end if - - && mv dreme_out/dreme.html ${html_outfile} - - && mv dreme_out/dreme.txt ${txt_outfile} - - && mv dreme_out/dreme.xml ${xml_outfile} - - && rm -rf dreme_out - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -http://meme.sdsc.edu/meme/doc/dreme.html - -DREME (Discriminative Regular Expression Motif Elicitation) finds relatively short motifs (up to 8 bases) fast, and can perform discriminative motif discovery if given a negative set, consisting of sequences unlikely to contain a motif of interest that is however likely to be found in the main ("positive") sequence set. If you do not provide a negative set the program shuffles the positive set to provide a background (in the role of the negative set). - -The input to DREME is one or two sets of DNA sequences. The program uses a Fisher Exact Test to determine significance of each motif found in the postive set as compared with its representation in the negative set, using a significance threshold that may be set on the command line. - -DREME achieves its high speed by restricting its search to regular expressions based on the IUPAC alphabet representing bases and ambiguous characters, and by using a heuristic estimate of generalised motifs' statistical significance. - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/fasta-dinucleotide-shuffle.py --- a/mytools/fasta-dinucleotide-shuffle.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,223 +0,0 @@ -#!/usr/bin/python - -import sys, string, random -import sequence - -# -# turn on psyco to speed up by 3X -# -if __name__=='__main__': - try: - import psyco - #psyco.log() - psyco.full() - psyco_found = True - except ImportError: -# psyco_found = False - pass -# print >> sys.stderr, "psyco_found", psyco_found - - -# altschulEriksonDinuclShuffle.py -# P. Clote, Oct 2003 - -def computeCountAndLists(s): - - #Initialize lists and mono- and dinucleotide dictionaries - List = {} #List is a dictionary of lists - List['A'] = []; List['C'] = []; - List['G'] = []; List['T'] = []; - # FIXME: is this ok? - List['N'] = [] - nuclList = ["A","C","G","T","N"] - s = s.upper() - #s = s.replace("U","T") - nuclCnt = {} #empty dictionary - dinuclCnt = {} #empty dictionary - for x in nuclList: - nuclCnt[x]=0 - dinuclCnt[x]={} - for y in nuclList: - dinuclCnt[x][y]=0 - - #Compute count and lists - nuclCnt[s[0]] = 1 - nuclTotal = 1 - dinuclTotal = 0 - for i in range(len(s)-1): - x = s[i]; y = s[i+1] - List[x].append( y ) - nuclCnt[y] += 1; nuclTotal += 1 - dinuclCnt[x][y] += 1; dinuclTotal += 1 - assert (nuclTotal==len(s)) - assert (dinuclTotal==len(s)-1) - return nuclCnt,dinuclCnt,List - - -def chooseEdge(x,dinuclCnt): - z = random.random() - denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N'] - numerator = dinuclCnt[x]['A'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['A'] -= 1 - return 'A' - numerator += dinuclCnt[x]['C'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['C'] -= 1 - return 'C' - numerator += dinuclCnt[x]['G'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['G'] -= 1 - return 'G' - numerator += dinuclCnt[x]['T'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['T'] -= 1 - return 'T' - dinuclCnt[x]['N'] -= 1 - return 'N' - -def connectedToLast(edgeList,nuclList,lastCh): - D = {} - for x in nuclList: D[x]=0 - for edge in edgeList: - a = edge[0]; b = edge[1] - if b==lastCh: D[a]=1 - for i in range(3): - for edge in edgeList: - a = edge[0]; b = edge[1] - if D[b]==1: D[a]=1 - ok = 0 - for x in nuclList: - if x!=lastCh and D[x]==0: return 0 - return 1 - -def eulerian(s): - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - #compute nucleotides appearing in s - nuclList = [] - for x in ["A","C","G","T","N"]: - if x in s: nuclList.append(x) - #create dinucleotide shuffle L - firstCh = s[0] #start with first letter of s - lastCh = s[-1] - edgeList = [] - for x in nuclList: - if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] ) - ok = connectedToLast(edgeList,nuclList,lastCh) - return ok,edgeList,nuclList,lastCh - - -def shuffleEdgeList(L): - n = len(L); barrier = n - for i in range(n-1): - z = int(random.random() * barrier) - tmp = L[z] - L[z]= L[barrier-1] - L[barrier-1] = tmp - barrier -= 1 - return L - -def dinuclShuffle(s): - ok = 0 - while not ok: - ok,edgeList,nuclList,lastCh = eulerian(s) - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - - #remove last edges from each vertex list, shuffle, then add back - #the removed edges at end of vertex lists. - for [x,y] in edgeList: List[x].remove(y) - for x in nuclList: shuffleEdgeList(List[x]) - for [x,y] in edgeList: List[x].append(y) - - #construct the eulerian path - L = [s[0]]; prevCh = s[0] - for i in range(len(s)-2): - ch = List[prevCh][0] - L.append( ch ) - del List[prevCh][0] - prevCh = ch - L.append(s[-1]) - t = string.join(L,"") - return t - -def main(): - - # - # defaults - # - file_name = None - seed = 1 - copies = 1 - - # - # get command line arguments - # - usage = """USAGE: - %s [options] - - -f file name (required) - -t added to shuffled sequence names - -s random seed; default: %d - -c make shuffled copies of each sequence; default: %d - -h print this usage message - """ % (sys.argv[0], seed, copies) - - # no arguments: print usage - if len(sys.argv) == 1: - print >> sys.stderr, usage; sys.exit(1) - - tag = ""; - - # parse command line - i = 1 - while i < len(sys.argv): - arg = sys.argv[i] - if (arg == "-f"): - i += 1 - try: file_name = sys.argv[i] - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-t"): - i += 1 - try: tag = sys.argv[i] - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-s"): - i += 1 - try: seed = string.atoi(sys.argv[i]) - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-c"): - i += 1 - try: copies = string.atoi(sys.argv[i]) - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-h"): - print >> sys.stderr, usage; sys.exit(1) - else: - print >> sys.stderr, "Unknown command line argument: " + arg - sys.exit(1) - i += 1 - - # check that required arguments given - if (file_name == None): - print >> sys.stderr, usage; sys.exit(1) - - random.seed(seed) - - # read sequences - seqs = sequence.readFASTA(file_name,'Extended DNA') - - for s in seqs: - str = s.getString() - #FIXME altschul can't handle ambigs - name = s.getName() - - #print >> sys.stderr, ">%s" % name - - for i in range(copies): - - shuffledSeq = dinuclShuffle(str) - - if (copies == 1): - print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq) - else: - print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq) - -if __name__ == '__main__': main() diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/fastamarkov.xml --- a/mytools/fastamarkov.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ - - of DNA sequence - cat $input | fasta-get-markov -m $m $norc > $output 2> err.txt - - - - - - - - - - - - -**What it does** - -This tool generates a markov model from a fasta sequence file. - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/fastashuffle1.xml --- a/mytools/fastashuffle1.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ - - preserving mono-nucleotide frequency - cat $input | fasta-shuffle-letters > $output - - - - - - - - -**Description** - -shuffle the position of nucleotides in each sequence, preserving the frequency of nucleotides in that sequence, but do not preserve di-nucleotide frequency. - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/fastashuffle2.xml --- a/mytools/fastashuffle2.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ - - preserving dinucleotide frequency - fasta-dinucleotide-shuffle.py -f $input -t $tag -c $n -s $seed > $output - - - - - - - - - - - -**What it does** - -This tool shuffles the sequences in the input file but preserves the dinucleotide frequency of each sequence. - -The code implements the Altschul-Erikson dinucleotide shuffle algorithm, described in "Significance of nucleotide sequence alignments: A method for random sequence permutation that preserves dinucleotide and codon usage", S.F. Altschul and B.W. Erikson, Mol. Biol. Evol., 2(6):526--538, 1985. - -Code adapted from http://bioinformatics.bc.edu/clotelab/RNAdinucleotideShuffle/dinucleotideShuffle.html - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/fastqdump.xml --- a/mytools/fastqdump.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ - - convert SRA to FASTQ - /Users/xuebing/tools/sratoolkit.2.1.9-mac32/fastq-dump -A $input -M $minReadLen -Z > $out_file1 - - - - - - - - - -**What it does** - -This is a wrapper of the fastq-dump tool from sra-toolkit. See http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/fimo2.xml --- a/mytools/fimo2.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ - - using FIMO - fimo - #if $background_select.bg_select == "fromfile": - -bgfile $bgfile - #end if - - $norc --max-stored-scores 5000000 --output-pthresh $pth --verbosity 1 $motif $database - && mv fimo_out/fimo.html ${html_outfile} - - && mv fimo_out/fimo.txt ${txt_outfile} - - && rm -rf fimo_out - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -This tool uses FIMO to find matches of a motif in a fasta file. See more details: - -http://meme.sdsc.edu/meme/fimo-intro.html - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/fimo2bed.py --- a/mytools/fimo2bed.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -''' -#pattern name sequence name start stop score p-value q-value matched sequence -constitutive-donor mm9_chr1_39533592_39535592_- 1815 1823 12.032 4.26e-06 0.397 CAGGTAAGT -constitutive-donor mm9_chr1_59313750_59315750_+ 1889 1897 12.032 4.26e-06 0.397 CAGGTAAGT - -#pattern name sequence name start stop score p-value q-value matched sequence -constitutive-donor mm9_chr1_172019075_172021075_- 1947 1955 12.032 4.26e-06 0.843 CAGGTAAGT -constitutive-donor mm9_chr1_15300532_15302532_+ 156 164 12.032 4.26e-06 0.843 CAGGTAAGT -''' - -import sys - -def fimo2bed(filename,rc): - ''' - parse fimo output to make a bed file - rc: the sequence have been reverse complemented - ''' - f = open(filename) - header = f.readline() - for line in f: - pattern,posi,begin,stop,score,pv,qv,seq = line.strip().split('\t') - flds = posi.split('_') - start = flds[-3] - end = flds[-2] - strand = flds[-1] - chrom = '_'.join(flds[1:-3]) #'chrX_random' - if not rc: - if strand == '+': - start1 = str(int(start) + int(begin)-1) - end1 = str(int(start) + int(stop)) - print '\t'.join([chrom,start1,end1,seq,score,strand]) - else: - start1 = str(int(end) - int(stop)) - end1 = str(int(end) - int(begin)+1) - print '\t'.join([chrom,start1,end1,seq,score,strand]) - else: - if strand == '-': - start1 = str(int(start) + int(begin)-1) - end1 = str(int(start) + int(stop)) - print '\t'.join([chrom,start1,end1,seq,score,'+']) - else: - start1 = str(int(end) - int(stop)) - end1 = str(int(end) - int(begin)+1) - print '\t'.join([chrom,start1,end1,seq,score,'-']) - -fimo2bed(sys.argv[1],sys.argv[2]=='rc') diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/fimo2bed.xml --- a/mytools/fimo2bed.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ - - convert FIMO output to BED - fimo2bed.py $input $rc > $output - - - - - - - - - - Only works if your original FIMO input fasta sequences have ids like:: - - mm9_chr15_99358448_99360448_- - - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/iupac2meme.xml --- a/mytools/iupac2meme.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ - - from one sequence - iupac2meme - #if $background_select.bg_select == "fromfile": - -bg $bgfile - #end if - -numseqs $numseqs $logodds $motif > $output - - - - - - - - - - - - - - - - - - - - -**Description** - -Convert an IUPAC motif into a MEME version 4 formatted file suitable for use with FIMO and other MEME Suite programs. - -See additional information: - -http://meme.sdsc.edu/meme/doc/iupac2meme.html - -**IUPAC code**:: - - Nucleotide Code: Base: - ---------------- ----- - A.................Adenine - C.................Cytosine - G.................Guanine - T (or U)..........Thymine (or Uracil) - R.................A or G - Y.................C or T - S.................G or C - W.................A or T - K.................G or T - M.................A or C - B.................C or G or T - D.................A or G or T - H.................A or C or T - V.................A or C or G - N.................any base - - - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/match.xml --- a/mytools/match.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ - - find short motif occurrences - match $motif $seq $output $nmismatch $rc $bed > $log - - - - - - - - - - - - - - -**What it does** - -This tool searches occurrences of a short nucleotide seuqences (allowing mismatches) in a set of longer sequences. - -Example motif file:: - - >motif1 - CAGGTAAGT - >motif2 - GTTTGGGGGCC - -Example sequence file:: - - >hg18_chr6_122208322_122209078_+ - CGTCGTAGCTACTAGCTACGTACGTACGTAGCTAGCATGCATGCTACGTA - CGTAGCTAGCTAAAAAAAAAAAAAAACTGCGGCTAGCTAGCTAGCTACGT - CGATCGTAGCTAC... - >hg18_chr6_1208322_122209023_+ - CGATGCTAGCTAGCTAGCTACGTAGCTAGCTAGTCGATGCTAGCTAGCTA - ATGCTAGCTAGC.... - -Output (bed):: - - chr11 72790893 72790902 ACTTAACTG 1 - antisense 5ss,G4T:CAGTTAAGT-rc hg18_chr11_72790846_72791902_+ 47 - chr11 72791880 72791889 CAGGTAAGA 1 + sense 5ss,T9A:CAGGTAAGA hg18_chr11_72790846_72791902_+ 1034 - - -Output (tab):: - - Tmod4 802 5ss:CAGGTAAGT-rc ACTTACCTG - Atp7b 77 5ss:CAGGTAAGT CAGGTAAGT - Fnta 665 5ss:CAGGTAAGT CAGGTAAGT - - - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/meme.xml --- a/mytools/meme.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,349 +0,0 @@ - - meme - motif discovery - meme "$input1" -o "${html_outfile.files_path}" - -nostatus - - ##-p 8 ##number of processors - - #if str( $options_type.options_type_selector ) == 'advanced': - -sf "${ str( $options_type.sf ).replace( ' ', '_' ) }" - -${options_type.alphabet_type.alphabet_type_selector} - -mod "${options_type.mod_type.mod_type_selector}" - -nmotifs "${options_type.nmotifs}" - -wnsites "${options_type.wnsites}" - -maxsize "${options_type.maxsize}" - - #if $options_type.evt < float('inf'): - -evt "${options_type.evt}" - #end if - - #if str( $options_type.mod_type.mod_type_selector ) != 'oops': - #if str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == 'nsites': - -nsites "${options_type.mod_type.motif_occurrence_type.nsites}" - #elif str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == 'min_max_sites': - -minsites "${options_type.mod_type.motif_occurrence_type.minsites}" -maxsites "${options_type.mod_type.motif_occurrence_type.maxsites}" - #end if - #end if - - #if str( $options_type.motif_width_type.motif_width_type_selector ) == 'exact': - -w "${options_type.motif_width_type.width}" - #else - -minw "${options_type.motif_width_type.minw}" -maxw "${options_type.motif_width_type.maxw}" - #end if - - #if str( $options_type.motif_trim_type.motif_trim_type_selector ) == 'nomatrim': - -nomatrim - #else - -wg "${options_type.motif_trim_type.wg}" -ws "${options_type.motif_trim_type.ws}" ${options_type.motif_trim_type.noendgaps} - #end if - - #if str( $options_type.bfile ) != 'None': - -bfile "${options_type.bfile}" - #end if - - #if str( $options_type.pspfile ) != 'None': - -psp "${options_type.pspfile}" - #end if - - #if str( $options_type.alphabet_type.alphabet_type_selector ) == "dna": - ${options_type.alphabet_type.revcomp} ${options_type.alphabet_type.pal} - #end if - - -maxiter "${options_type.maxiter}" -distance "${options_type.distance}" - - -prior "${options_type.alphabet_type.prior_type.prior_type_selector}" - #if str( $options_type.alphabet_type.prior_type.prior_type_selector ) != 'addone': - -b "${options_type.alphabet_type.prior_type.prior_b}" - #if str( $options_type.alphabet_type.prior_type.plib ) != 'None': - -plib "${options_type.alphabet_type.prior_type.plib}" - #end if - #end if - - #if str( $options_type.alphabet_type.spmap_type.spmap_type_selector ) == 'cons': - -cons "${options_type.alphabet_type.spmap_type.cons}" - #else - -spmap "${options_type.alphabet_type.spmap_type.spmap_type_selector}" - -spfuzz "${options_type.alphabet_type.spmap_type.spfuzz}" - #end if - - #if str( $options_type.branching_type.branching_type_selector ) == 'x_branch': - -x_branch -bfactor "${options_type.branching_type.bfactor}" -heapsize "${options_type.branching_type.heapsize}" - #end if - - ##-maxsize "1000000" ##remove hardcoded maxsize? should increase number of processors instead - - #end if - - 2>&1 || echo "Error running MEME." - - - && mv ${html_outfile.files_path}/meme.html ${html_outfile} - - && mv ${html_outfile.files_path}/meme.txt ${txt_outfile} - - && mv ${html_outfile.files_path}/meme.xml ${xml_outfile} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - value == True - - - - - - - - - - - - - - - - - - - - -.. class:: warningmark - -**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted. Before using, be sure to review, agree, and comply with the license.** - -If you want to specify sequence weights, you must include them at the top of your input FASTA file. - -.. class:: infomark - -**To cite MEME:** -Timothy L. Bailey and Charles Elkan, "Fitting a mixture model by expectation maximization to discover motifs in biopolymers", Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology, pp. 28-36, AAAI Press, Menlo Park, California, 1994. - - -For detailed information on MEME, click here_. To view the license_. - -.. _here: http://meme.nbcr.net/meme/meme-intro.html -.. _license: http://meme.nbcr.net/meme/COPYRIGHT.html - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/memelogo.xml --- a/mytools/memelogo.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/revcompl.py --- a/mytools/revcompl.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ -import sys - -compldna = {'A':'T', - 'C':'G', - 'G':'C', - 'T':'A', - 'U':'A', - 'M':'K', - 'K':'M', - 'W':'W', - 'S':'S', - 'R':'Y', - 'Y':'R', - 'N':'N'} -complrna = {'A':'U', - 'C':'G', - 'G':'C', - 'T':'A', - 'U':'A', - 'M':'K', - 'K':'M', - 'W':'W', - 'S':'S', - 'R':'Y', - 'Y':'R', - 'N':'N'} -def complement(seq,compl): - complseq = [compl[base] for base in seq] - return complseq - -def reverse_complement(seq,compl): - seq = list(seq) - seq.reverse() - return ''.join(complement(seq,compl)) - -def readFastaFile(infile,outfile,compl): - - fin = open(infile) - out = open(outfile,'w') - - currSeq='' - currSeqname=None - for line in fin: - if '>' in line: - if currSeqname !=None: - out.write(currSeqname+reverse_complement(currSeq,compl)+'\n') - currSeqname=None - currSeq='' - currSeqname=line - else: - currSeq=currSeq+line.strip().upper() - - if currSeqname!=None: - out.write(currSeqname+reverse_complement(currSeq,compl)+'\n') - - fin.close() - out.close() - -def readrawseq(infile,outfile,compl): - ''' - each line is a sequence - ''' - fin = open(infile) - out = open(outfile,'w') - for line in fin: - out.write(reverse_complement(line.strip().upper(),compl)+'\n') - fin.close() - out.close() - -def main(): - seqfile = sys.argv[1] - outfile = sys.argv[2] - fasta = sys.argv[3] - rna = sys.argv[4] - if rna == 'rna': - compl = complrna - else: - compl = compldna - if fasta == 'fasta': - readFastaFile(seqfile,outfile,compl) - else: - readrawseq(seqfile,outfile,compl) - -main() diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/revcompl.xml --- a/mytools/revcompl.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ - - of DNA/RNA sequences - revcompl.py $input $output $fasta $rna - - - - - - - - - - -**What it does** - -This tool outputs reverse complementary of DNA/RNA sequences in the input file. The input can be fasta format or raw sequences (each line is a sequence). - -Degenerate nucleotides are supported. Here is the match table: - -A to T/U - -C to G - -G to C - -T/U to A - -M to K - -W to W - -S to S - -R to Y - -Y to R - -N to N - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/seq2meme.py --- a/mytools/seq2meme.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -# -*- coding: iso-8859-1 -*- -import random,sys,math - -#import pylab - -def readMotifFile(filename): - - try: - f=open(filename) - except IOError: - print "could not open",filename,"Are you sure this file exists?" - sys.exit(1) - - seqs=[] - maxL = 0 - for line in f: - if '>' in line or 'N' in line: - next - else: - seq = line.strip().upper() - if maxL < len(seq): - maxL = len(seq) - seqs.append(seq) - f.close() - - print len(seqs),'sequences loaded' - print 'max seq length:',maxL - for i in range(len(seqs)): - if len(seqs[i]) < maxL: - del seqs[i] - print len(seqs),'sequences with length = ',maxL - return seqs - - -def createWeightMatrix(seqs,psuedocont): - - motifWidth = len(seqs[0]) - weightMatrix = [] - for i in range(motifWidth): - weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont}) - - #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif - #YOUR CODE HERE - for seq in seqs: - for pos in range(motifWidth): - weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0 - - #Normalize your weight matrix (so that it contains probabilities rather than counts) - #Remember the added psuedocounts when you normalize! - for pos in range(motifWidth): - totalCount = sum(weightMatrix[pos].values()) - for letter in weightMatrix[pos].keys(): - weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount - - #Return your weight matrix - return weightMatrix - -def printMemeFormat(weightMatrix,motifName,filename,nSite,background): - f = open(filename,'w') - f.write('MEME version 4.4\n\n') - - f.write('ALPHABET= ACGT\n\n') - - f.write('strands: + -\n\n') - - f.write('Background letter frequencies (from:\n') - f.write(background+'\n\n') - - f.write('MOTIF '+motifName+'\n\n') - - f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n') - for position in range(len(weightMatrix)): - probsThisPosition=weightMatrix[position] - f.write(' '+"%.6f" %(probsThisPosition['A'])+'\t '+"%.6f" %(probsThisPosition['C'])+'\t '+"%.6f" %(probsThisPosition['G'])+'\t '+"%.6f" %(probsThisPosition['T'])+'\t\n') - f.write('\n\n') - f.close() - -#get a two decimal-place string representation of a float f -def twoDecimal(f): - return "%.6f" %(f) - -def run(): - - #Get file name from command line - if len(sys.argv) < 3: - print "python seq2meme.py motif_fasta outputfile motifName psuedocont background" - sys.exit(1) - else: - motifFile=sys.argv[1] # - outFile=sys.argv[2] - motifName=sys.argv[3] - psuedocont = float(sys.argv[4]) - background=' '.join(sys.argv[5].strip().split(',')) - - motifs=readMotifFile(motifFile) - - #Create weight matrix - motifWeightMatrix=createWeightMatrix(motifs,psuedocont) - printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background) -run() - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/seq2meme.xml --- a/mytools/seq2meme.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ - - from fasta file - seq2meme.py $input $output $motifName $psuedocont $background - - - - - - - - - - - -**Description** - -Generate a MEME motif format file from a set of sequences. Input could be raw sequences (one sequence per line) or fasta format (one identifier line and one sequence line). - - - - diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/seqshuffle.py --- a/mytools/seqshuffle.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -import sys - -from altschulEriksonDinuclShuffle import * - -def readFastaFile(infile,outfile): - - fin = open(infile) - out = open(outfile,'w') - - currSeq='' - currSeqname=None - for line in fin: - if '>' in line: - if currSeqname !=None: - out.write(currSeqname+dinuclShuffle(currSeq)+'\n') - currSeqname=None - currSeq='' - currSeqname=line - else: - currSeq=currSeq+line.strip().upper() - - if currSeqname!=None: - out.write(currSeqname+dinuclShuffle(currSeq)+'\n') - - fin.close() - out.close() - -def readrawseq(infile,outfile): - ''' - each line is a sequence - ''' - fin = open(infile) - out = open(outfile,'w') - for line in fin: - out.write(dinuclShuffle(line.strip().upper())+'\n') - fin.close() - out.close() - -def main(): - seqfile = sys.argv[1] - outfile = sys.argv[2] - fasta = sys.argv[3] - - if fasta == 'fasta': - readFastaFile(seqfile,outfile) - else: - readrawseq(seqfile,outfile) - -main() diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/sequence.py --- a/mytools/sequence.py Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,720 +0,0 @@ -#!@WHICHPYTHON@ - -import copy, string, sys - -#------------------ Alphabet ------------------- - -class Alphabet(object): - """Biological alphabet class. - This defines the set of symbols from which various objects can be built, e.g. sequences and motifs. - The symbol set is immutable and accessed as a tuple. - symstr: symbols in alphabet as either a tuple or string - complement: dictionary defining letters and their complement - """ - def __init__(self, symstr, complement = None): - """Construct an alphabet from a string or tuple of characters. - Lower case characters will be converted to upper case. - An optional mapping for complements may be provided. - Example: - >>> alpha = sequence.Alphabet('ACGTttga', {'A':'C', 'G':'T'}) - >>> alpha.getSymbols() - will construct the DNA alphabet and output: - ('A', 'C', 'G', 'T') - """ - symlst = [] - for s in [str(sym).upper()[0] for sym in symstr]: - if not s in symlst: - symlst.append(s) - self.symbols = tuple(symlst) - if complement != None: - # expand the mapping and check for contradictions - cmap = {} - for s in self.symbols: - c = complement.get(s, None) - if c != None: - if s in cmap and cmap[s] != c: - raise RuntimeError("Alphabet complement map " - "contains contradictory mapping") - cmap[s] = c - cmap[c] = s - # replace mapping with indicies - cimap = {} - for idx in range (len(self.symbols)): - s = self.symbols[idx] - if s in cmap: - cimap[cmap[s]] = idx - # create tuple - cidxlst = [] - for idx in range (len(self.symbols)): - cidxlst.append(cimap.get(self.symbols[idx], None)) - self.complements = tuple(cidxlst) - else: - self.complements = None - - def getSymbols(self): - """Retrieve a tuple with all symbols, immutable membership and order""" - return self.symbols - - def getComplements(self): - """Retrieve a tuple with all complement indicies, immutable""" - return self.complements - - def isValidSymbol(self, sym): - """Check if the symbol is a member of alphabet""" - return any([s==sym for s in self.symbols]) - - def getIndex(self, sym): - """Retrieve the index of the symbol (immutable)""" - for idx in range (len(self.symbols)): - if self.symbols[idx] == sym: - return idx - raise RuntimeError("Symbol " + sym + " does not exist in alphabet") - - def isComplementable(self): - return self.complements != None - - def getComplement(self, sym): - """Retrieve the complement of the symbol (immutable)""" - return self.symbols[self.complements[self.getIndex(sym)]]; - - def isValidString(self, symstr): - """Check if the string contains only symbols that belong to the alphabet""" - found = True - for sym in symstr: - if self.isValidSymbol(sym) == False: - return False - return True - - def getLen(self): - """Retrieve the number of symbols in (the length of) the alphabet""" - return len(self.symbols) - -# pre-defined alphabets that can be specified by their name -predefAlphabets = [ - ("DNA" , Alphabet('ACGT', {'A':'T', 'G':'C'})), - ("RNA" , Alphabet('ACGU')), - ("Extended DNA" , Alphabet('ACGTYRN')), - ("Protein" , Alphabet('ACDEFGHIKLMNPQRSTVWY')), - ("Extended Protein" , Alphabet('ACDEFGHIKLMNPQRSTVWYX')), - ("TM Labels" , Alphabet('MIO')) -] - -def getAlphabet(name): - """Retrieve a pre-defined alphabet by name. - Currently, "Protein", "DNA", "RNA", "Extended DNA", "Extended Protein" and "TM Labels" are available. - Example: - >>> alpha = sequence.getAlphabet('Protein') - >>> alpha.getSymbols() - will retrieve the 20 amino acid alphabet and output the tuple: - ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') - """ - for (xname, xalpha) in predefAlphabets: - if xname == name: - return xalpha - return None - -#------------------ Sequence ------------------- - -class Sequence(object): - """Biological sequence class. Sequence data is immutable. - - data: the sequence data as a tuple or string - alpha: the alphabet from which symbols are taken - name: the sequence name, if any - info: can contain additional sequence information apart from the name - """ - def __init__(self, sequence, alpha = None, name = "", seqinfo = ""): - """Create a sequence with sequence data. - Specifying the alphabet is optional, so is the name and info. - Example: - >>> myseq = sequence.Sequence('MVSAKKVPAIAMSFGVSF') - will create a sequence with name "", and assign one of the predefined alphabets on basis of what symbols were used. - >>> myseq.getAlphabet().getSymbols() - will most likely output the standard protein alphabet: - ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') - """ - if type(sequence) is str: - self.data = tuple(sequence.upper()) - elif type(sequence) is tuple: - self.data = sequence - elif type(sequence) is list: - self.data = tuple([s.upper() for s in sequence]) - else: - raise RuntimeError("Sequence data is not specified correctly: must be string or tuple") - # Resolve choice of alphabet - validAlphabet = False - if alpha == None: # Alphabet is not set, attempt to set it automatically... - for (xname, xalpha) in predefAlphabets: # Iterate through each predefined alphabet, in order - if xalpha.isValidString( self.data ): # This alphabet works, go with it - self.alpha = alpha = xalpha - validAlphabet = True - break - self.name = name - self.info = seqinfo - if validAlphabet == False: # we were either unsuccessful above or the alphabet was specified so test it - if type(alpha) is str: # check if name is a predefined alphabet - for (xname, xalpha) in predefAlphabets: # Iterate through each predefined alphabet, check for name - if (xname == alpha): - alpha = xalpha - break - if type(alpha) is Alphabet: # the alphabet is specified - if alpha.isValidString(self.data) == False: - raise RuntimeError("Invalid alphabet specified: "+"".join(alpha.getSymbols())+" is not compatible with sequence '"+"".join(self.data)+"'") - else: - self.alpha = alpha - else: - raise RuntimeError("Could not identify alphabet from sequence") - - #basic getters and setters for the class - def getName(self): - """Get the name of the sequence""" - return self.name - def getInfo(self): - """Get additional info of the sequence (e.g. from the defline in a FASTA file)""" - return self.info - def getAlphabet(self): - """Retrieve the alphabet that is assigned to this sequence""" - return self.alpha - def setName(self, name): - """Change the name of the sequence""" - self.name = name - def setAlphabet(self, alpha): - """Set the alphabet, throws an exception if it is not compatible with the sequence data""" - if type(alpha) is Alphabet: - if alpha.isValid( sequence ) == False: - raise RuntimeError( "Invalid alphabet specified" ) - #sequence functions - def getSequence(self): - """Retrieve the sequence data (a tuple of symbols)""" - return self.data - def getString(self): - """Retrieve the sequence data as a string (copy of actual data)""" - return "".join(self.data) - def getLen(self): - """Get the length of the sequence (number of symbols)""" - return len(self.data) - def getSite(self, position, length = 1): - """Retrieve a site in the sequence of desired length. - Note that positions go from 0 to length-1, and that if the requested site - extends beyond those the method throws an exception. - """ - if position >= 0 and position <= self.getLen() - length: - if length == 1: - return self.data[position] - else: - return self.data[position:position+length] - else: - raise RuntimeError( "Attempt to access invalid position in sequence "+self.getName() ) - - def nice(self): - """ A short description of the sequence """ - print self.getName(), ":", self.getLen() - -def readStrings(filename): - """ Read one or more lines of text from a file--for example an alignment. - Return as a list of strings. - filename: name of file - """ - txtlist = [] - f = open(filename) - for line in f.readlines(): - txtlist.extend(line.split()) - return txtlist - -def readFASTA(filename, alpha = None): - """ Read one or more sequences from a file in FASTA format. - filename: name of file to load sequences from - alpha: alphabet that is used (if left unspecified, an attempt is made to identify the alphabet for each individual sequence) - """ - seqlist = [] - seqname = None - seqinfo = None - seqdata = [] - fh = open(filename) - thisline = fh.readline() - while (thisline): - if (thisline[0] == '>'): # new sequence - if (seqname): # take care of the data that is already in the buffer before processing the new sequence - try: - seqnew = Sequence(seqdata, alpha, seqname, seqinfo) - seqlist.append(seqnew) - except RuntimeError, e: - print >> sys.stderr, "Warning: "+seqname+" is invalid (ignored): ", e - seqinfo = thisline[1:-1] # everything on the defline is "info" - seqname = seqinfo.split()[0] # up to first space - seqdata = [] - else: # pull out the sequence data - cleanline = thisline.split() - for line in cleanline: - seqdata.extend(tuple(line.strip('*'))) # sometimes a line ends with an asterisk in FASTA files - thisline = fh.readline() - - if (seqname): - try: - seqnew = Sequence(seqdata, alpha, seqname, seqinfo) - seqlist.append(seqnew) - except RuntimeError, e: - print >> sys.stderr, "Warning: " + seqname + " is invalid (ignored): ", e - else: - raise RuntimeError("No sequences on FASTA format found in this file") - fh.close() - return seqlist - -def _writeOneFASTA(sequence, filehandle): - """Write one sequence in FASTA format to an already open file""" - filehandle.write(">" + sequence.getName()+"\n") - data = sequence.getSequence() - lines = ( sequence.getLen() - 1) / 60 + 1 - for i in range(lines): - #note: python lets us get the last line (var length) free - #lineofseq = data[i*60 : (i+1)*60] + "\n" - lineofseq = "".join(data[i*60 : (i+1)*60]) + "\n" - filehandle.write(lineofseq) - -def writeFASTA(sequence, filename): - """Write a list (or a single) of sequences to a file in the FASTA format""" - fh = open(filename, "w") - if isinstance(sequence, Sequence): - _writeOneFASTA(sequence, fh) - else: - for seq in sequence: - if isinstance(seq, Sequence): - _writeOneFASTA(seq, fh) - else: - print >> sys.stderr, "Warning: could not write " + seq.getName() + " (ignored)." - fh.flush() - fh.close() - -#------------------ Distrib ------------------- - -class Distrib(object): - """Class for storing a multinomial probability distribution over the symbols in an alphabet""" - def __init__(self, alpha, pseudo_count = 0.0): - self.alpha = alpha - self.tot = pseudo_count * self.alpha.getLen() - self.cnt = [pseudo_count for _ in range( self.alpha.getLen() )] - - def __deepcopy__(self, memo): - dup = Distrib(self.alpha) - dup.tot = copy.deepcopy(self.tot, memo) - dup.cnt = copy.deepcopy(self.cnt, memo) - return dup - - def count(self, syms = None ): - """Count an observation of a symbol""" - if syms == None: - syms = self.alpha.getSymbols() - for sym in syms: - idx = self.alpha.getIndex( sym ) - self.cnt[idx] += 1.0 - self.tot += 1 - - def complement(self): - """Complement the counts, throw an error if this is impossible""" - if not self.alpha.isComplementable(): - raise RuntimeError("Attempt to complement a Distrib " - "based on a non-complementable alphabet.") - coms = self.alpha.getComplements() - new_count = [] - for idx in range(len(coms)): - cidx = coms[idx] - if cidx == None: - cidx = idx - new_count.append(self.cnt[cidx]) - self.cnt = new_count - return self - - def reset(self): - """Reset the distribution, that is, restart counting.""" - self.tot = 0 - self.cnt = [0.0 for _ in range( self.alpha.getLen() )] - - def getFreq(self, sym = None): - """Determine the probability distribution from the current counts. - The order in which probabilities are given follow the order of the symbols in the alphabet.""" - if self.tot > 0: - if sym == None: - freq = tuple([ y / self.tot for y in self.cnt ]) - return freq - else: - idx = self.alpha.getIndex( sym ) - return self.cnt[idx] / self.tot - return None - - def pretty(self): - """Retrieve the probabilites for all symbols and return as a pretty table (a list of text strings)""" - table = ["".join(["%4s " % s for s in self.alpha.getSymbols()])] - table.append("".join(["%3.2f " % y for y in Distrib.getFreq(self)])) - return table - - def getSymbols(self): - """Get the symbols in the alphabet in the same order as probabilities are given.""" - return self.alpha.getSymbols() - - def getAlphabet(self): - """Get the alphabet over which the distribution is defined.""" - return self.alpha - -#------------------ Motif (and subclasses) ------------------- - -class Motif(object): - """ Sequence motif class--defining a pattern that can be searched in sequences. - This class is not intended for direct use. Instead use and develop sub-classes (see below). - """ - def __init__(self, alpha): - self.len = 0 - self.alpha = alpha - - def getLen(self): - """Get the length of the motif""" - return self.len - - def getAlphabet(self): - """Get the alphabet that is used in the motif""" - return self.alpha - - def isAlphabet(self, seqstr): - """Check if the sequence can be processed by this motif""" - mystr = seqstr - if type(seqstr) is Sequence: - mystr = seqstr.getString() - return self.getAlphabet().isValidString(mystr) - -import re - -class RegExp(Motif): - """A motif class that defines the pattern in terms of a regular expression""" - def __init__(self, alpha, re_string): - Motif.__init__(self, alpha) - self.pattern = re.compile(re_string) - - def match(self, seq): - """Find matches to the motif in a specified sequence. - The method is a generator, hence subsequent hits can be retrieved using next(). - The returned result is a tuple (position, match-sequence, score), where score is - always 1.0 since a regular expression is either true or false (not returned). - """ - myseq = seq - if not type(seq) is Sequence: - myseq = Sequence(seq, self.alpha) - mystr = myseq.getString() - if not Motif.isAlphabet(self, mystr): - raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName()) - for m in re.finditer(self.pattern, mystr): - yield (m.start(), m.group(), 1.0) - -import math, time - -# Variables used by the PWM for creating an EPS file -_colour_def = ( - "/black [0 0 0] def\n" - "/red [0.8 0 0] def\n" - "/green [0 0.5 0] def\n" - "/blue [0 0 0.8] def\n" - "/yellow [1 1 0] def\n" - "/purple [0.8 0 0.8] def\n" - "/magenta [1.0 0 1.0] def\n" - "/cyan [0 1.0 1.0] def\n" - "/pink [1.0 0.8 0.8] def\n" - "/turquoise [0.2 0.9 0.8] def\n" - "/orange [1 0.7 0] def\n" - "/lightred [0.8 0.56 0.56] def\n" - "/lightgreen [0.35 0.5 0.35] def\n" - "/lightblue [0.56 0.56 0.8] def\n" - "/lightyellow [1 1 0.71] def\n" - "/lightpurple [0.8 0.56 0.8] def\n" - "/lightmagenta [1.0 0.7 1.0] def\n" - "/lightcyan [0.7 1.0 1.0] def\n" - "/lightpink [1.0 0.9 0.9] def\n" - "/lightturquoise [0.81 0.9 0.89] def\n" - "/lightorange [1 0.91 0.7] def\n") -_colour_dict = ( - "/fullColourDict <<\n" - " (G) orange\n" - " (T) green\n" - " (C) blue\n" - " (A) red\n" - " (U) green\n" - ">> def\n" - "/mutedColourDict <<\n" - " (G) lightorange\n" - " (T) lightgreen\n" - " (C) lightblue\n" - " (A) lightred\n" - " (U) lightgreen\n" - ">> def\n" - "/colorDict fullColourDict def\n") - -_eps_defaults = { - 'LOGOTYPE': 'NA', - 'FONTSIZE': '12', - 'TITLEFONTSIZE': '12', - 'SMALLFONTSIZE': '6', - 'TOPMARGIN': '0.9', - 'BOTTOMMARGIN': '0.9', - 'YAXIS': 'true', - 'YAXISLABEL': 'bits', - 'XAXISLABEL': '', - 'TITLE': '', - 'ERRORBARFRACTION': '1.0', - 'SHOWINGBOX': 'false', - 'BARBITS': '2.0', - 'TICBITS': '1', - 'COLORDEF': _colour_def, - 'COLORDICT': _colour_dict, - 'SHOWENDS': 'false', - 'NUMBERING': 'true', - 'OUTLINE': 'false', -} -class PWM(Motif): - """This motif subclass defines a pattern in terms of a position weight matrix. - An alphabet must be provided. A pseudo-count to be added to each count is - optional. A uniform background distribution is used by default. - """ - def __init__(self, alpha): - Motif.__init__(self, alpha) # set alphabet of this multinomial distribution - self.background = Distrib(alpha) # the default background ... - self.background.count(alpha.getSymbols()) # ... is uniform - self.nsites = 0 - - def setFromAlignment(self, aligned, pseudo_count = 0.0): - """Set the probabilities in the PWM from an alignment. - The alignment is a list of equal-length strings (see readStrings), OR - a list of Sequence. - """ - self.cols = -1 - self.nsites = len(aligned) - seqs = [] - # Below we create a list of Sequence from the alignment, - # while doing some error checking, and figure out the number of columns - for s in aligned: - # probably a text string, so we make a nameless sequence from it - if not type(s) is Sequence: - s=Sequence(s, Motif.getAlphabet(self)) - else: - # it was a sequence, so we check that the alphabet in - # this motif will be able to process it - if not Motif.isAlphabet(self, s): - raise RuntimeError("Motif alphabet is not valid for sequence " + s.getName()) - if self.cols == -1: - self.cols = s.getLen() - elif self.cols != s.getLen(): - raise RuntimeError("Sequences in alignment are not of equal length") - seqs.append(s) - # The line below initializes the list of Distrib (one for each column of the alignment) - self.counts = [Distrib(Motif.getAlphabet(self), pseudo_count) for _ in range(self.cols)] - # Next, we do the counting, column by column - for c in range( self.cols ): # iterate through columns - for s in seqs: # iterate through rows - # determine the index of the symbol we find at this position (row, column c) - self.counts[c].count(s.getSite(c)) - # Update the length - self.len = self.cols - - def reverseComplement(self): - """Reverse complement the PWM""" - i = 0 - j = len(self.counts)-1 - while (i < j): - temp = self.counts[i]; - self.counts[i] = self.counts[j] - self.counts[j] = temp - self.counts[i].complement() - self.counts[j].complement() - i += 1; - j -= 1; - if i == j: - self.counts[i].complement() - return self - - def getNSites(self): - """Get the number of sites that made the PWM""" - return self.nsites - - def setBackground(self, distrib): - """Set the background distribution""" - if not distrib.getAlphabet() == Motif.getAlphabet(self): - raise RuntimeError("Incompatible alphabets") - self.background = distrib - - def getFreq(self, col = None, sym = None): - """Get the probabilities for all positions in the PWM (a list of Distribs)""" - if (col == None): - return [y.getFreq() for y in self.counts] - else: - return self.counts[col].getFreq(sym) - - def pretty(self): - """Retrieve the probabilites for all positions in the PWM as a pretty table (a list of text strings)""" - #table = ["".join(["%8s " % s for s in self.alpha.getSymbols()])] - table = [] - for row in PWM.getFreq(self): - table.append("".join(["%8.6f " % y for y in row])) - return table - - def logoddsPretty(self, bkg): - """Retrieve the (base-2) log-odds for all positions in the PWM as a pretty table (a list of text strings)""" - table = [] - for row in PWM.getFreq(self): - #table.append("".join(["%8.6f " % (math.log((row[i]+1e-6)/bkg[i])/math.log(2)) for i in range(len(row))])) - table.append("".join(["%8.6f " % (math.log((row[i])/bkg[i])/math.log(2)) for i in range(len(row))])) - #table.append("".join(["%8.6f " % row[i] for i in range(len(row))])) - return table - - - def consensus_sequence(self): - """ - Get the consensus sequence corresponding to a PWM. - Consensus sequence is the letter in each column - with the highest probability. - """ - consensus = "" - alphabet = Motif.getAlphabet(self).getSymbols() - for pos in range(self.cols): - best_letter = alphabet[0] - best_p = self.counts[pos].getFreq(best_letter) - for letter in alphabet[1:]: - p = self.counts[pos].getFreq(letter) - if p > best_p: - best_p = p - best_letter = letter - consensus += best_letter - return consensus - - - def consensus(self): - """ - Get the consensus corresponding to a PWM. - Consensus at each column of motif is a list of - characters with non-zero probabilities. - """ - consensus = [] - for pos in range(self.cols): - matches = [] - for letter in Motif.getAlphabet(self).getSymbols(): - p = self.counts[pos].getFreq(letter) - if p > 0: - matches += letter - consensus.append(matches) - return consensus - - - def getScore(self, seq, start): - """Score this particular list of symbols using the PFM (background needs to be set separately)""" - sum = 0.0 - seqdata = seq.getSequence()[start : start+self.cols] - for pos in range(len(seqdata)): - q = self.counts[pos].getFreq(seqdata[pos]) - if q == 0: - q = 0.0001 # to avoid log(0) == -Infinity - logodds = math.log(q / self.background.getFreq(seqdata[pos])) - sum += logodds - return sum - - def match(self, seq, _LOG0 = -10): - """Find matches to the motif in a specified sequence. - The method is a generator, hence subsequent hits can be retrieved using next(). - The returned result is a tuple (position, match-sequence, score). - The optional parameter _LOG0 specifies a lower bound on reported logodds scores. - """ - myseq = seq - if not type(seq) is Sequence: - myseq = Sequence(seq, self.alpha) - if not Motif.isAlphabet(self, myseq): - raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName()) - for pos in range(myseq.getLen() - self.cols): - score = PWM.getScore(self, myseq, pos) - if score > _LOG0: - yield (pos, "".join(myseq.getSite(pos, self.cols)), score) - - def writeEPS(self, program, template_file, eps_fh, - timestamp = time.localtime()): - """Write out a DNA motif to EPS format.""" - small_dfmt = "%d.%m.%Y %H:%M" - full_dfmt = "%d.%m.%Y %H:%M:%S %Z" - small_date = time.strftime(small_dfmt, timestamp) - full_date = time.strftime(full_dfmt, timestamp) - points_per_cm = 72.0 / 2.54 - height = 4.5 - width = self.getLen() * 0.8 + 2 - width = min(30, width) - points_height = int(height * points_per_cm) - points_width = int(width * points_per_cm) - defaults = _eps_defaults.copy() - defaults['CREATOR'] = program - defaults['CREATIONDATE'] = full_date - defaults['LOGOHEIGHT'] = str(height) - defaults['LOGOWIDTH'] = str(width) - defaults['FINEPRINT'] = program + ' ' + small_date - defaults['CHARSPERLINE'] = str(self.getLen()) - defaults['BOUNDINGHEIGHT'] = str(points_height) - defaults['BOUNDINGWIDTH'] = str(points_width) - defaults['LOGOLINEHEIGHT'] = str(height) - with open(template_file, 'r') as template_fh: - m_var = re.compile("\{\$([A-Z]+)\}") - for line in template_fh: - last = 0 - match = m_var.search(line) - while (match): - if (last < match.start()): - prev = line[last:match.start()] - eps_fh.write(prev) - key = match.group(1) - if (key == "DATA"): - eps_fh.write("\nStartLine\n") - for pos in range(self.getLen()): - eps_fh.write("({0:d}) startstack\n".format(pos+1)) - stack = [] - # calculate the stack information content - alpha_ic = 2 - h = 0 - for sym in self.getAlphabet().getSymbols(): - freq = self.getFreq(pos, sym) - if (freq == 0): - continue - h -= (freq * math.log(freq, 2)) - stack_ic = alpha_ic - h - # calculate the heights of each symbol - for sym in self.getAlphabet().getSymbols(): - freq = self.getFreq(pos, sym) - if (freq == 0): - continue - stack.append((freq * stack_ic, sym)) - stack.sort(); - # output the symbols - for symh, sym in stack: - eps_fh.write(" {0:f} ({1:s}) numchar\n".format( - symh, sym)) - eps_fh.write("endstack\n\n") - eps_fh.write("EndLine\n") - elif (key in defaults): - eps_fh.write(defaults[key]) - else: - raise RuntimeError('Unknown variable "' + key + - '" in EPS template') - last = match.end(); - match = m_var.search(line, last) - if (last < len(line)): - eps_fh.write(line[last:]) - - -#------------------ Main method ------------------- -# Executed if you run this file from the operating system prompt, e.g. -# > python sequence.py - -if __name__=='__main__': - alpha = getAlphabet('Extended DNA') - #seqs = readFASTA('pos.fasta') - seqs = [] - aln = readStrings('tmp0') - #regexp = RegExp(alpha, '[AG]G.[DE]TT[AS].') - pwm = PWM(alpha) - pwm.setFromAlignment(aln) - for row in pwm.pretty(): - print row - for s in seqs: - print s.getName(), s.getLen(), s.getAlphabet().getSymbols() - for m in regexp.match( s ): - print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2]) - for m in pwm.match( s ): - print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2]) diff -r 5a93fe53194d -r d1f0f85ee5bc mytools/splicesite.xml --- a/mytools/splicesite.xml Fri Mar 16 16:11:19 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ - - using max entropy model - $script $input > $out_file1 - - - - - - - - - - - - -**What it does** - -This tool computes splice site scores using a max entropy model. See more details here: - -http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html - ------ - -**Example input for 5' splice site sequence** - -3 exonic and 6 intronic nucleotides flanking the junction:: - - CTGGTGAGT - AAGGTACAG - -or fasta format:: - - >seq1 - CTGGTGAGT - >seq2 - AAGGTACAG - -Output:: - - CTGGTGAGT 10.10 - AAGGTACAG 8.04 - -or fasta format:: - - >seq1 CTGGTGAGT 10.10 - >seq2 AAGGTACAG 8.04 - - ------ - -**Example input for 3' splice site sequence** - -3 exonic and 20 intronic nucleotides flanking the junction:: - - CCTGCATCCTCTGTTCCCAGGTG - TTTCTTCCCTCCGGGAACAGTGG - -Output:: - - CCTGCATCCTCTGTTCCCAGGTG 10.47 - TTTCTTCCCTCCGGGAACAGTGG 6.22 - - -