Previous changeset 6:1aea7c1a9ab1 (2012-08-20) |
Commit message:
Uploaded revised kmer-SVM to include modules from kmer-visual. |
modified:
kmersvm/README.txt kmersvm/install.sh kmersvm/nullseq.xml kmersvm/scripts/kmersvm_train.py kmersvm/scripts/nullseq_generate.py kmersvm/train.xml |
added:
kmersvm/kmer2meme.pl kmersvm/kmertopwm.xml kmersvm/scripts/kmersvm_output_weights.out kmersvm/scripts/kmersvm_train_kfb_copy.py kmersvm/scripts/libkmersvm.pyc kmersvm/tomtom.xml |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/README.txt --- a/kmersvm/README.txt Mon Aug 20 21:42:29 2012 -0400 +++ b/kmersvm/README.txt Sun Jun 16 18:06:14 2013 -0400 |
b |
@@ -68,6 +68,8 @@ <tool file="kmersvm/train.xml"/> <tool file="kmersvm/split_genome.xml"/> <tool file="kmersvm/seqprofile.xml" /> + <tool file="kmersvm/kmertopwm.xml" /> + <tool file="kmersvm/tomtom.xml" /> </section> Tool Tests: |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/install.sh --- a/kmersvm/install.sh Mon Aug 20 21:42:29 2012 -0400 +++ b/kmersvm/install.sh Sun Jun 16 18:06:14 2013 -0400 |
b |
@@ -1,12 +1,11 @@ #!/bin/bash -cd "$1" -cp tool-data/nullseq_indices.loc.sample ../../tool-data/nullseq_indices.loc -cp tool-data/sample_roc_chen.png ../../tool-data -cp tool-data/classify_output.out ../../test-data -cp tool-data/classify_test.fa ../../test-data -cp tool-data/kmersvm_output_weights.out ../../test-data -cp tool-data/test_positive.fa ../../test-data -cp tool-data/test_negative.fa ../../test-data -cp tool-data/test_weights.out ../../test-data -cp tool-data/train_predictions.out ../../test-data +cp tool-data/nullseq_indices.loc.sample ~/galaxy-dist/tool-data/nullseq_indices.loc +cp tool-data/sample_roc_chen.png ~/galaxy-dist/tool-data +cp tool-data/classify_output.out ~/galaxy-dist/test-data +cp tool-data/classify_test.fa ~/galaxy-dist/test-data +cp tool-data/kmersvm_output_weights.out ~/galaxy-dist/test-data +cp tool-data/test_positive.fa ~/galaxy-dist/test-data +cp tool-data/test_negative.fa ~/galaxy-dist/test-data +cp tool-data/test_weights.out ~/galaxy-dist/test-data +cp tool-data/train_predictions.out ~/galaxy-dist/test-data |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/kmer2meme.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmersvm/kmer2meme.pl Sun Jun 16 18:06:14 2013 -0400 |
[ |
@@ -0,0 +1,49 @@ +use strict; + +open(my $w_fh, "<", $ARGV[0]); +my $num_kmers = $ARGV[1]; +my @weights = <$w_fh>; + +my @temp_k = @weights[8..(7+$num_kmers), (-$num_kmers..-1)]; + +my @kmers = (); +#cleanup kmers +for my $i (0..($#temp_k)){ + my @temp = split('\t',$temp_k[$i]); + #modified by dongwon 042713 + #push(@kmers, ($temp[0], $temp[1])); + push(@kmers, $temp[0]); +} + +open(my $o_fh, ">", "kmer2meme.meme"); + +print $o_fh +"MEME version 4 + +ALPHABET= ACGT + +strands: + - + +Background letter frequencies (from no specific genome): +A 0.25 C 0.25 G 0.25 T 0.25\n\n"; + +foreach my $kmer (@kmers) { + print $o_fh "MOTIF $kmer\n"; + my $l = length($kmer); + print $o_fh "letter-probability matrix: alength= 4 w= $l nsites= 1 E= 0\n"; + foreach my $i (0..($l-1)) { + my $nc = substr($kmer, $i, 1); + if ($nc eq "A") { + print $o_fh " 1.00 0.00 0.00 0.00\n"; + }elsif ($nc eq "C") { + print $o_fh " 0.00 1.00 0.00 0.00\n"; + }elsif ($nc eq "G") { + print $o_fh " 0.00 0.00 1.00 0.00\n"; + }elsif ($nc eq "T") { + print $o_fh " 0.00 0.00 0.00 1.00\n"; + }else { + print " 0.25 0.25 0.25 0.25\n"; + } + } + print $o_fh "\n"; +} |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/kmertopwm.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmersvm/kmertopwm.xml Sun Jun 16 18:06:14 2013 -0400 |
b |
@@ -0,0 +1,25 @@ +<tool id="kmer2meme" name="Kmer To MEME"> + <description>Convert kmers to MEME format for motif finding by Tomtom</description> + <command interpreter="perl">kmer2meme.pl + $weights $N + </command> + + <inputs> + <param format="tabular" name="weights" type="data" label="Kmer Weights"/> + <param type="integer" name="N" value="10" label="Kmer Number"> + <validator type="in_range" message="Kmer number must be in range 1 - 50" min="1" max="50"/> + </param> + </inputs> + + <outputs> + <data format="txt" from_work_dir="kmer2meme.meme" name="MEME for Kmers" label="${tool.name} on ${on_string}: MEME"/> + </outputs> + + <help> +This is a utility function that creates PWMs in MEME format for use with Tomtom. + +'Kmer Weights' is the weight file generated by 'Train SVM'. + +'Kmer Number' is the number of most positive and most negative kmers to be processed. + </help> +</tool> |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/nullseq.xml --- a/kmersvm/nullseq.xml Mon Aug 20 21:42:29 2012 -0400 +++ b/kmersvm/nullseq.xml Sun Jun 16 18:06:14 2013 -0400 |
b |
@@ -7,9 +7,9 @@ -x $fold -r $rseed -g $gc_err -t $rpt_err $input $dbkey ${indices_path.fields.path} </command> <inputs> - <param name="fold" type="integer" value="1" label="# of Fold-Increase" /> - <param name="gc_err" type="float" value="0.02" label="Allowable GC Error" /> - <param name="rpt_err" type="float" value="0.02" label="Allowable Repeat Error" /> + <param name="fold" type="integer" value="10" label="# of Fold-Increase" min="1" max="50" /> + <param name="gc_err" type="float" value="0.02" label="Allowable GC Error" min="0.01" max="0.1"/> + <param name="rpt_err" type="float" value="0.02" label="Allowable Repeat Error" min="0.01" max="0.1"/> <param name="rseed" type="integer" value="1" label="Random Number Seed" /> <param format="interval" name="input" type="data" label="BED File of Positive Regions" /> <validator type="unspecified_build" /> @@ -44,6 +44,16 @@ **What it does** Takes an input BED file and generates a set of sequences for use as negative data (null sequences) in Train SVM similar in length, GC content and repeat fraction. Uses random sampling for efficiency. + +---- + +**Recommended Settings** + +Fold-Increase: Default is recommended, up to 50x positive set. + +GC Error, Repeat Error: Default is recommended. + +---- **Parameters** |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/scripts/kmersvm_output_weights.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmersvm/scripts/kmersvm_output_weights.out Sun Jun 16 18:06:14 2013 -0400 |
b |
b'@@ -0,0 +1,2088 @@\n+#parameters:\n+#kernel=1\n+#kmerlen=6\n+#bias=0.930368454935\n+#A=0\n+#B=0\n+#NOTE: k-mers with large negative weights are also important. They can be found at the bottom of the list.\n+#k-mer\trevcomp\tSVM-weight\n+AAAAAA\tTTTTTT\t0.553324469582\n+AAAAAC\tGTTTTT\t1.0689111563\n+AAAAAG\tCTTTTT\t0.386997519222\n+AAAAAT\tATTTTT\t0.371506923691\n+AAAACA\tTGTTTT\t0.582941243013\n+AAAACC\tGGTTTT\t-0.00322550380692\n+AAAACG\tCGTTTT\t0.115121834279\n+AAAACT\tAGTTTT\t0.64234562623\n+AAAAGA\tTCTTTT\t0.180098364822\n+AAAAGC\tGCTTTT\t-0.370020965708\n+AAAAGG\tCCTTTT\t-0.148530185678\n+AAAAGT\tACTTTT\t1.19477154105\n+AAAATA\tTATTTT\t1.23378644064\n+AAAATC\tGATTTT\t-0.980691936551\n+AAAATG\tCATTTT\t0.221932570601\n+AAAATT\tAATTTT\t0.449293989111\n+AAACAA\tTTGTTT\t-1.57507857322\n+AAACAC\tGTGTTT\t-2.1383477652\n+AAACAG\tCTGTTT\t-0.720402198466\n+AAACAT\tATGTTT\t-0.915754056705\n+AAACCA\tTGGTTT\t0.959609519802\n+AAACCC\tGGGTTT\t0.150812627734\n+AAACCG\tCGGTTT\t-0.204853254781\n+AAACCT\tAGGTTT\t0.486872195933\n+AAACGA\tTCGTTT\t-0.404172254228\n+AAACGC\tGCGTTT\t0.471891306908\n+AAACGG\tCCGTTT\t-0.732914484007\n+AAACGT\tACGTTT\t-0.79028442459\n+AAACTA\tTAGTTT\t-0.200848111441\n+AAACTC\tGAGTTT\t-0.00260431934722\n+AAACTG\tCAGTTT\t0.456381173353\n+AAACTT\tAAGTTT\t0.639062115506\n+AAAGAA\tTTCTTT\t0.257495713463\n+AAAGAC\tGTCTTT\t-0.228023730318\n+AAAGAG\tCTCTTT\t0.247579662852\n+AAAGAT\tATCTTT\t-0.304817901111\n+AAAGCA\tTGCTTT\t-0.155658358179\n+AAAGCC\tGGCTTT\t0.416290507318\n+AAAGCG\tCGCTTT\t-0.319122803172\n+AAAGCT\tAGCTTT\t-0.10365386651\n+AAAGGA\tTCCTTT\t0.465546368844\n+AAAGGC\tGCCTTT\t0.293788204177\n+AAAGGG\tCCCTTT\t-0.738483493496\n+AAAGGT\tACCTTT\t-1.46557110152\n+AAAGTA\tTACTTT\t-0.487013201424\n+AAAGTC\tGACTTT\t-0.815561145197\n+AAAGTG\tCACTTT\t0.523242409873\n+AAAGTT\tAACTTT\t1.49610361616\n+AAATAA\tTTATTT\t-0.50775903415\n+AAATAC\tGTATTT\t-0.925034113885\n+AAATAG\tCTATTT\t-1.3099174763\n+AAATAT\tATATTT\t-1.8047214372\n+AAATCA\tTGATTT\t-0.899342838259\n+AAATCC\tGGATTT\t-0.146519411262\n+AAATCG\tCGATTT\t-0.267007765303\n+AAATCT\tAGATTT\t0.291560176957\n+AAATGA\tTCATTT\t-0.514145682209\n+AAATGC\tGCATTT\t0.954279511728\n+AAATGG\tCCATTT\t-0.711449233898\n+AAATGT\tACATTT\t-0.752526282583\n+AAATTA\tTAATTT\t0.00593646027611\n+AAATTC\tGAATTT\t1.26182226428\n+AAATTG\tCAATTT\t-0.0953103902516\n+AAATTT\tAAATTT\t0.189613989631\n+AACAAA\tTTTGTT\t-1.37122264124\n+AACAAC\tGTTGTT\t0.00146931165158\n+AACAAG\tCTTGTT\t-0.803037783522\n+AACAAT\tATTGTT\t0.00385062783094\n+AACACA\tTGTGTT\t-0.363356864114\n+AACACC\tGGTGTT\t-0.779849447985\n+AACACG\tCGTGTT\t-0.97471290289\n+AACACT\tAGTGTT\t-0.335935444604\n+AACAGA\tTCTGTT\t-1.28171369495\n+AACAGC\tGCTGTT\t-0.411448258216\n+AACAGG\tCCTGTT\t-0.469780016788\n+AACAGT\tACTGTT\t-0.453227635948\n+AACATA\tTATGTT\t-0.945101613087\n+AACATC\tGATGTT\t-0.0283361724906\n+AACATG\tCATGTT\t-0.575985749697\n+AACATT\tAATGTT\t-0.0429091030472\n+AACCAA\tTTGGTT\t-0.0823228706445\n+AACCAC\tGTGGTT\t2.58639657949\n+AACCAG\tCTGGTT\t-0.276555339554\n+AACCAT\tATGGTT\t-0.11357479766\n+AACCCA\tTGGGTT\t0.192569792654\n+AACCCC\tGGGGTT\t-0.0425603516266\n+AACCCG\tCGGGTT\t-0.404973603501\n+AACCCT\tAGGGTT\t0.0764485451656\n+AACCGA\tTCGGTT\t-0.137853811078\n+AACCGC\tGCGGTT\t0.710876928983\n+AACCGG\tCCGGTT\t0.272143672682\n+AACCGT\tACGGTT\t-1.42589113548\n+AACCTA\tTAGGTT\t-0.611888789113\n+AACCTC\tGAGGTT\t0.837839227815\n+AACCTG\tCAGGTT\t-0.422972816872\n+AACCTT\tAAGGTT\t0.0794552714245\n+AACGAA\tTTCGTT\t0.662384258058\n+AACGAC\tGTCGTT\t-0.711145623237\n+AACGAG\tCTCGTT\t0.198654543303\n+AACGAT\tATCGTT\t-1.14468704666\n+AACGCA\tTGCGTT\t-0.143027192823\n+AACGCC\tGGCGTT\t-0.0833645930753\n+AACGCG\tCGCGTT\t0.0613946992336\n+AACGCT\tAGCGTT\t0.379426684798\n+AACGGA\tTCCGTT\t-0.902189680896\n+AACGGC\tGCCGTT\t0.725518300654\n+AACGGG\tCCCGTT\t0.487999502266\n+AACGGT\tACCGTT\t-0.323411669378\n+AACGTA\tTACGTT\t0.429654445762\n+AACGTC\tGACGTT\t-0.392752266586\n+AACGTG\tCACGTT\t-1.04792887194\n+AACGTT\tAACGTT\t0.616207780774\n+AACTAA\tTTAGTT\t-0.843322479317\n+AACTAC\tGTAGTT\t0.184493095017\n+AACTAG\tCTAGTT\t0.0179086348231\n+AACTAT\tATAGTT\t0.994586833037\n+AACTCA\tTGAGTT\t-0.12838936418\n+AACTCC\tGGAGTT\t0.726028047244\n+AACTCG\tCGAGTT\t0.205501965615\n+AACTCT\tAGAGTT\t0.78739364499\n+AACTGA\tTCAGTT\t0.168022862889\n+AACTGC\tGCAGTT\t0.216791948549\n+AACTGG\tCCAGTT\t-0.314557426071\n+AACTGT\tACAGTT\t-0.011128'..b'2496048\n+TAACCA\tTGGTTA\t1.25918224499\n+TAACGA\tTCGTTA\t-0.245420018049\n+TAACTA\tTAGTTA\t0.608128184593\n+TAAGAA\tTTCTTA\t1.02556160172\n+TAAGCA\tTGCTTA\t-0.00178390331913\n+TAAGGA\tTCCTTA\t0.837198657402\n+TAAGTA\tTACTTA\t0.0721724774768\n+TAATAA\tTTATTA\t-0.189941492074\n+TAATCA\tTGATTA\t0.004309724706\n+TAATGA\tTCATTA\t-0.927698594817\n+TAATTA\tTAATTA\t-1.12948086023\n+TACAAA\tTTTGTA\t-0.668160668233\n+TACACA\tTGTGTA\t-1.5497729664\n+TACAGA\tTCTGTA\t-0.00599134182996\n+TACATA\tTATGTA\t-1.12195560298\n+TACCAA\tTTGGTA\t-0.281401801063\n+TACCCA\tTGGGTA\t0.279140164003\n+TACCGA\tTCGGTA\t-0.674861229256\n+TACCTA\tTAGGTA\t0.232148610987\n+TACGAA\tTTCGTA\t-0.107764774043\n+TACGCA\tTGCGTA\t-0.519175400062\n+TACGGA\tTCCGTA\t-0.402472215133\n+TACGTA\tTACGTA\t0.541316396612\n+TACTAA\tTTAGTA\t-0.135239105598\n+TACTCA\tTGAGTA\t0.639564666607\n+TACTGA\tTCAGTA\t-0.482886609395\n+TAGAAA\tTTTCTA\t0.0445682798936\n+TAGACA\tTGTCTA\t-0.45261373791\n+TAGAGA\tTCTCTA\t-0.0954516438137\n+TAGATA\tTATCTA\t-0.532370750539\n+TAGCAA\tTTGCTA\t-0.652336647837\n+TAGCCA\tTGGCTA\t0.291364926411\n+TAGCGA\tTCGCTA\t0.149053684558\n+TAGCTA\tTAGCTA\t0.686821117845\n+TAGGAA\tTTCCTA\t0.597312086206\n+TAGGCA\tTGCCTA\t-0.0392374245184\n+TAGGGA\tTCCCTA\t-0.898776106474\n+TAGTAA\tTTACTA\t-0.0409191001809\n+TAGTCA\tTGACTA\t1.32002868308\n+TAGTGA\tTCACTA\t0.306784253534\n+TATAAA\tTTTATA\t-0.121630582814\n+TATACA\tTGTATA\t-0.557527238557\n+TATAGA\tTCTATA\t-0.29108877277\n+TATATA\tTATATA\t-0.239193588677\n+TATCAA\tTTGATA\t-1.21281286807\n+TATCCA\tTGGATA\t-0.143758324237\n+TATCGA\tTCGATA\t-0.316281900517\n+TATGAA\tTTCATA\t0.567777510974\n+TATGCA\tTGCATA\t0.382062812319\n+TATGGA\tTCCATA\t0.18823076363\n+TATTAA\tTTAATA\t0.370202791227\n+TATTCA\tTGAATA\t-0.0890453547066\n+TATTGA\tTCAATA\t-3.17710597328\n+TCAAAA\tTTTTGA\t0.303875411059\n+TCAACA\tTGTTGA\t-2.00254784618\n+TCAAGA\tTCTTGA\t1.4720641259\n+TCACAA\tTTGTGA\t0.295459860043\n+TCACCA\tTGGTGA\t0.210574562759\n+TCACGA\tTCGTGA\t-0.515739048241\n+TCAGAA\tTTCTGA\t-0.292759366807\n+TCAGCA\tTGCTGA\t-0.620868456293\n+TCAGGA\tTCCTGA\t0.0391644847042\n+TCATAA\tTTATGA\t0.46490835305\n+TCATCA\tTGATGA\t0.156189576126\n+TCATGA\tTCATGA\t-0.28992182543\n+TCCAAA\tTTTGGA\t0.372353469926\n+TCCACA\tTGTGGA\t-0.303324084643\n+TCCAGA\tTCTGGA\t1.11791545491\n+TCCCAA\tTTGGGA\t1.0120690282\n+TCCCCA\tTGGGGA\t0.475206609554\n+TCCCGA\tTCGGGA\t-0.218933163821\n+TCCGAA\tTTCGGA\t0.231675785391\n+TCCGCA\tTGCGGA\t-0.0565746658942\n+TCCGGA\tTCCGGA\t-0.534349085125\n+TCCTAA\tTTAGGA\t0.426713370402\n+TCCTCA\tTGAGGA\t-0.101009274429\n+TCGAAA\tTTTCGA\t0.385602827192\n+TCGACA\tTGTCGA\t0.567716035117\n+TCGAGA\tTCTCGA\t-0.0811772342038\n+TCGCAA\tTTGCGA\t0.471042033643\n+TCGCCA\tTGGCGA\t-0.433031956525\n+TCGCGA\tTCGCGA\t-0.10259765071\n+TCGGAA\tTTCCGA\t0.152092661722\n+TCGGCA\tTGCCGA\t-0.277526003969\n+TCGTAA\tTTACGA\t0.6872310151\n+TCGTCA\tTGACGA\t-0.633071677798\n+TCTAAA\tTTTAGA\t-0.119569291318\n+TCTACA\tTGTAGA\t0.531537891496\n+TCTAGA\tTCTAGA\t-0.582228401295\n+TCTCAA\tTTGAGA\t0.0181823636774\n+TCTCCA\tTGGAGA\t-0.0990443536722\n+TCTGAA\tTTCAGA\t0.818735079074\n+TCTGCA\tTGCAGA\t-0.708985771217\n+TCTTAA\tTTAAGA\t0.684573742868\n+TCTTCA\tTGAAGA\t0.231633308785\n+TGAAAA\tTTTTCA\t-0.590753014937\n+TGAACA\tTGTTCA\t0.143205522462\n+TGACAA\tTTGTCA\t-0.109523687459\n+TGACCA\tTGGTCA\t1.47045933497\n+TGAGAA\tTTCTCA\t-0.555364387516\n+TGAGCA\tTGCTCA\t-0.498941620454\n+TGATAA\tTTATCA\t-1.78189703088\n+TGATCA\tTGATCA\t-0.58090561671\n+TGCAAA\tTTTGCA\t-0.642834024972\n+TGCACA\tTGTGCA\t-1.10325139721\n+TGCCAA\tTTGGCA\t-0.349619012912\n+TGCCCA\tTGGGCA\t0.294732505972\n+TGCGAA\tTTCGCA\t-0.0604088395824\n+TGCGCA\tTGCGCA\t0.0208781532991\n+TGCTAA\tTTAGCA\t-0.902952269579\n+TGGAAA\tTTTCCA\t-1.65646848688\n+TGGACA\tTGTCCA\t0.212014771381\n+TGGCAA\tTTGCCA\t-0.299231706691\n+TGGCCA\tTGGCCA\t0.0133271697043\n+TGGGAA\tTTCCCA\t-0.0169051652559\n+TGGTAA\tTTACCA\t-0.741148671428\n+TGTAAA\tTTTACA\t-1.90833491575\n+TGTACA\tTGTACA\t-0.781209085217\n+TGTCAA\tTTGACA\t-1.54128600175\n+TGTGAA\tTTCACA\t-2.24458672601\n+TGTTAA\tTTAACA\t-1.12490394498\n+TTAAAA\tTTTTAA\t-0.0119631686901\n+TTACAA\tTTGTAA\t0.81347268468\n+TTAGAA\tTTCTAA\t0.271221556202\n+TTATAA\tTTATAA\t0.774386643995\n+TTCAAA\tTTTGAA\t0.809237031692\n+TTCCAA\tTTGGAA\t0.158724969294\n+TTCGAA\tTTCGAA\t-0.405674192258\n+TTGAAA\tTTTCAA\t-0.306551492839\n+TTGCAA\tTTGCAA\t0.851414898595\n+TTTAAA\tTTTAAA\t0.479575745295\n' |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/scripts/kmersvm_train.py --- a/kmersvm/scripts/kmersvm_train.py Mon Aug 20 21:42:29 2012 -0400 +++ b/kmersvm/scripts/kmersvm_train.py Sun Jun 16 18:06:14 2013 -0400 |
b |
@@ -754,7 +754,8 @@ sids = sids_pos + sids_neg if options.weight == 0: - options.weight = 1 + log(nneg/npos) + #DEBUGGED by dlee 02/17/13 + options.weight = 1 + log(nneg/float(npos)) if options.quiet == False: sys.stderr.write('SVM parameters:\n') |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/scripts/kmersvm_train_kfb_copy.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmersvm/scripts/kmersvm_train_kfb_copy.py Sun Jun 16 18:06:14 2013 -0400 |
[ |
b'@@ -0,0 +1,894 @@\n+#!/usr/bin/env python\n+"""\n+\tkmersvm_train.py; train a support vector machine using shogun toolbox\n+\tCopyright (C) 2011 Dongwon Lee\n+\n+\tThis program is free software: you can redistribute it and/or modify\n+\tit under the terms of the GNU General Public License as published by\n+\tthe Free Software Foundation, either version 3 of the License, or\n+\t(at your option) any later version.\n+\n+\tThis program is distributed in the hope that it will be useful,\n+\tbut WITHOUT ANY WARRANTY; without even the implied warranty of\n+\tMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+\tGNU General Public License for more details.\n+\n+\tYou should have received a copy of the GNU General Public License\n+\talong with this program. If not, see <http://www.gnu.org/licenses/>.\n+\n+\n+"""\n+\n+\n+\n+import sys\n+import optparse\n+import random\n+import numpy\n+from math import log, exp\n+\n+from libkmersvm import *\n+try:\n+\tfrom shogun.PreProc import SortWordString, SortUlongString\n+except ImportError:\n+\tfrom shogun.Preprocessor import SortWordString, SortUlongString\n+from shogun.Kernel import CommWordStringKernel, CommUlongStringKernel, \\\n+\t\tCombinedKernel\n+\t\t\n+from shogun.Features import StringWordFeatures, StringUlongFeatures, \\\n+\t\tStringCharFeatures, CombinedFeatures, DNA, Labels\n+from shogun.Classifier import MSG_INFO, MSG_ERROR\n+try:\n+\tfrom shogun.Classifier import SVMLight\n+except ImportError:\n+\tfrom shogun.Classifier import LibSVM\n+\n+"""\n+global variables\n+"""\n+g_kmers = []\n+g_rcmap = []\n+\n+\n+def kmerid2kmer(kmerid, kmerlen):\n+\t"""convert integer kmerid to kmer string\n+\n+\tArguments:\n+\tkmerid -- integer, id of k-mer\n+\tkmerlen -- integer, length of k-mer\n+\n+\tReturn:\n+\tkmer string\n+\t"""\n+\n+\tnts = "ACGT"\n+\tkmernts = []\n+\tkmerid2 = kmerid\n+\n+\tfor i in xrange(kmerlen):\n+\t\tntid = kmerid2 % 4\n+\t\tkmernts.append(nts[ntid])\n+\t\tkmerid2 = int((kmerid2-ntid)/4)\n+\n+\treturn \'\'.join(reversed(kmernts))\n+\n+\n+def kmer2kmerid(kmer, kmerlen):\n+\t"""convert kmer string to integer kmerid\n+\n+\tArguments:\n+\tkmerid -- integer, id of k-mer\n+\tkmerlen -- integer, length of k-mer\n+\n+\tReturn:\n+\tid of k-mer\n+\t"""\n+\n+\tnt2id = {\'A\':0, \'C\':1, \'G\':2, \'T\':3}\n+\n+\treturn reduce(lambda x, y: (4*x+y), [nt2id[x] for x in kmer])\n+\n+\n+def get_rcmap(kmerid, kmerlen):\n+\t"""mapping kmerid to its reverse complement k-mer on-the-fly\n+\n+\tArguments:\n+\tkmerid -- integer, id of k-mer\n+\tkmerlen -- integer, length of k-mer\n+\n+\tReturn:\n+\tinteger kmerid after mapping to its reverse complement\n+\t"""\n+\n+\t#1. get kmer from kmerid\n+\t#2. get reverse complement kmer\n+\t#3. get kmerid from revcomp kmer\n+\trckmerid = kmer2kmerid(revcomp(kmerid2kmer(kmerid, kmerlen)), kmerlen)\n+\n+\tif rckmerid < kmerid:\n+\t\treturn rckmerid\n+\n+\treturn kmerid\n+\n+\n+def non_redundant_word_features(feats, kmerlen):\n+\t"""convert the features from Shogun toolbox to non-redundant word features (handle reverse complements)\n+\tArguments:\n+\tfeats -- StringWordFeatures\n+\tkmerlen -- integer, length of k-mer\n+\n+\tReturn:\n+\tStringWordFeatures after converting reverse complement k-mer ids\n+\t"""\n+\n+\trcmap = g_rcmap\n+\n+\tfor i in xrange(feats.get_num_vectors()):\n+\t\tnf = [rcmap[int(kmerid)] for kmerid in feats.get_feature_vector(i)]\n+\n+\t\tfeats.set_feature_vector(numpy.array(nf, numpy.dtype(\'u2\')), i)\n+\n+\tpreproc = SortWordString()\n+\tpreproc.init(feats)\n+\ttry:\n+\t\tfeats.add_preproc(preproc)\n+\t\tfeats.apply_preproc()\n+\texcept AttributeError:\n+\t\tfeats.add_preprocessor(preproc)\n+\t\tfeats.apply_preprocessor()\t\n+\n+\treturn feats\n+\n+\n+def non_redundant_ulong_features(feats, kmerlen):\n+\t"""convert the features from Shogun toolbox to non-redundant ulong features\n+\tArguments:\n+\tfeats -- StringUlongFeatures\n+\tkmerlen -- integer, length of k-mer\n+\n+\tReturn:\n+\tStringUlongFeatures after converting reverse complement k-mer ids\n+\t"""\n+\n+\tfor i in xrange(feats.get_num_vectors()):\n+\t\tnf = [get_rcmap(int(kmerid), kmerlen) \\\n+\t\t\t\tfor kmerid in feats.get_feature_vector(i)]\n+\n+\t\tfeats.set_feature_vector(numpy.array(nf, numpy.dtype(\'u8\')), i)\n+\n+\tpreproc = SortUlongStr'..b'0]\n+\tnegf = args[1]\n+\t\n+\tseqs_pos, sids_pos = read_fastafile(posf)\n+\tseqs_neg, sids_neg = read_fastafile(negf)\n+\tnpos = len(seqs_pos)\n+\tnneg = len(seqs_neg)\n+\tseqs = seqs_pos + seqs_neg\n+\tsids = sids_pos + sids_neg\n+\n+\tif options.weight == 0:\n+\t\t#DEBUGGED by dlee 02/17/13\n+\t\toptions.weight = 1 + log(nneg/float(npos))\n+\n+\tif options.quiet == False:\n+\t\tsys.stderr.write(\'SVM parameters:\\n\')\n+\t\tsys.stderr.write(\' kernel-type: \' + str(options.ktype) + "." + ktype_str[options.ktype] + \'\\n\')\n+\t\tsys.stderr.write(\' svm-C: \' + str(options.svmC) + \'\\n\')\n+\t\tsys.stderr.write(\' epsilon: \' + str(options.epsilon) + \'\\n\')\n+\t\tsys.stderr.write(\' weight: \' + str(options.weight) + \'\\n\')\n+\t\tsys.stderr.write(\'\\n\')\n+\n+\t\tsys.stderr.write(\'Other options:\\n\')\n+\t\tsys.stderr.write(\' kmerlen: \' + str(options.kmerlen) + \'\\n\')\n+\t\tif options.ktype == 2:\n+\t\t\tsys.stderr.write(\' kmerlen2: \' + str(options.kmerlen2) + \'\\n\')\n+\t\tsys.stderr.write(\' outputname: \' + options.outputname + \'\\n\')\n+\t\tsys.stderr.write(\' posteriorp: \' + str(options.posteriorp) + \'\\n\')\n+\t\tif options.ncv > 0:\n+\t\t\tsys.stderr.write(\' ncv: \' + str(options.ncv) + \'\\n\')\n+\t\t\tsys.stderr.write(\' rseed: \' + str(options.rseed) + \'\\n\')\n+\t\tsys.stderr.write(\' sorted-weight: \' + str(options.sort) + \'\\n\')\n+\n+\t\tsys.stderr.write(\'\\n\')\n+\n+\t\tsys.stderr.write(\'Input args:\\n\')\n+\t\tsys.stderr.write(\' positive sequence file: \' + posf + \'\\n\')\n+\t\tsys.stderr.write(\' negative sequence file: \' + negf + \'\\n\')\n+\t\tsys.stderr.write(\'\\n\')\n+\n+\t\tsys.stderr.write(\'numer of total positive seqs: \' + str(npos) + \'\\n\')\n+\t\tsys.stderr.write(\'numer of total negative seqs: \' + str(nneg) + \'\\n\')\n+\t\tsys.stderr.write(\'\\n\')\n+\n+\t#generate labels\n+\tlabels = [1]*npos + [-1]*nneg\n+\n+\tif options.ktype == 1:\n+\t\tget_features = get_spectrum_features\n+\t\tget_kernel = get_spectrum_kernel\n+\t\tget_weights = get_sksvm_weights\n+\t\tsave_weights = save_sksvm_weights\n+\t\tsvm_classify = sksvm_classify\n+\telif options.ktype == 2:\n+\t\tget_features = get_weighted_spectrum_features\n+\t\tget_kernel = get_weighted_spectrum_kernel\n+\t\tget_weights = get_wsksvm_weights\n+\t\tsave_weights = save_wsksvm_weights\n+\t\tsvm_classify = wsksvm_classify\n+\telse:\n+\t\tsys.stderr.write(\'..unknown kernel..\\n\')\n+\t\tsys.exit(0)\n+\n+\tA = B = 0\n+\tif options.ncv > 0:\n+\t\tif options.quiet == False:\n+\t\t\tsys.stderr.write(\'..Cross-validation\\n\')\n+\n+\t\tcvlist = generate_cv_list(options.ncv, npos, nneg)\n+\t\tlabels_cv = []\n+\t\tpreds_cv = []\n+\t\tsids_cv = []\n+\t\tindices_cv = []\n+\t\tfor icv in xrange(options.ncv):\n+\t\t\t#split data into training and test set\n+\t\t\tseqs_tr, seqs_te = split_cv_list(cvlist, icv, seqs) \n+\t\t\tlabs_tr, labs_te = split_cv_list(cvlist, icv, labels)\n+\t\t\tsids_tr, sids_te = split_cv_list(cvlist, icv, sids)\n+\t\t\tindices_tr, indices_te = split_cv_list(cvlist, icv, range(len(seqs)))\n+\n+\t\t\t#train SVM\n+\t\t\tfeats_tr = get_features(seqs_tr, options)\n+\t\t\tkernel_tr = get_kernel(feats_tr, options)\n+\t\t\tsvm_cv = svm_learn(kernel_tr, labs_tr, options)\n+\n+\t\t\tpreds_cv = preds_cv + svm_classify(seqs_te, svm_cv, kernel_tr, feats_tr, options)\n+\t\t\t\n+\t\t\tlabels_cv = labels_cv + labs_te\n+\t\t\tsids_cv = sids_cv + sids_te\n+\t\t\tindices_cv = indices_cv + indices_te\n+\n+\t\toutput_cvpred = options.outputname + "_cvpred.out"\n+\t\tprediction_results = sorted(zip(indices_cv, sids_cv, preds_cv, labels_cv), key=lambda p: p[0])\n+\t\tsave_predictions(output_cvpred, prediction_results, cvlist)\n+\n+\t\tif options.posteriorp:\n+\t\t\tA, B = LMAI(preds_cv, labels_cv, labels_cv.count(-1), labels_cv.count(1))\n+\n+\t\t\tif options.quiet == False:\n+\t\t\t\tsys.stderr.write(\'Estimated Parameters:\\n\')\n+\t\t\t\tsys.stderr.write(\' A: \' + str(A) + \'\\n\')\n+\t\t\t\tsys.stderr.write(\' B: \' + str(B) + \'\\n\')\n+\n+\tif options.quiet == False:\n+\t\tsys.stderr.write(\'..SVM weights\\n\')\n+\n+\tfeats = get_features(seqs, options)\n+\tkernel = get_kernel(feats, options)\n+\tsvm = svm_learn(kernel, labels, options)\n+\tjj = get_feature_counts(svm, feats, options)\n+\tw = get_weights(svm, feats, options)\n+\tb = svm.get_bias()\n+\n+\tsave_weights(w, b, A, B, options)\n+\n+if __name__==\'__main__\': main()\n' |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/scripts/libkmersvm.pyc |
b |
Binary file kmersvm/scripts/libkmersvm.pyc has changed |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/scripts/nullseq_generate.py --- a/kmersvm/scripts/nullseq_generate.py Mon Aug 20 21:42:29 2012 -0400 +++ b/kmersvm/scripts/nullseq_generate.py Sun Jun 16 18:06:14 2013 -0400 |
[ |
@@ -71,8 +71,7 @@ def sample_sequences(positions, buildname, basedir, options): """ """ - rpt_err = options.rpt_err - gc_err = options.gc_err + max_fails = 20 max_trys = options.max_trys norpt = options.norpt nogc = options.nogc @@ -121,6 +120,12 @@ else: count = options.count + #initialize paramter + #added by dlee 2/17/13 + ncfails = 0 + rpt_err = options.rpt_err + gc_err = options.gc_err + sampled_positions = [] while len(sampled_positions) < count: sampled_prof = random.choice(profiles) @@ -128,6 +133,15 @@ sampled_gc = sampled_prof[2] sampled_rpt = sampled_prof[3] + #relax rpt_err and gc_err if it keep fail to sample a region + #added by dlee 2/17/13 + if ncfails >= max_fails: + if options.quiet == False: + sys.stderr.write("reached max_fail. relax gc and rpt err criteria\n") + ncfails = 0 + rpt_err += 0.01 + gc_err += 0.01 + rpt_err_allowed = int(rpt_err*sampled_len) gc_err_allowed = int(gc_err*sampled_len) trys = 0 @@ -156,9 +170,17 @@ sampled_positions.append((chrom, pos, pos_e)) + #reset the counter of consecutive fails + #added by dlee 2/17/13 + ncfails = 0 + #print trys, chrom, pos, pos_e, sampled_len, pos_rpt, sampled_rpt, pos_gc, sampled_gc break else: + #increase the counter of consecutive fails + #added by dlee 2/17/13 + ncfails += 1 + if options.quiet == False: sys.stderr.write(' '.join(["fail to sample from", \ "len=", str(sampled_len), \ |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/tomtom.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmersvm/tomtom.xml Sun Jun 16 18:06:14 2013 -0400 |
b |
@@ -0,0 +1,84 @@ +<tool id="tomtom" name="Tomtom" version="1.0.0"> + + <description>Tomtom tool for motif searching</description> + <command>/home/galaxy/meme/bin/tomtom -no-ssc -internal -text -verbosity 1 -thresh $thresh + #if str($cut.cut_choice) == 'e.value': + -evalue + #end if + + #if str($dist.dist) == 'ed': + -dist ed + #elif str($dist.dist) == 'sw': + -dist sandelin + #else + -dist pearson + #end if + + $input1 /home/galaxy/meme/db/combined_db.meme > tomtom_out.txt + + </command> + <inputs> + <param format="txt" name="input1" type="data" label="PWM File"/> + <param type="float" value="0.5" label="Threshold" name="thresh"/> + <conditional name="cut"> + <param name="cut_choice" type="select" label="Threshold Type"> + <option value="q.value" selected="true">q-value</option> + <option value="e.value">E-value</option> + </param> + </conditional> + + <conditional name="dist"> + <param name="dist" type="select" label="Distance Metric"> + <option value="pearson" selected="true">Pearson</option> + <option value="ed">Euclidean</option> + <option value="sw">Sandelin-Wasserman Function</option> + </param> + </conditional> + </inputs> + + <outputs> + <data format="txt" name="Tomtom Results" from_work_dir="tomtom_out.txt" label="${tool.name} on ${on_string}: Tomtom Matches"/> + + </outputs> + <help> + +Tomtom is a tool for comparing a DNA motif to a database of known motifs. For an in-depth explanation of the Tomtom software see here_. + +---- + +**Recommended Settings** + +We recommend most users use the Tomtom defaults of q-value for score, the cutoff of 0.5 and the Pearson correlation coefficent for distance metric. + +---- + +**Parameters** + +We offer users the options of choosing which distance metric can be used to find matching motifs. Specifically, we offer the Pearson correlation coefficient, the Euclidean distance and the Sandelin-Wasserman Function. + + * The Pearson correlation coefficient measures the similarity between columns of position weight matrices (PWMs). + + * The Euclidean distance can be thought of as the length of the straight line between two PWMs. + + * The Sandelin-Wasserman function sums the column-wise differences between PWMs. + +We also offer the choice of E-value and q-value to threshold the results returned by Tomtom. + + * The E-value controls the expected number of false positives and can be any number. + + * The q-value controls the false discovery rate and is a number between 0 and 1. + +---- + +Note that at this time we only offer Tomtom output in txt format. + +---- + +**Citation** + +If you use this tool, please cite: Shobhit Gupta, JA Stamatoyannopolous, Timothy Bailey and William Stafford Noble, "Quantifying similarity between motifs", Genome Biology, 8(2):R24, 2007. + +.. _here: http://meme.nbcr.net/meme/tomtom-intro.html + + </help> +</tool> |
b |
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/train.xml --- a/kmersvm/train.xml Mon Aug 20 21:42:29 2012 -0400 +++ b/kmersvm/train.xml Sun Jun 16 18:06:14 2013 -0400 |
b |
@@ -47,8 +47,12 @@ <param name="weight" type="float" value="1" label="Input The Value of Positive Set Weight" /> </when> </conditional> - <param name="SVMC" type="integer" value="1" label="Regularization Param C" /> - <param name="EPS" type="float" value="0.00001" label="Precision Param E" /> + <param name="SVMC" type="float" value="1" label="Regularization Param C" > + <validator type="in_range" message="SVMC must be in range 1 - 10" min="0.01" max="1" /> + </param> + <param name="EPS" type="float" value="0.00001" label="Precision Param E" > + <validator type="in_range" message="EPS must be in range 1e-1 to 1e-5" min="0.00001" max="0.1" /> + </param> </inputs> <outputs> <data format="tabular" name="SVM_weights" from_work_dir="kmersvm_output_weights.out" label="${tool.name} on ${on_string} : Weights" /> @@ -79,11 +83,27 @@ Takes as input 2 FASTA files, 1 of positive sequences and 1 of negative sequences. Produces 2 outputs: - A) Weights: list of sequences of length K ranked by score and posterior probability for that score. + A) Weights: list of sequences of length K ranked by score. - B) Predictions: results of N-fold cross validation + B) Predictions: results of N-fold cross validation. ---- + +**Recommended Settings** + +Kernel: Spectrum + +Kmer length: 6 + +N-Fold Cross-Validation: 5 + +Weight: We recommend letting the Positive Set Weight be selected automatically, unless it has been separately optimized. + +Regularization Parameter C: We recommend values between 0.1 and 1. + +Precision Parameter E: We recommend using the default and staying below 0.1. + +---- **Parameters** @@ -91,8 +111,9 @@ A) Spectrum Kernel: Analyzes a sequence using strings of length K. - B) Weighted Spectrum Kernel: Analyzes a sequence using strings of range of lengths K1 - Kn. - + B) Weighted Spectrum Kernel: Analyzes a sequence using strings of range of lengths K_min - K_max. + + N-Fold Cross Validation: Number of partitions of training data used for cross validation. Weight: Increases importance of positive data (increase if positive sets are very trustworthy or for training with very large negative sequence sets). @@ -100,7 +121,7 @@ Regularization Parameter: Penalty for misclassification. Trade-off is overfitting (high parameter) versus high error rate (low parameter). Precision Parameter: Insensitivity zone. Affects precision of SVM by altering number of support vectors used. - + ---- **Example** |