# HG changeset patch
# User cafletezbrant
# Date 1371420374 14400
# Node ID fd740d51550265a6c62c8c0fa6d3f970caab3075
# Parent 1aea7c1a9ab1663122d5a2eb2796919a46ae452a
Uploaded revised kmer-SVM to include modules from kmer-visual.
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/README.txt
--- a/kmersvm/README.txt Mon Aug 20 21:42:29 2012 -0400
+++ b/kmersvm/README.txt Sun Jun 16 18:06:14 2013 -0400
@@ -68,6 +68,8 @@
+
+
Tool Tests:
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/install.sh
--- a/kmersvm/install.sh Mon Aug 20 21:42:29 2012 -0400
+++ b/kmersvm/install.sh Sun Jun 16 18:06:14 2013 -0400
@@ -1,12 +1,11 @@
#!/bin/bash
-cd "$1"
-cp tool-data/nullseq_indices.loc.sample ../../tool-data/nullseq_indices.loc
-cp tool-data/sample_roc_chen.png ../../tool-data
-cp tool-data/classify_output.out ../../test-data
-cp tool-data/classify_test.fa ../../test-data
-cp tool-data/kmersvm_output_weights.out ../../test-data
-cp tool-data/test_positive.fa ../../test-data
-cp tool-data/test_negative.fa ../../test-data
-cp tool-data/test_weights.out ../../test-data
-cp tool-data/train_predictions.out ../../test-data
+cp tool-data/nullseq_indices.loc.sample ~/galaxy-dist/tool-data/nullseq_indices.loc
+cp tool-data/sample_roc_chen.png ~/galaxy-dist/tool-data
+cp tool-data/classify_output.out ~/galaxy-dist/test-data
+cp tool-data/classify_test.fa ~/galaxy-dist/test-data
+cp tool-data/kmersvm_output_weights.out ~/galaxy-dist/test-data
+cp tool-data/test_positive.fa ~/galaxy-dist/test-data
+cp tool-data/test_negative.fa ~/galaxy-dist/test-data
+cp tool-data/test_weights.out ~/galaxy-dist/test-data
+cp tool-data/train_predictions.out ~/galaxy-dist/test-data
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/kmer2meme.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/kmersvm/kmer2meme.pl Sun Jun 16 18:06:14 2013 -0400
@@ -0,0 +1,49 @@
+use strict;
+
+open(my $w_fh, "<", $ARGV[0]);
+my $num_kmers = $ARGV[1];
+my @weights = <$w_fh>;
+
+my @temp_k = @weights[8..(7+$num_kmers), (-$num_kmers..-1)];
+
+my @kmers = ();
+#cleanup kmers
+for my $i (0..($#temp_k)){
+ my @temp = split('\t',$temp_k[$i]);
+ #modified by dongwon 042713
+ #push(@kmers, ($temp[0], $temp[1]));
+ push(@kmers, $temp[0]);
+}
+
+open(my $o_fh, ">", "kmer2meme.meme");
+
+print $o_fh
+"MEME version 4
+
+ALPHABET= ACGT
+
+strands: + -
+
+Background letter frequencies (from no specific genome):
+A 0.25 C 0.25 G 0.25 T 0.25\n\n";
+
+foreach my $kmer (@kmers) {
+ print $o_fh "MOTIF $kmer\n";
+ my $l = length($kmer);
+ print $o_fh "letter-probability matrix: alength= 4 w= $l nsites= 1 E= 0\n";
+ foreach my $i (0..($l-1)) {
+ my $nc = substr($kmer, $i, 1);
+ if ($nc eq "A") {
+ print $o_fh " 1.00 0.00 0.00 0.00\n";
+ }elsif ($nc eq "C") {
+ print $o_fh " 0.00 1.00 0.00 0.00\n";
+ }elsif ($nc eq "G") {
+ print $o_fh " 0.00 0.00 1.00 0.00\n";
+ }elsif ($nc eq "T") {
+ print $o_fh " 0.00 0.00 0.00 1.00\n";
+ }else {
+ print " 0.25 0.25 0.25 0.25\n";
+ }
+ }
+ print $o_fh "\n";
+}
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/kmertopwm.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/kmersvm/kmertopwm.xml Sun Jun 16 18:06:14 2013 -0400
@@ -0,0 +1,25 @@
+
+ Convert kmers to MEME format for motif finding by Tomtom
+ kmer2meme.pl
+ $weights $N
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This is a utility function that creates PWMs in MEME format for use with Tomtom.
+
+'Kmer Weights' is the weight file generated by 'Train SVM'.
+
+'Kmer Number' is the number of most positive and most negative kmers to be processed.
+
+
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/nullseq.xml
--- a/kmersvm/nullseq.xml Mon Aug 20 21:42:29 2012 -0400
+++ b/kmersvm/nullseq.xml Sun Jun 16 18:06:14 2013 -0400
@@ -7,9 +7,9 @@
-x $fold -r $rseed -g $gc_err -t $rpt_err $input $dbkey ${indices_path.fields.path}
-
-
-
+
+
+
@@ -44,6 +44,16 @@
**What it does**
Takes an input BED file and generates a set of sequences for use as negative data (null sequences) in Train SVM similar in length, GC content and repeat fraction. Uses random sampling for efficiency.
+
+----
+
+**Recommended Settings**
+
+Fold-Increase: Default is recommended, up to 50x positive set.
+
+GC Error, Repeat Error: Default is recommended.
+
+----
**Parameters**
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/scripts/kmersvm_output_weights.out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/kmersvm/scripts/kmersvm_output_weights.out Sun Jun 16 18:06:14 2013 -0400
@@ -0,0 +1,2088 @@
+#parameters:
+#kernel=1
+#kmerlen=6
+#bias=0.930368454935
+#A=0
+#B=0
+#NOTE: k-mers with large negative weights are also important. They can be found at the bottom of the list.
+#k-mer revcomp SVM-weight
+AAAAAA TTTTTT 0.553324469582
+AAAAAC GTTTTT 1.0689111563
+AAAAAG CTTTTT 0.386997519222
+AAAAAT ATTTTT 0.371506923691
+AAAACA TGTTTT 0.582941243013
+AAAACC GGTTTT -0.00322550380692
+AAAACG CGTTTT 0.115121834279
+AAAACT AGTTTT 0.64234562623
+AAAAGA TCTTTT 0.180098364822
+AAAAGC GCTTTT -0.370020965708
+AAAAGG CCTTTT -0.148530185678
+AAAAGT ACTTTT 1.19477154105
+AAAATA TATTTT 1.23378644064
+AAAATC GATTTT -0.980691936551
+AAAATG CATTTT 0.221932570601
+AAAATT AATTTT 0.449293989111
+AAACAA TTGTTT -1.57507857322
+AAACAC GTGTTT -2.1383477652
+AAACAG CTGTTT -0.720402198466
+AAACAT ATGTTT -0.915754056705
+AAACCA TGGTTT 0.959609519802
+AAACCC GGGTTT 0.150812627734
+AAACCG CGGTTT -0.204853254781
+AAACCT AGGTTT 0.486872195933
+AAACGA TCGTTT -0.404172254228
+AAACGC GCGTTT 0.471891306908
+AAACGG CCGTTT -0.732914484007
+AAACGT ACGTTT -0.79028442459
+AAACTA TAGTTT -0.200848111441
+AAACTC GAGTTT -0.00260431934722
+AAACTG CAGTTT 0.456381173353
+AAACTT AAGTTT 0.639062115506
+AAAGAA TTCTTT 0.257495713463
+AAAGAC GTCTTT -0.228023730318
+AAAGAG CTCTTT 0.247579662852
+AAAGAT ATCTTT -0.304817901111
+AAAGCA TGCTTT -0.155658358179
+AAAGCC GGCTTT 0.416290507318
+AAAGCG CGCTTT -0.319122803172
+AAAGCT AGCTTT -0.10365386651
+AAAGGA TCCTTT 0.465546368844
+AAAGGC GCCTTT 0.293788204177
+AAAGGG CCCTTT -0.738483493496
+AAAGGT ACCTTT -1.46557110152
+AAAGTA TACTTT -0.487013201424
+AAAGTC GACTTT -0.815561145197
+AAAGTG CACTTT 0.523242409873
+AAAGTT AACTTT 1.49610361616
+AAATAA TTATTT -0.50775903415
+AAATAC GTATTT -0.925034113885
+AAATAG CTATTT -1.3099174763
+AAATAT ATATTT -1.8047214372
+AAATCA TGATTT -0.899342838259
+AAATCC GGATTT -0.146519411262
+AAATCG CGATTT -0.267007765303
+AAATCT AGATTT 0.291560176957
+AAATGA TCATTT -0.514145682209
+AAATGC GCATTT 0.954279511728
+AAATGG CCATTT -0.711449233898
+AAATGT ACATTT -0.752526282583
+AAATTA TAATTT 0.00593646027611
+AAATTC GAATTT 1.26182226428
+AAATTG CAATTT -0.0953103902516
+AAATTT AAATTT 0.189613989631
+AACAAA TTTGTT -1.37122264124
+AACAAC GTTGTT 0.00146931165158
+AACAAG CTTGTT -0.803037783522
+AACAAT ATTGTT 0.00385062783094
+AACACA TGTGTT -0.363356864114
+AACACC GGTGTT -0.779849447985
+AACACG CGTGTT -0.97471290289
+AACACT AGTGTT -0.335935444604
+AACAGA TCTGTT -1.28171369495
+AACAGC GCTGTT -0.411448258216
+AACAGG CCTGTT -0.469780016788
+AACAGT ACTGTT -0.453227635948
+AACATA TATGTT -0.945101613087
+AACATC GATGTT -0.0283361724906
+AACATG CATGTT -0.575985749697
+AACATT AATGTT -0.0429091030472
+AACCAA TTGGTT -0.0823228706445
+AACCAC GTGGTT 2.58639657949
+AACCAG CTGGTT -0.276555339554
+AACCAT ATGGTT -0.11357479766
+AACCCA TGGGTT 0.192569792654
+AACCCC GGGGTT -0.0425603516266
+AACCCG CGGGTT -0.404973603501
+AACCCT AGGGTT 0.0764485451656
+AACCGA TCGGTT -0.137853811078
+AACCGC GCGGTT 0.710876928983
+AACCGG CCGGTT 0.272143672682
+AACCGT ACGGTT -1.42589113548
+AACCTA TAGGTT -0.611888789113
+AACCTC GAGGTT 0.837839227815
+AACCTG CAGGTT -0.422972816872
+AACCTT AAGGTT 0.0794552714245
+AACGAA TTCGTT 0.662384258058
+AACGAC GTCGTT -0.711145623237
+AACGAG CTCGTT 0.198654543303
+AACGAT ATCGTT -1.14468704666
+AACGCA TGCGTT -0.143027192823
+AACGCC GGCGTT -0.0833645930753
+AACGCG CGCGTT 0.0613946992336
+AACGCT AGCGTT 0.379426684798
+AACGGA TCCGTT -0.902189680896
+AACGGC GCCGTT 0.725518300654
+AACGGG CCCGTT 0.487999502266
+AACGGT ACCGTT -0.323411669378
+AACGTA TACGTT 0.429654445762
+AACGTC GACGTT -0.392752266586
+AACGTG CACGTT -1.04792887194
+AACGTT AACGTT 0.616207780774
+AACTAA TTAGTT -0.843322479317
+AACTAC GTAGTT 0.184493095017
+AACTAG CTAGTT 0.0179086348231
+AACTAT ATAGTT 0.994586833037
+AACTCA TGAGTT -0.12838936418
+AACTCC GGAGTT 0.726028047244
+AACTCG CGAGTT 0.205501965615
+AACTCT AGAGTT 0.78739364499
+AACTGA TCAGTT 0.168022862889
+AACTGC GCAGTT 0.216791948549
+AACTGG CCAGTT -0.314557426071
+AACTGT ACAGTT -0.0111281613254
+AACTTA TAAGTT -0.183787054209
+AACTTC GAAGTT 0.84215541061
+AACTTG CAAGTT 0.376469022105
+AAGAAA TTTCTT 0.832667586229
+AAGAAC GTTCTT 0.93622383333
+AAGAAG CTTCTT 0.271875957941
+AAGAAT ATTCTT 1.43057617592
+AAGACA TGTCTT -0.132932072786
+AAGACC GGTCTT 0.0903286121328
+AAGACG CGTCTT 0.184576127381
+AAGACT AGTCTT 0.222042406341
+AAGAGA TCTCTT -0.0130328461327
+AAGAGC GCTCTT -0.37404789079
+AAGAGG CCTCTT -0.163448118904
+AAGAGT ACTCTT 0.769472446615
+AAGATA TATCTT -0.790403171158
+AAGATC GATCTT -0.120008098951
+AAGATG CATCTT 0.598644235302
+AAGATT AATCTT 1.39556497538
+AAGCAA TTGCTT -0.984888908248
+AAGCAC GTGCTT 0.783311673894
+AAGCAG CTGCTT -0.551197739368
+AAGCAT ATGCTT 0.368035643478
+AAGCCA TGGCTT 0.281990338241
+AAGCCC GGGCTT -0.699900156956
+AAGCCG CGGCTT 0.986454860217
+AAGCCT AGGCTT 0.446570897308
+AAGCGA TCGCTT -0.717502459474
+AAGCGC GCGCTT 0.292841378565
+AAGCGG CCGCTT 0.726632808198
+AAGCGT ACGCTT -0.441284795806
+AAGCTA TAGCTT -1.65918664431
+AAGCTC GAGCTT 0.0882183240244
+AAGCTG CAGCTT -0.134531324525
+AAGCTT AAGCTT 0.499772590447
+AAGGAA TTCCTT 0.608701292821
+AAGGAC GTCCTT 0.174988238866
+AAGGAG CTCCTT 0.56638313976
+AAGGAT ATCCTT 0.86759780737
+AAGGCA TGCCTT -0.0660388079911
+AAGGCC GGCCTT 0.353849453837
+AAGGCG CGCCTT -0.103035156648
+AAGGCT AGCCTT 0.226115108478
+AAGGGA TCCCTT -0.0242048325334
+AAGGGC GCCCTT -0.991808457742
+AAGGGG CCCCTT -0.108713197864
+AAGGGT ACCCTT 0.426314095539
+AAGGTA TACCTT 0.0063082317847
+AAGGTC GACCTT -2.22148605405
+AAGGTG CACCTT -0.171279446553
+AAGTAA TTACTT -0.660359179691
+AAGTAC GTACTT 1.37990716767
+AAGTAG CTACTT 0.00364551059326
+AAGTAT ATACTT 0.0627944758868
+AAGTCA TGACTT -0.0836841889637
+AAGTCC GGACTT -0.0783950838806
+AAGTCG CGACTT -0.331907177283
+AAGTCT AGACTT 0.922862248198
+AAGTGA TCACTT 0.301713638482
+AAGTGC GCACTT 0.372118346492
+AAGTGG CCACTT -0.29435234237
+AAGTGT ACACTT 0.453500782049
+AAGTTA TAACTT -0.0413529710922
+AAGTTC GAACTT 0.490694081954
+AAGTTG CAACTT 1.43527540302
+AATAAA TTTATT -0.464685282825
+AATAAC GTTATT 0.473126192871
+AATAAG CTTATT -0.361537503398
+AATAAT ATTATT -1.14135793996
+AATACA TGTATT -0.0868676244573
+AATACC GGTATT -0.432019199972
+AATACG CGTATT -0.812104843229
+AATACT AGTATT -0.160253986465
+AATAGA TCTATT -0.819357586187
+AATAGC GCTATT -0.582278240034
+AATAGG CCTATT -0.102936475866
+AATAGT ACTATT 0.64849424254
+AATATA TATATT -0.875327209013
+AATATC GATATT -0.538482532464
+AATATG CATATT 0.442497512442
+AATATT AATATT -1.60012723551
+AATCAA TTGATT -1.37621833951
+AATCAC GTGATT 0.91628767144
+AATCAG CTGATT 0.070484765244
+AATCAT ATGATT 0.650606183815
+AATCCA TGGATT -1.02720580521
+AATCCC GGGATT -0.352811994914
+AATCCG CGGATT 0.0165933980204
+AATCCT AGGATT 0.712428149182
+AATCGA TCGATT -0.592300021647
+AATCGC GCGATT 0.812676084435
+AATCGG CCGATT 0.39632534305
+AATCGT ACGATT -0.342808208442
+AATCTA TAGATT -1.480756961
+AATCTC GAGATT -0.751509737277
+AATCTG CAGATT -0.0237559933613
+AATGAA TTCATT -0.771072829647
+AATGAC GTCATT -0.14602458728
+AATGAG CTCATT 0.492363745269
+AATGAT ATCATT -0.609265638394
+AATGCA TGCATT 0.167952139321
+AATGCC GGCATT 0.965994735545
+AATGCG CGCATT -0.166276358058
+AATGCT AGCATT 1.03827471911
+AATGGA TCCATT -0.187500612316
+AATGGC GCCATT 0.216365462216
+AATGGG CCCATT -0.0888492445946
+AATGGT ACCATT 0.14433579757
+AATGTA TACATT 0.283672586491
+AATGTC GACATT -0.913297517025
+AATGTG CACATT 1.12759664753
+AATTAA TTAATT -2.21736658818
+AATTAC GTAATT 0.126090373031
+AATTAG CTAATT -0.499643372776
+AATTAT ATAATT 0.0590969699364
+AATTCA TGAATT -0.535790423504
+AATTCC GGAATT 1.30916473709
+AATTCG CGAATT 0.0530089957774
+AATTCT AGAATT 1.3354098108
+AATTGA TCAATT -1.66820825185
+AATTGC GCAATT 0.00310194879804
+AATTGG CCAATT 0.419449404673
+AATTGT ACAATT 0.464182538132
+AATTTA TAAATT -0.570808133223
+AATTTC GAAATT 0.929450761295
+AATTTG CAAATT 0.406154967173
+ACAAAA TTTTGT 0.534003859773
+ACAAAC GTTTGT -0.750597270967
+ACAAAG CTTTGT -0.174225381133
+ACAAAT ATTTGT -0.967477603914
+ACAACA TGTTGT 0.684531901144
+ACAACC GGTTGT 0.306111794846
+ACAACG CGTTGT -0.492170779986
+ACAACT AGTTGT 0.12647703187
+ACAAGA TCTTGT 0.435693866629
+ACAAGC GCTTGT 0.162579020622
+ACAAGG CCTTGT -0.482270829511
+ACAAGT ACTTGT 0.368700538071
+ACAATA TATTGT -0.556960796215
+ACAATC GATTGT 0.447210789307
+ACAATG CATTGT -0.652007172748
+ACACAA TTGTGT -0.58426177344
+ACACAC GTGTGT 0.0976403710637
+ACACAG CTGTGT -0.67562334546
+ACACAT ATGTGT -0.783431510249
+ACACCA TGGTGT -0.0559002312137
+ACACCC GGGTGT -0.279278917913
+ACACCG CGGTGT 0.927647825457
+ACACCT AGGTGT -2.31444811782
+ACACGA TCGTGT -0.290620517011
+ACACGC GCGTGT -0.433731209379
+ACACGG CCGTGT 0.133955133112
+ACACGT ACGTGT -0.0840820691034
+ACACTA TAGTGT -1.53601873195
+ACACTC GAGTGT -0.0411823725391
+ACACTG CAGTGT -0.133117765869
+ACAGAA TTCTGT 0.192830326341
+ACAGAC GTCTGT -0.344297166277
+ACAGAG CTCTGT 0.0995945155779
+ACAGAT ATCTGT -1.71253677969
+ACAGCA TGCTGT 0.103705732884
+ACAGCC GGCTGT -0.141361720091
+ACAGCG CGCTGT 0.0923052988622
+ACAGCT AGCTGT -2.06591471431
+ACAGGA TCCTGT 0.106606820089
+ACAGGC GCCTGT 0.243978095226
+ACAGGG CCCTGT -0.163198751642
+ACAGGT ACCTGT -1.35902898114
+ACAGTA TACTGT 0.330450384923
+ACAGTC GACTGT -0.0441089075653
+ACAGTG CACTGT 0.410210459073
+ACATAA TTATGT -0.124637932465
+ACATAC GTATGT 0.0408944886861
+ACATAG CTATGT -0.281098621777
+ACATAT ATATGT -1.5461561949
+ACATCA TGATGT 0.404823860207
+ACATCC GGATGT -0.0771250376801
+ACATCG CGATGT 0.348036576745
+ACATCT AGATGT -1.0135367165
+ACATGA TCATGT -0.0533364791011
+ACATGC GCATGT -0.230663552067
+ACATGG CCATGT 0.354870946287
+ACATGT ACATGT -0.255821119156
+ACATTA TAATGT -0.305695214437
+ACATTC GAATGT 2.10633976985
+ACATTG CAATGT -0.969761944969
+ACCAAA TTTGGT -0.0894837549998
+ACCAAC GTTGGT -0.310977709975
+ACCAAG CTTGGT 0.126792582447
+ACCAAT ATTGGT -0.208620509384
+ACCACA TGTGGT 3.95883323455
+ACCACC GGTGGT -0.550768309866
+ACCACG CGTGGT 1.64662237122
+ACCACT AGTGGT -0.252204442565
+ACCAGA TCTGGT -0.182102497222
+ACCAGC GCTGGT -0.666347426374
+ACCAGG CCTGGT -0.302144138217
+ACCAGT ACTGGT 0.0178732652384
+ACCATA TATGGT -0.128894926297
+ACCATC GATGGT -0.113161940262
+ACCATG CATGGT 0.0567971909973
+ACCCAA TTGGGT -0.112170340264
+ACCCAC GTGGGT -0.0932909430755
+ACCCAG CTGGGT 0.38534995457
+ACCCAT ATGGGT 0.86383897393
+ACCCCA TGGGGT 0.69577964714
+ACCCCC GGGGGT 0.336687664266
+ACCCCG CGGGGT -0.0474965784183
+ACCCCT AGGGGT 0.0583362287737
+ACCCGA TCGGGT -0.651964147142
+ACCCGC GCGGGT 0.430185118239
+ACCCGG CCGGGT 0.0136510502891
+ACCCGT ACGGGT -0.66740789625
+ACCCTA TAGGGT -0.0140774209654
+ACCCTC GAGGGT -0.0866227575362
+ACCCTG CAGGGT 0.831880338582
+ACCGAA TTCGGT 0.162092610395
+ACCGAC GTCGGT -0.213537840127
+ACCGAG CTCGGT -0.798483849782
+ACCGAT ATCGGT -0.171301259624
+ACCGCA TGCGGT 1.12176529563
+ACCGCC GGCGGT -0.0399431587546
+ACCGCG CGCGGT -0.00200779866329
+ACCGCT AGCGGT 0.807679694982
+ACCGGA TCCGGT 0.475032143564
+ACCGGC GCCGGT 0.644168012631
+ACCGGG CCCGGT 0.0514057931436
+ACCGGT ACCGGT -0.201116834029
+ACCGTA TACGGT -0.0510651546867
+ACCGTC GACGGT -1.28871756987
+ACCGTG CACGGT 0.480206522572
+ACCTAA TTAGGT -0.0164396390877
+ACCTAC GTAGGT -0.948871399654
+ACCTAG CTAGGT 0.902143796699
+ACCTAT ATAGGT -0.805157821049
+ACCTCA TGAGGT 0.0288013729329
+ACCTCC GGAGGT -0.123113465161
+ACCTCG CGAGGT 0.262603098349
+ACCTCT AGAGGT -0.105633795468
+ACCTGA TCAGGT -1.25693360781
+ACCTGC GCAGGT -2.19188428198
+ACCTGG CCAGGT -1.22596740891
+ACCTTA TAAGGT -0.397764436469
+ACCTTC GAAGGT -0.365879915113
+ACCTTG CAAGGT -0.468430599887
+ACGAAA TTTCGT 0.264150600855
+ACGAAC GTTCGT -0.264756483319
+ACGAAG CTTCGT 0.204030639912
+ACGAAT ATTCGT -0.0361589742531
+ACGACA TGTCGT -0.610965249538
+ACGACC GGTCGT -0.241773839788
+ACGACG CGTCGT 0.0286911417218
+ACGACT AGTCGT -0.446205245539
+ACGAGA TCTCGT 0.605153633971
+ACGAGC GCTCGT -0.265340587538
+ACGAGG CCTCGT -0.268759016858
+ACGAGT ACTCGT 0.640180324145
+ACGATA TATCGT 0.155583247136
+ACGATC GATCGT 0.0616053169407
+ACGATG CATCGT -0.458209843991
+ACGCAA TTGCGT 0.525980393513
+ACGCAC GTGCGT -1.07211159219
+ACGCAG CTGCGT 0.297215899525
+ACGCAT ATGCGT -0.62466515887
+ACGCCA TGGCGT -0.802770461001
+ACGCCC GGGCGT 0.447952405036
+ACGCCG CGGCGT -0.115846359904
+ACGCCT AGGCGT -0.238430995845
+ACGCGA TCGCGT 0.095946201326
+ACGCGC GCGCGT -0.0433332223788
+ACGCGG CCGCGT 0.266229064785
+ACGCGT ACGCGT -0.163327119327
+ACGCTA TAGCGT 0.237235200727
+ACGCTC GAGCGT -0.0921458803262
+ACGCTG CAGCGT 0.199715587503
+ACGGAA TTCCGT -0.329353600663
+ACGGAC GTCCGT 0.344570189474
+ACGGAG CTCCGT -0.0548389021114
+ACGGAT ATCCGT -0.545997471972
+ACGGCA TGCCGT 0.280450942962
+ACGGCC GGCCGT 1.41582328399
+ACGGCG CGCCGT -0.196303455354
+ACGGCT AGCCGT 0.227947803931
+ACGGGA TCCCGT 0.280829778818
+ACGGGC GCCCGT 0.00018403437262
+ACGGGG CCCCGT 0.580486853705
+ACGGTA TACCGT 0.075495718471
+ACGGTC GACCGT 0.348584555757
+ACGGTG CACCGT -0.182120731866
+ACGTAA TTACGT -0.0621196361007
+ACGTAC GTACGT 0.466909984366
+ACGTAG CTACGT -0.663033009337
+ACGTAT ATACGT -0.308630159174
+ACGTCA TGACGT -1.99820059064
+ACGTCC GGACGT 0.420415612207
+ACGTCG CGACGT -0.0602876602791
+ACGTCT AGACGT -0.634849462137
+ACGTGA TCACGT -0.198905336876
+ACGTGC GCACGT 0.414998502382
+ACGTGG CCACGT 0.200447884465
+ACGTTA TAACGT 0.185859329067
+ACGTTC GAACGT 0.52815687831
+ACGTTG CAACGT 0.211039795795
+ACTAAA TTTAGT -0.50736915368
+ACTAAC GTTAGT 0.475747236187
+ACTAAG CTTAGT 0.138308785668
+ACTAAT ATTAGT 1.06019516177
+ACTACA TGTAGT 0.319192883857
+ACTACC GGTAGT 0.487897945196
+ACTACG CGTAGT -0.74407054614
+ACTACT AGTAGT 0.0639271446503
+ACTAGA TCTAGT -0.0298376878721
+ACTAGC GCTAGT -0.0429928576347
+ACTAGG CCTAGT 0.333780155394
+ACTAGT ACTAGT -0.16686552786
+ACTATA TATAGT -0.405746674892
+ACTATC GATAGT -0.111727361497
+ACTATG CATAGT -0.162666443308
+ACTCAA TTGAGT 0.0769394839718
+ACTCAC GTGAGT 1.11965808913
+ACTCAG CTGAGT 0.702350819167
+ACTCAT ATGAGT 1.21992890886
+ACTCCA TGGAGT 0.159797702837
+ACTCCC GGGAGT -0.25369524982
+ACTCCG CGGAGT -0.22211947957
+ACTCCT AGGAGT 0.573765565902
+ACTCGA TCGAGT -0.52369931313
+ACTCGC GCGAGT 0.0889533091085
+ACTCGG CCGAGT 0.228330956989
+ACTCTA TAGAGT 0.143486764445
+ACTCTC GAGAGT 0.112394817019
+ACTCTG CAGAGT -0.15818031926
+ACTGAA TTCAGT -0.255344152434
+ACTGAC GTCAGT -0.494035697197
+ACTGAG CTCAGT 0.356908231789
+ACTGAT ATCAGT -0.39586503844
+ACTGCA TGCAGT 0.67076450454
+ACTGCC GGCAGT -0.33621057783
+ACTGCG CGCAGT 0.397171550083
+ACTGCT AGCAGT -0.0353519946569
+ACTGGA TCCAGT 0.221117097972
+ACTGGC GCCAGT -0.148482643928
+ACTGGG CCCAGT 0.36938530952
+ACTGTA TACAGT 0.387293260858
+ACTGTC GACAGT -0.0178021629868
+ACTGTG CACAGT 0.618305777696
+ACTTAA TTAAGT 0.437644834694
+ACTTAC GTAAGT 0.033919287324
+ACTTAG CTAAGT -0.516377419414
+ACTTAT ATAAGT 0.698841633408
+ACTTCA TGAAGT 0.611347347435
+ACTTCC GGAAGT 0.0973285263686
+ACTTCG CGAAGT -0.0915669240628
+ACTTCT AGAAGT 0.518303185233
+ACTTGA TCAAGT -0.216079683422
+ACTTGC GCAAGT 0.575477942051
+ACTTGG CCAAGT 0.00543146924231
+ACTTTA TAAAGT -0.300213848597
+ACTTTC GAAAGT -0.148863314977
+ACTTTG CAAAGT -0.245595583167
+AGAAAA TTTTCT -0.137268535318
+AGAAAC GTTTCT 0.572093479149
+AGAAAG CTTTCT 0.098472865858
+AGAAAT ATTTCT 0.410453396261
+AGAACA TGTTCT 0.793454650212
+AGAACC GGTTCT 0.0115494156458
+AGAACG CGTTCT 0.82393226583
+AGAACT AGTTCT -0.119807464497
+AGAAGA TCTTCT -0.333758403042
+AGAAGC GCTTCT 0.0844487814689
+AGAAGG CCTTCT -0.205219218112
+AGAATA TATTCT 0.0633993564776
+AGAATC GATTCT -0.538136371548
+AGAATG CATTCT 0.931621596147
+AGACAA TTGTCT -1.50574942362
+AGACAC GTGTCT -0.0167669078258
+AGACAG CTGTCT -0.12906421364
+AGACAT ATGTCT 1.35404000457
+AGACCA TGGTCT 0.328218872799
+AGACCC GGGTCT -0.0200725289044
+AGACCG CGGTCT 0.765626742963
+AGACCT AGGTCT -0.456848517462
+AGACGA TCGTCT -0.802053904091
+AGACGC GCGTCT 0.0253871768361
+AGACGG CCGTCT -0.139266727141
+AGACTA TAGTCT -0.320057583318
+AGACTC GAGTCT -0.834249291646
+AGACTG CAGTCT 0.193508572354
+AGAGAA TTCTCT 0.0931165815382
+AGAGAC GTCTCT -0.5884698684
+AGAGAG CTCTCT 0.615476951972
+AGAGAT ATCTCT -0.299853214526
+AGAGCA TGCTCT -0.335642528646
+AGAGCC GGCTCT -0.883469618392
+AGAGCG CGCTCT 0.251578584118
+AGAGCT AGCTCT 0.487429375142
+AGAGGA TCCTCT 0.463823218153
+AGAGGC GCCTCT 0.916276432149
+AGAGGG CCCTCT -0.18684096125
+AGAGTA TACTCT -0.495138709385
+AGAGTC GACTCT -0.484655774219
+AGAGTG CACTCT 0.115993902393
+AGATAA TTATCT -1.58907716419
+AGATAC GTATCT 0.339921215435
+AGATAG CTATCT -0.367780034415
+AGATAT ATATCT 0.287484551209
+AGATCA TGATCT -0.123844683894
+AGATCC GGATCT 0.00247282057627
+AGATCG CGATCT -0.0307897262914
+AGATCT AGATCT 0.227651621052
+AGATGA TCATCT 0.386848181305
+AGATGC GCATCT -0.0213670882284
+AGATGG CCATCT -1.81021353589
+AGATTA TAATCT -0.8443233316
+AGATTC GAATCT -0.532888247722
+AGATTG CAATCT -0.107371277313
+AGCAAA TTTGCT -1.94599552918
+AGCAAC GTTGCT -0.0162424474542
+AGCAAG CTTGCT -0.342005889721
+AGCAAT ATTGCT 1.13914459658
+AGCACA TGTGCT -0.00361898399215
+AGCACC GGTGCT -0.315148789185
+AGCACG CGTGCT 0.303979187648
+AGCACT AGTGCT 1.06273224797
+AGCAGA TCTGCT -0.762081808432
+AGCAGC GCTGCT -0.718835795316
+AGCAGG CCTGCT -0.805919785711
+AGCATA TATGCT -1.23962747197
+AGCATC GATGCT -0.72646586189
+AGCATG CATGCT 0.464440738258
+AGCCAA TTGGCT -0.328449603835
+AGCCAC GTGGCT 1.94437802586
+AGCCAG CTGGCT -0.220210493763
+AGCCAT ATGGCT -0.106927471131
+AGCCCA TGGGCT -0.206762784221
+AGCCCC GGGGCT 0.51046992709
+AGCCCG CGGGCT 1.1594447744
+AGCCCT AGGGCT -0.414622923074
+AGCCGA TCGGCT -0.0926922352106
+AGCCGC GCGGCT 0.262876228393
+AGCCGG CCGGCT 0.055499757051
+AGCCTA TAGGCT -0.197316160999
+AGCCTC GAGGCT 0.21702983432
+AGCCTG CAGGCT 0.435309995039
+AGCGAA TTCGCT -1.1066505883e-05
+AGCGAC GTCGCT -0.47360327321
+AGCGAG CTCGCT 0.164778505221
+AGCGAT ATCGCT 0.356498907503
+AGCGCA TGCGCT 0.0505610378921
+AGCGCC GGCGCT -0.0890290036684
+AGCGCG CGCGCT 0.178915191469
+AGCGCT AGCGCT 0.11567687362
+AGCGGA TCCGCT -0.227235460911
+AGCGGC GCCGCT 0.0584119341573
+AGCGGG CCCGCT 0.144626430325
+AGCGTA TACGCT -0.573602800156
+AGCGTC GACGCT 0.0226782464247
+AGCGTG CACGCT 0.229831864487
+AGCTAA TTAGCT -0.845230674233
+AGCTAC GTAGCT 0.164831388872
+AGCTAG CTAGCT 0.500762258636
+AGCTAT ATAGCT -0.488234536435
+AGCTCA TGAGCT 0.187034773314
+AGCTCC GGAGCT -0.401339822528
+AGCTCG CGAGCT 0.156048007048
+AGCTGA TCAGCT -0.516709892962
+AGCTGC GCAGCT -1.11014090329
+AGCTGG CCAGCT -0.932348608248
+AGCTTA TAAGCT -0.176665598631
+AGCTTC GAAGCT 0.0711278180534
+AGCTTG CAAGCT -0.434225228906
+AGGAAA TTTCCT -0.704291527622
+AGGAAC GTTCCT 0.303469917629
+AGGAAG CTTCCT 0.00524866323846
+AGGAAT ATTCCT 1.58780848354
+AGGACA TGTCCT -0.15505321324
+AGGACC GGTCCT 0.289757945024
+AGGACG CGTCCT 0.481778993922
+AGGACT AGTCCT 0.640823779903
+AGGAGA TCTCCT -0.235490942376
+AGGAGC GCTCCT 0.235768365124
+AGGAGG CCTCCT -0.414415365132
+AGGATA TATCCT -0.0664042613035
+AGGATC GATCCT -0.434947351654
+AGGATG CATCCT 0.366955472562
+AGGCAA TTGCCT -1.01850518506
+AGGCAC GTGCCT 0.736932368634
+AGGCAG CTGCCT 0.256955994847
+AGGCAT ATGCCT 0.369133304411
+AGGCCA TGGCCT -0.561290343016
+AGGCCC GGGCCT 1.03465518609
+AGGCCG CGGCCT 0.522868342351
+AGGCCT AGGCCT 0.564743685355
+AGGCGA TCGCCT 0.424607293004
+AGGCGC GCGCCT -0.236182049346
+AGGCGG CCGCCT 0.0445043432832
+AGGCTA TAGCCT 0.0242969706881
+AGGCTC GAGCCT -0.0471097341017
+AGGCTG CAGCCT 0.456595572887
+AGGGAA TTCCCT 0.498834635405
+AGGGAC GTCCCT -0.260200051734
+AGGGAG CTCCCT 0.240503537084
+AGGGAT ATCCCT -0.535599280412
+AGGGCA TGCCCT -0.00133954725493
+AGGGCC GGCCCT 0.265601558657
+AGGGCG CGCCCT -0.545673685243
+AGGGGA TCCCCT 0.513483423778
+AGGGGC GCCCCT -0.265469144844
+AGGGGG CCCCCT 0.36693029363
+AGGGTA TACCCT 0.104498083465
+AGGGTC GACCCT 0.278328586346
+AGGGTG CACCCT 0.428836989395
+AGGTAA TTACCT -0.725162219135
+AGGTAC GTACCT -1.09752982535
+AGGTAG CTACCT -0.341120431869
+AGGTAT ATACCT 0.314857124255
+AGGTCA TGACCT -1.91741654129
+AGGTCC GGACCT 0.810994316372
+AGGTCG CGACCT 0.394159438614
+AGGTGA TCACCT -1.07617897961
+AGGTGC GCACCT -1.39197112403
+AGGTGG CCACCT -1.18178211992
+AGGTTA TAACCT -0.510476009759
+AGGTTC GAACCT -0.625187182346
+AGGTTG CAACCT 0.143869769481
+AGTAAA TTTACT -1.50793279028
+AGTAAC GTTACT 0.427455316037
+AGTAAG CTTACT 0.279690202349
+AGTAAT ATTACT 0.291866593552
+AGTACA TGTACT 0.880452697751
+AGTACC GGTACT -0.445269994312
+AGTACG CGTACT 0.579750490517
+AGTACT AGTACT 0.829122008787
+AGTAGA TCTACT -0.191593190325
+AGTAGC GCTACT -0.0893420501576
+AGTAGG CCTACT 0.14556137321
+AGTATA TATACT -0.849140839165
+AGTATC GATACT -0.61282065974
+AGTATG CATACT 0.912596605147
+AGTCAA TTGACT -1.70769354035
+AGTCAC GTGACT 1.34966048832
+AGTCAG CTGACT 1.37319346673
+AGTCAT ATGACT 2.72186586472
+AGTCCA TGGACT -0.0192132003426
+AGTCCC GGGACT 0.578612760429
+AGTCCG CGGACT -0.212053165155
+AGTCGA TCGACT -0.0786424670568
+AGTCGC GCGACT -0.458716046909
+AGTCGG CCGACT 0.248365258652
+AGTCTA TAGACT -0.320914441937
+AGTCTC GAGACT 0.0850550293497
+AGTCTG CAGACT 0.488675888517
+AGTGAA TTCACT 0.0266570650857
+AGTGAC GTCACT -0.387049263014
+AGTGAG CTCACT 0.430045767393
+AGTGAT ATCACT 0.827141172737
+AGTGCA TGCACT 0.070601211825
+AGTGCC GGCACT -0.966563356734
+AGTGCG CGCACT -0.281729870678
+AGTGGA TCCACT -0.381263157411
+AGTGGC GCCACT 0.206378092816
+AGTGGG CCCACT 0.389632179038
+AGTGTA TACACT -0.820978049153
+AGTGTC GACACT -0.600524497894
+AGTGTG CACACT 0.713037712467
+AGTTAA TTAACT -0.0106066013167
+AGTTAC GTAACT 0.0455408515653
+AGTTAG CTAACT 0.466941093618
+AGTTAT ATAACT 0.768829672421
+AGTTCA TGAACT -0.125866517956
+AGTTCC GGAACT 0.668936562441
+AGTTCG CGAACT 0.387968939941
+AGTTGA TCAACT -0.0474971487734
+AGTTGC GCAACT 1.1127899844
+AGTTGG CCAACT -0.465573167102
+AGTTTA TAAACT 0.284055967528
+AGTTTC GAAACT 0.18619349173
+AGTTTG CAAACT 0.12731264248
+ATAAAA TTTTAT 1.02648675816
+ATAAAC GTTTAT 0.732880225054
+ATAAAG CTTTAT -1.87548823673
+ATAAAT ATTTAT -0.873007961387
+ATAACA TGTTAT -1.36018600643
+ATAACC GGTTAT -0.202228583494
+ATAACG CGTTAT 0.589468702095
+ATAAGA TCTTAT -0.257841865173
+ATAAGC GCTTAT -0.610042148574
+ATAAGG CCTTAT -1.22287623937
+ATAATA TATTAT -0.164184507604
+ATAATC GATTAT -0.722314659784
+ATAATG CATTAT -0.0511255174539
+ATACAA TTGTAT -0.950800433133
+ATACAC GTGTAT -0.119925598684
+ATACAG CTGTAT -0.0262094856091
+ATACAT ATGTAT 0.249201529788
+ATACCA TGGTAT 0.574806016908
+ATACCC GGGTAT -0.101360243386
+ATACCG CGGTAT -0.611029854896
+ATACGA TCGTAT -0.920389648385
+ATACGC GCGTAT -0.174514324887
+ATACGG CCGTAT -0.51787444727
+ATACTA TAGTAT 0.229563124064
+ATACTC GAGTAT -0.595979909399
+ATACTG CAGTAT 0.236738075475
+ATAGAA TTCTAT -1.07321125666
+ATAGAC GTCTAT -0.918035212801
+ATAGAG CTCTAT 0.338560548219
+ATAGAT ATCTAT 0.330811757112
+ATAGCA TGCTAT -0.923663860458
+ATAGCC GGCTAT 0.16558344397
+ATAGCG CGCTAT 0.462369612811
+ATAGGA TCCTAT 0.425973895672
+ATAGGC GCCTAT 0.314093043047
+ATAGGG CCCTAT 0.219751962566
+ATAGTA TACTAT -0.900554379694
+ATAGTC GACTAT -0.0847074884239
+ATAGTG CACTAT -0.211791001641
+ATATAA TTATAT -0.276221858749
+ATATAC GTATAT -1.19823462791
+ATATAG CTATAT 0.0633032493853
+ATATAT ATATAT 0.429750726604
+ATATCA TGATAT -1.31621436594
+ATATCC GGATAT -0.127518411175
+ATATCG CGATAT -0.1582300913
+ATATGA TCATAT -1.22039654692
+ATATGC GCATAT -0.733718702885
+ATATGG CCATAT -0.410024811352
+ATATTA TAATAT -0.556385064056
+ATATTC GAATAT -0.193606749951
+ATATTG CAATAT -1.84656827598
+ATCAAA TTTGAT -1.48651833301
+ATCAAC GTTGAT -0.177019712196
+ATCAAG CTTGAT 0.993564451911
+ATCAAT ATTGAT -1.73845783884
+ATCACA TGTGAT -0.7187838349
+ATCACC GGTGAT -0.522216302483
+ATCACG CGTGAT -0.717922773896
+ATCAGA TCTGAT -0.432098796815
+ATCAGC GCTGAT -0.177781089389
+ATCAGG CCTGAT -0.185638177915
+ATCATA TATGAT 0.532155893918
+ATCATC GATGAT -0.124663780341
+ATCATG CATGAT -0.00689408482313
+ATCCAA TTGGAT 0.860993420444
+ATCCAC GTGGAT -0.0725321924413
+ATCCAG CTGGAT -0.27103003808
+ATCCAT ATGGAT -1.06148966182
+ATCCCA TGGGAT 0.61836822508
+ATCCCC GGGGAT 0.953992982003
+ATCCCG CGGGAT -0.399268842253
+ATCCGA TCGGAT 0.30018596428
+ATCCGC GCGGAT 0.267936181036
+ATCCGG CCGGAT -0.621204560642
+ATCCTA TAGGAT -0.243979632628
+ATCCTC GAGGAT 0.744817578624
+ATCCTG CAGGAT -0.504079419831
+ATCGAA TTCGAT -0.127059008672
+ATCGAC GTCGAT -0.0155499225919
+ATCGAG CTCGAT -0.547153990807
+ATCGAT ATCGAT -0.243041889295
+ATCGCA TGCGAT 0.617626698462
+ATCGCC GGCGAT -0.549200964359
+ATCGCG CGCGAT -0.319928310366
+ATCGGA TCCGAT -0.0213478576162
+ATCGGC GCCGAT 0.01012445378
+ATCGGG CCCGAT -0.16502415031
+ATCGTA TACGAT 0.209770307092
+ATCGTC GACGAT 0.446879950629
+ATCGTG CACGAT -0.0107015895665
+ATCTAA TTAGAT 0.275281219402
+ATCTAC GTAGAT -0.505485858388
+ATCTAG CTAGAT -1.40045656324
+ATCTCA TGAGAT -0.796214522768
+ATCTCC GGAGAT 0.357919780998
+ATCTCG CGAGAT 0.330326321378
+ATCTGA TCAGAT 0.54409789464
+ATCTGC GCAGAT -1.11848953262
+ATCTGG CCAGAT -1.74540660353
+ATCTTA TAAGAT 0.653446604954
+ATCTTC GAAGAT -0.402855143504
+ATCTTG CAAGAT 1.12868592482
+ATGAAA TTTCAT -0.931491848926
+ATGAAC GTTCAT -0.668998480935
+ATGAAG CTTCAT -0.234340414147
+ATGAAT ATTCAT 1.32890513335
+ATGACA TGTCAT -0.407101486566
+ATGACC GGTCAT -0.10517111194
+ATGACG CGTCAT -1.81349916863
+ATGAGA TCTCAT -0.143474073688
+ATGAGC GCTCAT -0.422453674778
+ATGAGG CCTCAT 0.24497065201
+ATGATA TATCAT -0.457971919127
+ATGATC GATCAT -0.956283043587
+ATGATG CATCAT 0.280192974098
+ATGCAA TTGCAT 0.0243701389727
+ATGCAC GTGCAT -0.19959196082
+ATGCAG CTGCAT 0.349978081089
+ATGCAT ATGCAT 0.13057511889
+ATGCCA TGGCAT 0.381740306749
+ATGCCC GGGCAT -0.0217736885435
+ATGCCG CGGCAT 0.62665478233
+ATGCGA TCGCAT 0.335703166837
+ATGCGC GCGCAT 0.783616840941
+ATGCGG CCGCAT 0.0472938578306
+ATGCTA TAGCAT -0.0735655576493
+ATGCTC GAGCAT -0.840560139265
+ATGCTG CAGCAT 0.15741810487
+ATGGAA TTCCAT -0.775848717153
+ATGGAC GTCCAT 0.41782311792
+ATGGAG CTCCAT -0.163619300573
+ATGGCA TGCCAT -0.129694663435
+ATGGCC GGCCAT -0.416279326696
+ATGGCG CGCCAT -0.445757591838
+ATGGGA TCCCAT -0.128926465653
+ATGGGC GCCCAT 0.771709467651
+ATGGGG CCCCAT -0.0745872493387
+ATGGTA TACCAT 0.246690854305
+ATGGTC GACCAT 0.0673503554008
+ATGGTG CACCAT -0.253032937261
+ATGTAA TTACAT -0.995516665438
+ATGTAC GTACAT -0.525286748875
+ATGTAG CTACAT 0.68550253572
+ATGTCA TGACAT -2.11241919747
+ATGTCC GGACAT 0.845134601462
+ATGTCG CGACAT 0.516595304184
+ATGTGA TCACAT 0.141325506081
+ATGTGC GCACAT 0.0828091804383
+ATGTGG CCACAT 0.947400656873
+ATGTTA TAACAT -0.947214214102
+ATGTTC GAACAT 0.431810543746
+ATGTTG CAACAT -0.125258264151
+ATTAAA TTTAAT -1.31975538764
+ATTAAC GTTAAT -1.41512566175
+ATTAAG CTTAAT -0.199537767266
+ATTAAT ATTAAT -0.436068858224
+ATTACA TGTAAT -0.201133348119
+ATTACC GGTAAT 0.172300094885
+ATTACG CGTAAT -0.417719262734
+ATTAGA TCTAAT 0.93500523245
+ATTAGC GCTAAT -2.10811581856
+ATTAGG CCTAAT -0.467480191357
+ATTATA TATAAT 0.363629728768
+ATTATC GATAAT -0.861960495475
+ATTATG CATAAT 0.562723819118
+ATTCAA TTGAAT 0.0200498460273
+ATTCAC GTGAAT 0.418891749324
+ATTCAG CTGAAT -0.230767178642
+ATTCCA TGGAAT 1.05154227319
+ATTCCC GGGAAT -0.13155797816
+ATTCCG CGGAAT 0.0812893553365
+ATTCGA TCGAAT -0.0838650208544
+ATTCGC GCGAAT 0.222296881979
+ATTCGG CCGAAT -0.139283061422
+ATTCTA TAGAAT -0.224824743806
+ATTCTC GAGAAT -0.557117996579
+ATTCTG CAGAAT 1.0695468056
+ATTGAA TTCAAT -0.463986709186
+ATTGAC GTCAAT -1.69772334912
+ATTGAG CTCAAT -0.117012580083
+ATTGCA TGCAAT 0.257329206735
+ATTGCC GGCAAT 0.0584571835771
+ATTGCG CGCAAT 0.406025923639
+ATTGGA TCCAAT -0.62851834273
+ATTGGC GCCAAT 0.079438863957
+ATTGGG CCCAAT 0.913438482245
+ATTGTA TACAAT 0.450423686846
+ATTGTC GACAAT -1.01726290552
+ATTGTG CACAAT 0.60669560725
+ATTTAA TTAAAT 0.253196011315
+ATTTAC GTAAAT -2.97235568716
+ATTTAG CTAAAT -0.292154315684
+ATTTCA TGAAAT 0.636662944131
+ATTTCC GGAAAT -0.6231521332
+ATTTCG CGAAAT 0.503689389655
+ATTTGA TCAAAT -0.338609456935
+ATTTGC GCAAAT -2.07758131186
+ATTTGG CCAAAT 0.333462478492
+ATTTTA TAAAAT -0.556525692846
+ATTTTC GAAAAT -0.840026641213
+ATTTTG CAAAAT 1.35270004621
+CAAAAA TTTTTG 1.47166885108
+CAAAAC GTTTTG 0.109633599033
+CAAAAG CTTTTG -0.10115908148
+CAAACA TGTTTG -3.89334182039
+CAAACC GGTTTG 0.963184089308
+CAAACG CGTTTG -0.0395814581096
+CAAAGA TCTTTG -0.0118964430112
+CAAAGC GCTTTG 0.224207599526
+CAAAGG CCTTTG -0.175494095276
+CAAATA TATTTG -3.52207271503
+CAAATC GATTTG 0.660053835664
+CAAATG CATTTG -0.727424259718
+CAACAA TTGTTG -1.19656276713
+CAACAC GTGTTG 0.660747434725
+CAACAG CTGTTG -0.857485267387
+CAACCA TGGTTG 0.404850702022
+CAACCC GGGTTG 0.503951138249
+CAACCG CGGTTG -0.644416232942
+CAACGA TCGTTG -0.224419054868
+CAACGC GCGTTG -0.508914854709
+CAACGG CCGTTG -0.200076485177
+CAACTA TAGTTG 0.0981933677308
+CAACTC GAGTTG 0.150972486588
+CAACTG CAGTTG -0.921035707419
+CAAGAA TTCTTG 0.922374683971
+CAAGAC GTCTTG 0.423676757958
+CAAGAG CTCTTG -0.187883399321
+CAAGCA TGCTTG 0.636543530499
+CAAGCC GGCTTG 0.00071745286602
+CAAGCG CGCTTG -0.254180305152
+CAAGGA TCCTTG 0.821021319418
+CAAGGC GCCTTG 0.0604176049684
+CAAGGG CCCTTG 0.226570746088
+CAAGTA TACTTG 0.216374948595
+CAAGTC GACTTG 0.729235603486
+CAAGTG CACTTG -0.56315324029
+CAATAA TTATTG -0.549020618619
+CAATAC GTATTG -0.208803364217
+CAATAG CTATTG -0.0777355282063
+CAATCA TGATTG -0.263891707036
+CAATCC GGATTG 0.700155593708
+CAATCG CGATTG 0.357897326676
+CAATGA TCATTG 0.638987959753
+CAATGC GCATTG -0.191968495388
+CAATGG CCATTG 0.665776651287
+CAATTA TAATTG -0.271883139449
+CAATTC GAATTG 0.0983322987509
+CAATTG CAATTG -0.158614672809
+CACAAA TTTGTG 0.598792191689
+CACAAC GTTGTG 0.340518077962
+CACAAG CTTGTG 0.510003745313
+CACACA TGTGTG -0.400928902962
+CACACC GGTGTG -1.11996656805
+CACACG CGTGTG -0.718947444703
+CACAGA TCTGTG -0.100922777882
+CACAGC GCTGTG -0.50134148629
+CACAGG CCTGTG 0.252298561921
+CACATA TATGTG -0.110647463006
+CACATC GATGTG -0.651447413877
+CACATG CATGTG 0.00381644641394
+CACCAA TTGGTG 0.70840160186
+CACCAC GTGGTG 0.582686488096
+CACCAG CTGGTG -0.793184758924
+CACCCA TGGGTG -0.0284926248858
+CACCCC GGGGTG 0.893138419231
+CACCCG CGGGTG 0.417490759867
+CACCGA TCGGTG -0.0699188856569
+CACCGC GCGGTG 0.522058993046
+CACCGG CCGGTG 0.0180860340686
+CACCTA TAGGTG -0.592893592202
+CACCTC GAGGTG 0.676528270972
+CACCTG CAGGTG -5.27584299853
+CACGAA TTCGTG -0.382612993115
+CACGAC GTCGTG 0.032371814243
+CACGAG CTCGTG 0.254629628132
+CACGCA TGCGTG 0.117247261513
+CACGCC GGCGTG 0.00637478726345
+CACGCG CGCGTG -0.0917099552667
+CACGGA TCCGTG 0.368340586739
+CACGGC GCCGTG 1.30441576203
+CACGGG CCCGTG -0.264381491117
+CACGTA TACGTG -1.02682296878
+CACGTC GACGTG -0.219968085956
+CACGTG CACGTG 1.22865752463
+CACTAA TTAGTG 0.253977430376
+CACTAC GTAGTG -0.301750384238
+CACTAG CTAGTG -0.669946880366
+CACTCA TGAGTG -0.734281923173
+CACTCC GGAGTG -0.204493843897
+CACTCG CGAGTG -0.172095955086
+CACTGA TCAGTG -0.318554612147
+CACTGC GCAGTG 0.452355062655
+CACTGG CCAGTG 0.0113022350057
+CACTTA TAAGTG 0.159408091311
+CACTTC GAAGTG 0.111535665099
+CAGAAA TTTCTG 0.013661920057
+CAGAAC GTTCTG -0.34634558884
+CAGAAG CTTCTG -0.268726628204
+CAGACA TGTCTG -0.0884146723771
+CAGACC GGTCTG 0.848596588006
+CAGACG CGTCTG -0.176588448217
+CAGAGA TCTCTG 0.213650101773
+CAGAGC GCTCTG 0.72706788894
+CAGAGG CCTCTG 0.222483758838
+CAGATA TATCTG 0.134724484734
+CAGATC GATCTG 0.135060768551
+CAGATG CATCTG -3.82365153576
+CAGCAA TTGCTG 0.216246738966
+CAGCAC GTGCTG 0.134612451799
+CAGCAG CTGCTG -0.782809242533
+CAGCCA TGGCTG 0.25352023069
+CAGCCC GGGCTG 0.892449121017
+CAGCCG CGGCTG -1.34095163643
+CAGCGA TCGCTG 0.368898676461
+CAGCGC GCGCTG 0.40336227877
+CAGCGG CCGCTG -0.041543776559
+CAGCTA TAGCTG 0.347266756135
+CAGCTC GAGCTG 1.48884499806
+CAGCTG CAGCTG -3.03276014117
+CAGGAA TTCCTG 0.0661704451321
+CAGGAC GTCCTG 0.86939329333
+CAGGAG CTCCTG -0.480221178798
+CAGGCA TGCCTG 0.506805708618
+CAGGCC GGCCTG 0.756166029763
+CAGGCG CGCCTG 0.233566907124
+CAGGGA TCCCTG 0.970287665608
+CAGGGC GCCCTG -0.32000823828
+CAGGGG CCCCTG -0.0409673970414
+CAGGTA TACCTG -1.16422982988
+CAGGTC GACCTG 0.991958843425
+CAGTAA TTACTG -0.456065885023
+CAGTAC GTACTG 0.866479551705
+CAGTAG CTACTG 0.0492891641913
+CAGTCA TGACTG -0.864906645492
+CAGTCC GGACTG 0.436595236134
+CAGTCG CGACTG -0.0778031521909
+CAGTGA TCACTG 0.095401665363
+CAGTGC GCACTG 0.809574408241
+CAGTGG CCACTG 0.130805602072
+CAGTTA TAACTG 0.102237048367
+CAGTTC GAACTG -0.329116232229
+CATAAA TTTATG 1.09056149453
+CATAAC GTTATG -0.240307894957
+CATAAG CTTATG -0.274744434381
+CATACA TGTATG -0.316826193863
+CATACC GGTATG 0.907504112042
+CATACG CGTATG -0.646410594912
+CATAGA TCTATG 0.0743941919121
+CATAGC GCTATG 0.604402059297
+CATAGG CCTATG 0.132614581735
+CATATA TATATG 0.102595570806
+CATATC GATATG -0.685577322551
+CATATG CATATG -1.85636550314
+CATCAA TTGATG 0.528225242508
+CATCAC GTGATG -0.779076990822
+CATCAG CTGATG 0.0792779772782
+CATCCA TGGATG 0.380324367299
+CATCCC GGGATG 0.476823595932
+CATCCG CGGATG -0.735460908783
+CATCGA TCGATG -0.364712952887
+CATCGC GCGATG -0.445769472916
+CATCGG CCGATG -0.213744964599
+CATCTA TAGATG 0.591942642834
+CATCTC GAGATG 0.175954704824
+CATGAA TTCATG -0.0345496547785
+CATGAC GTCATG 0.128585442856
+CATGAG CTCATG 0.093844395721
+CATGCA TGCATG 0.500579541195
+CATGCC GGCATG 0.534363670334
+CATGCG CGCATG -0.07767446656
+CATGGA TCCATG -0.915298913786
+CATGGC GCCATG 0.496643001128
+CATGGG CCCATG 0.412095419236
+CATGTA TACATG 0.170530452886
+CATGTC GACATG 0.409305171927
+CATTAA TTAATG -1.21674231256
+CATTAC GTAATG 0.0363120647791
+CATTAG CTAATG 0.113980713568
+CATTCA TGAATG 0.211633133094
+CATTCC GGAATG 1.3667834662
+CATTCG CGAATG -0.306612061174
+CATTGA TCAATG 0.261275457012
+CATTGC GCAATG 0.422488072429
+CATTGG CCAATG -0.0683147570932
+CATTTA TAAATG -0.674511635662
+CATTTC GAAATG 0.149816454289
+CCAAAA TTTTGG 0.722553040783
+CCAAAC GTTTGG 0.136597481362
+CCAAAG CTTTGG 0.335940148998
+CCAACA TGTTGG 0.242613946152
+CCAACC GGTTGG 0.193037046526
+CCAACG CGTTGG -0.464564126497
+CCAAGA TCTTGG 0.449792823487
+CCAAGC GCTTGG 0.0105858416551
+CCAAGG CCTTGG 0.750008716703
+CCAATA TATTGG 0.380191080614
+CCAATC GATTGG -0.291650702172
+CCACAA TTGTGG 2.48928712228
+CCACAC GTGTGG 0.042494328422
+CCACAG CTGTGG 1.98559943258
+CCACCA TGGTGG 0.139131185109
+CCACCC GGGTGG 0.411865066012
+CCACCG CGGTGG 0.214345177527
+CCACGA TCGTGG 0.329121460372
+CCACGC GCGTGG 0.853862453394
+CCACGG CCGTGG 0.958047076313
+CCACTA TAGTGG 0.990324695059
+CCACTC GAGTGG -0.865871154042
+CCAGAA TTCTGG 1.35087980364
+CCAGAC GTCTGG 0.330013500547
+CCAGAG CTCTGG 0.850843625887
+CCAGCA TGCTGG -0.105656155248
+CCAGCC GGCTGG 0.10376516441
+CCAGCG CGCTGG -0.151192099436
+CCAGGA TCCTGG -0.00188020552734
+CCAGGC GCCTGG 0.477368129975
+CCAGGG CCCTGG 0.330067567163
+CCAGTA TACTGG 0.201689404211
+CCAGTC GACTGG 0.172972219351
+CCATAA TTATGG 0.864471006772
+CCATAC GTATGG -0.053674529194
+CCATAG CTATGG -0.628472779515
+CCATCA TGATGG 0.316290664784
+CCATCC GGATGG 0.541183344469
+CCATCG CGATGG -0.585435010409
+CCATGA TCATGG 0.842999014507
+CCATGC GCATGG 0.885922909323
+CCATGG CCATGG -1.19148404887
+CCATTA TAATGG 0.461535283786
+CCATTC GAATGG -0.448015532577
+CCCAAA TTTGGG 1.00513456604
+CCCAAC GTTGGG 0.0589630047868
+CCCAAG CTTGGG 0.612217890411
+CCCACA TGTGGG 1.49280537935
+CCCACC GGTGGG -0.540327896372
+CCCACG CGTGGG -0.189765626178
+CCCAGA TCTGGG 0.906005293679
+CCCAGC GCTGGG 0.00258121804065
+CCCAGG CCTGGG 0.385280928829
+CCCATA TATGGG 0.580077252911
+CCCATC GATGGG 0.105886687275
+CCCCAA TTGGGG 0.64902551958
+CCCCAC GTGGGG 0.177279574499
+CCCCAG CTGGGG 1.0367549673
+CCCCCA TGGGGG -0.220124853692
+CCCCCC GGGGGG -0.180081108721
+CCCCCG CGGGGG 0.373246680376
+CCCCGA TCGGGG 0.635830201709
+CCCCGC GCGGGG 0.000651576611126
+CCCCGG CCGGGG 0.854947986857
+CCCCTA TAGGGG 0.325441244643
+CCCCTC GAGGGG 0.483696397734
+CCCGAA TTCGGG 0.714032030274
+CCCGAC GTCGGG -0.208075368152
+CCCGAG CTCGGG 0.440636877923
+CCCGCA TGCGGG 0.402712812803
+CCCGCC GGCGGG 0.476459210594
+CCCGCG CGCGGG 0.023503459093
+CCCGGA TCCGGG -0.538869263981
+CCCGGC GCCGGG 0.346443397541
+CCCGGG CCCGGG 0.548834083398
+CCCGTA TACGGG 0.139729969322
+CCCGTC GACGGG -0.214648757928
+CCCTAA TTAGGG -0.290368330245
+CCCTAC GTAGGG 0.144663483304
+CCCTAG CTAGGG 0.0802948074314
+CCCTCA TGAGGG 0.408278749762
+CCCTCC GGAGGG -0.517676283937
+CCCTCG CGAGGG 0.0815315059399
+CCCTGA TCAGGG 0.0384525153021
+CCCTGC GCAGGG 0.376976503211
+CCCTTA TAAGGG -0.343987739741
+CCCTTC GAAGGG 0.152780345376
+CCGAAA TTTCGG 0.701489171099
+CCGAAC GTTCGG 0.371848478306
+CCGAAG CTTCGG 0.108859525197
+CCGACA TGTCGG -0.0475200223019
+CCGACC GGTCGG 0.190388885966
+CCGACG CGTCGG -0.43295891756
+CCGAGA TCTCGG -0.851488238354
+CCGAGC GCTCGG 0.402946122489
+CCGAGG CCTCGG 0.447649638689
+CCGATA TATCGG -0.583698466004
+CCGATC GATCGG 0.214495804491
+CCGCAA TTGCGG 0.704274885421
+CCGCAC GTGCGG -0.124930120604
+CCGCAG CTGCGG 0.796608954246
+CCGCCA TGGCGG 0.0933037773855
+CCGCCC GGGCGG -0.761788986379
+CCGCCG CGGCGG 0.118893616093
+CCGCGA TCGCGG -0.214050849808
+CCGCGC GCGCGG 0.427088586284
+CCGCGG CCGCGG -0.163354064297
+CCGCTA TAGCGG 0.075663139979
+CCGCTC GAGCGG -0.0840465139463
+CCGGAA TTCCGG -0.815717995392
+CCGGAC GTCCGG -0.0199681503018
+CCGGAG CTCCGG -0.142607842875
+CCGGCA TGCCGG -0.211088707568
+CCGGCC GGCCGG 0.20978106433
+CCGGCG CGCCGG 0.186483274301
+CCGGGA TCCCGG -0.120892474243
+CCGGGC GCCCGG 0.286207705144
+CCGGTA TACCGG -0.364061582654
+CCGGTC GACCGG 0.537401184601
+CCGTAA TTACGG -0.368122547637
+CCGTAC GTACGG 0.388891995455
+CCGTAG CTACGG -0.14921907099
+CCGTCA TGACGG -0.731374154141
+CCGTCC GGACGG 0.142640525033
+CCGTCG CGACGG -0.219815741986
+CCGTGA TCACGG 0.351505600383
+CCGTGC GCACGG 0.0466167726132
+CCGTTA TAACGG -0.177534573202
+CCGTTC GAACGG 0.930924136276
+CCTAAA TTTAGG -0.437162767665
+CCTAAC GTTAGG 0.715494549297
+CCTAAG CTTAGG 0.834398157528
+CCTACA TGTAGG 0.352513677162
+CCTACC GGTAGG -0.188597324766
+CCTACG CGTAGG -0.527855146408
+CCTAGA TCTAGG -0.358834707149
+CCTAGC GCTAGG -0.48327631422
+CCTAGG CCTAGG 0.712974479815
+CCTATA TATAGG -0.00662770590644
+CCTATC GATAGG 0.321915138696
+CCTCAA TTGAGG 0.478167716035
+CCTCAC GTGAGG -0.603310279315
+CCTCAG CTGAGG 1.18451331899
+CCTCCA TGGAGG -0.156185760342
+CCTCCC GGGAGG 0.196588926242
+CCTCCG CGGAGG 0.34975369542
+CCTCGA TCGAGG -0.12704382375
+CCTCGC GCGAGG -0.690558364248
+CCTCTA TAGAGG 0.666120847354
+CCTCTC GAGAGG 0.326083338601
+CCTGAA TTCAGG 0.0108115210728
+CCTGAC GTCAGG 0.0764004186754
+CCTGAG CTCAGG 0.140494023385
+CCTGCA TGCAGG -0.350864542641
+CCTGCC GGCAGG -0.156177205702
+CCTGCG CGCAGG 0.202361500118
+CCTGGA TCCAGG -0.165955838374
+CCTGGC GCCAGG 0.0156098804816
+CCTGTA TACAGG -0.631427205006
+CCTGTC GACAGG -0.29284206294
+CCTTAA TTAAGG 0.313884227175
+CCTTAC GTAAGG 0.666799661556
+CCTTAG CTAAGG 0.577269508525
+CCTTCA TGAAGG -0.228064410555
+CCTTCC GGAAGG 0.127618019043
+CCTTCG CGAAGG 0.233061732027
+CCTTGA TCAAGG 0.697814875901
+CCTTGC GCAAGG 0.163330658666
+CCTTTA TAAAGG -1.10202858977
+CCTTTC GAAAGG -0.15641139741
+CGAAAA TTTTCG 0.570946981635
+CGAAAC GTTTCG 0.71891048699
+CGAAAG CTTTCG 0.00936335629174
+CGAACA TGTTCG -0.544911197699
+CGAACC GGTTCG 0.236185790077
+CGAACG CGTTCG 0.0439294539998
+CGAAGA TCTTCG 0.0635388313512
+CGAAGC GCTTCG -0.312490378281
+CGAATA TATTCG 0.205254003141
+CGAATC GATTCG 0.0901499283283
+CGACAA TTGTCG 0.238050713336
+CGACAC GTGTCG -0.109116574917
+CGACAG CTGTCG -0.769009564377
+CGACCA TGGTCG 0.0360443347963
+CGACCC GGGTCG 0.216209586146
+CGACCG CGGTCG -0.00608417190027
+CGACGA TCGTCG 0.109094760764
+CGACGC GCGTCG -0.0821660131027
+CGACTA TAGTCG -0.154626936131
+CGACTC GAGTCG -0.246273419905
+CGAGAA TTCTCG 0.612418964386
+CGAGAC GTCTCG -0.0448240636466
+CGAGAG CTCTCG -0.751517796087
+CGAGCA TGCTCG -0.330608606967
+CGAGCC GGCTCG 0.693245611149
+CGAGCG CGCTCG 0.278683035903
+CGAGGA TCCTCG -0.187835599873
+CGAGGC GCCTCG -0.952734572307
+CGAGTA TACTCG 0.184943150492
+CGAGTC GACTCG 0.375230854406
+CGATAA TTATCG -0.393190761045
+CGATAC GTATCG 0.0742054445512
+CGATAG CTATCG -0.519678403883
+CGATCA TGATCG 0.0526959654511
+CGATCC GGATCG 0.0664840380862
+CGATCG CGATCG 0.15660945983
+CGATGA TCATCG -0.774933636073
+CGATGC GCATCG -0.391779119657
+CGATTA TAATCG -0.0474517133734
+CGATTC GAATCG 0.235116843424
+CGCAAA TTTGCG -0.0809989053442
+CGCAAC GTTGCG 0.914321615152
+CGCAAG CTTGCG 0.902729943974
+CGCACA TGTGCG -0.643522000175
+CGCACC GGTGCG -0.127073746796
+CGCACG CGTGCG -0.771541292094
+CGCAGA TCTGCG 0.165580560798
+CGCAGC GCTGCG 0.0469455940157
+CGCATA TATGCG 0.335290623274
+CGCATC GATGCG 0.437930163851
+CGCCAA TTGGCG -0.109846650695
+CGCCAC GTGGCG -0.994485238576
+CGCCAG CTGGCG -0.142749553444
+CGCCCA TGGGCG 1.05328824356
+CGCCCC GGGGCG 0.0137809042937
+CGCCCG CGGGCG -0.583692912482
+CGCCGA TCGGCG 0.403802926462
+CGCCGC GCGGCG 0.0867379096548
+CGCCTA TAGGCG -0.0424714321252
+CGCCTC GAGGCG -0.196079793222
+CGCGAA TTCGCG 0.198826174488
+CGCGAC GTCGCG 0.0519079932143
+CGCGAG CTCGCG -0.0095714045129
+CGCGCA TGCGCG 0.29733863628
+CGCGCC GGCGCG 0.00534076408975
+CGCGCG CGCGCG 0.133525541683
+CGCGGA TCCGCG 0.0295590736422
+CGCGGC GCCGCG 0.0211820057043
+CGCGTA TACGCG -0.00215667699398
+CGCGTC GACGCG 0.177954856123
+CGCTAA TTAGCG -0.241135973823
+CGCTAC GTAGCG 0.239902173491
+CGCTAG CTAGCG 0.0596735417366
+CGCTCA TGAGCG -0.0533438512221
+CGCTCC GGAGCG 0.0208450744523
+CGCTGA TCAGCG 0.68121260517
+CGCTGC GCAGCG 0.319030638925
+CGCTTA TAAGCG 0.331742647103
+CGCTTC GAAGCG -0.220266241715
+CGGAAA TTTCCG -1.28580550145
+CGGAAC GTTCCG 0.926154437465
+CGGAAG CTTCCG -0.532165941151
+CGGACA TGTCCG 0.26801379322
+CGGACC GGTCCG 0.208564671836
+CGGACG CGTCCG 0.185658917881
+CGGAGA TCTCCG -0.0783602636003
+CGGAGC GCTCCG 0.213447050966
+CGGATA TATCCG 0.180933559441
+CGGATC GATCCG 0.0115933607686
+CGGCAA TTGCCG -0.141333522478
+CGGCAC GTGCCG -0.0989810302207
+CGGCAG CTGCCG -0.315504532197
+CGGCCA TGGCCG 0.520189196688
+CGGCCC GGGCCG 0.0770477004768
+CGGCCG CGGCCG -0.0100293295169
+CGGCGA TCGCCG 0.335335269266
+CGGCGC GCGCCG 0.142338129608
+CGGCTA TAGCCG 0.483570773894
+CGGCTC GAGCCG 0.386036576617
+CGGGAA TTCCCG -0.464940358426
+CGGGAC GTCCCG 0.447833530318
+CGGGAG CTCCCG 0.0731420049918
+CGGGCA TGCCCG 0.282329634235
+CGGGCC GGCCCG 0.812549782764
+CGGGGA TCCCCG 0.869420112553
+CGGGGC GCCCCG 0.86056528281
+CGGGTA TACCCG -0.203275385127
+CGGGTC GACCCG -0.444672989957
+CGGTAA TTACCG -0.649508330752
+CGGTAC GTACCG -0.117712116536
+CGGTAG CTACCG 0.281181384004
+CGGTCA TGACCG 1.13661174882
+CGGTCC GGACCG 0.0903119623658
+CGGTGA TCACCG -0.00769608361013
+CGGTGC GCACCG -0.800050077623
+CGGTTA TAACCG -0.240948411238
+CGGTTC GAACCG 0.347248429131
+CGTAAA TTTACG 0.413018459532
+CGTAAC GTTACG 0.133628748349
+CGTAAG CTTACG -0.181667545798
+CGTACA TGTACG 0.487388635308
+CGTACC GGTACG -0.0604415421058
+CGTACG CGTACG -0.23751294459
+CGTAGA TCTACG -0.125539098097
+CGTAGC GCTACG 0.386530915361
+CGTATA TATACG -0.194131124774
+CGTATC GATACG -0.267160757641
+CGTCAA TTGACG -1.00863955903
+CGTCAC GTGACG -0.521674552853
+CGTCAG CTGACG -1.76038907438
+CGTCCA TGGACG 0.0777766757824
+CGTCCC GGGACG 0.803357191448
+CGTCGA TCGACG 0.0477564437913
+CGTCGC GCGACG 0.103336677444
+CGTCTA TAGACG -0.60967168838
+CGTCTC GAGACG -0.791340367
+CGTGAA TTCACG 0.634022023138
+CGTGAC GTCACG 0.443531335362
+CGTGAG CTCACG -0.289769453913
+CGTGCA TGCACG 0.109747972914
+CGTGCC GGCACG 0.464253873914
+CGTGGA TCCACG 0.275127549406
+CGTGGC GCCACG 0.531223592152
+CGTGTA TACACG 0.642083015658
+CGTGTC GACACG 0.136541839783
+CGTTAA TTAACG -0.497209435042
+CGTTAC GTAACG -0.0943435759699
+CGTTAG CTAACG -0.242661499343
+CGTTCA TGAACG 0.504828978427
+CGTTCC GGAACG 0.656549347858
+CGTTGA TCAACG 0.482938477348
+CGTTGC GCAACG -0.32248741119
+CGTTTA TAAACG -0.413518821035
+CGTTTC GAAACG -0.819946564926
+CTAAAA TTTTAG -0.474998981835
+CTAAAC GTTTAG -0.0266242942178
+CTAAAG CTTTAG 0.297713951577
+CTAACA TGTTAG -0.621545633382
+CTAACC GGTTAG 0.63074726461
+CTAAGA TCTTAG 0.597123137729
+CTAAGC GCTTAG 0.459779530522
+CTAATA TATTAG 0.051814051491
+CTAATC GATTAG -0.192871916975
+CTACAA TTGTAG 0.415390381047
+CTACAC GTGTAG 0.314508278002
+CTACAG CTGTAG -0.112671340464
+CTACCA TGGTAG -0.264705309307
+CTACCC GGGTAG 0.552961906046
+CTACGA TCGTAG 0.489104112382
+CTACGC GCGTAG -0.684139469516
+CTACTA TAGTAG -0.106050671268
+CTACTC GAGTAG -0.218538947551
+CTAGAA TTCTAG 0.76196340125
+CTAGAC GTCTAG -0.314054292133
+CTAGAG CTCTAG 0.170762667208
+CTAGCA TGCTAG -0.201084106705
+CTAGCC GGCTAG 0.657253973787
+CTAGGA TCCTAG -0.343732437303
+CTAGGC GCCTAG -0.399983888794
+CTAGTA TACTAG 0.279272209924
+CTAGTC GACTAG 0.63773352209
+CTATAA TTATAG -0.540591918194
+CTATAC GTATAG 0.0316033486072
+CTATAG CTATAG -0.110441744258
+CTATCA TGATAG -0.986583008494
+CTATCC GGATAG 0.0938549592711
+CTATGA TCATAG 1.19060286606
+CTATGC GCATAG 0.77976599654
+CTATTA TAATAG 0.494683632528
+CTATTC GAATAG -0.0659811786152
+CTCAAA TTTGAG -0.435334739428
+CTCAAC GTTGAG 0.673295862992
+CTCAAG CTTGAG 0.814663961939
+CTCACA TGTGAG -0.125711909834
+CTCACC GGTGAG -0.0607596963595
+CTCAGA TCTGAG 0.550689436444
+CTCAGC GCTGAG -0.272692249315
+CTCATA TATGAG 0.669202456635
+CTCATC GATGAG -0.214653088571
+CTCCAA TTGGAG -0.404875066778
+CTCCAC GTGGAG 0.544704103667
+CTCCAG CTGGAG 0.300968023705
+CTCCCA TGGGAG -0.166515521703
+CTCCCC GGGGAG -0.0107883337468
+CTCCGA TCGGAG 0.475523145763
+CTCCGC GCGGAG -0.234881416584
+CTCCTA TAGGAG -0.440390954167
+CTCCTC GAGGAG 0.252207292196
+CTCGAA TTCGAG 0.458096440119
+CTCGAC GTCGAG -0.648461079866
+CTCGAG CTCGAG 0.238421232408
+CTCGCA TGCGAG -0.770225157474
+CTCGCC GGCGAG 0.367453385031
+CTCGGA TCCGAG 0.339650543736
+CTCGGC GCCGAG 0.421157785256
+CTCGTA TACGAG 0.362284718227
+CTCGTC GACGAG -0.209973906057
+CTCTAA TTAGAG -0.585120608238
+CTCTAC GTAGAG -0.0381226169544
+CTCTCA TGAGAG 0.412400836814
+CTCTCC GGAGAG -0.500495460815
+CTCTGA TCAGAG -0.0509816398658
+CTCTGC GCAGAG -0.137150029459
+CTCTTA TAAGAG -0.0995119601555
+CTCTTC GAAGAG 0.0293315576627
+CTGAAA TTTCAG 0.637905924395
+CTGAAC GTTCAG 0.0801852652274
+CTGAAG CTTCAG 1.29884632524
+CTGACA TGTCAG -1.45442440761
+CTGACC GGTCAG 1.08710062922
+CTGAGA TCTCAG -0.182920626709
+CTGAGC GCTCAG -0.261916169948
+CTGATA TATCAG -1.5286762778
+CTGATC GATCAG -0.261819591815
+CTGCAA TTGCAG -0.367828379023
+CTGCAC GTGCAG -0.0851053710219
+CTGCAG CTGCAG 0.017373979076
+CTGCCA TGGCAG -0.328262173072
+CTGCCC GGGCAG 0.396136640158
+CTGCGA TCGCAG -0.236577233975
+CTGCGC GCGCAG -0.0340134961712
+CTGCTA TAGCAG -0.90401510954
+CTGCTC GAGCAG -0.482167659855
+CTGGAA TTCCAG 0.480592453838
+CTGGAC GTCCAG 0.124783447514
+CTGGCA TGCCAG -0.744896236415
+CTGGCC GGCCAG 0.508550304958
+CTGGGA TCCCAG 0.5595983837
+CTGGGC GCCCAG -0.420503142996
+CTGGTA TACCAG 0.351352305298
+CTGGTC GACCAG -0.333524602904
+CTGTAA TTACAG -0.344930581819
+CTGTAC GTACAG 0.422072688223
+CTGTCA TGACAG -1.32137018932
+CTGTCC GGACAG -0.0204497524117
+CTGTGA TCACAG -0.318233094975
+CTGTGC GCACAG -0.452689661808
+CTGTTA TAACAG -0.682203584522
+CTGTTC GAACAG -0.032586371998
+CTTAAA TTTAAG 0.522514091698
+CTTAAC GTTAAG -1.0306648626
+CTTAAG CTTAAG 1.38874859741
+CTTACA TGTAAG 0.301805562881
+CTTACC GGTAAG 0.0844070155044
+CTTAGA TCTAAG 0.170474445016
+CTTAGC GCTAAG 0.138185199667
+CTTATA TATAAG 0.454300274475
+CTTATC GATAAG -1.35801301952
+CTTCAA TTGAAG 0.19993761629
+CTTCAC GTGAAG 0.108019941224
+CTTCCA TGGAAG 0.362648019349
+CTTCCC GGGAAG 0.131998098522
+CTTCGA TCGAAG -0.105193206331
+CTTCGC GCGAAG -0.390704103225
+CTTCTA TAGAAG -0.155899121808
+CTTCTC GAGAAG -0.0739598398458
+CTTGAA TTCAAG -0.0719916676968
+CTTGAC GTCAAG 0.068351491377
+CTTGCA TGCAAG 0.463268254042
+CTTGCC GGCAAG -0.766831412922
+CTTGGA TCCAAG 0.314061693873
+CTTGGC GCCAAG 0.405573462214
+CTTGTA TACAAG 0.65851026313
+CTTGTC GACAAG 0.421432544956
+CTTTAA TTAAAG 0.0423904357395
+CTTTAC GTAAAG 0.655866345313
+CTTTCA TGAAAG -0.707971281847
+CTTTCC GGAAAG -0.305665628725
+CTTTGA TCAAAG -0.390594122457
+CTTTGC GCAAAG -0.253785395266
+CTTTTA TAAAAG 0.520772205443
+CTTTTC GAAAAG -0.0834230798663
+GAAAAA TTTTTC -0.394594492275
+GAAAAC GTTTTC -0.889752597605
+GAAACA TGTTTC 0.981733612947
+GAAACC GGTTTC -0.0721495314535
+GAAAGA TCTTTC -0.475164148563
+GAAAGC GCTTTC -0.0120812417699
+GAAATA TATTTC 0.128630511111
+GAAATC GATTTC -0.258506502151
+GAACAA TTGTTC 0.072282920944
+GAACAC GTGTTC 0.701467646999
+GAACCA TGGTTC -0.542881494194
+GAACCC GGGTTC 0.340658710073
+GAACGA TCGTTC -0.199593582018
+GAACGC GCGTTC 0.454300760466
+GAACTA TAGTTC -0.295720723936
+GAACTC GAGTTC 1.20394131743
+GAAGAA TTCTTC 0.506500337578
+GAAGAC GTCTTC 0.0907880643499
+GAAGCA TGCTTC -1.19156928449
+GAAGCC GGCTTC 0.397646726543
+GAAGGA TCCTTC 0.253377264863
+GAAGGC GCCTTC -0.525939145222
+GAAGTA TACTTC 0.662195710792
+GAAGTC GACTTC -0.419934443362
+GAATAA TTATTC 0.123387568539
+GAATAC GTATTC -0.383087785228
+GAATCA TGATTC 1.27939625155
+GAATCC GGATTC -0.0498963377435
+GAATGA TCATTC -0.620744593633
+GAATGC GCATTC 1.04194079095
+GAATTA TAATTC -0.149020713486
+GAATTC GAATTC 0.644297830158
+GACAAA TTTGTC -0.134936102515
+GACAAC GTTGTC -0.309021420318
+GACACA TGTGTC 1.29887852736
+GACACC GGTGTC -0.787922948829
+GACAGA TCTGTC -0.863814253112
+GACAGC GCTGTC -0.676822231763
+GACATA TATGTC 0.827736434537
+GACATC GATGTC -0.130237675939
+GACCAA TTGGTC -0.475509904338
+GACCAC GTGGTC 2.00597812344
+GACCCA TGGGTC 0.594605764824
+GACCCC GGGGTC -0.25038724278
+GACCGA TCGGTC 0.077158904808
+GACCGC GCGGTC 0.996665204047
+GACCTA TAGGTC -0.0140856102395
+GACCTC GAGGTC -0.0141400877685
+GACGAA TTCGTC -0.24855256907
+GACGAC GTCGTC -0.787091955547
+GACGCA TGCGTC -0.79119100016
+GACGCC GGCGTC -0.064943799736
+GACGGA TCCGTC 0.507946533831
+GACGGC GCCGTC 0.0996726291695
+GACGTA TACGTC -1.129683455
+GACGTC GACGTC -0.325314874102
+GACTAA TTAGTC 1.9212514778
+GACTAC GTAGTC -0.489310082414
+GACTCA TGAGTC 3.18513477299
+GACTCC GGAGTC 0.00318724773961
+GACTGA TCAGTC -0.284654331124
+GACTGC GCAGTC 0.253234563706
+GACTTA TAAGTC 0.53783328072
+GAGAAA TTTCTC -0.107943987489
+GAGAAC GTTCTC 0.763065903804
+GAGACA TGTCTC 0.0238226221677
+GAGACC GGTCTC 0.000346769354435
+GAGAGA TCTCTC -0.366063998677
+GAGAGC GCTCTC -0.0716698934044
+GAGATA TATCTC -0.479728321524
+GAGATC GATCTC 0.647198822351
+GAGCAA TTGCTC 0.278077669471
+GAGCAC GTGCTC 0.442986942774
+GAGCCA TGGCTC -0.0889483545925
+GAGCCC GGGCTC -0.310590547764
+GAGCGA TCGCTC 0.294194487089
+GAGCGC GCGCTC 0.0827346373539
+GAGCTA TAGCTC -0.730332859895
+GAGCTC GAGCTC -0.199506840782
+GAGGAA TTCCTC 0.543305692157
+GAGGAC GTCCTC -0.349616541062
+GAGGCA TGCCTC -0.105697066717
+GAGGCC GGCCTC 0.858282680338
+GAGGGA TCCCTC -0.124510451688
+GAGGGC GCCCTC 0.323767770759
+GAGGTA TACCTC -0.844332688078
+GAGTAA TTACTC 0.758120437701
+GAGTAC GTACTC 0.104306241785
+GAGTCA TGACTC 3.58563459353
+GAGTCC GGACTC 0.339341967911
+GAGTGA TCACTC 0.066165066026
+GAGTGC GCACTC -0.514233890221
+GAGTTA TAACTC 0.671276468041
+GATAAA TTTATC -1.16448423447
+GATAAC GTTATC -0.476107186466
+GATACA TGTATC -0.172248071576
+GATACC GGTATC 0.922018577146
+GATAGA TCTATC -0.678310902513
+GATAGC GCTATC -1.0739725457
+GATATA TATATC 0.323693340333
+GATATC GATATC -0.13829875248
+GATCAA TTGATC -0.263370217811
+GATCAC GTGATC -0.383180034094
+GATCCA TGGATC -0.418998551659
+GATCCC GGGATC 0.459522351525
+GATCGA TCGATC 0.174830497643
+GATCGC GCGATC -0.0493224221696
+GATCTA TAGATC -0.114539458553
+GATGAA TTCATC 0.113123798729
+GATGAC GTCATC 0.420960424606
+GATGCA TGCATC -0.740595126486
+GATGCC GGCATC -0.0146831740442
+GATGGA TCCATC -0.44789540412
+GATGGC GCCATC -0.976518189565
+GATGTA TACATC 0.315735783162
+GATTAA TTAATC -0.768397991554
+GATTAC GTAATC -0.159295927856
+GATTCA TGAATC 1.91569273635
+GATTCC GGAATC 0.028441642914
+GATTGA TCAATC 0.586533848838
+GATTGC GCAATC 0.0359466523255
+GATTTA TAAATC -0.43347973413
+GCAAAA TTTTGC 1.57245669278
+GCAAAC GTTTGC -2.41567256424
+GCAACA TGTTGC -0.0212126198682
+GCAACC GGTTGC 0.167308203586
+GCAAGA TCTTGC 0.230496794516
+GCAAGC GCTTGC -0.279906494617
+GCAATA TATTGC 1.03289974237
+GCACAA TTGTGC 0.168080718089
+GCACAC GTGTGC -0.126747337538
+GCACCA TGGTGC -0.370948557776
+GCACCC GGGTGC 0.486126157128
+GCACGA TCGTGC 0.115382004439
+GCACGC GCGTGC -0.384704484018
+GCACTA TAGTGC -0.732242903933
+GCAGAA TTCTGC -0.441638279756
+GCAGAC GTCTGC 0.577048501416
+GCAGCA TGCTGC 0.0643315561072
+GCAGCC GGCTGC 0.0719820075452
+GCAGGA TCCTGC 0.034078237827
+GCAGGC GCCTGC 0.0751087587256
+GCAGTA TACTGC -0.140907462619
+GCATAA TTATGC -0.216901949377
+GCATAC GTATGC -0.417062652314
+GCATCA TGATGC -0.509813438412
+GCATCC GGATGC 0.157724061262
+GCATGA TCATGC 0.311792502655
+GCATGC GCATGC 0.0273955313285
+GCATTA TAATGC -0.038383608378
+GCCAAA TTTGGC -0.0363591138599
+GCCAAC GTTGGC -0.48002846426
+GCCACA TGTGGC 0.478891420952
+GCCACC GGTGGC 0.046888332347
+GCCAGA TCTGGC -0.325597697717
+GCCAGC GCTGGC -0.199926154665
+GCCATA TATGGC -0.951578017522
+GCCCAA TTGGGC 1.10276199076
+GCCCAC GTGGGC 0.676887327364
+GCCCCA TGGGGC 0.35936393068
+GCCCCC GGGGGC -0.398755700607
+GCCCGA TCGGGC 1.06470764147
+GCCCGC GCGGGC 0.47965310734
+GCCCTA TAGGGC 0.435710100752
+GCCGAA TTCGGC 0.0377072993252
+GCCGAC GTCGGC 0.22768512467
+GCCGCA TGCGGC 0.115534020858
+GCCGCC GGCGGC -0.732735770282
+GCCGGA TCCGGC -0.38520088123
+GCCGGC GCCGGC -0.185586613226
+GCCGTA TACGGC -0.332516669945
+GCCTAA TTAGGC 0.0973747347692
+GCCTAC GTAGGC 0.0804171248438
+GCCTCA TGAGGC 0.64675754844
+GCCTCC GGAGGC -0.235791468531
+GCCTGA TCAGGC 0.871312603586
+GCCTTA TAAGGC 0.66398535763
+GCGAAA TTTCGC 0.216500601374
+GCGAAC GTTCGC 0.176665221767
+GCGACA TGTCGC -0.206525750848
+GCGACC GGTCGC 0.867020226992
+GCGAGA TCTCGC 0.39834591882
+GCGAGC GCTCGC -0.0047486305465
+GCGATA TATCGC -0.212587858109
+GCGCAA TTGCGC 0.249470449698
+GCGCAC GTGCGC -0.576649184299
+GCGCCA TGGCGC -0.370226281141
+GCGCCC GGGCGC 0.381404303337
+GCGCGA TCGCGC 0.244534402725
+GCGCGC GCGCGC 0.0601779542866
+GCGCTA TAGCGC -0.184211933615
+GCGGAA TTCCGC -0.14917985125
+GCGGAC GTCCGC -0.0437387199239
+GCGGCA TGCCGC 0.124099157921
+GCGGCC GGCCGC -0.868770297365
+GCGGGA TCCCGC -0.330802769301
+GCGGTA TACCGC -0.293792191657
+GCGTAA TTACGC -0.308730784191
+GCGTAC GTACGC -0.493705045035
+GCGTCA TGACGC -1.40894601949
+GCGTCC GGACGC 0.650092537073
+GCGTGA TCACGC 0.485604781064
+GCGTTA TAACGC -0.16348782665
+GCTAAA TTTAGC 0.0633281870082
+GCTAAC GTTAGC 0.0446987046769
+GCTACA TGTAGC 0.139951985661
+GCTACC GGTAGC -0.582191094114
+GCTAGA TCTAGC 0.425355245083
+GCTAGC GCTAGC 0.544395057547
+GCTATA TATAGC -0.122878741617
+GCTCAA TTGAGC 0.357260216643
+GCTCAC GTGAGC -0.362077448999
+GCTCCA TGGAGC 0.234865357242
+GCTCCC GGGAGC 0.410356067134
+GCTCGA TCGAGC 0.523443780445
+GCTCTA TAGAGC -0.503747504809
+GCTGAA TTCAGC 0.292869298061
+GCTGAC GTCAGC -0.149757550449
+GCTGCA TGCAGC 0.0619331271517
+GCTGCC GGCAGC 0.116465057744
+GCTGGA TCCAGC 0.00864608267586
+GCTGTA TACAGC -0.0817336518524
+GCTTAA TTAAGC 0.664269635099
+GCTTAC GTAAGC -0.573448259095
+GCTTCA TGAAGC -0.527498202661
+GCTTCC GGAAGC 0.0466074141633
+GCTTGA TCAAGC -0.223589414042
+GCTTTA TAAAGC 0.213691377209
+GGAAAA TTTTCC -2.05613895077
+GGAAAC GTTTCC -0.31843375495
+GGAACA TGTTCC 0.878948290896
+GGAACC GGTTCC -0.41733947832
+GGAAGA TCTTCC 0.292117518108
+GGAATA TATTCC -0.310943681072
+GGACAA TTGTCC 0.390414984809
+GGACAC GTGTCC 0.174292489707
+GGACCA TGGTCC -0.523110637041
+GGACCC GGGTCC 0.5472877955
+GGACGA TCGTCC 0.53114236268
+GGACTA TAGTCC 0.793132421151
+GGAGAA TTCTCC -0.163792630984
+GGAGAC GTCTCC 0.206727520829
+GGAGCA TGCTCC 0.456407748447
+GGAGCC GGCTCC 0.295421561451
+GGAGGA TCCTCC 0.328364640098
+GGAGTA TACTCC -0.151742588951
+GGATAA TTATCC -0.0585638459289
+GGATAC GTATCC -0.309412420714
+GGATCA TGATCC -0.480562203251
+GGATCC GGATCC 0.335487965753
+GGATGA TCATCC 0.177410986413
+GGATTA TAATCC -1.1939070998
+GGCAAA TTTGCC 0.000624112947654
+GGCAAC GTTGCC -1.15978921165
+GGCACA TGTGCC 0.693216860903
+GGCACC GGTGCC -0.881907424195
+GGCAGA TCTGCC 0.202002559673
+GGCATA TATGCC 0.0739074460809
+GGCCAA TTGGCC 0.233613871256
+GGCCAC GTGGCC 0.349420517541
+GGCCCA TGGGCC 0.76801497851
+GGCCCC GGGGCC 0.27541536492
+GGCCGA TCGGCC 0.663089975748
+GGCCTA TAGGCC 0.219005607787
+GGCGAA TTCGCC 0.226797469513
+GGCGAC GTCGCC 0.127570597429
+GGCGCA TGCGCC -0.0446136938549
+GGCGCC GGCGCC -0.0745645953993
+GGCGGA TCCGCC -0.226505856389
+GGCGTA TACGCC -0.566154746416
+GGCTAA TTAGCC 0.410090602753
+GGCTAC GTAGCC 0.167521133316
+GGCTCA TGAGCC -0.182448764472
+GGCTGA TCAGCC 0.305315803898
+GGCTTA TAAGCC 0.0549165179339
+GGGAAA TTTCCC 0.211185115876
+GGGAAC GTTCCC 0.0891726153306
+GGGACA TGTCCC 0.798353329738
+GGGACC GGTCCC 0.327460829345
+GGGAGA TCTCCC -0.00772602123998
+GGGATA TATCCC 0.042485553485
+GGGCAA TTGCCC -0.398865388366
+GGGCAC GTGCCC 0.531369020502
+GGGCCA TGGCCC 0.421850727294
+GGGCCC GGGCCC 0.13715245724
+GGGCGA TCGCCC -0.207341953811
+GGGCTA TAGCCC 0.860581648468
+GGGGAA TTCCCC 0.414828826681
+GGGGAC GTCCCC 1.09337858056
+GGGGCA TGCCCC 0.188790419324
+GGGGGA TCCCCC 0.564505418018
+GGGGTA TACCCC 0.337102528271
+GGGTAA TTACCC 0.152772959878
+GGGTAC GTACCC 0.0850292801245
+GGGTCA TGACCC -0.277523149632
+GGGTGA TCACCC 0.389877602576
+GGGTTA TAACCC -1.1488463254
+GGTAAA TTTACC -1.93485319848
+GGTAAC GTTACC -0.444349645812
+GGTACA TGTACC 0.533161182312
+GGTACC GGTACC -0.307030306627
+GGTAGA TCTACC 0.0409651099541
+GGTATA TATACC -1.16022327914
+GGTCAA TTGACC 0.139153520691
+GGTCAC GTGACC -0.450515748733
+GGTCCA TGGACC 0.169460723222
+GGTCGA TCGACC -0.122888767331
+GGTCTA TAGACC -0.0850652207286
+GGTGAA TTCACC 0.524001085704
+GGTGAC GTCACC -0.17925252457
+GGTGCA TGCACC -0.7324301946
+GGTGGA TCCACC 0.377800190353
+GGTGTA TACACC 0.617772381551
+GGTTAA TTAACC 0.156217996285
+GGTTAC GTAACC -0.890319968576
+GGTTCA TGAACC -0.551493468635
+GGTTGA TCAACC 0.213366284121
+GGTTTA TAAACC -0.015482073629
+GTAAAA TTTTAC 1.18200993615
+GTAAAC GTTTAC -3.59764423275
+GTAACA TGTTAC 0.863881272261
+GTAAGA TCTTAC 0.696118619435
+GTAATA TATTAC -0.0790481100302
+GTACAA TTGTAC 0.959255249776
+GTACAC GTGTAC -0.381103239165
+GTACCA TGGTAC 0.673058575835
+GTACGA TCGTAC 0.303418617214
+GTACTA TAGTAC 0.368286056561
+GTAGAA TTCTAC 0.777466438187
+GTAGAC GTCTAC -0.260392489063
+GTAGCA TGCTAC -0.557171886797
+GTAGGA TCCTAC 0.249764914617
+GTAGTA TACTAC 1.10613154186
+GTATAA TTATAC -0.225205794143
+GTATAC GTATAC -0.709411981795
+GTATCA TGATAC -0.102084079336
+GTATGA TCATAC 0.883480966804
+GTATTA TAATAC 0.112117340424
+GTCAAA TTTGAC 0.143115014765
+GTCAAC GTTGAC -2.14211821008
+GTCACA TGTGAC 0.438253073163
+GTCAGA TCTGAC -0.426249226609
+GTCATA TATGAC 0.201153560659
+GTCCAA TTGGAC 0.0470205002071
+GTCCAC GTGGAC -0.21284193494
+GTCCCA TGGGAC 1.6176606758
+GTCCGA TCGGAC 0.114722967227
+GTCCTA TAGGAC 0.771952812089
+GTCGAA TTCGAC 0.724088020994
+GTCGAC GTCGAC 0.251614152656
+GTCGCA TGCGAC 0.675711700536
+GTCGGA TCCGAC 0.193393621986
+GTCGTA TACGAC 0.0186019337447
+GTCTAA TTAGAC -0.187936713171
+GTCTCA TGAGAC -0.491335152028
+GTCTGA TCAGAC 0.360208311696
+GTCTTA TAAGAC 0.0625579178154
+GTGAAA TTTCAC -0.563337549375
+GTGAAC GTTCAC -0.349524060758
+GTGACA TGTCAC -0.410586341492
+GTGAGA TCTCAC -1.0355009926
+GTGATA TATCAC -0.736410349306
+GTGCAA TTGCAC -0.260755290625
+GTGCAC GTGCAC -0.744407650846
+GTGCCA TGGCAC -1.07262578178
+GTGCGA TCGCAC -0.164782650071
+GTGCTA TAGCAC -0.662274879805
+GTGGAA TTCCAC 0.176989759501
+GTGGCA TGCCAC -0.159364738365
+GTGGGA TCCCAC 0.988068726378
+GTGGTA TACCAC -0.246902705808
+GTGTAA TTACAC -0.891262927583
+GTGTCA TGACAC -0.0379293872633
+GTGTGA TCACAC -1.95184795358
+GTGTTA TAACAC -1.3607239668
+GTTAAA TTTAAC -0.186873285101
+GTTAAC GTTAAC 0.482940767605
+GTTACA TGTAAC 0.329573937825
+GTTAGA TCTAAC -1.00236498509
+GTTATA TATAAC -0.163258685865
+GTTCAA TTGAAC 0.577729784916
+GTTCCA TGGAAC 0.42827050466
+GTTCGA TCGAAC -0.160584230436
+GTTCTA TAGAAC 0.628168497995
+GTTGAA TTCAAC 0.0122596223788
+GTTGCA TGCAAC 1.28935831187
+GTTGGA TCCAAC 0.501609010251
+GTTGTA TACAAC 0.536022722898
+GTTTAA TTAAAC -0.00554204032662
+GTTTCA TGAAAC -0.106888470485
+GTTTGA TCAAAC -0.313019133275
+GTTTTA TAAAAC 0.992384413805
+TAAAAA TTTTTA 0.579463327305
+TAAACA TGTTTA -2.43807962232
+TAAAGA TCTTTA -0.0559249115822
+TAAATA TATTTA -2.16727500814
+TAACAA TTGTTA 0.5362496048
+TAACCA TGGTTA 1.25918224499
+TAACGA TCGTTA -0.245420018049
+TAACTA TAGTTA 0.608128184593
+TAAGAA TTCTTA 1.02556160172
+TAAGCA TGCTTA -0.00178390331913
+TAAGGA TCCTTA 0.837198657402
+TAAGTA TACTTA 0.0721724774768
+TAATAA TTATTA -0.189941492074
+TAATCA TGATTA 0.004309724706
+TAATGA TCATTA -0.927698594817
+TAATTA TAATTA -1.12948086023
+TACAAA TTTGTA -0.668160668233
+TACACA TGTGTA -1.5497729664
+TACAGA TCTGTA -0.00599134182996
+TACATA TATGTA -1.12195560298
+TACCAA TTGGTA -0.281401801063
+TACCCA TGGGTA 0.279140164003
+TACCGA TCGGTA -0.674861229256
+TACCTA TAGGTA 0.232148610987
+TACGAA TTCGTA -0.107764774043
+TACGCA TGCGTA -0.519175400062
+TACGGA TCCGTA -0.402472215133
+TACGTA TACGTA 0.541316396612
+TACTAA TTAGTA -0.135239105598
+TACTCA TGAGTA 0.639564666607
+TACTGA TCAGTA -0.482886609395
+TAGAAA TTTCTA 0.0445682798936
+TAGACA TGTCTA -0.45261373791
+TAGAGA TCTCTA -0.0954516438137
+TAGATA TATCTA -0.532370750539
+TAGCAA TTGCTA -0.652336647837
+TAGCCA TGGCTA 0.291364926411
+TAGCGA TCGCTA 0.149053684558
+TAGCTA TAGCTA 0.686821117845
+TAGGAA TTCCTA 0.597312086206
+TAGGCA TGCCTA -0.0392374245184
+TAGGGA TCCCTA -0.898776106474
+TAGTAA TTACTA -0.0409191001809
+TAGTCA TGACTA 1.32002868308
+TAGTGA TCACTA 0.306784253534
+TATAAA TTTATA -0.121630582814
+TATACA TGTATA -0.557527238557
+TATAGA TCTATA -0.29108877277
+TATATA TATATA -0.239193588677
+TATCAA TTGATA -1.21281286807
+TATCCA TGGATA -0.143758324237
+TATCGA TCGATA -0.316281900517
+TATGAA TTCATA 0.567777510974
+TATGCA TGCATA 0.382062812319
+TATGGA TCCATA 0.18823076363
+TATTAA TTAATA 0.370202791227
+TATTCA TGAATA -0.0890453547066
+TATTGA TCAATA -3.17710597328
+TCAAAA TTTTGA 0.303875411059
+TCAACA TGTTGA -2.00254784618
+TCAAGA TCTTGA 1.4720641259
+TCACAA TTGTGA 0.295459860043
+TCACCA TGGTGA 0.210574562759
+TCACGA TCGTGA -0.515739048241
+TCAGAA TTCTGA -0.292759366807
+TCAGCA TGCTGA -0.620868456293
+TCAGGA TCCTGA 0.0391644847042
+TCATAA TTATGA 0.46490835305
+TCATCA TGATGA 0.156189576126
+TCATGA TCATGA -0.28992182543
+TCCAAA TTTGGA 0.372353469926
+TCCACA TGTGGA -0.303324084643
+TCCAGA TCTGGA 1.11791545491
+TCCCAA TTGGGA 1.0120690282
+TCCCCA TGGGGA 0.475206609554
+TCCCGA TCGGGA -0.218933163821
+TCCGAA TTCGGA 0.231675785391
+TCCGCA TGCGGA -0.0565746658942
+TCCGGA TCCGGA -0.534349085125
+TCCTAA TTAGGA 0.426713370402
+TCCTCA TGAGGA -0.101009274429
+TCGAAA TTTCGA 0.385602827192
+TCGACA TGTCGA 0.567716035117
+TCGAGA TCTCGA -0.0811772342038
+TCGCAA TTGCGA 0.471042033643
+TCGCCA TGGCGA -0.433031956525
+TCGCGA TCGCGA -0.10259765071
+TCGGAA TTCCGA 0.152092661722
+TCGGCA TGCCGA -0.277526003969
+TCGTAA TTACGA 0.6872310151
+TCGTCA TGACGA -0.633071677798
+TCTAAA TTTAGA -0.119569291318
+TCTACA TGTAGA 0.531537891496
+TCTAGA TCTAGA -0.582228401295
+TCTCAA TTGAGA 0.0181823636774
+TCTCCA TGGAGA -0.0990443536722
+TCTGAA TTCAGA 0.818735079074
+TCTGCA TGCAGA -0.708985771217
+TCTTAA TTAAGA 0.684573742868
+TCTTCA TGAAGA 0.231633308785
+TGAAAA TTTTCA -0.590753014937
+TGAACA TGTTCA 0.143205522462
+TGACAA TTGTCA -0.109523687459
+TGACCA TGGTCA 1.47045933497
+TGAGAA TTCTCA -0.555364387516
+TGAGCA TGCTCA -0.498941620454
+TGATAA TTATCA -1.78189703088
+TGATCA TGATCA -0.58090561671
+TGCAAA TTTGCA -0.642834024972
+TGCACA TGTGCA -1.10325139721
+TGCCAA TTGGCA -0.349619012912
+TGCCCA TGGGCA 0.294732505972
+TGCGAA TTCGCA -0.0604088395824
+TGCGCA TGCGCA 0.0208781532991
+TGCTAA TTAGCA -0.902952269579
+TGGAAA TTTCCA -1.65646848688
+TGGACA TGTCCA 0.212014771381
+TGGCAA TTGCCA -0.299231706691
+TGGCCA TGGCCA 0.0133271697043
+TGGGAA TTCCCA -0.0169051652559
+TGGTAA TTACCA -0.741148671428
+TGTAAA TTTACA -1.90833491575
+TGTACA TGTACA -0.781209085217
+TGTCAA TTGACA -1.54128600175
+TGTGAA TTCACA -2.24458672601
+TGTTAA TTAACA -1.12490394498
+TTAAAA TTTTAA -0.0119631686901
+TTACAA TTGTAA 0.81347268468
+TTAGAA TTCTAA 0.271221556202
+TTATAA TTATAA 0.774386643995
+TTCAAA TTTGAA 0.809237031692
+TTCCAA TTGGAA 0.158724969294
+TTCGAA TTCGAA -0.405674192258
+TTGAAA TTTCAA -0.306551492839
+TTGCAA TTGCAA 0.851414898595
+TTTAAA TTTAAA 0.479575745295
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/scripts/kmersvm_train.py
--- a/kmersvm/scripts/kmersvm_train.py Mon Aug 20 21:42:29 2012 -0400
+++ b/kmersvm/scripts/kmersvm_train.py Sun Jun 16 18:06:14 2013 -0400
@@ -754,7 +754,8 @@
sids = sids_pos + sids_neg
if options.weight == 0:
- options.weight = 1 + log(nneg/npos)
+ #DEBUGGED by dlee 02/17/13
+ options.weight = 1 + log(nneg/float(npos))
if options.quiet == False:
sys.stderr.write('SVM parameters:\n')
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/scripts/kmersvm_train_kfb_copy.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/kmersvm/scripts/kmersvm_train_kfb_copy.py Sun Jun 16 18:06:14 2013 -0400
@@ -0,0 +1,894 @@
+#!/usr/bin/env python
+"""
+ kmersvm_train.py; train a support vector machine using shogun toolbox
+ Copyright (C) 2011 Dongwon Lee
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+
+"""
+
+
+
+import sys
+import optparse
+import random
+import numpy
+from math import log, exp
+
+from libkmersvm import *
+try:
+ from shogun.PreProc import SortWordString, SortUlongString
+except ImportError:
+ from shogun.Preprocessor import SortWordString, SortUlongString
+from shogun.Kernel import CommWordStringKernel, CommUlongStringKernel, \
+ CombinedKernel
+
+from shogun.Features import StringWordFeatures, StringUlongFeatures, \
+ StringCharFeatures, CombinedFeatures, DNA, Labels
+from shogun.Classifier import MSG_INFO, MSG_ERROR
+try:
+ from shogun.Classifier import SVMLight
+except ImportError:
+ from shogun.Classifier import LibSVM
+
+"""
+global variables
+"""
+g_kmers = []
+g_rcmap = []
+
+
+def kmerid2kmer(kmerid, kmerlen):
+ """convert integer kmerid to kmer string
+
+ Arguments:
+ kmerid -- integer, id of k-mer
+ kmerlen -- integer, length of k-mer
+
+ Return:
+ kmer string
+ """
+
+ nts = "ACGT"
+ kmernts = []
+ kmerid2 = kmerid
+
+ for i in xrange(kmerlen):
+ ntid = kmerid2 % 4
+ kmernts.append(nts[ntid])
+ kmerid2 = int((kmerid2-ntid)/4)
+
+ return ''.join(reversed(kmernts))
+
+
+def kmer2kmerid(kmer, kmerlen):
+ """convert kmer string to integer kmerid
+
+ Arguments:
+ kmerid -- integer, id of k-mer
+ kmerlen -- integer, length of k-mer
+
+ Return:
+ id of k-mer
+ """
+
+ nt2id = {'A':0, 'C':1, 'G':2, 'T':3}
+
+ return reduce(lambda x, y: (4*x+y), [nt2id[x] for x in kmer])
+
+
+def get_rcmap(kmerid, kmerlen):
+ """mapping kmerid to its reverse complement k-mer on-the-fly
+
+ Arguments:
+ kmerid -- integer, id of k-mer
+ kmerlen -- integer, length of k-mer
+
+ Return:
+ integer kmerid after mapping to its reverse complement
+ """
+
+ #1. get kmer from kmerid
+ #2. get reverse complement kmer
+ #3. get kmerid from revcomp kmer
+ rckmerid = kmer2kmerid(revcomp(kmerid2kmer(kmerid, kmerlen)), kmerlen)
+
+ if rckmerid < kmerid:
+ return rckmerid
+
+ return kmerid
+
+
+def non_redundant_word_features(feats, kmerlen):
+ """convert the features from Shogun toolbox to non-redundant word features (handle reverse complements)
+ Arguments:
+ feats -- StringWordFeatures
+ kmerlen -- integer, length of k-mer
+
+ Return:
+ StringWordFeatures after converting reverse complement k-mer ids
+ """
+
+ rcmap = g_rcmap
+
+ for i in xrange(feats.get_num_vectors()):
+ nf = [rcmap[int(kmerid)] for kmerid in feats.get_feature_vector(i)]
+
+ feats.set_feature_vector(numpy.array(nf, numpy.dtype('u2')), i)
+
+ preproc = SortWordString()
+ preproc.init(feats)
+ try:
+ feats.add_preproc(preproc)
+ feats.apply_preproc()
+ except AttributeError:
+ feats.add_preprocessor(preproc)
+ feats.apply_preprocessor()
+
+ return feats
+
+
+def non_redundant_ulong_features(feats, kmerlen):
+ """convert the features from Shogun toolbox to non-redundant ulong features
+ Arguments:
+ feats -- StringUlongFeatures
+ kmerlen -- integer, length of k-mer
+
+ Return:
+ StringUlongFeatures after converting reverse complement k-mer ids
+ """
+
+ for i in xrange(feats.get_num_vectors()):
+ nf = [get_rcmap(int(kmerid), kmerlen) \
+ for kmerid in feats.get_feature_vector(i)]
+
+ feats.set_feature_vector(numpy.array(nf, numpy.dtype('u8')), i)
+
+ preproc = SortUlongString()
+ preproc.init(feats)
+ try:
+ feats.add_preproc(preproc)
+ feats.apply_preproc()
+ except AttributeError:
+ feats.add_preprocessor(preproc)
+ feats.apply_preprocessor()
+
+ return feats
+
+
+def svm_learn(kernel, labels, options):
+ """train SVM using SVMLight or LibSVM
+
+ Arguments:
+ kernel -- kernel object from Shogun toolbox
+ lebels -- list of labels
+ options -- object containing option data
+
+ Return:
+ trained svm object
+ """
+
+ try:
+ svm=SVMLight(options.svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double)))
+ except NameError:
+ svm=LibSVM(options.svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double)))
+
+ if options.quiet == False:
+ svm.io.set_loglevel(MSG_INFO)
+ svm.io.set_target_to_stderr()
+
+ svm.set_epsilon(options.epsilon)
+ svm.parallel.set_num_threads(1)
+ if options.weight != 1.0:
+ svm.set_C(options.svmC, options.svmC*options.weight)
+ svm.train()
+
+ if options.quiet == False:
+ svm.io.set_loglevel(MSG_ERROR)
+
+ return svm
+
+
+def _get_spectrum_features(seqs, kmerlen):
+ """generate spectrum features (internal)
+
+ Arguments:
+ seqs -- list of sequences
+ kmerlen -- integer, length of k-mer
+
+ Return:
+ StringWord(Ulong)Features after treatment of redundant reverse complement k-mers
+ """
+
+ char_feats = StringCharFeatures(seqs, DNA)
+
+ if kmerlen <= 8:
+ string_features = StringWordFeatures
+ non_redundant_features = non_redundant_word_features
+ else:
+ string_features = StringUlongFeatures
+ non_redundant_features = non_redundant_ulong_features
+
+ feats = string_features(DNA)
+ feats.obtain_from_char(char_feats, kmerlen-1, kmerlen, 0, False)
+ return non_redundant_features(feats, kmerlen)
+
+
+def get_spectrum_features(seqs, options):
+ """generate spectrum features (wrapper)
+ """
+ return _get_spectrum_features(seqs, options.kmerlen)
+
+
+def get_weighted_spectrum_features(seqs, options):
+ """generate weighted spectrum features
+ """
+ global g_kmers
+ global g_rcmap
+
+ subfeats_list = []
+
+ for k in xrange(options.kmerlen, options.kmerlen2+1):
+ char_feats = StringCharFeatures(seqs, DNA)
+ if k <= 8:
+ g_kmers = generate_kmers(k)
+ g_rcmap = generate_rcmap_table(k, g_kmers)
+
+ subfeats = _get_spectrum_features(seqs, k)
+ subfeats_list.append(subfeats)
+
+ return subfeats_list
+
+
+def get_spectrum_kernel(feats, options):
+ """build spectrum kernel with non-redundant k-mer list (removing reverse complement)
+
+ Arguments:
+ feats -- feature object
+ options -- object containing option data
+
+ Return:
+ StringWord(Ulong)Features, CommWord(Ulong)StringKernel
+ """
+ if options.kmerlen <= 8:
+ return CommWordStringKernel(feats, feats)
+ else:
+ return CommUlongStringKernel(feats, feats)
+
+
+def get_weighted_spectrum_kernel(subfeats_list, options):
+ """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)
+
+ Arguments:
+ subfeats_list -- list of sub-feature objects
+ options -- object containing option data
+
+ Return:
+ CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel
+ """
+ kmerlen = options.kmerlen
+ kmerlen2 = options.kmerlen2
+
+ subkernels = 0
+ kernel = CombinedKernel()
+ feats = CombinedFeatures()
+
+ for subfeats in subfeats_list:
+ feats.append_feature_obj(subfeats)
+
+ for k in xrange(kmerlen, kmerlen2+1):
+ if k <= 8:
+ subkernel = CommWordStringKernel(10, False)
+ else:
+ subkernel = CommUlongStringKernel(10, False)
+
+ kernel.append_kernel(subkernel)
+ subkernels+=1
+
+ kernel.init(feats, feats)
+
+ kernel.set_subkernel_weights(numpy.array([1/float(subkernels)]*subkernels, numpy.dtype('float64')))
+
+ return kernel
+
+
+def init_spectrum_kernel(kern, feats_lhs, feats_rhs):
+ """initialize spectrum kernel (wrapper function)
+ """
+ kern.init(feats_lhs, feats_rhs)
+
+
+def init_weighted_spectrum_kernel(kern, subfeats_list_lhs, subfeats_list_rhs):
+ """initialize weighted spectrum kernel (wrapper function)
+ """
+ feats_lhs = CombinedFeatures()
+ feats_rhs = CombinedFeatures()
+
+ for subfeats in subfeats_list_lhs:
+ feats_lhs.append_feature_obj(subfeats)
+
+ for subfeats in subfeats_list_rhs:
+ feats_rhs.append_feature_obj(subfeats)
+
+ kern.init(feats_lhs, feats_rhs)
+
+
+def get_sksvm_weights(svm, feats, options):
+ """calculate the SVM weight vector of spectrum kernel
+ """
+ kmerlen = options.kmerlen
+ alphas = svm.get_alphas()
+ support_vector_ids = svm.get_support_vectors()
+
+ w = numpy.array([0]*(2**(2*kmerlen)), numpy.double)
+
+ for i in xrange(len(alphas)):
+ x = [0]*(2**(2*kmerlen))
+ for kmerid in feats.get_feature_vector(int(support_vector_ids[i])):
+ x[int(kmerid)] += 1
+ x = numpy.array(x, numpy.double)
+ w += (alphas[i]*x/numpy.sqrt(numpy.sum(x**2)))
+
+ return w
+
+def get_feature_counts(svm, feats, options):
+ """calculate feature counts for SVs
+ """
+ kmerlen = options.kmerlen
+ alphas = svm.get_alphas()
+ support_vector_ids = svm.get_support_vectors()
+ output = options.outputname + "_counts.out"
+
+ global g_kmers
+ global g_rcmap
+
+ w = numpy.array([0]*(2**(2*kmerlen)), numpy.double)
+
+ for i in xrange(len(support_vector_ids)):
+ x = [0]*(2**(2*kmerlen))
+ for kmerid in feats.get_feature_vector(int(support_vector_ids[i])):
+ x[int(kmerid)] += 1
+
+ x = numpy.array(x, numpy.double)
+ w += x
+
+ if options.sort:
+ w_sorted = sorted(zip(range(len(w)), w), key=lambda x: x[1], reverse=True)
+ else:
+ w_sorted = zip(range(len(w)), w)
+
+ for i in map(lambda x: x[0], w_sorted):
+ if i == g_rcmap[i]:
+ f.write('\t'.join( [g_kmers[i], revcomp(g_kmers[i]), str(w[i])] ) + '\n')
+
+ f.close()
+
+
+
+def get_wsksvm_weights(svm, subfeats_list, options):
+ """calculate the SVM weight vector of weighted spectrum kernel
+ """
+ kmerlen = options.kmerlen
+ kmerlen2 = options.kmerlen2
+ alphas = svm.get_alphas()
+ support_vector_ids = svm.get_support_vectors()
+ kmerlens = range(kmerlen, kmerlen2+1)
+
+ weights = []
+ for idx in xrange(len(kmerlens)):
+ subfeats = subfeats_list[idx]
+
+ k = kmerlens[idx]
+ w = numpy.array([0]*(2**(2*k)), numpy.double)
+
+ for i in xrange(len(alphas)):
+ x = [0]*(2**(2*k))
+ for kmerid in subfeats.get_feature_vector(int(support_vector_ids[i])):
+ x[int(kmerid)] += 1
+ x = numpy.array(x, numpy.double)
+ w += (alphas[i]*x/numpy.sqrt(numpy.sum(x**2)))
+
+ w /= len(kmerlens)
+ weights.append(w)
+
+ return weights
+
+
+def save_header(f, bias, A, B, options):
+ f.write("#parameters:\n")
+ f.write("#kernel=" + str(options.ktype) + "\n")
+ f.write("#kmerlen=" + str(options.kmerlen) + "\n")
+ if options.ktype == 2:
+ f.write("#kmerlen2=" + str(options.kmerlen2) + "\n")
+ f.write("#bias=" + str(bias) + "\n")
+ f.write("#A=" + str(A) + "\n")
+ f.write("#B=" + str(B) + "\n")
+ f.write("#NOTE: k-mers with large negative weights are also important. They can be found at the bottom of the list.\n")
+ f.write("#k-mer\trevcomp\tSVM-weight\n")
+
+
+def save_sksvm_weights(w, bias, A, B, options):
+ """save the SVM weight vector from spectrum kernel
+ """
+ output = options.outputname + "_weights.out"
+ kmerlen = options.kmerlen
+
+ f = open(output, 'w')
+ save_header(f, bias, A, B, options)
+
+ global g_kmers
+ global g_rcmap
+
+ if options.sort:
+ w_sorted = sorted(zip(range(len(w)), w), key=lambda x: x[1], reverse=True)
+ else:
+ w_sorted = zip(range(len(w)), w)
+
+ if kmerlen <= 8:
+ for i in map(lambda x: x[0], w_sorted):
+ if i == g_rcmap[i]:
+ f.write('\t'.join( [g_kmers[i], revcomp(g_kmers[i]), str(w[i])] ) + '\n')
+ else:
+ for i in map(lambda x: x[0], w_sorted):
+ if i == get_rcmap(i, kmerlen):
+ kmer = kmerid2kmer(i, kmerlen)
+ f.write('\t'.join( [kmer, revcomp(kmer), str(w[i])] ) + '\n')
+
+ f.close()
+
+
+def save_wsksvm_weights(w, bias, A, B, options):
+ """save the SVM weight vector from weighted spectrum kernel
+ """
+ output = options.outputname + "_weights.out"
+ kmerlen = options.kmerlen
+ kmerlen2 = options.kmerlen2
+
+ f = open(output, 'w')
+ save_header(f, bias, A, B, options)
+
+ global g_kmers
+ global g_rcmap
+
+ kmerlens = range(kmerlen, kmerlen2+1)
+ for idx in xrange(len(kmerlens)):
+ k = kmerlens[idx]
+ subw = w[idx]
+
+ if options.sort:
+ subw_sorted = sorted(zip(range(len(subw)), subw), key=lambda x: x[1], reverse=True)
+ else:
+ subw_sorted = zip(range(len(subw)), subw)
+
+ if k <= 8:
+ g_kmers = generate_kmers(k)
+ g_rcmap = generate_rcmap_table(k, g_kmers)
+ for i in map(lambda x: x[0], subw_sorted):
+ if i == g_rcmap[i]:
+ f.write('\t'.join( [g_kmers[i], revcomp(g_kmers[i]), str(subw[i])] ) + "\n")
+ else:
+ for i in map(lambda x: x[0], subw_sorted):
+ if i == get_rcmap(i, k):
+ kmer = kmerid2kmer(i, k)
+ f.write('\t'.join( [kmers, revcomp(kmers), str(subw[i])] ) + "\n")
+
+ f.close()
+
+
+def save_predictions(output, preds, cvs):
+ """save prediction
+ """
+ f = open(output, 'w')
+ f.write('\t'.join(["#seq_id", "SVM score", "label", "NCV"]) + "\n")
+ for i in xrange(len(preds)):
+ f.write('\t'.join([preds[i][1], str(preds[i][2]), str(preds[i][3]), str(cvs[i])]) + "\n")
+ f.close()
+
+
+def generate_cv_list(ncv, n1, n2):
+ """generate the N-fold cross validation list
+
+ Arguments:
+ ncv -- integer, number of cross-validation
+ n1 -- integer, number of positives
+ n2 -- integer, number of negatives
+
+ Return:
+ a list of N-fold cross validation
+ """
+
+ shuffled_idx_list1 = range(n1)
+ shuffled_idx_list2 = range(n1,n1+n2)
+
+ random.shuffle(shuffled_idx_list1)
+ random.shuffle(shuffled_idx_list2)
+
+ shuffled_idx_list = shuffled_idx_list1 + shuffled_idx_list2
+
+ idx = 0
+ icv = 0
+ cv = [0] * (n1+n2)
+ while(idx < (n1+n2)):
+ cv[shuffled_idx_list[idx]] = icv
+
+ idx += 1
+ icv += 1
+ if icv == ncv:
+ icv = 0
+
+ return cv
+
+
+def split_cv_list(cvlist, icv, data):
+ """split data into training and test based on cross-validation list
+
+ Arguments:
+ cvlist -- list, cross-validation list
+ icv -- integer, corss-validation set of interest
+ data -- list, data set to be splitted
+
+ Return:
+ a list of training set and a list of test set
+ """
+
+ tr_data = []
+ te_data = []
+
+ for i in xrange(len(data)):
+ if cvlist[i] == icv:
+ te_data.append(data[i])
+ else:
+ tr_data.append(data[i])
+
+ return tr_data, te_data
+
+
+def LMAI(svms, labels, prior0, prior1):
+ """fitting svms to sigmoid function (improved version introduced by Lin 2003)
+
+ Arguments:
+ svms -- list of svm scores
+ labels -- list of labels
+ prior0 -- prior of negative set
+ prior1 -- prior of positive set
+
+ Return:
+ A, B parameter of 1/(1+exp(A*SVM+B))
+ """
+
+ #parameter settings
+ maxiter = 100
+ minstep = 1e-10
+ sigma = 1e-3
+
+ hiTarget = (prior1+1.0)/float(prior1+2.0)
+ loTarget = 1/float(prior0+2.0)
+
+ t = [0]*len(labels)
+ for i in xrange(len(labels)):
+ if labels[i] == 1:
+ t[i] = hiTarget
+ else:
+ t[i] = loTarget
+
+ A = 0.0
+ B = log((prior0+1.0)/float(prior1+1.0))
+ fval = 0.0
+
+ for i in xrange(len(labels)):
+ fApB = svms[i]*A+B
+ if fApB >= 0:
+ fval += (t[i]*fApB+log(1+exp(-fApB)))
+ else:
+ fval += ((t[i]-1)*fApB+log(1+exp(fApB)))
+
+
+ for it in xrange(maxiter):
+ #print "iteration:", it
+ #Update Graidient and Hessian (use H'= H + sigma I)
+ h11 = sigma
+ h22 = sigma
+ h21 = 0.0
+ g1 = 0.0
+ g2 = 0.0
+
+ for i in xrange(len(labels)):
+ fApB = svms[i]*A+B
+ if fApB >= 0:
+ p = exp(-fApB) / float(1.0+exp(-fApB))
+ q = 1.0 / float(1.0 + exp(-fApB))
+ else:
+ p = 1.0 / float(1.0 + exp(fApB))
+ q = exp(fApB) / float(1.0+exp(fApB))
+ d2 = p*q
+ h11 += (svms[i]*svms[i]*d2)
+ h22 += d2
+ h21 += (svms[i]*d2)
+ d1 = t[i]-p
+ g1 += (svms[i]*d1)
+ g2 += d1
+
+ #Stopping criteria
+ if (abs(g1)<1e-5) and (abs(g2)<1e-5):
+ break
+
+ det = h11*h22-h21*h21
+ dA = -(h22*g1-h21*g2)/float(det)
+ dB = -(-h21*g1+h11*g2)/float(det)
+ gd = g1*dA+g2*dB
+ stepsize=1
+ while stepsize >= minstep:
+ newA = A+stepsize*dA
+ newB = B+stepsize*dB
+ newf = 0.0
+
+ for i in xrange(len(labels)):
+ fApB = svms[i]*newA+newB
+ if fApB >= 0:
+ newf += (t[i]*fApB + log(1+exp(-fApB)))
+ else:
+ newf += ((t[i]-1)*fApB + log(1+exp(fApB)))
+
+ if newf < (fval+0.0001*stepsize*gd):
+ A=newA
+ B=newB
+ fval=newf
+ break
+ else:
+ stepsize=stepsize/float(2.0)
+
+ #Line search failes
+ if stepsize < minstep:
+ #print "Line search fails"
+ break
+
+ #if it >= maxiter:
+ # print "Reaching maximum iterations"
+
+ return A, B
+
+
+def wsksvm_classify(seqs, svm, kern, feats, options):
+ feats_te = get_weighted_spectrum_features(seqs, options)
+ init_weighted_spectrum_kernel(kern, feats, feats_te)
+
+ return svm.apply().get_labels().tolist()
+
+
+def score_seq(s, svmw, kmerlen):
+ """calculate SVM score of given sequence using single set of svm weights
+
+ Arguments:
+ s -- string, DNA sequence
+ svmw -- numpy array, SVM weights
+ kmerlen -- integer, length of k-mer of SVM weight
+
+ Return:
+ SVM score
+ """
+
+ global g_rcmap
+ kmer2kmerid_func = kmer2kmerid
+
+ x = [0]*(2**(2*kmerlen))
+ for j in xrange(len(s)-kmerlen+1):
+ x[ g_rcmap[kmer2kmerid_func(s[j:j+kmerlen], kmerlen)] ] += 1
+
+ x = numpy.array(x, numpy.double)
+ score_norm = numpy.dot(svmw, x)/numpy.sqrt(numpy.sum(x**2))
+
+ return score_norm
+
+
+def sksvm_classify(seqs, svm, kern, feats, options):
+ """classify the given sequences
+ """
+ if options.kmerlen <= 8:
+ #this is much faster when the length of kmer is short, and SVs are many
+ svmw = get_sksvm_weights(svm, feats, options)
+ return [score_seq(s, svmw, options.kmerlen)+svm.get_bias() for s in seqs]
+ else:
+ feats_te = get_spectrum_features(seqs, options)
+ init_spectrum_kernel(kern, feats, feats_te)
+
+ return svm.apply().get_labels().tolist()
+
+
+def main(argv = sys.argv):
+ usage = "Usage: %prog [options] POSITIVE_SEQ NEGATIVE_SEQ"
+ desc = "1. take two files(FASTA format) as input, 2. train an SVM and store the trained SVM weights"
+ parser = optparse.OptionParser(usage=usage, description=desc)
+ parser.add_option("-t", dest="ktype", type="int", default=1, \
+ help="set the type of kernel, 1:Spectrum, 2:Weighted Spectrums (default=1.Spectrum)")
+
+ parser.add_option("-C", dest="svmC", type="float", default=1, \
+ help="set the regularization parameter svmC (default=1)")
+
+ parser.add_option("-e", dest="epsilon", type="float", default=0.00001, \
+ help="set the precision parameter epsilon (default=0.00001)")
+
+ parser.add_option("-w", dest="weight", type="float", default=0.0, \
+ help="set the weight for positive set (default=auto, 1+log(N/P))")
+
+ parser.add_option("-k", dest="kmerlen", type="int",default=6, \
+ help="set the (min) length of k-mer for (weighted) spectrum kernel (default = 6)")
+
+ parser.add_option("-K", dest="kmerlen2", type="int",default=8, \
+ help="set the max length of k-mer for weighted spectrum kernel (default = 8)")
+
+ parser.add_option("-n", dest="outputname", default="kmersvm_output", \
+ help="set the name of output files (default=kmersvm_output)")
+
+ parser.add_option("-v", dest="ncv", type="int", default=0, \
+ help="if set, it will perform N-fold cross-validation and generate a prediction file (default = 0)")
+
+ parser.add_option("-p", dest="posteriorp", default=False, action="store_true", \
+ help="estimate parameters for posterior probability with N-CV. this option requires -v option to be set (default=false)")
+
+ parser.add_option("-r", dest="rseed", type="int", default=1, \
+ help="set the random number seed for cross-validation (-p option) (default=1)")
+
+ parser.add_option("-q", dest="quiet", default=False, action="store_true", \
+ help="supress messages (default=false)")
+
+ parser.add_option("-s", dest="sort", default=False, action="store_true", \
+ help="sort the kmers by absolute values of SVM weights (default=false)")
+
+ ktype_str = ["", "Spectrum", "Weighted Spectrums"]
+
+ (options, args) = parser.parse_args()
+
+ if len(args) == 0:
+ parser.print_help()
+ sys.exit(0)
+
+ if len(args) != 2:
+ parser.error("incorrect number of arguments")
+ parser.print_help()
+ sys.exit(0)
+
+ if options.posteriorp and options.ncv == 0:
+ parser.error("posterior probability estimation requires N-fold CV process (-v option should be set)")
+ parser.print_help()
+ sys.exit(0)
+
+ random.seed(options.rseed)
+
+ """
+ set global variable
+ """
+ if (options.ktype == 1) and (options.kmerlen <= 8):
+ global g_kmers
+ global g_rcmap
+
+ g_kmers = generate_kmers(options.kmerlen)
+ g_rcmap = generate_rcmap_table(options.kmerlen, g_kmers)
+
+ posf = args[0]
+ negf = args[1]
+
+ seqs_pos, sids_pos = read_fastafile(posf)
+ seqs_neg, sids_neg = read_fastafile(negf)
+ npos = len(seqs_pos)
+ nneg = len(seqs_neg)
+ seqs = seqs_pos + seqs_neg
+ sids = sids_pos + sids_neg
+
+ if options.weight == 0:
+ #DEBUGGED by dlee 02/17/13
+ options.weight = 1 + log(nneg/float(npos))
+
+ if options.quiet == False:
+ sys.stderr.write('SVM parameters:\n')
+ sys.stderr.write(' kernel-type: ' + str(options.ktype) + "." + ktype_str[options.ktype] + '\n')
+ sys.stderr.write(' svm-C: ' + str(options.svmC) + '\n')
+ sys.stderr.write(' epsilon: ' + str(options.epsilon) + '\n')
+ sys.stderr.write(' weight: ' + str(options.weight) + '\n')
+ sys.stderr.write('\n')
+
+ sys.stderr.write('Other options:\n')
+ sys.stderr.write(' kmerlen: ' + str(options.kmerlen) + '\n')
+ if options.ktype == 2:
+ sys.stderr.write(' kmerlen2: ' + str(options.kmerlen2) + '\n')
+ sys.stderr.write(' outputname: ' + options.outputname + '\n')
+ sys.stderr.write(' posteriorp: ' + str(options.posteriorp) + '\n')
+ if options.ncv > 0:
+ sys.stderr.write(' ncv: ' + str(options.ncv) + '\n')
+ sys.stderr.write(' rseed: ' + str(options.rseed) + '\n')
+ sys.stderr.write(' sorted-weight: ' + str(options.sort) + '\n')
+
+ sys.stderr.write('\n')
+
+ sys.stderr.write('Input args:\n')
+ sys.stderr.write(' positive sequence file: ' + posf + '\n')
+ sys.stderr.write(' negative sequence file: ' + negf + '\n')
+ sys.stderr.write('\n')
+
+ sys.stderr.write('numer of total positive seqs: ' + str(npos) + '\n')
+ sys.stderr.write('numer of total negative seqs: ' + str(nneg) + '\n')
+ sys.stderr.write('\n')
+
+ #generate labels
+ labels = [1]*npos + [-1]*nneg
+
+ if options.ktype == 1:
+ get_features = get_spectrum_features
+ get_kernel = get_spectrum_kernel
+ get_weights = get_sksvm_weights
+ save_weights = save_sksvm_weights
+ svm_classify = sksvm_classify
+ elif options.ktype == 2:
+ get_features = get_weighted_spectrum_features
+ get_kernel = get_weighted_spectrum_kernel
+ get_weights = get_wsksvm_weights
+ save_weights = save_wsksvm_weights
+ svm_classify = wsksvm_classify
+ else:
+ sys.stderr.write('..unknown kernel..\n')
+ sys.exit(0)
+
+ A = B = 0
+ if options.ncv > 0:
+ if options.quiet == False:
+ sys.stderr.write('..Cross-validation\n')
+
+ cvlist = generate_cv_list(options.ncv, npos, nneg)
+ labels_cv = []
+ preds_cv = []
+ sids_cv = []
+ indices_cv = []
+ for icv in xrange(options.ncv):
+ #split data into training and test set
+ seqs_tr, seqs_te = split_cv_list(cvlist, icv, seqs)
+ labs_tr, labs_te = split_cv_list(cvlist, icv, labels)
+ sids_tr, sids_te = split_cv_list(cvlist, icv, sids)
+ indices_tr, indices_te = split_cv_list(cvlist, icv, range(len(seqs)))
+
+ #train SVM
+ feats_tr = get_features(seqs_tr, options)
+ kernel_tr = get_kernel(feats_tr, options)
+ svm_cv = svm_learn(kernel_tr, labs_tr, options)
+
+ preds_cv = preds_cv + svm_classify(seqs_te, svm_cv, kernel_tr, feats_tr, options)
+
+ labels_cv = labels_cv + labs_te
+ sids_cv = sids_cv + sids_te
+ indices_cv = indices_cv + indices_te
+
+ output_cvpred = options.outputname + "_cvpred.out"
+ prediction_results = sorted(zip(indices_cv, sids_cv, preds_cv, labels_cv), key=lambda p: p[0])
+ save_predictions(output_cvpred, prediction_results, cvlist)
+
+ if options.posteriorp:
+ A, B = LMAI(preds_cv, labels_cv, labels_cv.count(-1), labels_cv.count(1))
+
+ if options.quiet == False:
+ sys.stderr.write('Estimated Parameters:\n')
+ sys.stderr.write(' A: ' + str(A) + '\n')
+ sys.stderr.write(' B: ' + str(B) + '\n')
+
+ if options.quiet == False:
+ sys.stderr.write('..SVM weights\n')
+
+ feats = get_features(seqs, options)
+ kernel = get_kernel(feats, options)
+ svm = svm_learn(kernel, labels, options)
+ jj = get_feature_counts(svm, feats, options)
+ w = get_weights(svm, feats, options)
+ b = svm.get_bias()
+
+ save_weights(w, b, A, B, options)
+
+if __name__=='__main__': main()
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/scripts/libkmersvm.pyc
Binary file kmersvm/scripts/libkmersvm.pyc has changed
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/scripts/nullseq_generate.py
--- a/kmersvm/scripts/nullseq_generate.py Mon Aug 20 21:42:29 2012 -0400
+++ b/kmersvm/scripts/nullseq_generate.py Sun Jun 16 18:06:14 2013 -0400
@@ -71,8 +71,7 @@
def sample_sequences(positions, buildname, basedir, options):
"""
"""
- rpt_err = options.rpt_err
- gc_err = options.gc_err
+ max_fails = 20
max_trys = options.max_trys
norpt = options.norpt
nogc = options.nogc
@@ -121,6 +120,12 @@
else:
count = options.count
+ #initialize paramter
+ #added by dlee 2/17/13
+ ncfails = 0
+ rpt_err = options.rpt_err
+ gc_err = options.gc_err
+
sampled_positions = []
while len(sampled_positions) < count:
sampled_prof = random.choice(profiles)
@@ -128,6 +133,15 @@
sampled_gc = sampled_prof[2]
sampled_rpt = sampled_prof[3]
+ #relax rpt_err and gc_err if it keep fail to sample a region
+ #added by dlee 2/17/13
+ if ncfails >= max_fails:
+ if options.quiet == False:
+ sys.stderr.write("reached max_fail. relax gc and rpt err criteria\n")
+ ncfails = 0
+ rpt_err += 0.01
+ gc_err += 0.01
+
rpt_err_allowed = int(rpt_err*sampled_len)
gc_err_allowed = int(gc_err*sampled_len)
trys = 0
@@ -156,9 +170,17 @@
sampled_positions.append((chrom, pos, pos_e))
+ #reset the counter of consecutive fails
+ #added by dlee 2/17/13
+ ncfails = 0
+
#print trys, chrom, pos, pos_e, sampled_len, pos_rpt, sampled_rpt, pos_gc, sampled_gc
break
else:
+ #increase the counter of consecutive fails
+ #added by dlee 2/17/13
+ ncfails += 1
+
if options.quiet == False:
sys.stderr.write(' '.join(["fail to sample from", \
"len=", str(sampled_len), \
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/tomtom.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/kmersvm/tomtom.xml Sun Jun 16 18:06:14 2013 -0400
@@ -0,0 +1,84 @@
+
+
+ Tomtom tool for motif searching
+ /home/galaxy/meme/bin/tomtom -no-ssc -internal -text -verbosity 1 -thresh $thresh
+ #if str($cut.cut_choice) == 'e.value':
+ -evalue
+ #end if
+
+ #if str($dist.dist) == 'ed':
+ -dist ed
+ #elif str($dist.dist) == 'sw':
+ -dist sandelin
+ #else
+ -dist pearson
+ #end if
+
+ $input1 /home/galaxy/meme/db/combined_db.meme > tomtom_out.txt
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Tomtom is a tool for comparing a DNA motif to a database of known motifs. For an in-depth explanation of the Tomtom software see here_.
+
+----
+
+**Recommended Settings**
+
+We recommend most users use the Tomtom defaults of q-value for score, the cutoff of 0.5 and the Pearson correlation coefficent for distance metric.
+
+----
+
+**Parameters**
+
+We offer users the options of choosing which distance metric can be used to find matching motifs. Specifically, we offer the Pearson correlation coefficient, the Euclidean distance and the Sandelin-Wasserman Function.
+
+ * The Pearson correlation coefficient measures the similarity between columns of position weight matrices (PWMs).
+
+ * The Euclidean distance can be thought of as the length of the straight line between two PWMs.
+
+ * The Sandelin-Wasserman function sums the column-wise differences between PWMs.
+
+We also offer the choice of E-value and q-value to threshold the results returned by Tomtom.
+
+ * The E-value controls the expected number of false positives and can be any number.
+
+ * The q-value controls the false discovery rate and is a number between 0 and 1.
+
+----
+
+Note that at this time we only offer Tomtom output in txt format.
+
+----
+
+**Citation**
+
+If you use this tool, please cite: Shobhit Gupta, JA Stamatoyannopolous, Timothy Bailey and William Stafford Noble, "Quantifying similarity between motifs", Genome Biology, 8(2):R24, 2007.
+
+.. _here: http://meme.nbcr.net/meme/tomtom-intro.html
+
+
+
diff -r 1aea7c1a9ab1 -r fd740d515502 kmersvm/train.xml
--- a/kmersvm/train.xml Mon Aug 20 21:42:29 2012 -0400
+++ b/kmersvm/train.xml Sun Jun 16 18:06:14 2013 -0400
@@ -47,8 +47,12 @@
-
-
+
+
+
+
+
+
@@ -79,11 +83,27 @@
Takes as input 2 FASTA files, 1 of positive sequences and 1 of negative sequences. Produces 2 outputs:
- A) Weights: list of sequences of length K ranked by score and posterior probability for that score.
+ A) Weights: list of sequences of length K ranked by score.
- B) Predictions: results of N-fold cross validation
+ B) Predictions: results of N-fold cross validation.
----
+
+**Recommended Settings**
+
+Kernel: Spectrum
+
+Kmer length: 6
+
+N-Fold Cross-Validation: 5
+
+Weight: We recommend letting the Positive Set Weight be selected automatically, unless it has been separately optimized.
+
+Regularization Parameter C: We recommend values between 0.1 and 1.
+
+Precision Parameter E: We recommend using the default and staying below 0.1.
+
+----
**Parameters**
@@ -91,8 +111,9 @@
A) Spectrum Kernel: Analyzes a sequence using strings of length K.
- B) Weighted Spectrum Kernel: Analyzes a sequence using strings of range of lengths K1 - Kn.
-
+ B) Weighted Spectrum Kernel: Analyzes a sequence using strings of range of lengths K_min - K_max.
+
+
N-Fold Cross Validation: Number of partitions of training data used for cross validation.
Weight: Increases importance of positive data (increase if positive sets are very trustworthy or for training with very large negative sequence sets).
@@ -100,7 +121,7 @@
Regularization Parameter: Penalty for misclassification. Trade-off is overfitting (high parameter) versus high error rate (low parameter).
Precision Parameter: Insensitivity zone. Affects precision of SVM by altering number of support vectors used.
-
+
----
**Example**