changeset 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
files ._BSseeker2 BSseeker2/._AUTHORS BSseeker2/._FilterReads.py BSseeker2/._README.md BSseeker2/._RELEASE_NOTES BSseeker2/._bs_align BSseeker2/._bs_index BSseeker2/._bs_seeker2-align.py BSseeker2/._bs_seeker2-build.py BSseeker2/._bs_seeker2-call_methylation.py BSseeker2/._bs_utils BSseeker2/._galaxy BSseeker2/AUTHORS BSseeker2/FilterReads.py BSseeker2/LICENSE BSseeker2/README.md BSseeker2/RELEASE_NOTES BSseeker2/bs_align/.___init__.py BSseeker2/bs_align/._bs_align_utils.py BSseeker2/bs_align/._bs_pair_end.py BSseeker2/bs_align/._bs_rrbs.py BSseeker2/bs_align/._bs_single_end.py BSseeker2/bs_align/._output.py BSseeker2/bs_align/__init__.py BSseeker2/bs_align/bs_align_utils.py BSseeker2/bs_align/bs_pair_end.py BSseeker2/bs_align/bs_rrbs.py BSseeker2/bs_align/bs_single_end.py BSseeker2/bs_align/output.py BSseeker2/bs_index/.___init__.py BSseeker2/bs_index/._rrbs_build.py BSseeker2/bs_index/._wg_build.py BSseeker2/bs_index/__init__.py BSseeker2/bs_index/rrbs_build.py BSseeker2/bs_index/wg_build.py BSseeker2/bs_seeker2-align.py BSseeker2/bs_seeker2-build.py BSseeker2/bs_seeker2-call_methylation.py BSseeker2/bs_utils/.___init__.py BSseeker2/bs_utils/._utils.py BSseeker2/bs_utils/__init__.py BSseeker2/bs_utils/utils.py BSseeker2/galaxy/.___init__.py BSseeker2/galaxy/._bs_seeker2_wrapper.py BSseeker2/galaxy/._bs_seeker2_wrapper.xml BSseeker2/galaxy/__init__.py BSseeker2/galaxy/bs_seeker2_wrapper.py BSseeker2/galaxy/bs_seeker2_wrapper.xml
diffstat 48 files changed, 5719 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file ._BSseeker2 has changed
Binary file BSseeker2/._AUTHORS has changed
Binary file BSseeker2/._FilterReads.py has changed
Binary file BSseeker2/._README.md has changed
Binary file BSseeker2/._RELEASE_NOTES has changed
Binary file BSseeker2/._bs_align has changed
Binary file BSseeker2/._bs_index has changed
Binary file BSseeker2/._bs_seeker2-align.py has changed
Binary file BSseeker2/._bs_seeker2-build.py has changed
Binary file BSseeker2/._bs_seeker2-call_methylation.py has changed
Binary file BSseeker2/._bs_utils has changed
Binary file BSseeker2/._galaxy has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/AUTHORS	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,7 @@
+BS-Seeker2 authors
+
+Pao-Yang Chen <paoyang@gate.sinica.edu.tw> 
+Petko Fiziev <peplafi@gmail.com>
+Weilong Guo <guoweilong@gmail.com>
+
+May 2013
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/FilterReads.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,237 @@
+#!/usr/bin/python
+
+import sys
+import os
+import os.path
+import re
+
+# Define the Dictionary & the Reverse Dictionary for barcode
+
+# ===========================================
+# 
+def FilterFastq (infile, outfile) :
+    try:
+        INFILE = open(infile, 'r')
+    except IOError:
+        print "\n[Error]:\n\t File cannot be open: ", infile;
+        exit(-1)
+    try:
+        OUTFILE = open(outfile, 'w')
+    except IOError:
+        print "\n[Error]:\n\t File cannot be open: ", outfile;
+        exit(-1)
+    Dict = {}
+    count = 0
+    total = 0
+    while 1 :
+        line1 = INFILE.readline()
+        if not line1 :
+            break
+        line1 = line1
+        line2 = INFILE.readline()
+        line3 = INFILE.readline()
+        line4 = INFILE.readline()
+        total += 1
+ #       print line2
+        if ( line2 in Dict ) :
+            Dict[line2] += 1
+        else :
+            Dict[line2] = 1
+            OUTFILE.write(line1)
+            OUTFILE.write(line2)
+            OUTFILE.write(line3)
+            OUTFILE.write(line4)
+            count += 1
+    INFILE.close()
+    OUTFILE.close()
+    print "Count of total raw reads: ", total
+    print "Count of reads left: ", count
+    print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
+
+
+# ===============================
+#
+
+def FilterSequence (infile, outfile) :
+    try:
+        INFILE = open(infile, 'r')
+    except IOError:
+        print "\n[Error]:\n\t File cannot be open: ", infile;
+        exit(-1)
+    try:
+        OUTFILE = open(outfile, 'w')
+    except IOError:
+        print "\n[Error]:\n\t File cannot be open: ", outfile;
+        exit(-1)
+    Dict = {}
+    count = 0
+    total = 0
+    for line in INFILE:
+        line = line.strip()
+        total += 1
+        if ( line in Dict ) :
+            Dict[line] += 1
+        else :
+            Dict[line] = 1
+            count += 1
+            OUTFILE.write(line + "\n")
+    INFILE.close()
+    OUTFILE.close()
+    print "Count of total raw reads: ", total
+    print "Count of reads left: ", count
+    print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
+
+# ===============================
+# SN603   WA047   6       1101    41.40   99.10   0       1       .GGGA.......TTAG..............       @SZ__@@@@@@@RR__@@@@@@@@@@@@@@       0
+#
+def FilterQseq (infile, outfile, keepquality) :
+    if keepquality :
+        print "User specified '-k' and read quality will not be considered"
+    else :
+        print "Reads with PF=0 will be filtered"
+
+    try:
+        INFILE = open(infile, 'r')
+    except IOError:
+        print "\n[Error]:\n\t File cannot be open: ", infile;
+        exit(-1)
+    try:
+        OUTFILE = open(outfile, 'w')
+    except IOError:
+        print "\n[Error]:\n\t File cannot be open: ", outfile;
+        exit(-1)
+    Dict = {}
+    count = 0
+    total = 0
+    for line in INFILE:
+        tokens = line.strip().split()
+        total += 1
+        if ( (not keepquality and tokens[10] == "1") or keepquality ) :
+            if ( tokens[8] in Dict ) :
+                Dict[tokens[8]] += 1
+            else :
+                Dict[tokens[8]] = 1
+                count += 1
+                OUTFILE.write(line)
+
+    INFILE.close()
+    OUTFILE.close()
+    print "Count of total raw reads: ", total
+    print "Count of reads left: ", count
+    print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
+
+
+# ===============================
+#
+
+def FilterFasta (infile, outfile) :
+    try:
+        INFILE = open(infile, 'r')
+    except IOError:
+        print "\n[Error]:\n\t File cannot be open: ", infile;
+        exit(-1)
+    try:
+        OUTFILE = open(outfile, 'w')
+    except IOError:
+        print "\n[Error]:\n\t File cannot be open: ", outfile;
+        exit(-1)
+    Dict = {}
+    count = 0
+    total = 0
+    name = ""
+    read = ""
+    for line in INFILE:
+        line = line.strip()
+        if (line[0] == '>') :
+            total += 1
+            if ( name != "" ) :
+                if ( read in Dict ) :
+                    Dict[read] += 1
+                else :
+                    Dict[read] = 1
+                    count += 1
+                    OUTFILE.write(">" + name + "\n")
+                    OUTFILE.write(read + "\n")
+            name = line[1:]
+            read = ""
+        else :
+            read = read + line
+    if (name != "") :
+        if ( read in Dict ) :
+            Dict[read] += 1
+        else :
+            Dict[read] = 1
+            count += 1
+            OUTFILE.write(">" + name + "\n")
+            OUTFILE.write(read + "\n")
+    print "Count of total raw reads: ", total
+    print "Count of reads left: ", count
+    print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
+
+
+
+# ==============================
+#  Decide branch functions for different format
+
+def FilterReads (infile, outfile, keepquality):
+    try:
+        INFILE = open(infile, 'r')
+    except IOError:
+        print "\n[Error]:\n\t File cannot be open: ", infile;
+        exit(-1)
+    i = 0
+    # Count the numbers of barcodes in first 10000 lines
+    line = INFILE.readline()
+    tokens = line.strip().split()
+    input_format="";
+    if line[0]=="@" : 
+        input_format = "FastQ"
+        n_fastq = 0
+    elif len(tokens) == 1 and line[0] != ">":
+        input_format = "sequence"
+    elif len(tokens) == 11:
+        input_format = "qseq"
+    elif line[0]==">" :
+        input_format = "fasta"
+    INFILE.close()
+
+    print "Input file format detected: ", input_format
+
+    if input_format == "FastQ" :
+        FilterFastq(infile, outfile)
+    elif input_format == "sequence" :
+        FilterSequence(infile, outfile)
+    elif input_format == "qseq" :
+        FilterQseq(infile, outfile, keepquality)
+    elif input_format == "fasta" :
+        FilterFasta(infile, outfile)
+
+
+from optparse import OptionParser
+
+# ===========================================
+def main():
+    usage = "Usage: %prog -i <input> -o <output> [-k]\n" \
+            "Author : Guo, Weilong; guoweilong@gmail.com; 2012-11-10\n" \
+            "Last Update: 2013-04-01" \
+            "Description: Unique reads for qseq/fastq/fasta/sequencce,\n" \
+            "       and filter low quality reads in qseq file."
+    parser = OptionParser(usage)
+    parser.add_option("-i", dest="infile", 
+                  help="Name of the input qseq/fastq/fasta/sequence file", metavar="FILE")
+    parser.add_option("-o", dest="outfile",
+                  help="Name of the output file", metavar="FILE")
+    parser.add_option("-k", dest="keepquality", default = False, action = "store_true",
+                  help="Would not filter low quality reads if specified")
+    (options, args) = parser.parse_args()
+    
+    if (options.infile is None) or (options.outfile is None) :
+        print parser.print_help()
+        exit(-1)
+    FilterReads(options.infile, options.outfile, options.keepquality)
+
+
+# ===========================================
+if __name__ == "__main__":
+    main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/LICENSE	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2012-2013 BS-Seeker2 Developers.  All  Rights  Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/README.md	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,590 @@
+BS-Seeker2
+=========
+
+BS-Seeker2 (BS Seeker 2) performs accurate and fast mapping of bisulfite-treated short reads. BS-Seeker2 is an updated version on BS-Seeker.
+
+0. Availability
+============
+
+Homepage of [BS-Seeker2](http://pellegrini.mcdb.ucla.edu/BS_Seeker2/).
+
+The source code for this package is available from
+[https://github.com/BSSeeker/BSseeker2](https://github.com/BSSeeker/BSseeker2).
+Also, you can use an instance of BS-Seeker 2 in Galaxy from [http://galaxy.hoffman2.idre.ucla.edu](http://galaxy.hoffman2.idre.ucla.edu).
+(Label: "NGS: Methylation Mapping"/"Methylation Map with BS Seeker2")
+
+
+1. Remarkable new features
+============
+* Reduced index for RRBS, accelerating the mapping speed and increasing mappability
+* Allowing local alignment with Bowtie 2, increased the mappability
+
+2. Other features
+============
+* Supported library types
+	- whole genome-wide bisulfite sequencing (WGBS)
+	- reduced representative bisulfite sequencing (RRBS)
+
+* Supported formats for input file
+	- [fasta](http://en.wikipedia.org/wiki/FASTA_format)
+	- [fastq](http://en.wikipedia.org/wiki/FASTQ_format)
+	- [qseq](http://jumpgate.caltech.edu/wiki/QSeq)
+	- pure sequence (one-line one-sequence)
+
+* Supported alignment tools
+	- [bowtie](http://bowtie-bio.sourceforge.net/index.shtml) : Single-seed
+	- [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) : Multiple-seed, gapped-alignment
+		- [local alignment](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#local-alignment-example)
+		- [end-to-end alignment](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#end-to-end-alignment-example)
+	- [SOAP](http://soap.genomics.org.cn/)
+
+* Supported formats for mapping results
+	- [BAM](http://genome.ucsc.edu/FAQ/FAQformat.html#format5.1)
+	- [SAM](http://samtools.sourceforge.net/)
+	- [BS-seeker 1](http://pellegrini.mcdb.ucla.edu/BS_Seeker/USAGE.html)
+
+3. System requirements
+============
+
+* Linux or Mac OS platform
+* One of the following Aligner
+  - [bowtie](http://bowtie-bio.sourceforge.net/)
+  - [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/) (Recommend)
+  - [soap](http://soap.genomics.org.cn/)
+* [Python](http://www.python.org/download/) (Version 2.6 +)
+
+  (It is normally pre-installed in Linux. Type " python -V" to see the installed version.)
+
+* [pysam](http://code.google.com/p/pysam/) package is needed.
+
+  (Read "Questions & Answers" if you have problem when installing this package.)
+
+
+
+4. Modules' descriptions
+============
+
+(0) FilterReads.py
+------------
+
+Optional and independent module.
+Some reads would be extremely amplified during the PCR. This script helps you get unique reads before doing the mapping. You can decide whether or not to filter reads before doing the mapping.
+
+##Usage :
+
+	$ python FilterReads.py
+	Usage: FilterReads.py -i <input> -o <output> [-k]
+	Author : Guo, Weilong; guoweilong@gmail.com; 2012-11-10
+	Unique reads for qseq/fastq/fasta/sequencce, and filter
+	low quality file in qseq file.
+
+	Options:
+	  -h, --help  show this help message and exit
+	  -i FILE     Name of the input qseq/fastq/fasta/sequence file
+	  -o FILE     Name of the output file
+	  -k          Would not filter low quality reads if specified
+
+
+##Tip :
+
+- This step is not suggested for RRBS library, as reads from RRBS library would more likely from the same location.
+
+
+(1) bs_seeker2-build.py
+------------
+
+Module to build the index for BS-Seeker2.
+
+
+##Usage :
+
+    $ python bs_seeker2-build.py -h
+    Usage: bs_seeker2-build.py [options]
+
+    Options:
+      -h, --help            show this help message and exit
+      -f FILE, --file=FILE  Input your reference genome file (fasta)
+      --aligner=ALIGNER     Aligner program to perform the analysis: bowtie,
+                            bowtie2, soap [Default: bowtie2]
+      -p PATH, --path=PATH  Path to the aligner program. Defaults:
+                            bowtie: /u/home/mcdb/weilong/install/bowtie-0.12.8
+                            bowtie2:
+                            /u/home/mcdb/weilong/install/bowtie2-2.0.0-beta7
+                            soap: /u/home/mcdb/weilong/install/soap2.21release/
+      -d DBPATH, --db=DBPATH
+                            Path to the reference genome library (generated in
+                            preprocessing genome) [Default: /u/home/mcdb/weilong/i
+                            nstall/BSseeker2/bs_utils/reference_genomes]
+      -v, --version         show version of BS-Seeker2
+
+      Reduced Representation Bisulfite Sequencing Options:
+        Use this options with conjuction of -r [--rrbs]
+
+        -r, --rrbs          Build index specially for Reduced Representation
+                            Bisulfite Sequencing experiments. Genome other than
+                            certain fragments will be masked. [Default: False]
+        -l LOW_BOUND, --low=LOW_BOUND
+                            lower bound of fragment length (excluding recognition
+                            sequence such as C-CGG) [Default: 40]
+        -u UP_BOUND, --up=UP_BOUND
+                            upper bound of fragment length (excluding recognition
+                            sequence such as C-CGG ends) [Default: 500]
+        -c CUT_FORMAT, --cut-site=CUT_FORMAT
+                            Cut sites of restriction enzyme. Ex: MspI(C-CGG),
+                            Mael:(C-TAG), double-enzyme MspI&Mael:(C-CGG,C-TAG).
+                            [Default: C-CGG]
+
+##Example
+
+* Build genome index for WGBS using bowtie, path of bowtie should be included in $PATH
+
+        python bs_seeker2-build.py -f genome.fa --aligner=bowtie
+
+* Build genome index for RRBS with default parameters specifying the path for bowtie2
+
+        python bs_seeker2-build.py -f genome.fa --aligner=bowtie2 -p ~/install/bowtie2-2.0.0-beta7/ -r
+
+* Build genome index for RRBS library using bowite2, with fragment lengths ranging [40bp, 400bp]
+
+        python bs_seeker2-build.py -f genome.fa -r -l 40 -u 400 --aligner=bowtie2
+
+* Build genome index for RRBS library for double-enzyme : MspI (C-CGG) & ApeKI (G-CWGC, where W=A|T, see [IUPAC code](http://www.bioinformatics.org/sms/iupac.html))
+
+        python bs_seeker2-build.py -f genome.fa -r -c C-CGG,G-CWGC
+
+##Tips:
+
+- Index built for BS-Seeker2 is different from the index for BS-Seeker 1.
+For RRBS, you need to specify "-r" in the parameters. Also, you need to specify LOW_BOUND and UP_BOUND for the range of fragment lengths according your protocol.
+
+- The fragment length is different from read length. Fragments refers to the DNA fragments which you get by size-selection step (i.e. gel-cut oor AMPure beads). Lengths of fragments are supposed to be in a range, such as [50bp,250bp].
+
+- The indexes for RRBS and WGBS are different. Also, indexes for RRBS are specific for fragment length parameters (LOW_BOUND and UP_BOUND).
+
+
+
+
+(2) bs_seeker2-align.py
+------------
+
+Module to map reads on 3-letter converted genome.
+
+##Usage :
+
+    $ python ~/install/BSseeker2/bs_seeker2-align.py -h
+    Usage: bs_seeker2-align.py [options]
+
+    Options:
+      -h, --help            show this help message and exit
+
+      For single end reads:
+        -i INFILE, --input=INFILE
+                            Input your read file name (FORMAT: sequences,
+                            fastq, qseq,fasta)
+
+      For pair end reads:
+        -1 FILE, --input_1=FILE
+                            Input your read file end 1 (FORMAT: sequences,
+                            qseq, fasta, fastq)
+        -2 FILE, --input_2=FILE
+                            Input your read file end 2 (FORMAT: sequences,
+                            qseq, fasta, fastq)
+        --minins=MIN_INSERT_SIZE
+                            The minimum insert size for valid paired-end
+                            alignments [Default: -1]
+        --maxins=MAX_INSERT_SIZE
+                            The maximum insert size for valid paired-end
+                            alignments [Default: 400]
+
+      Reduced Representation Bisulfite Sequencing Options:
+        -r, --rrbs          Process reads from Reduced Representation Bisulfite
+                            Sequencing experiments
+        -c pattern, --cut-site=pattern
+                            Cutting sites of restriction enzyme. Ex: MspI(C-CGG),
+                            Mael:(C-TAG), double-enzyme MspI&Mael:(C-CGG,C-TAG).
+        -L RRBS_LOW_BOUND, --low=RRBS_LOW_BOUND
+                            lower bound of fragment length (excluding C-CGG ends)
+                            [Default: 40]
+        -U RRBS_UP_BOUND, --up=RRBS_UP_BOUND
+                            upper bound of fragment length (excluding C-CGG ends)
+                            [Default: 500]
+
+      General options:
+        -t TAG, --tag=TAG   [Y]es for undirectional lib, [N]o for directional
+                            [Default: N]
+        -s CUTNUMBER1, --start_base=CUTNUMBER1
+                            The first base of your read to be mapped [Default: 1]
+        -e CUTNUMBER2, --end_base=CUTNUMBER2
+                            The last cycle number of your read to be mapped
+                            [Default: 200]
+        -a FILE, --adapter=FILE
+                            Input text file of your adaptor sequences (to be
+                            trimed from the 3'end of the reads). Input 1 seq for
+                            dir. lib., 2 seqs for undir. lib. One line per
+                            sequence
+        --am=ADAPTER_MISMATCH
+                            Number of mismatches allowed in adaptor [Default: 1]
+        -g GENOME, --genome=GENOME
+                            Name of the reference genome (the same as the
+                            reference genome file in the preprocessing step) [ex.
+                            chr21_hg18.fa]
+        -m INT_NO_MISMATCHES, --mismatches=INT_NO_MISMATCHES
+                            Number of mismatches in one read [Default: 4]
+        --aligner=ALIGNER   Aligner program to perform the analisys: bowtie,
+                            bowtie2, soap [Default: bowtie2]
+        -p PATH, --path=PATH
+                            Path to the aligner program. Defaults:
+                            bowtie: /u/home/mcdb/weilong/install/bowtie-0.12.8
+                            bowtie2:
+                            /u/home/mcdb/weilong/install/bowtie2-2.0.0-beta7
+                            soap: /u/home/mcdb/weilong/soap2.21release/
+        -d DBPATH, --db=DBPATH
+                            Path to the reference genome library (generated in
+                            preprocessing genome) [Default: /u/home/mcdb/weilong/i
+                            nstall/BSseeker2/bs_utils/reference_genomes]
+        -l NO_SPLIT, --split_line=NO_SPLIT
+                            Number of lines per split (the read file will be split
+                            into small files for mapping. The result will be
+                            merged. [Default: 4000000]
+        -o OUTFILE, --output=OUTFILE
+                            The name of output file [INFILE.bs(se|pe|rrbs)]
+        -f FORMAT, --output-format=FORMAT
+                            Output format: bam, sam, bs_seeker1 [Default: bam]
+        --no-header         Suppress SAM header lines [Default: False]
+        --temp_dir=PATH     The path to your temporary directory [Default: /tmp]
+        --XS=XS_FILTER      Filter definition for tag XS, format X,Y. X=0.8 and
+                            y=5 indicate that for one read, if #(mCH sites)/#(all
+                            CH sites)>0.8 and #(mCH sites)>5, then tag XS=1; or
+                            else tag XS=0. [Default: 0.5,5]
+        --multiple-hit      Output reads with multiple hits to
+                            file"Multiple_hit.fa"
+        -v, --version       show version of BS-Seeker2
+
+      Aligner Options:
+        You may specify any additional options for the aligner. You just have
+        to prefix them with --bt- for bowtie, --bt2- for bowtie2, --soap- for
+        soap, and BS Seeker will pass them on. For example: --bt-p 4 will
+        increase the number of threads for bowtie to 4, --bt--tryhard will
+        instruct bowtie to try as hard as possible to find valid alignments
+        when they exist, and so on. Be sure that you know what you are doing
+        when using these options! Also, we don't do any validation on the
+        values.
+
+
+
+##Examples :
+
+* Align from fasta format with bowtie2 (local alignment) for whole genome, allowing 3 mismatches
+
+        python bs_seeker2-align.py -i WGBS.fa -m 3 --aligner=bowtie2 -o WGBS.bam -f bam -g genome.fa
+
+* Align from qseq format for RRBS with bowtie, default parameters for RRBS fragments
+
+        python bs_seeker2-align.py -i RRBS.fa --aligner=bowtie -o RRBS.sam -f sam -g genome.fa -r -a adapter.txt
+
+* Align from qseq format for RRBS with bowtie (end-to-end), specifying lengths of fragments ranging [40bp, 400bp]
+
+        python bs_seeker2-align.py -i RRBS.qseq --aligner=bowtie2 --bt2--end-to-end -o RRBS.bam -f bam -g genome.fa -r --low=40 --up=400 -a adapter.txt
+
+The parameters '--low' and '--up' should be the same with corresponding parameters when building the genome index
+
+
+
+##Input file:
+
+- Adapter.txt (example) :
+
+            AGATCGGAAGAGCACACGTC
+
+
+##Output files:
+
+- SAM file
+
+    Sample:
+
+        10918   0       chr1    133859922       255     100M    *       0       0       TGGTTGTTTTTGTTATAGTTTTTTGTTGTAGAGTTTTTTTTGGAAAGTTGTGTTTATTTTTTTTTTTGTTTGGGTTTTGTTTGAAAGGGGTGGATGAGTT        *       XO:Z:+FW        XS:i:0  NM:i:3  XM:Z:x--yx-zzzy--y--y--zz-zyx-yx-y--------z------------x--------z--zzz----y----y--x-zyx--------y--------z   XG:Z:-C_CGGCCGCCCCTGCTGCAGCCTCCCGCCGCAGAGTTTTCTTTGGAAAGTTGCGTTTATTTCTTCCCTTGTCTGGGCTGCGCCCGAAAGGGGCAGATGAGTC_AC
+
+
+    Format descriptions:
+
+        BS-Seeker2 specific tags:
+        XO : orientation, from forward/reverted
+        XS : 1 when read is recognized as not fully converted by bisulfite treatment, or else 0
+        XM : number of sites for mismatch
+                X: methylated CG
+                x: un-methylated CG
+                Y: methylated CHG
+                y: un-methylated CHG
+                Z: methylated CHH
+                z: un-methylated CHH
+        XG : genome sequences, with 2bp extended on both ends, from 5' to 3'
+        YR : tag only for RRBS, serial id of mapped fragment
+        YS : tag only for RRBS, start position of mapped fragment
+        YE : tag only for RRBS, end position of mapped fragment
+
+        Note:
+            For reads mapped on Watson(minus) strand, the 10th colum in SAM file is not the original reads but the revered sequences.
+
+
+##Tips:
+
+- Removing adapter is recommended.
+
+	If you don't know what's your parameter, please ask the person who generate the library for you.
+
+	If you are too shy to ask for it, you can try to de novo motif finding tools (such as [DME](http://cb1.utdallas.edu/dme/index.htm) and [MEME](http://meme.nbcr.net/meme/cgi-bin/meme.cgi)) find the enriched pattern in 1000 reads.
+
+	Of course, you can also use other tools (such as [cutadapt](http://code.google.com/p/cutadapt/) ) to remove adaptor first.
+
+- It's always better to use a wider range for fragment length.
+
+	For example, if 95% of reads come from fragments with length range [50bp, 250bp], you'd better choose [40bp, 300bp].
+
+
+(3) bs_seeker2-call_methylation.py
+------------
+
+
+This module calls methylation levels from the mapping result.
+
+##Usage:
+
+        $ python bs_seeker2-call_methylation.py -h
+        Usage: bs_seeker2-call_methylation.py [options]
+
+        Options:
+          -h, --help            show this help message and exit
+          -i INFILE, --input=INFILE
+                                BAM output from bs_seeker2-align.py
+          -d DBPATH, --db=DBPATH
+                                Path to the reference genome library (generated in
+                                preprocessing genome) [Default: /u/home/mcdb/weilong/i
+                                nstall/BSseeker2/bs_utils/reference_genomes]
+          -o OUTFILE, --output-prefix=OUTFILE
+                                The output prefix to create ATCGmap and wiggle files
+                                [INFILE]
+          --wig=OUTFILE         The output .wig file [INFILE.wig]
+          --CGmap=OUTFILE       The output .CGmap file [INFILE.CGmap]
+          --ATCGmap=OUTFILE     The output .ATCGmap file [INFILE.ATCGmap]
+          -x, --rm-SX           Removed reads with tag 'XS:i:1', which would be
+                                considered as not fully converted by bisulfite
+                                treatment [Default: False]
+          -r READ_NO, --read-no=READ_NO
+                                The least number of reads covering one site to be
+                                shown in wig file [Default: 1]
+          -v, --version         show version of BS-Seeker2
+
+
+##Example :
+
+-For WGBS (whole genome bisulfite sequencing):
+
+        python bs_seeker2-call_methylation.py -i WGBS.bam -o output --db /path/to/BSseeker2/bs_utils/reference_genomes/genome.fa_bowtie/
+
+-For RRBS:
+
+        python bs_seeker2-call_methylation.py -i RRBS.bam -o output --db /path/to/BSseeker2/bs_utils/reference_genomes/genome.fa_rrbs_40_400_bowtie2/
+
+-For RRBS and removed un-converted reads (with tag XS=1):
+
+        python bs_seeker2-call_methylation.py -x -i RRBS.bam -o output --db /path/to/BSseeker2/bs_utils/reference_genomes/genome.fa_rrbs_75_280_bowtie2/
+
+-For RRBS and only show sites covered by at least 10 reads in WIG file:
+
+        python bs_seeker2-call_methylation.py -r 10 -i RRBS.bam -o output --db /path/to/BSseeker2/bs_utils/reference_genomes/genome.fa_rrbs_75_280_bowtie2/
+
+
+The folder “genome.fa\_rrbs\_40\_500\_bowtie2” is built  in the first step
+
+##Output files:
+
+- wig file
+
+        Sample:
+
+        variableStep chrom=chr1
+        3000419	0.000000
+        3000423	-0.2
+        3000440	0.000000
+        3000588	0.5
+        3000593	-0.000000
+
+
+        Format descriptions:
+        WIG file format. Negative value for 2nd column indicate a Cytosine on minus strand.
+
+
+- CGmap file
+
+    Sample:
+
+        chr1	G	3000851	CHH	CC	0.1	1	10
+        chr1	C	3001624	CHG	CA	0.0	0	9
+        chr1	C	3001631	CG	CG	1.0	5	5
+
+    Format descriptions:
+
+        (1) chromosome
+        (2) nucleotide on Watson (+) strand
+        (3) position
+        (4) context (CG/CHG/CHH)
+        (5) dinucleotide-context (CA/CC/CG/CT)
+        (6) methyltion-level = #-of-C / (#-of-C + #-of-T)
+        (7) #-of-C (methylated)
+        (8) (#-ofC + #-of-T) (all cytosines)
+
+
+- ATCGmap file
+
+    Sample:
+
+        chr1	T	3009410	--	--	0	10	0	0	0	0	0	0	0	0	na
+        chr1	C	3009411	CHH	CC	0	10	0	0	0	0	0	0	0	0	0.0
+        chr1	C	3009412	CHG	CC	0	10	0	0	0	0	0	0	0	0	0.0
+        chr1	C	3009413	CG	CG	0	10	50	0	0	0	0	0	0	0	0.833333333333
+
+
+    Format descriptions:
+
+        (1) chromosome
+        (2) nucleotide on Watson (+) strand
+        (3) position
+        (4) context (CG/CHG/CHH)
+        (5) dinucleotide-context (CA/CC/CG/CT)
+
+        (6) - (10) plus strand
+        (6) # of reads from Watson strand mapped here, support A on Watson strand
+        (7) # of reads from Watson strand mapped here, support T on Watson strand
+        (8) # of reads from Watson strand mapped here, support C on Watson strand
+        (9) # of reads from Watson strand mapped here, support G on Watson strand
+        (10) # of reads from Watson strand mapped here, support N
+
+        (11) - (15) minus strand
+        (11) # of reads from Crick strand mapped here, support A on Watson strand and T on Crick strand
+        (12) # of reads from Crick strand mapped here, support T on Watson strand and A on Crick strand
+        (13) # of reads from Crick strand mapped here, support C on Watson strand and G on Crick strand
+        (14) # of reads from Crick strand mapped here, support G on Watson strand and C on Crick strand
+        (15) # of reads from Crick strand mapped here, support N
+
+        (16) methylation_level = #C/(#C+#T) = (C8+C14)/(C7+C8+C11+C14); "nan" means none reads support C/T at this position.
+
+
+
+Contact Information
+============
+
+If you still have questions on BS-Seeker 2, or you find bugs when using BS-Seeker 2, or you have suggestions, please write email to guoweilong@gmail.com (Weilong Guo).
+
+
+
+Questions & Answers
+============
+
+(1) Speed-up your alignment
+
+Q: "It takes me days to do the alignment for one lane" ...
+
+A: Yes, alignment is a time-consuming work, especially because the sequencing depth is increasing. An efficient way to align is :
+
+    i. cut the original sequence file into multiple small pieces;
+
+        Ex: split -l 4000000 input.fq
+
+    ii. align them in parallel;
+    iii. merge all the BAM files into a single one before running "bs-seeker2_call-methylation.py" (user "samtools merge" command).
+
+        Ex: samtools merge out.bam in1.bam in2.bam in3.bam
+
+(2) read in BAM/SAM
+
+Q: Is the read sequence in BAM/SAM file is the same as my original one?
+
+A: NO. They are different for several reasons.
+
+    i. For RRBS, some reads are short because of trimming of the adapters
+    ii. For read mapping on Crick (-) strand, the reads are in fact the antisense version of the original sequence, opposite both in nucleotides and direction
+
+(3) "Pysam" package related problem
+
+Q: I'm normal account user for Linux(Cluster). I can't install "pysam". I get following error massages:
+
+
+        $ python setup.py install
+        running install
+        error: can't create or remove files in install directory
+        The following error occurred while trying to add or remove files in the
+        installation directory:
+            [Errno 13] Permission denied: '/usr/lib64/python2.6/site-packages/test-easy-install-26802.write-test'
+        ...
+
+
+A: You can ask the administrator of your cluster to install pysam. If you don't want to bother him/her, you might need to build your own python, and then install the "pysam" package. The following script could be helpful for you.
+
+
+        mkdir ~/install
+        cd ~/install/
+
+        # install python
+        wget http://www.python.org/ftp/python/2.7.4/Python-2.7.4.tgz # download the python from websites
+        tar zxvf Python-2.7.4.tgz # decompress
+        cd Python-2.7.4
+        ./configure --prefix=`pwd`
+        make
+        make install
+
+        # Add the path of Python to $PATH
+        #  Please add the following line to file ~/.bashrc
+
+            export PATH=~/install/Python-2.7.4:$PATH
+
+        # save the ~/.bashrc file
+        source ~/.bashrc
+
+        # install pysam package
+        wget https://pysam.googlecode.com/files/pysam-0.7.4.tar.gz
+        tar zxvf pysam-0.7.4.tar.gz
+        cd pysam-0.7.4
+        python setup.py build
+        python setup.py install
+        # re-login the shell after finish installing pysam
+
+        # install BS-Seeker2
+        wget https://github.com/BSSeeker/BSseeker2/archive/master.zip
+        mv master BSSeeker2.zip
+        unzip BSSeeker2.zip
+        cd BSseeker2-master/
+
+
+(4)Run BS-Seeker2
+
+Q: Can I add the path of BS-Seeker2's *.py to the $PATH, so I can call
+BS-Seeker2 from anywhere?
+
+A: If you're using the "python" from path "/usr/bin/python", you can directly
+add the path of BS-Seeker2 in file "~/.bash_profile" (bash) or "~/.profile"
+(other shell) or "~/.bashrc" (per-interactive-shell startup).
+But if you are using python under other directories, you might need to modify
+BS-Seeker2's script first. For example, if your python path is "/my_python/python",
+please change the first line in "bs_seeker-build.py", "bs_seeker-align.py" and
+"bs_seeker-call_methylation.py" to
+
+        #!/my_python/python
+
+Then add
+
+        export PATH=/path/to/BS-Seeker2/:$PATH
+
+to file "~/.bash_profile" (e.g.), and source the file:
+
+        source  ~/.bash_profile
+
+Then you can use BS-Seeker2 globally by typing:
+
+        bs_seeker_build.py -h
+        bs_seeker-align.py -h
+        bs_seeker-call_methylation.py -h
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/RELEASE_NOTES	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,7 @@
+
+v2.0.3 - May 19, 2013
+1. Fix bug in utils.py
+2. Add UPDATE file
+
+
+
Binary file BSseeker2/bs_align/.___init__.py has changed
Binary file BSseeker2/bs_align/._bs_align_utils.py has changed
Binary file BSseeker2/bs_align/._bs_pair_end.py has changed
Binary file BSseeker2/bs_align/._bs_rrbs.py has changed
Binary file BSseeker2/bs_align/._bs_single_end.py has changed
Binary file BSseeker2/bs_align/._output.py has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_align/__init__.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,1 @@
+__author__ = 'pf'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_align/bs_align_utils.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,385 @@
+from bs_utils.utils import *
+import re
+
+
+BAM_MATCH = 0
+BAM_INS = 1
+BAM_DEL = 2
+BAM_SOFTCLIP = 4
+
+CIGAR_OPS = {'M' : BAM_MATCH, 'I' : BAM_INS, 'D' : BAM_DEL, 'S' : BAM_SOFTCLIP}
+
+
+def N_MIS(r,g):
+    mismatches = 0
+    if len(r)==len(g):
+        for i in xrange(len(r)):
+            if r[i] != g[i] and r[i] != "N" and g[i] != "N" and not(r[i] == 'T' and g[i] == 'C'):
+                mismatches += 1
+    return mismatches
+
+
+#----------------------------------------------------------------
+
+"""
+Exmaple:
+========
+Read :       ACCGCGTTGATCGAGTACGTACGTGGGTC
+Adapter :    ....................ACGTGGGTCCCG
+========
+
+no_mismatch : the maximum number allowed for mismatches
+
+Algorithm: (allowing 1 mismatch)
+========
+-Step 1: 
+  ACCGCGTTGATCGAGTACGTACGTGGGTC
+  ||XX
+  ACGTGGGTCCCG
+-Step 2: 
+  ACCGCGTTGATCGAGTACGTACGTGGGTC
+   X||X
+  .ACGTGGGTCCCG
+-Step 3: 
+  ACCGCGTTGATCGAGTACGTACGTGGGTC
+    XX
+  ..ACGTGGGTCCCG
+-Step ...
+-Step N: 
+  ACCGCGTTGATCGAGTACGTACGTGGGTC
+                      |||||||||
+  ....................ACGTGGGTCCCG
+Success & return!
+========
+
+"""
+
+def RemoveAdapter ( read, adapter, no_mismatch ) :
+    lr = len(read)
+    la = len(adapter)
+    for i in xrange( lr - no_mismatch ) :
+        read_pos = i
+        adapter_pos = 0
+        count_no_mis = 0
+        while (adapter_pos < la) and (read_pos < lr) :
+            if (read[read_pos] == adapter[adapter_pos]) :
+                read_pos = read_pos + 1
+                adapter_pos = adapter_pos + 1
+            else :
+                count_no_mis = count_no_mis + 1
+                if count_no_mis > no_mismatch :
+                    break
+                else :
+                    read_pos = read_pos + 1
+                    adapter_pos = adapter_pos + 1
+        # while_end
+
+        if adapter_pos == la or read_pos == lr :
+            return read[:i]
+    # for_end
+    return read
+
+
+def Remove_5end_Adapter ( read, adapter, no_mismatch) :
+    lr = len(read)
+    la = len(adapter)
+    for i in xrange (la - no_mismatch) :
+        read_pos = 0
+        adapter_pos = i
+        count_no_mis = 0
+        while (adapter_pos < la) and (read_pos < lr) :
+            if (read[read_pos] == adapter[adapter_pos]) :
+                adapter_pos = adapter_pos + 1
+                read_pos = read_pos + 1
+            else :
+                count_no_mis = count_no_mis + 1
+                if count_no_mis > no_mismatch :
+                    break
+                else : 
+                    read_pos = read_pos + 1
+                    adapter_pos = adapter_pos + 1
+        # while_end
+        if adapter_pos == la :
+            return read[(la-i):]
+
+
+def next_nuc(seq, pos, n):
+    """ Returns the nucleotide that is n places from pos in seq. Skips gap symbols.
+    """
+    i = pos + 1
+    while i < len(seq):
+        if seq[i] != '-':
+            n -= 1
+            if n == 0: break
+        i += 1
+    if i < len(seq) :
+        return seq[i]
+    else :
+        return 'N'
+
+
+
+def methy_seq(read, genome):
+    H = ['A', 'C', 'T']
+    m_seq = []
+    xx = "-"
+    for i in xrange(len(read)):
+
+        if genome[i] == '-':
+            continue
+
+        elif read[i] != 'C' and read[i] != 'T':
+            xx = "-"
+
+        elif read[i] == "T" and genome[i] == "C": #(unmethylated):
+            nn1 = next_nuc(genome, i, 1)
+            if nn1 == "G":
+                xx = "x"
+            elif nn1 in H :
+                nn2 = next_nuc(genome, i, 2)
+                if nn2 == "G":
+                    xx = "y"
+                elif nn2 in H :
+                    xx = "z"
+
+        elif read[i] == "C" and genome[i] == "C": #(methylated):
+            nn1 = next_nuc(genome, i, 1)
+
+            if nn1 == "G":
+                xx = "X"
+            elif nn1 in H :
+                nn2 = next_nuc(genome, i, 2)
+
+                if nn2 == "G":
+                    xx = "Y"
+                elif nn2 in H:
+                    xx = "Z"
+        else:
+            xx = "-"
+        m_seq.append(xx)
+
+    return ''.join(m_seq)
+
+def mcounts(mseq, mlst, ulst):
+    out_mlst=[mlst[0]+mseq.count("X"), mlst[1]+mseq.count("Y"), mlst[2]+mseq.count("Z")]
+    out_ulst=[ulst[0]+mseq.count("x"), ulst[1]+mseq.count("y"), ulst[2]+mseq.count("z")]
+    return out_mlst, out_ulst
+
+
+
+def process_aligner_output(filename, pair_end = False):
+
+    #m = re.search(r'-('+'|'.join(supported_aligners) +')-TMP', filename)
+    m = re.search(r'-('+'|'.join(supported_aligners) +')-.*TMP', filename)
+    if m is None:
+        error('The temporary folder path should contain the name of one of the supported aligners: ' + filename)
+
+    format = m.group(1)
+    try :
+        input = open(filename)
+    except IOError:
+        print "[Error] Cannot open file %s" % filename
+        exit(-1)
+
+    QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL = range(11)
+    def parse_SAM(line):
+        buf = line.split()
+        # print buf
+        flag = int(buf[FLAG])
+
+        # skip reads that are not mapped
+        # skip reads that have probability of being non-unique higher than 1/10
+        if flag & 0x4 : # or int(buf[MAPQ]) < 10:
+            return None, None, None, None, None, None
+        # print "format = ", format
+        if format == BOWTIE:
+            mismatches = int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'NM:i:'][0]) # get the edit distance
+        # --- bug fixed ------
+        elif format == BOWTIE2:
+            if re.search(r'(.)*-e2e-TMP(.*)', filename) is None : # local model
+                mismatches = 1-int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'AS:i:'][0])
+                # print "====local=====\n"
+                ## bowtie2 use AS tag (score) to evaluate the mapping. The higher, the better.
+            else : # end-to-end model
+                # print "end-to-end\n"
+                mismatches = int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'XM:i:'][0])
+        # --- Weilong ---------
+        elif format == SOAP:
+            mismatches = 1-buf[MAPQ]
+            # mismatches = 1/float(buf[MAPQ])
+            ## downstream might round (0,1) to 0, so use integer instead
+            ## fixed by Weilong
+        elif format == RMAP:
+            # chr16   75728107        75728147        read45  9       -
+            # chr16   67934919        67934959        read45  9       -
+            mismatches = buf[4]
+
+        return (buf[QNAME], # read ID
+                buf[RNAME], # reference ID
+                int(buf[POS]) - 1, # position, 0 based (SAM is 1 based)
+                mismatches,    # number of mismatches
+                parse_cigar(buf[CIGAR]), # the parsed cigar string
+                flag & 0x40 # true if it is the first mate in a pair, false if it is the second mate
+                )
+
+    SOAP_QNAME, SOAP_SEQ, SOAP_QUAL, SOAP_NHITS, SOAP_AB, SOAP_LEN, SOAP_STRAND, SOAP_CHR, SOAP_LOCATION, SOAP_MISMATCHES = range(10)
+    def parse_SOAP(line):
+        buf = line.split()
+        return (buf[SOAP_QNAME],
+                buf[SOAP_CHR],
+                int(buf[SOAP_LOCATION]) - 1,
+                int(buf[SOAP_MISMATCHES]),
+                buf[SOAP_AB],
+                buf[SOAP_STRAND],
+                parse_cigar(buf[SOAP_LEN]+'M')
+            )
+
+        # chr16   75728107        75728147        read45  9       -
+    RMAP_CHR, RMAP_START, RMAP_END, RMAP_QNAME, RMAP_MISMATCH, RMAP_STRAND = range(6)
+    def parse_RMAP(line):
+        buf = line.split()
+        return ( buf[RMAP_QNAME],
+                 buf[RMAP_CHR],
+                 int(buf[RMAP_START]), # to check -1 or not
+                 int(buf[RMAP_END]) - int(buf[RMAP_START]) + 1,
+                 int(buf[RMAP_MISMATCH]),
+                 buf[RMAP_STRAND]
+        )
+
+    if format == BOWTIE or format == BOWTIE2:
+        if pair_end:
+            for line in input:
+                header1, chr1, location1, no_mismatch1, cigar1,        _ = parse_SAM(line)
+                header2,    _, location2, no_mismatch2, cigar2, mate_no2 = parse_SAM(input.next())
+
+
+                if header1 and header2:
+                    # flip the location info if the second mate comes first in the alignment file
+                    if mate_no2:
+                        location1, location2 = location2, location1
+                        cigar1, cigar2 = cigar2, cigar1
+
+
+                    yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2
+        else:
+            for line in input:
+                header, chr, location, no_mismatch, cigar, _ = parse_SAM(line)
+                if header is not None:
+                    yield header, chr, location, no_mismatch, cigar
+    elif format == SOAP:
+        if pair_end:
+            for line in input:
+                header1, chr1, location1, no_mismatch1, mate1, strand1, cigar1 = parse_SOAP(line)
+                header2, _   , location2, no_mismatch2,     _, strand2, cigar2 = parse_SOAP(input.next())
+
+                if mate1 == 'b':
+                    location1, location2 = location2, location1
+                    strand1, strand2 = strand2, strand1
+                    ciga1, cigar2 = cigar2, cigar1
+
+
+                if header1 and header2 and strand1 == '+' and strand2 == '-':
+                    yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2
+
+        else:
+            for line in input:
+                header, chr, location, no_mismatch, _, strand, cigar = parse_SOAP(line)
+                if header and strand == '+':
+                    yield header, chr, location, no_mismatch, cigar
+    elif format == RMAP :
+        if pair_end :
+            todo = 0
+            # to do
+        else :
+            for line in input:
+                header, chr, location, read_len, no_mismatch, strand = parse_RMAP(line)
+                cigar = str(read_len) + "M"
+                yield header, chr, location, no_mismatch, cigar
+
+    input.close()
+
+
+def parse_cigar(cigar_string):
+    i = 0
+    prev_i = 0
+    cigar = []
+    while i < len(cigar_string):
+        if cigar_string[i] in CIGAR_OPS:
+            cigar.append((CIGAR_OPS[cigar_string[i]], int(cigar_string[prev_i:i])))
+            prev_i = i + 1
+        i += 1
+    return cigar
+
+def get_read_start_end_and_genome_length(cigar):
+    r_start = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0
+    r_end = r_start
+    g_len = 0
+    for edit_op, count in cigar:
+        if edit_op == BAM_MATCH:
+            r_end += count
+            g_len += count
+        elif edit_op == BAM_INS:
+            r_end += count
+        elif edit_op == BAM_DEL:
+            g_len += count
+    return r_start, r_end, g_len # return the start and end in the read and the length of the genomic sequence
+    # r_start : start position on the read
+    # r_end   : end position on the read
+    # g_len   : length of the mapped region on genome
+
+
+def cigar_to_alignment(cigar, read_seq, genome_seq):
+    """ Reconstruct the pairwise alignment based on the CIGAR string and the two sequences
+    """
+
+    # reconstruct the alignment
+    r_pos = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0
+    g_pos = 0
+    r_aln = ''
+    g_aln = ''
+    for edit_op, count in cigar:
+        if edit_op == BAM_MATCH:
+            r_aln += read_seq[r_pos : r_pos + count]
+            g_aln += genome_seq[g_pos : g_pos + count]
+            r_pos += count
+            g_pos += count
+        elif edit_op == BAM_DEL:
+            r_aln += '-'*count
+            g_aln += genome_seq[g_pos : g_pos + count]
+            g_pos += count
+        elif edit_op == BAM_INS:
+            r_aln += read_seq[r_pos : r_pos + count]
+            g_aln += '-'*count
+            r_pos += count
+
+    return r_aln, g_aln
+
+
+
+# return sequence is [start, end), not include 'end'
+def get_genomic_sequence(genome, start, end, strand = '+'):
+    if strand != '+' and strand != '-' :
+        print "[Bug] get_genomic_sequence input should be \'+\' or \'-\'."
+        exit(-1)
+    if start > 1:
+        prev = genome[start-2:start]
+    elif start == 1:
+        prev = 'N'+genome[0]
+    else:
+        prev = 'NN'
+
+    if end < len(genome) - 1:
+        next = genome[end: end + 2]
+    elif end == len(genome) - 1:
+        next = genome[end] + 'N'
+    else:
+        next = 'NN'
+    origin_genome = genome[start:end]
+
+    if strand == '-':
+        # reverse complement everything if strand is '-'
+        revc = reverse_compl_seq('%s%s%s' % (prev, origin_genome, next))
+        prev, origin_genome, next = revc[:2], revc[2:-2], revc[-2:]
+
+    return origin_genome, next, '%s_%s_%s' % (prev, origin_genome, next)
+    # next : next two nucleotides
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_align/bs_pair_end.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,1011 @@
+import fileinput, os, random, math
+from bs_utils.utils import *
+from bs_align_utils import *
+#----------------------------------------------------------------
+def extract_mapping(ali_file):
+    unique_hits = {}
+    non_unique_hits = {}
+    header0 = ""
+    family = []
+    for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True):
+        #------------------------
+        if header != header0:
+
+            # --- output ----
+            if len(family) == 1:
+                unique_hits[header0] = family[0]
+            elif len(family) > 1:
+                min_lst = min(family, key = lambda x: x[0])
+                max_lst = max(family, key = lambda x: x[0])
+
+                if min_lst[0] < max_lst[0]:
+                    unique_hits[header0] = min_lst
+                else:
+                    non_unique_hits[header0] = min_lst[0]
+
+            header0 = header
+            family = []
+        family.append((no_mismatch, chr, location1, cigar1, location2, cigar2))
+
+    if len(family) == 1:
+        unique_hits[header0] = family[0]
+    elif len(family) > 1:
+        min_lst = min(family, key = lambda x: x[0])
+        max_lst = max(family, key = lambda x: x[0])
+
+        if min_lst[0] < max_lst[0]:
+            unique_hits[header0] = min_lst
+        else:
+            non_unique_hits[header0] = min_lst[0]
+
+    return unique_hits, non_unique_hits
+
+def _extract_mapping(ali_file):
+    U = {}
+    R = {}
+    header0 = ""
+    family = []
+    for line in fileinput.input(ali_file):
+        l = line.split()
+        header = l[0][:-2]
+        chr = str(l[1])
+        location = int(l[2])
+        #no_hits=int(l[4])
+        #-------- mismatches -----------
+        if len(l) == 4:
+            no_mismatch = 0
+        elif len(l) == 5:
+            no_mismatch = l[4].count(":")
+        else:
+            print l
+        #------------------------
+        if header != header0:
+            #--------------------
+            if header0 != "":
+                # --- output ----
+                if len(family) == 1:
+                    U[header0] = family[0]
+                else:
+                    if family[0][0] < family[1][0]:
+                        U[header0] = family[0]
+                    elif family[1][0] < family[0][0]:
+                        U[header0] = family[1]
+                    else:
+                        R[header0] = family[0][0]
+                family=[]
+                # ---------------
+            header0 = header
+            family = [[no_mismatch, chr, location]]
+            member = 1
+        elif header == header0:
+            if member == 1:
+                family[-1][0] += no_mismatch
+                family[-1].append(location)
+                member = 2
+            elif member == 2:
+                family.append([no_mismatch, chr, location])
+                member = 1
+        #------------------------------
+
+    fileinput.close()
+    return U, R
+
+
+#----------------------------------------------------------------
+
+def bs_pair_end(main_read_file_1,
+                main_read_file_2,
+                asktag,
+                adapter_file,
+                cut1,
+                cut2, # add cut3 and cut4?
+                no_small_lines,
+                max_mismatch_no,
+                aligner_command,
+                db_path,
+                tmp_path,
+                outfile, XS_pct, XS_count, show_multiple_hit=False):
+
+
+    #----------------------------------------------------------------
+    adapter=""
+    adapterA=""
+    adapterB=""
+    if adapter_file !="":
+        try :
+            adapter_inf=open(adapter_file,"r")
+            if asktag=="N": #<--- directional library
+                adapter=adapter_inf.readline()
+                adapter_inf.close()
+                adapter=adapter.rstrip("\n")
+            elif asktag=="Y":#<--- un-directional library
+                adapterA=adapter_inf.readline()
+                adapterB=adapter_inf.readline()
+                adapter_inf.close()
+                adapterA=adapterA.rstrip("\n")
+                adapterB=adapterB.rstrip("\n")
+        except IOError :
+            print "[Error] Cannot find adapter file : %s" % adapter_file
+            exit(-1)
+
+
+    #----------------------------------------------------------------
+
+    logm("End 1 filename: %s"% main_read_file_1  )
+    logm("End 2 filename: %s"% main_read_file_2  )
+    logm("The first base (for mapping): %d"% cut1  )
+    logm("The last base (for mapping): %d"% cut2  )
+
+    logm("-------------------------------- " )
+    logm("Un-directional library: %s" % asktag  )
+    logm("Path for short reads aligner: %s"% aligner_command + '\n')
+    logm("Reference genome library path: %s"% db_path  )
+    logm("Number of mismatches allowed: %s"% max_mismatch_no  )
+
+    if adapter_file !="":
+        if asktag=="Y":
+            logm("Adapters to be removed from 3' of the reads:" )
+            logm("-- A: %s" % adapterA  )
+            logm("-- B: %s" % adapterB  )
+        elif asktag=="N":
+            logm("Adapter to be removed from 3' of the reads:" )
+            logm("-- %s" % adapter  )
+
+    logm("-------------------------------- " )
+
+    #----------------------------------------------------------------
+
+    # helper method to join fname with tmp_path
+    tmp_d = lambda fname: os.path.join(tmp_path, fname)
+
+    db_d = lambda fname: os.path.join(db_path, fname)
+
+
+    #----------------------------------------------------------------
+    # splitting the 2 big read files
+
+    input_fname1 = os.path.split(main_read_file_1)[1]
+    input_fname2 = os.path.split(main_read_file_2)[1]
+
+    # TODO: run these in parallel with a subprocess
+    split_file(main_read_file_1, tmp_d(input_fname1)+'-E1-', no_small_lines)
+    split_file(main_read_file_2, tmp_d(input_fname2)+'-E2-', no_small_lines)
+
+    dirList=os.listdir(tmp_path)
+    my_files = zip(sorted(filter(lambda fname: fname.startswith("%s-E1-" % input_fname1), dirList)),
+                   sorted(filter(lambda fname: fname.startswith("%s-E2-" % input_fname2), dirList)))
+
+
+
+    #---- Stats ------------------------------------------------------------
+    all_raw_reads=0
+    all_trimmed=0
+    all_mapped=0
+    all_mapped_passed=0
+
+    all_unmapped=0
+
+    numbers_premapped_lst=[0,0,0,0]
+    numbers_mapped_lst=[0,0,0,0]
+
+    mC_lst=[0,0,0]
+    uC_lst=[0,0,0]
+
+    no_my_files=0
+
+    #----------------------------------------------------------------
+    print "== Start mapping =="
+
+    for read_file_1, read_file_2 in my_files:
+
+        no_my_files+=1
+
+        random_id=".tmp-"+str(random.randint(1000000,9999999))
+
+        original_bs_reads_1 = {}
+        original_bs_reads_2 = {}
+        original_bs_reads_lst= [original_bs_reads_1, original_bs_reads_2]
+
+
+        if asktag == "Y" :
+
+            #----------------------------------------------------------------
+            outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id)
+            outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id)
+            outfile_2FCT = tmp_d('Trimmed_FCT_2.fa'+random_id)
+            outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id)
+
+            try :
+                read_inf = open(tmp_d(read_file_1),"r")
+            except IOError :
+                print "[Error] Cannot open file : %s", tmp_d(read_file_1)
+                exit(-1)
+            oneline = read_inf.readline()
+            l = oneline.split()
+            input_format = ""
+
+            if oneline[0]=="@":	# Illumina GAII FastQ (Lister et al Nature 2009)
+                input_format="fastq"
+                n_fastq=0
+            elif len(l)==1 and oneline[0]!=">": 	# pure sequences
+                input_format="seq"
+            elif len(l)==11:	# Illumina GAII qseq file
+                input_format="qseq"
+            elif oneline[0]==">":	# fasta
+                input_format="fasta"
+                n_fasta=0
+            read_inf.close()
+
+            print "Detected data format: %s" % input_format
+
+            #----------------------------------------------------------------
+            read_file_list   = [read_file_1, read_file_2]
+            outfile_FCT_list = [outfile_1FCT, outfile_2FCT]
+            outfile_RCT_list = [outfile_1RCT, outfile_2RCT]
+            n_list = [0, 0]
+
+
+            for f in range(2):
+                read_file = read_file_list[f]
+                outf_FCT = open(outfile_FCT_list[f], 'w')
+                outf_RCT = open(outfile_RCT_list[f], 'w')
+                original_bs_reads = original_bs_reads_lst[f]
+                n = n_list[f]
+
+                seq_id = ""
+                seq = ""
+                seq_ready = "N"
+                for line in fileinput.input(tmp_d(read_file)):
+                    l=line.split()
+                    if input_format=="seq":
+                        n+=1
+                        seq_id=str(n)
+                        seq_id=seq_id.zfill(12)
+                        seq=l[0]
+                        seq_ready="Y"
+                    elif input_format=="fastq":
+                        m_fastq=math.fmod(n_fastq,4)
+                        n_fastq+=1
+                        seq_ready="N"
+                        if m_fastq==0:
+                            n+=1
+                            seq_id=str(n)
+                            seq_id=seq_id.zfill(12)
+                            seq=""
+                        elif m_fastq==1:
+                            seq=l[0]
+                            seq_ready="Y"
+                        else:
+                            seq=""
+                    elif input_format=="qseq":
+                        n+=1
+                        seq_id=str(n)
+                        seq_id=seq_id.zfill(12)
+                        seq=l[8]
+                        seq_ready="Y"
+                    elif input_format=="fasta":
+                        m_fasta=math.fmod(n_fasta,2)
+                        n_fasta+=1
+                        seq_ready="N"
+                        if m_fasta==0:
+                            n+=1
+                            seq_id=l[0][1:]
+                            seq_id=seq_id.zfill(17)
+                            seq=""
+                        elif m_fasta==1:
+                            seq=l[0]
+                            seq_ready="Y"
+                        else:
+                            seq=""
+                    #----------------------------------------------------------------
+                    if seq_ready=="Y":
+                        seq=seq[cut1-1:cut2] #------- selecting 0..52 from 1..72  -e 52
+                        seq=seq.upper()
+                        seq=seq.replace(".","N")
+
+                        #--striping BS adapter from 3' read --------------------------------------------------------------
+                        if (adapterA !="") and (adapterB !=""):
+                            signature=adapterA[:6]
+                            if signature in seq:
+                                signature_pos=seq.index(signature)
+                                if seq[signature_pos:] in adapterA:
+                                    seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
+                                    all_trimmed+=1
+                            else:
+                                signature=adapterB[:6]
+                                if signature in seq:
+                                    #print seq_id,seq,signature;
+                                    signature_pos=seq.index(signature)
+                                    if seq[signature_pos:] in adapterB:
+                                        seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
+                                        all_trimmed+=1
+
+                        if len(seq) <= 4:
+                            seq = "N" * (cut2-cut1+1)
+                        #---------  trimmed_raw_BS_read  ------------------
+                        original_bs_reads[seq_id] = seq
+
+                        #---------  FW_C2T  ------------------
+                        outf_FCT.write('>%s\n%s\n' % (seq_id, seq.replace("C","T")))
+                        #---------  RC_G2A  ------------------
+                        outf_RCT.write('>%s\n%s\n' % (seq_id, seq.replace("G","A")))
+
+                n_list[f]=n
+
+                outf_FCT.close()
+                outf_RCT.close()
+
+                fileinput.close()
+
+
+            #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
+            all_raw_reads+=n
+
+            #--------------------------------------------------------------------------------
+            # Bowtie mapping
+            #--------------------------------------------------------------------------------
+            WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
+            WC2T_rf=tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
+            CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
+            CC2T_rf=tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
+
+            run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
+                                                      'input_file_1' : outfile_1FCT,
+                                                      'input_file_2' : outfile_2RCT,
+                                                      'output_file' : WC2T_fr},
+
+                             aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
+                                                      'input_file_1' : outfile_1FCT,
+                                                      'input_file_2' : outfile_2RCT,
+                                                      'output_file' : CC2T_fr},
+
+                             aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
+                                                      'input_file_1' : outfile_2FCT,
+                                                      'input_file_2' : outfile_1RCT,
+                                                      'output_file' : WC2T_rf},
+
+                             aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
+                                                      'input_file_1' : outfile_2FCT,
+                                                      'input_file_2' : outfile_1RCT,
+                                                      'output_file' : CC2T_rf}])
+
+
+            delete_files(outfile_1FCT, outfile_2FCT, outfile_1RCT, outfile_2RCT)
+
+
+            #--------------------------------------------------------------------------------
+            # Post processing
+            #--------------------------------------------------------------------------------
+
+
+            FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
+            FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf)
+            RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
+            RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf)
+
+            delete_files(WC2T_fr, WC2T_rf, CC2T_fr, CC2T_rf)
+
+            #----------------------------------------------------------------
+            # get uniq-hit reads
+            #----------------------------------------------------------------
+            Union_set=set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys())
+
+            Unique_FW_fr_C2T=set() # +
+            Unique_FW_rf_C2T=set() # +
+            Unique_RC_fr_C2T=set() # -
+            Unique_RC_rf_C2T=set() # -
+            Multiple_hits=set()
+
+
+            for x in Union_set:
+                _list=[]
+                for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_C2T_fr_U, RC_C2T_rf_U]:
+                    mis_lst=d.get(x,[99])
+                    mis=int(mis_lst[0])
+                    _list.append(mis)
+                for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_C2T_fr_R, RC_C2T_rf_R]:
+                    mis=d.get(x,99)
+                    _list.append(mis)
+                mini=min(_list)
+                if _list.count(mini)==1:
+                    mini_index=_list.index(mini)
+                    if mini_index==0:
+                        Unique_FW_fr_C2T.add(x)
+                    elif mini_index==1:
+                        Unique_FW_rf_C2T.add(x)
+                    elif mini_index==2:
+                        Unique_RC_fr_C2T.add(x)
+                    elif mini_index==3:
+                        Unique_RC_rf_C2T.add(x)
+                    else :
+                        Multiple_hits.add(x)
+                else :
+                    Multiple_hits.add(x)
+
+            # write reads rejected by Multiple Hits to file
+            if show_multiple_hit :
+                outf_MH=open("Multiple_hit.fa",'w')
+                for i in Multiple_hits :
+                    outf_MH.write(">%s\n" % i)
+                    outf_MH.write("%s\n" % original_bs_reads[i])
+                outf_MH.close()
+
+            del Union_set
+            del FW_C2T_fr_R
+            del FW_C2T_rf_R
+            del RC_C2T_fr_R
+            del RC_C2T_rf_R
+
+            FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
+            FW_C2T_rf_uniq_lst=[[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T]
+            RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
+            RC_C2T_rf_uniq_lst=[[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T]
+
+            FW_C2T_fr_uniq_lst.sort()
+            FW_C2T_rf_uniq_lst.sort()
+            RC_C2T_fr_uniq_lst.sort()
+            RC_C2T_rf_uniq_lst.sort()
+            FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
+            FW_C2T_rf_uniq_lst=[x[1] for x in FW_C2T_rf_uniq_lst]
+            RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
+            RC_C2T_rf_uniq_lst=[x[1] for x in RC_C2T_rf_uniq_lst]
+            #----------------------------------------------------------------
+
+            numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
+            numbers_premapped_lst[1]+=len(Unique_FW_rf_C2T)
+            numbers_premapped_lst[2]+=len(Unique_RC_fr_C2T)
+            numbers_premapped_lst[3]+=len(Unique_RC_rf_C2T)
+
+            del Unique_FW_fr_C2T
+            del Unique_FW_rf_C2T
+            del Unique_RC_fr_C2T
+            del Unique_RC_rf_C2T
+
+            #----------------------------------------------------------------
+
+            nn = 0
+            for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U),
+                                            (FW_C2T_rf_uniq_lst,FW_C2T_rf_U),
+                                            (RC_C2T_fr_uniq_lst,RC_C2T_fr_U),
+                                            (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]:
+                nn += 1
+
+                mapped_chr0 = ""
+                for header in ali_unique_lst:
+
+                    _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
+
+                    #-------------------------------------
+                    if mapped_chr != mapped_chr0:
+                        my_gseq=deserialize(db_d(mapped_chr))
+                        chr_length=len(my_gseq)
+                        mapped_chr0=mapped_chr
+                    #-------------------------------------
+                    if nn == 1 or nn == 3:
+                        original_BS_1 = original_bs_reads_1[header]
+                        original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
+                    else:
+                        original_BS_1 = original_bs_reads_2[header]
+                        original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
+
+                    r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
+                    r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
+
+                    all_mapped += 1
+
+                    if nn == 1: 							# FW-RC mapped to + strand:
+
+                        FR = "+FR"
+
+#                        mapped_location_1 += 1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "+"
+
+#                        mapped_location_2 += 1
+#                        origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "+"
+
+                    elif nn==2: 							# RC-FW mapped to + strand:
+
+#                        original_BS_1 = original_bs_reads_2[header]
+#                        original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
+                        FR = "+RF"
+#                        mapped_location_1 += 1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "+"
+
+#                        mapped_location_2 += 1
+#                        origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "+"
+
+
+                    elif nn==3: 						# FW-RC mapped to - strand:
+#                        original_BS_1=original_bs_reads_1[header]
+#                        original_BS_2=reverse_compl_seq(original_bs_reads_2[header])
+
+                        FR = "-FR"
+                        mapped_location_1 = chr_length - mapped_location_1 - g_len_1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "-"
+
+                        mapped_location_2 = chr_length - mapped_location_2 - g_len_2
+#                        origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1 ])
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "-"
+
+                    elif nn==4: 						# RC-FW mapped to - strand:
+#                        original_BS_1=original_bs_reads_2[header]
+#                        original_BS_2=reverse_compl_seq(original_bs_reads_1[header])
+
+                        FR = "-RF"
+                        mapped_location_1 = chr_length - mapped_location_1 - g_len_1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "-"
+
+                        mapped_location_2 = chr_length - mapped_location_2 - g_len_2
+#                        origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "-"
+
+
+
+
+                    origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
+                    origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
+
+                    r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
+                    r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
+
+
+                    N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
+                    N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
+
+
+                    if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no) :
+                        all_mapped_passed += 1
+                        numbers_mapped_lst[nn-1] += 1
+                        #---- unmapped -------------------------
+                        del original_bs_reads_1[header]
+                        del original_bs_reads_2[header]
+                        #---------------------------------------
+
+#                        output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
+#                        output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
+
+                        methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
+                        methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
+
+                        mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst)
+                        mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst)
+
+
+                        # 
+                        #---XS FILTER----------------
+                        #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
+                        XS_1 = 0
+                        nCH_1 = methy_1.count('y') + methy_1.count('z')
+                        nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
+                        if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
+                            XS_1 = 1
+                        #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
+                        XS_2 = 0
+                        nCH_2 = methy_2.count('y') + methy_2.count('z')
+                        nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
+                        if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
+                            XS_2 = 1
+
+
+                        outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
+                            output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
+                        outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
+                            output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
+
+            print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
+            #----------------------------------------------------------------
+            #	output unmapped pairs
+            #----------------------------------------------------------------
+
+            unmapped_lst=original_bs_reads_1.keys()
+            unmapped_lst.sort()
+
+#            for u in unmapped_lst:
+#                outf_u1.write("%s\n"%original_bs_reads_1[u])
+#                outf_u2.write("%s\n"%original_bs_reads_2[u])
+
+            all_unmapped += len(unmapped_lst)
+
+
+        if asktag=="N":
+
+            #----------------------------------------------------------------
+            outfile_1FCT= tmp_d('Trimed_FCT_1.fa'+random_id)
+            outfile_2FCT= tmp_d('Trimed_FCT_2.fa'+random_id)
+
+            try :
+                read_inf=open(tmp_d(read_file_1),"r")
+            except IOError :
+                print "[Error] Cannot open file : %s", tmp_d(read_file_1)
+                exit(-1)
+
+            oneline=read_inf.readline()
+            l=oneline.split()
+            input_format=""
+
+            #if len(l)==5: # old solexa format
+            #	input_format="old Solexa Seq file"
+
+            if oneline[0]=="@":	# Illumina GAII FastQ (Lister et al Nature 2009)
+                input_format="FastQ"
+                n_fastq=0
+            elif len(l)==1 and oneline[0]!=">": 	# pure sequences
+                input_format="_list of sequences"
+            elif len(l)==11:	# Illumina GAII qseq file
+                input_format="Illumina GAII qseq file"
+            elif oneline[0]==">":	# fasta
+                input_format="fasta"
+                n_fasta=0
+
+            read_inf.close()
+
+            print "Detected data format: %s" % input_format
+
+            #----------------------------------------------------------------
+            read_file_list=[read_file_1,read_file_2]
+            outfile_FCT_list=[outfile_1FCT,outfile_2FCT]
+            n_list=[0,0]
+
+            for f in range(2):
+                read_file=read_file_list[f]
+                outf_FCT=open(outfile_FCT_list[f],'w')
+                original_bs_reads = original_bs_reads_lst[f]
+                n=n_list[f]
+
+                seq_id=""
+                seq=""
+                seq_ready="N"
+                for line in fileinput.input(tmp_d(read_file)):
+                    l=line.split()
+                    if input_format=="old Solexa Seq file":
+                        n+=1
+                        seq_id=str(n)
+                        seq_id=seq_id.zfill(12)
+                        seq=l[4]
+                        seq_ready="Y"
+                    elif input_format=="_list of sequences":
+                        n+=1
+                        seq_id=str(n)
+                        seq_id=seq_id.zfill(12)
+                        seq=l[0]
+                        seq_ready="Y"
+                    elif input_format=="FastQ":
+                        m_fastq=math.fmod(n_fastq,4)
+                        n_fastq+=1
+                        seq_ready="N"
+                        if m_fastq==0:
+                            n+=1
+                            seq_id=str(n)
+                            seq_id=seq_id.zfill(12)
+                            seq=""
+                        elif m_fastq==1:
+                            seq=l[0]
+                            seq_ready="Y"
+                        else:
+                            seq=""
+                    elif input_format=="Illumina GAII qseq file":
+                        n+=1
+                        seq_id=str(n)
+                        seq_id=seq_id.zfill(12)
+                        seq=l[8]
+                        seq_ready="Y"
+                    elif input_format=="fasta":
+                        m_fasta=math.fmod(n_fasta,2)
+                        n_fasta+=1
+                        seq_ready="N"
+                        if m_fasta==0:
+                            n+=1
+                            seq_id=l[0][1:]
+                            seq_id=seq_id.zfill(17)
+                            seq=""
+                        elif m_fasta==1:
+                            seq=l[0]
+                            seq_ready="Y"
+                        else:
+                            seq=""
+                    #----------------------------------------------------------------
+                    if seq_ready=="Y":
+                        seq=seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72  -e 52
+                        seq=seq.upper()
+                        seq=seq.replace(".","N")
+
+                        #--striping BS adapter from 3' read --------------------------------------------------------------
+                        if (adapterA !="") and (adapterB !=""):
+                            signature=adapterA[:6]
+                            if signature in seq:
+                                signature_pos=seq.index(signature)
+                                if seq[signature_pos:] in adapterA:
+                                    seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
+                                    all_trimmed+=1
+                            else:
+                                signature=adapterB[:6]
+                                if signature in seq:
+                                    #print seq_id,seq,signature;
+                                    signature_pos=seq.index(signature)
+                                    if seq[signature_pos:] in adapterB:
+                                        seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
+                                        all_trimmed+=1
+
+                        if len(seq) <= 4:
+                            seq = "N" * (cut2-cut1+1)
+                        #---------  trimmed_raw_BS_read  ------------------
+                        original_bs_reads[seq_id] = seq
+
+                        #---------  FW_C2T  ------------------
+                        if f==0:
+                            outf_FCT.write('>%s\n%s\n'% (seq_id, seq.replace("C","T")))
+                        elif f==1:
+                            outf_FCT.write('>%s\n%s\n'% (seq_id, reverse_compl_seq(seq).replace("C","T")))
+
+
+                n_list[f]=n
+                outf_FCT.close()
+
+                fileinput.close()
+
+
+            #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
+            all_raw_reads+=n
+
+            #--------------------------------------------------------------------------------
+            # Bowtie mapping
+            #--------------------------------------------------------------------------------
+            WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
+            CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
+
+            run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
+                                         'input_file_1' : outfile_1FCT,
+                                         'input_file_2' : outfile_2FCT,
+                                         'output_file' : WC2T_fr},
+
+                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
+                                         'input_file_1' : outfile_1FCT,
+                                         'input_file_2' : outfile_2FCT,
+                                         'output_file' : CC2T_fr} ])
+
+
+            delete_files(outfile_1FCT, outfile_2FCT)
+
+            #--------------------------------------------------------------------------------
+            # Post processing
+            #--------------------------------------------------------------------------------
+
+            FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
+            RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
+
+            #----------------------------------------------------------------
+            # get uniq-hit reads
+            #----------------------------------------------------------------
+            Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys())
+
+            Unique_FW_fr_C2T = set() # +
+            Unique_RC_fr_C2T = set() # -
+            Multiple_hits=set()
+
+
+            for x in Union_set:
+                _list = []
+                for d in [FW_C2T_fr_U, RC_C2T_fr_U]:
+                    mis_lst = d.get(x,[99])
+                    mis = int(mis_lst[0])
+                    _list.append(mis)
+                for d in [FW_C2T_fr_R, RC_C2T_fr_R]:
+                    mis = d.get(x,99)
+                    _list.append(mis)
+                mini = min(_list)
+                if _list.count(mini) == 1:
+                    mini_index = _list.index(mini)
+                    if mini_index == 0:
+                        Unique_FW_fr_C2T.add(x)
+                    elif mini_index == 1:
+                        Unique_RC_fr_C2T.add(x)
+                    else :
+                        Multiple_hits.add(x)
+                else :
+                    Multiple_hits.add(x)
+
+           # write reads rejected by Multiple Hits to file
+            if show_multiple_hit :
+                outf_MH=open("Multiple_hit.fa",'w')
+                for i in Multiple_hits :
+                    outf_MH.write(">%s\n" % i)
+                    outf_MH.write("%s\n" % original_bs_reads[i])
+                outf_MH.close()
+
+            FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
+            RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
+
+            FW_C2T_fr_uniq_lst.sort()
+            RC_C2T_fr_uniq_lst.sort()
+            FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
+            RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
+            #----------------------------------------------------------------
+
+            numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
+            numbers_premapped_lst[1]+=len(Unique_RC_fr_C2T)
+
+
+            #----------------------------------------------------------------
+
+            nn = 0
+            for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_C2T_fr_U)]:
+                nn += 1
+                mapped_chr0 = ""
+                for header in ali_unique_lst:
+                    _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
+
+
+                    #-------------------------------------
+                    if mapped_chr != mapped_chr0:
+                        my_gseq = deserialize(db_d(mapped_chr))
+                        chr_length = len(my_gseq)
+                        mapped_chr0 = mapped_chr
+                    #-------------------------------------
+
+                    original_BS_1 = original_bs_reads_1[header]
+                    original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
+
+                    r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
+                    r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
+
+                    all_mapped += 1
+
+                    if nn == 1: 							# FW-RC mapped to + strand:
+                        FR = "+FR"
+#                        mapped_location_1 += 1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "+"
+
+#                        mapped_location_2 += 1
+#                        origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "+"
+
+                    elif nn == 2: 						# FW-RC mapped to - strand:
+
+                        FR="-FR"
+                        mapped_location_1 = chr_length - mapped_location_1 - g_len_1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "-"
+
+                        mapped_location_2 = chr_length - mapped_location_2 - g_len_2
+#                        origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "-"
+
+
+
+                    origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
+                    origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
+
+
+                    r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
+                    r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
+
+                    N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
+                    N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
+
+                    if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no):
+
+                        numbers_mapped_lst[nn-1] += 1
+                        all_mapped_passed += 1
+
+                        #---- unmapped -------------------------
+                        del original_bs_reads_1[header]
+                        del original_bs_reads_2[header]
+
+                        #---------------------------------------
+#                        output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
+#                        output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
+                        methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
+                        methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
+                        mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst)
+                        mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst)
+
+                        # 
+                        #---XS FILTER----------------
+                        #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
+                        XS_1 = 0
+                        nCH_1 = methy_1.count('y') + methy_1.count('z')
+                        nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
+                        if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
+                            XS_1 = 1
+                        #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
+                        XS_2 = 0
+                        nCH_2 = methy_2.count('y') + methy_2.count('z')
+                        nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
+                        if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
+                            XS_2 = 1
+
+
+                        outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
+                                      output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
+                        outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
+                                      output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
+
+            print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
+            #----------------------------------------------------------------
+            #	output unmapped pairs
+            #----------------------------------------------------------------
+
+            unmapped_lst=original_bs_reads_1.keys()
+            unmapped_lst.sort()
+
+#            for u in unmapped_lst:
+#                outf_u1.write("%s\n"%(original_bs_reads_1[u]))
+#                outf_u2.write("%s\n"%(original_bs_reads_2[u]) )
+
+            all_unmapped+=len(unmapped_lst)
+
+
+    #==================================================================================================
+#    outf.close()
+#
+#    outf_u1.close()
+#    outf_u2.close()
+    delete_files(tmp_path)
+
+    logm("-------------------------------- " )
+    logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) )
+    logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\n")
+
+    if all_raw_reads >0:
+
+        logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\n")
+        if asktag=="Y":
+            logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
+            logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
+            logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
+            logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
+        elif asktag=="N":
+            logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
+            logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
+
+
+        logm("O --- Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
+        logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
+        if asktag=="Y":
+            logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
+            logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) )
+            logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) )
+            logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) )
+        elif asktag=="N":
+            logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
+            logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) )
+
+        logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
+        logm("O Unmapped read pairs: %d"% all_unmapped+"\n")
+
+
+        n_CG=mC_lst[0]+uC_lst[0]
+        n_CHG=mC_lst[1]+uC_lst[1]
+        n_CHH=mC_lst[2]+uC_lst[2]
+
+        logm("-------------------------------- " )
+        logm("Methylated C in mapped reads " )
+        logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) )
+        logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) )
+        logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) )
+
+    logm("----------------------------------------------" )
+    logm("------------------- END ----------------------" )
+    elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_align/bs_rrbs.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,1041 @@
+import fileinput, random, math, os.path
+from bs_index.rrbs_build import FWD_MAPPABLE_REGIONS, REV_MAPPABLE_REGIONS
+from bs_utils.utils import *
+
+from bs_align.bs_single_end import extract_mapping
+from bs_align_utils import *
+
+def my_mappable_region(chr_regions, mapped_location, FR): # start_position (first C), end_position (last G), serial, sequence
+    #print len(chr_regions)
+    out_serial = 0
+    out_start = -1
+    out_end = -1
+    #print "mapped_location:", mapped_location
+    if FR == "+FW" or FR == "-RC":
+        my_location = str(mapped_location)
+        if my_location in chr_regions:
+            my_lst = chr_regions[my_location]
+            out_start = int(my_location)
+            out_end = my_lst[0]
+            out_serial = my_lst[1]
+        #else :
+        #    print "[For debug]: +FW location %s cannot be found" % my_location
+    elif FR == "-FW" or FR == "+RC":
+        my_location = str(mapped_location)
+        if my_location in chr_regions:
+            my_lst = chr_regions[my_location]
+            out_end = int(my_location)
+            out_start = my_lst[0]
+            out_serial = my_lst[1]
+        #else :
+        #    print "[For debug]: -FW location %s cannot be found" % my_location
+
+    return out_serial, out_start, out_end
+
+
+#----------------------------------------------------------------
+
+def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines, max_mismatch_no,
+            aligner_command, db_path, tmp_path, outfile, XS_pct, XS_count, adapter_mismatch, cut_format="C-CGG",
+            show_multiple_hit=False):
+    #----------------------------------------------------------------
+    # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC"
+    #cut_context = re.sub("-", "", cut_format)
+    # Ex. cut_format="C-CGG,AT-CG,G-CWGC"
+    """
+
+    :param main_read_file:
+    :param asktag:
+    :param adapter_file:
+    :param cut_s:
+    :param cut_e:
+    :param no_small_lines:
+    :param max_mismatch_no:
+    :param aligner_command:
+    :param db_path:
+    :param tmp_path:
+    :param outfile:
+    :param XS_pct:
+    :param XS_count:
+    :param adapter_mismatch:
+    :param cut_format:
+    """
+    cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC']
+    cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC']
+    cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G']
+    cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC']
+    cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5]
+    min_cut5_len = min([len(i) for i in cut5_context])
+    #print cut_format_lst
+    #print cut_format
+    #print cut5_context
+
+    cut_tag_lst = Enumerate_C_to_CT(cut_format_lst) # ['G-TTGC', 'AT-TG', 'G-CAGT', 'T-CGG', 'G-TAGC', 'C-TGG', 'G-CAGC', 'G-CTGC', 'AT-CG', 'T-TGG', 'G-TTGT', 'G-TAGT', 'C-CGG', 'G-CTGT']
+    cut5_tag_lst = [re.match(r'(.*)\-(.*)', i).group(1) for i in cut_tag_lst]
+    cut3_tag_lst = [re.match(r'(.*)\-(.*)', i).group(2) for i in cut_tag_lst]
+    check_pattern = [ i[-2:]+"_"+j for i,j in zip(cut5_tag_lst, cut3_tag_lst) ]
+
+    #print "======="
+    #print cut_tag_lst
+    #print cut3_tag_lst
+    #print cut5_tag_lst
+    #print check_pattern
+
+    # set region[gx,gy] for checking_genome_context
+    gx = [ 0 if j>2 else 2-j for j in [len(i) for i in cut5_tag_lst] ] # [XC-CGG]
+    gy = [ 3+len(i) for i in cut3_tag_lst ]
+
+
+    #----------------------------------------------------------------
+
+    # helper method to join fname with tmp_path
+    tmp_d = lambda fname: os.path.join(tmp_path, fname)
+    db_d = lambda fname: os.path.join(db_path, fname)
+
+    MAX_TRY = 500 # For finding the serial_no
+    whole_adapter_seq = ""
+    #----------------------------------------------------------------
+    adapter_seq=""
+    if adapter_file:
+        try :
+            adapter_inf = open(adapter_file,"r")
+            whole_adapter_seq = adapter_inf.readline().strip()
+            adapter_seq = whole_adapter_seq[0:10] # only use first 10bp of adapter
+            adapter_inf.close()
+        except IOError:
+            print "[Error] Cannot find adapter file : %s !" % adapter_file
+            exit(-1)
+
+    logm("I Read filename: %s" % main_read_file)
+    logm("I The last cycle (for mapping): %d" % cut_e )
+    logm("I Bowtie path: %s" % aligner_command )
+    logm("I Reference genome library path: %s" % db_path )
+    logm("I Number of mismatches allowed: %s" % max_mismatch_no)
+    logm("I Adapter seq: %s" % whole_adapter_seq)
+    logm("----------------------------------------------")
+
+    #----------------------------------------------------------------
+    all_raw_reads=0
+    all_tagged=0
+    all_tagged_trimmed=0
+    all_mapped=0
+    all_mapped_passed=0
+    n_cut_tag_lst={}
+    #print cut3_tag_lst
+    for x in cut3_tag_lst:
+        n_cut_tag_lst[x]=0
+
+    mC_lst=[0,0,0]
+    uC_lst=[0,0,0]
+
+    no_my_files=0
+
+    num_mapped_FW_C2T = 0
+    num_mapped_RC_C2T = 0
+    num_mapped_FW_G2A = 0
+    num_mapped_RC_G2A = 0
+
+    #===============================================
+    # directional sequencing
+    #===============================================
+
+    if asktag=="N" :
+        #----------------------------------------------------------------
+        logm("== Start mapping ==")
+
+        input_fname = os.path.split(main_read_file)[1]
+        for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
+
+            logm("Processing read file: %s" % read_file)
+            original_bs_reads = {}
+            no_my_files+=1
+            random_id = ".tmp-"+str(random.randint(1000000,9999999))
+            outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
+
+            outf2=open(outfile2,'w')
+
+            #--- Checking input format ------------------------------------------
+            try :
+                read_inf=open(read_file,"r")
+            except IOError:
+                print "[Error] Cannot open input file : %s" % read_file
+                exit(-1)
+
+            oneline=read_inf.readline()
+            l=oneline.split()
+            n_fastq=0
+            n_fasta=0
+            input_format=""
+            if oneline[0]=="@":	# FastQ
+                input_format="fastq"
+            elif len(l)==1 and oneline[0]!=">": # pure sequences
+                input_format="seq"
+            elif len(l)==11: # Illumina qseq
+                input_format="qseq"
+            elif oneline[0]==">": # fasta
+                input_format="fasta"
+            read_inf.close()
+
+            #----------------------------------------------------------------
+            seq_id=""
+            seq=""
+            seq_ready=0
+            for line in fileinput.input(read_file):
+                l=line.split()
+
+                if input_format=="seq":
+                    all_raw_reads+=1
+                    seq_id=str(all_raw_reads)
+                    seq_id=seq_id.zfill(12)
+                    seq=l[0]
+                    seq_ready="Y"
+                elif input_format=="fastq":
+                    m_fastq=math.fmod(n_fastq,4)
+                    n_fastq+=1
+                    seq_ready="N"
+                    if m_fastq==0:
+                        all_raw_reads+=1
+                        seq_id=str(all_raw_reads)
+                        seq_id=seq_id.zfill(12)
+                        seq=""
+                    elif m_fastq==1:
+                        seq=l[0]
+                        seq_ready="Y"
+                    else:
+                        seq=""
+                elif input_format=="qseq":
+                    all_raw_reads+=1
+                    seq_id=str(all_raw_reads)
+                    seq_id=seq_id.zfill(12)
+                    seq=l[8]
+                    seq_ready="Y"
+                elif input_format=="fasta":
+                    m_fasta=math.fmod(n_fasta,2)
+                    n_fasta+=1
+                    seq_ready="N"
+                    if m_fasta==0:
+                        all_raw_reads+=1
+                        seq_id=l[0][1:]
+                        seq=""
+                    elif m_fasta==1:
+                        seq=l[0]
+                        seq_ready="Y"
+                    else:
+                        seq=""
+                #---------------------------------------------------------------
+                if seq_ready=="Y":
+                    # Normalize the characters
+                    seq=seq.upper().replace(".","N")
+
+                    read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
+                    if len(read_tag) > 0 :
+                        all_tagged += 1
+                        for i in read_tag :
+                            n_cut_tag_lst[i] += 1
+
+                    seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
+
+                    #-- Trimming adapter sequence ---
+                    if adapter_seq != "" :
+                        new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch)
+                        if len(new_read) < len(seq) :
+                            all_tagged_trimmed += 1
+                        seq = new_read
+                    if len(seq) <= 4 :
+                        seq = "N" * (cut_e - cut_s)
+
+                    # all reads will be considered, regardless of tags
+                    #---------  trimmed_raw_BS_read and qscore ------------------
+                    original_bs_reads[seq_id] = seq
+                    #---------  FW_C2T  ------------------
+                    outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T')))
+            fileinput.close()
+
+            outf2.close()
+
+            delete_files(read_file)
+            logm("Processing input is done")
+            #--------------------------------------------------------------------------------
+
+            # mapping
+            #--------------------------------------------------------------------------------
+            WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
+            CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
+
+            run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
+                                                   'input_file' : outfile2,
+                                                   'output_file' : WC2T},
+                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
+                                                   'input_file' : outfile2,
+                                                   'output_file' : CC2T} ])
+
+            logm("Aligning reads is done")
+
+            delete_files(outfile2)
+
+            #--------------------------------------------------------------------------------
+            # Post processing
+            #--------------------------------------------------------------------------------
+
+            FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
+            RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
+            logm("Extracting alignments is done")
+
+            #----------------------------------------------------------------
+            # get uniq-hit reads
+            #----------------------------------------------------------------
+            Union_set=set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())
+
+            Unique_FW_C2T=set() # +
+            Unique_RC_C2T=set() # -
+            Multiple_hits=set()
+
+
+            for x in Union_set:
+                _list=[]
+                for dx in [FW_C2T_U, RC_C2T_U]:
+                    mis_lst=dx.get(x,[99])
+                    mis=int(mis_lst[0])
+                    _list.append(mis)
+                for dx in [FW_C2T_R, RC_C2T_R]:
+                    mis=dx.get(x,99)
+                    _list.append(mis)
+                mini=min(_list)
+                if _list.count(mini)==1:
+                    mini_index=_list.index(mini)
+                    if mini_index==0:
+                        Unique_FW_C2T.add(x)
+                    elif mini_index==1:
+                        Unique_RC_C2T.add(x)
+                    else :
+                        Multiple_hits.add(x)
+                else :
+                    Multiple_hits.add(x)
+            # write reads rejected by Multiple Hits to file
+            if show_multiple_hit :
+                outf_MH=open("Multiple_hit.fa",'w')
+                for i in Multiple_hits :
+                    outf_MH.write(">%s\n" % i)
+                    outf_MH.write("%s\n" % original_bs_reads[i])
+                outf_MH.close()
+
+            del Union_set
+            del FW_C2T_R
+            del RC_C2T_R
+
+            FW_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
+            RC_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
+            FW_uniq_lst.sort()
+            RC_uniq_lst.sort()
+            FW_uniq_lst=[x[1] for x in FW_uniq_lst]
+            RC_uniq_lst=[x[1] for x in RC_uniq_lst]
+
+            del Unique_FW_C2T
+            del Unique_RC_C2T
+
+            #----------------------------------------------------------------
+            # Post-filtering reads
+
+            # ---- FW  ----
+            FW_regions = dict()
+            gseq = dict()
+            chr_length = dict()
+            for header in FW_uniq_lst :
+                _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
+                original_BS = original_bs_reads[header]
+                if mapped_chr not in FW_regions :
+                    FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
+                if mapped_chr not in gseq :
+                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
+                    chr_length[mapped_chr] = len(gseq[mapped_chr])
+
+                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
+                all_mapped+=1
+                FR = "+FW"
+                mapped_strand = "+"
+                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
+                                                                             mapped_location,
+                                                                             mapped_location + g_len,
+                                                                             mapped_strand)
+                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
+                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
+
+                if len(r_aln) == len(g_aln) :
+                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
+                    if True in checking_genome_context :
+                        try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
+                        my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
+                                try_pos, FR)
+                    if my_region_serial == 0 : # still be 0
+                        # for some cases, read has no tags; searching the upstream sequence for tags
+                        # print "[For debug]: FW read has no tags"
+                        try_count = 0
+                        try_pos = mapped_location - min_cut5_len + 1
+                        while my_region_serial == 0 and try_count < MAX_TRY :
+                            my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
+                                                              try_pos, FR)
+                            try_pos -= 1
+                            try_count += 1
+
+                        #if my_region_serial == 0 :
+                        #    print "[For debug]: chr=", mapped_chr
+                        #    print "[For debug]: +FW read still can not find fragment serial"
+                        # Tip: sometimes "my_region_serial" is still 0 ...
+
+
+                    N_mismatch = N_MIS(r_aln, g_aln)
+                    if N_mismatch <= int(max_mismatch_no) :
+                        all_mapped_passed += 1
+                        methy = methy_seq(r_aln, g_aln + next2bp)
+                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
+                        #---XS FILTER----------------
+                        XS = 0
+                        nCH = methy.count('y') + methy.count('z')
+                        nmCH = methy.count('Y') + methy.count('Z')
+                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
+                            XS = 1
+                        num_mapped_FW_C2T += 1
+                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
+                                      mapped_location, cigar, original_BS, methy, XS,
+                                      output_genome = output_genome,
+                                      rrbs = True,
+                                      my_region_serial = my_region_serial,
+                                      my_region_start = my_region_start,
+                                      my_region_end = my_region_end)
+                else :
+                    print "[For debug]: reads not in same lengths"
+
+            #print "start RC"
+            # ---- RC ----
+            RC_regions = dict()
+            for header in RC_uniq_lst :
+                _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
+                original_BS = original_bs_reads[header]
+                if mapped_chr not in RC_regions :
+                    RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
+                if mapped_chr not in gseq :
+                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
+                    chr_length[mapped_chr] = len(gseq[mapped_chr])
+
+                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
+                mapped_location = chr_length[mapped_chr] - mapped_location - g_len
+                all_mapped+=1
+                FR = "-FW"
+                mapped_strand = "-"
+                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
+                                                                             mapped_location,
+                                                                             mapped_location + g_len,
+                                                                             mapped_strand)
+                #checking_genome_context = (output_genome[gx:gy] == check_pattern)
+                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
+                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
+
+                if len(r_aln) == len(g_aln) : # and checking_genome_context:
+                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
+                    if True in checking_genome_context :
+                        try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
+                        my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
+                                                                           try_pos , FR)
+                    if my_region_serial == 0 : # still be 0
+                        # for some cases, read has no tags; searching the upstream sequence for tags
+                        #print "[For debug]: RC Read has no tags"
+                        try_count = 0
+                        try_pos = mapped_location + g_len + min_cut5_len - 2
+                        while my_region_serial == 0 and try_count < MAX_TRY :
+                            my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
+                                                                               try_pos, FR)
+                            try_pos += 1
+                            try_count += 1
+
+                        #if my_region_serial == 0 :
+                        #    print "[For debug]: chr=", mapped_chr
+                        #    print "[For debug]: -FW read still cannot find fragment serial"
+
+
+                    N_mismatch = N_MIS(r_aln, g_aln)
+                    if N_mismatch <= int(max_mismatch_no) :
+                        all_mapped_passed += 1
+                        methy = methy_seq(r_aln, g_aln + next2bp)
+                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
+                        #---XS FILTER----------------
+                        XS = 0
+                        nCH = methy.count('y') + methy.count('z')
+                        nmCH = methy.count('Y') + methy.count('Z')
+                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
+                            XS = 1
+                        num_mapped_RC_C2T += 1
+                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
+                                      mapped_location, cigar, original_BS, methy, XS,
+                                      output_genome = output_genome,
+                                      rrbs = True,
+                                      my_region_serial = my_region_serial,
+                                      my_region_start = my_region_start,
+                                      my_region_end = my_region_end)
+                else :
+                    print "[For debug]: reads not in same lengths"
+
+
+            # Finished both FW and RC
+            logm("Done: %s (%d) \n" % (read_file, no_my_files))
+            print "--> %s (%d) "%(read_file, no_my_files)
+            del original_bs_reads
+            delete_files(WC2T, CC2T)
+
+    # End of directional library
+
+
+    # ====================================================
+    #  un-directional library
+    # ====================================================
+
+    elif asktag=="Y" :
+        #----------------------------------------------------------------
+        logm("== Start mapping ==")
+
+        input_fname = os.path.split(main_read_file)[1]
+        for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
+
+            logm("Processing read file: %s" % read_file)
+            original_bs_reads = {}
+            no_my_files+=1
+            random_id = ".tmp-"+str(random.randint(1000000,9999999))
+            outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
+            outfile3=tmp_d('Trimmed_G2A.fa'+random_id)
+
+            outf2=open(outfile2,'w')
+            outf3=open(outfile3,'w')
+
+            #--- Checking input format ------------------------------------------
+            try :
+                read_inf=open(read_file,"r")
+            except IOError:
+                print "[Error] Cannot open input file : %s" % read_file
+                exit(-1)
+
+            oneline=read_inf.readline()
+            l=oneline.split()
+            n_fastq=0
+            n_fasta=0
+            input_format=""
+            if oneline[0]=="@":	# FastQ
+                input_format="fastq"
+            elif len(l)==1 and oneline[0]!=">": # pure sequences
+                input_format="seq"
+            elif len(l)==11: # Illumina qseq
+                input_format="qseq"
+            elif oneline[0]==">": # fasta
+                input_format="fasta"
+            read_inf.close()
+
+            #----------------------------------------------------------------
+            seq_id = ""
+            seq = ""
+            seq_ready=0
+            for line in fileinput.input(read_file):
+                l=line.split()
+
+                if input_format == "seq":
+                    all_raw_reads+=1
+                    seq_id=str(all_raw_reads)
+                    seq_id=seq_id.zfill(12)
+                    seq=l[0]
+                    seq_ready="Y"
+                elif input_format=="fastq":
+                    m_fastq=math.fmod(n_fastq,4)
+                    n_fastq+=1
+                    seq_ready="N"
+                    if m_fastq==0:
+                        all_raw_reads+=1
+                        seq_id=str(all_raw_reads)
+                        seq_id=seq_id.zfill(12)
+                        seq=""
+                    elif m_fastq==1:
+                        seq=l[0]
+                        seq_ready="Y"
+                    else:
+                        seq=""
+                elif input_format=="qseq":
+                    all_raw_reads+=1
+                    seq_id=str(all_raw_reads)
+                    seq_id=seq_id.zfill(12)
+                    seq=l[8]
+                    seq_ready="Y"
+                elif input_format=="fasta":
+                    m_fasta=math.fmod(n_fasta,2)
+                    n_fasta+=1
+                    seq_ready="N"
+                    if m_fasta==0:
+                        all_raw_reads+=1
+                        seq_id=l[0][1:]
+                        seq=""
+                    elif m_fasta==1:
+                        seq=l[0]
+                        seq_ready="Y"
+                    else:
+                        seq=""
+                    #---------------------------------------------------------------
+                if seq_ready=="Y":
+                    # Normalize the characters
+                    seq=seq.upper().replace(".","N")
+
+                    read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
+                    if len(read_tag) > 0 :
+                        all_tagged += 1
+                        for i in read_tag :
+                            n_cut_tag_lst[i] += 1
+
+                    seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
+
+                    #-- Trimming adapter sequence ---
+                    if adapter_seq != "" :
+                        new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch)
+                        if len(new_read) < len(seq) :
+                            all_tagged_trimmed += 1
+                        seq = new_read
+                    if len(seq) <= 4 :
+                        seq = "N" * (cut_e - cut_s)
+
+                    # all reads will be considered, regardless of tags
+                    #---------  trimmed_raw_BS_read and qscore ------------------
+                    original_bs_reads[seq_id] = seq
+                    #---------  FW_C2T  ------------------
+                    outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T')))
+                    #---------  RC_G2A  ------------------
+                    outf3.write('>%s\n%s\n' % (seq_id, seq.replace("G","A")))
+            fileinput.close()
+
+            outf2.close()
+
+            delete_files(read_file)
+            logm("Processing input is done")
+            #--------------------------------------------------------------------------------
+
+            # mapping
+            #--------------------------------------------------------------------------------
+            WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
+            CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
+            WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
+            CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
+
+            run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
+                                                 'input_file' : outfile2,
+                                                 'output_file' : WC2T},
+                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
+                                                 'input_file' : outfile2,
+                                                 'output_file' : CC2T},
+                              aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
+                                                 'input_file' : outfile3,
+                                                 'output_file' : WG2A},
+                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
+                                                 'input_file' : outfile3,
+                                                 'output_file' : CG2A} ])
+
+            logm("Aligning reads is done")
+
+            delete_files(outfile2)
+
+            #--------------------------------------------------------------------------------
+            # Post processing
+            #--------------------------------------------------------------------------------
+
+            FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
+            RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)
+
+            FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
+            RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
+
+            logm("Extracting alignments is done")
+
+            #----------------------------------------------------------------
+            # get unique-hit reads
+            #----------------------------------------------------------------
+            Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
+
+            Unique_FW_C2T=set() # +
+            Unique_RC_G2A=set() # +
+            Unique_FW_G2A=set() # -
+            Unique_RC_C2T=set() # -
+            Multiple_hits=set()
+
+            for x in Union_set:
+                _list=[]
+                for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
+                    mis_lst=dx.get(x,[99])
+                    mis=int(mis_lst[0])
+                    _list.append(mis)
+                for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
+                    mis=dx.get(x,99)
+                    _list.append(mis)
+                mini=min(_list)
+                if _list.count(mini) == 1:
+                    mini_index=_list.index(mini)
+                    if mini_index == 0:
+                        Unique_FW_C2T.add(x)
+                    elif mini_index == 1:
+                        Unique_RC_G2A.add(x)
+                    elif mini_index == 2:
+                        Unique_FW_G2A.add(x)
+                    elif mini_index == 3:
+                        Unique_RC_C2T.add(x)
+                    else :
+                        Multiple_hits.add(x)
+                else :
+                    Multiple_hits.add(x)
+            # write reads rejected by Multiple Hits to file
+            if show_multiple_hit :
+                outf_MH=open("Multiple_hit.fa",'w')
+                for i in Multiple_hits :
+                    outf_MH.write(">%s\n" % i)
+                    outf_MH.write("%s\n" % original_bs_reads[i])
+                outf_MH.close()
+
+            del Union_set
+            del FW_C2T_R
+            del FW_G2A_R
+            del RC_C2T_R
+            del RC_G2A_R
+
+            FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
+            FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
+            RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
+            RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
+            FW_C2T_uniq_lst.sort()
+            RC_C2T_uniq_lst.sort()
+            FW_G2A_uniq_lst.sort()
+            RC_G2A_uniq_lst.sort()
+            FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
+            RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
+            FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
+            RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
+
+            del Unique_FW_C2T
+            del Unique_FW_G2A
+            del Unique_RC_C2T
+            del Unique_RC_G2A
+
+
+            #----------------------------------------------------------------
+            # Post-filtering reads
+            # ---- FW_C2T  ---- undirectional
+            FW_regions = dict()
+            gseq = dict()
+            chr_length = dict()
+            for header in FW_C2T_uniq_lst :
+                _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
+                original_BS = original_bs_reads[header]
+                if mapped_chr not in FW_regions :
+                    FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
+                if mapped_chr not in gseq :
+                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
+                    chr_length[mapped_chr] = len(gseq[mapped_chr])
+
+                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
+                all_mapped+=1
+                FR = "+FW"
+                mapped_strand = "+"
+                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
+                                                                             mapped_location,
+                                                                             mapped_location + g_len,
+                                                                             mapped_strand)
+                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
+                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
+
+                if len(r_aln) == len(g_aln) :
+                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
+                    if True in checking_genome_context :
+                        try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
+                        my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
+                                try_pos, FR)
+                    if my_region_serial == 0 : # still be 0
+                        # for some cases, read has no tags; searching the upstream sequence for tags
+                        # print "[For debug]: FW read has no tags"
+                        try_count = 0
+                        try_pos = mapped_location - min_cut5_len + 1
+                        while my_region_serial == 0 and try_count < MAX_TRY :
+                            my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
+                                                              try_pos, FR)
+                            try_pos -= 1
+                            try_count += 1
+
+                    N_mismatch = N_MIS(r_aln, g_aln)
+                    if N_mismatch <= int(max_mismatch_no) :
+                        all_mapped_passed += 1
+                        methy = methy_seq(r_aln, g_aln + next2bp)
+                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
+                        #---XS FILTER----------------
+                        XS = 0
+                        nCH = methy.count('y') + methy.count('z')
+                        nmCH = methy.count('Y') + methy.count('Z')
+                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
+                            XS = 1
+                        num_mapped_FW_C2T += 1
+                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
+                                      mapped_location, cigar, original_BS, methy, XS,
+                                      output_genome = output_genome,
+                                      rrbs = True,
+                                      my_region_serial = my_region_serial,
+                                      my_region_start = my_region_start,
+                                      my_region_end = my_region_end)
+                else :
+                    print "[For debug]: reads not in same lengths"
+
+
+            # ---- RC_C2T ---- undirectional
+            RC_regions = dict()
+            for header in RC_C2T_uniq_lst :
+                _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
+                original_BS = original_bs_reads[header]
+                if mapped_chr not in RC_regions :
+                    RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
+                if mapped_chr not in gseq :
+                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
+                    chr_length[mapped_chr] = len(gseq[mapped_chr])
+
+                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
+                mapped_location = chr_length[mapped_chr] - mapped_location - g_len
+                all_mapped+=1
+                FR = "-FW"
+                mapped_strand = "-"
+                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
+                                                                             mapped_location,
+                                                                             mapped_location + g_len,
+                                                                             mapped_strand)
+                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
+                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
+
+                if len(r_aln) == len(g_aln) : # and checking_genome_context:
+                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
+                    if True in checking_genome_context :
+                        try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
+                        my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
+                                                                           try_pos , FR)
+                    if my_region_serial == 0 : # still be 0
+                        # for some cases, read has no tags; searching the upstream sequence for tags
+                        #print "[For debug]: RC Read has no tags"
+                        try_count = 0
+                        try_pos = mapped_location + g_len + min_cut5_len - 2
+                        while my_region_serial == 0 and try_count < MAX_TRY :
+                            my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
+                                                                               try_pos, FR)
+                            try_pos += 1
+                            try_count += 1
+
+                    N_mismatch = N_MIS(r_aln, g_aln)
+                    if N_mismatch <= int(max_mismatch_no) :
+                        all_mapped_passed += 1
+                        methy = methy_seq(r_aln, g_aln + next2bp)
+                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
+                        #---XS FILTER----------------
+                        XS = 0
+                        nCH = methy.count('y') + methy.count('z')
+                        nmCH = methy.count('Y') + methy.count('Z')
+                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
+                            XS = 1
+                        num_mapped_RC_C2T += 1
+                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
+                                      mapped_location, cigar, original_BS, methy, XS,
+                                      output_genome = output_genome,
+                                      rrbs = True,
+                                      my_region_serial = my_region_serial,
+                                      my_region_start = my_region_start,
+                                      my_region_end = my_region_end)
+
+                else :
+                    print "[For debug]: reads not in same lengths"
+
+
+            # ---- FW_G2A  ---- undirectional
+            FW_regions = dict()
+            gseq = dict()
+            chr_length = dict()
+            for header in FW_G2A_uniq_lst :
+                _, mapped_chr, mapped_location, cigar = FW_G2A_U[header]
+                original_BS = original_bs_reads[header]
+                if mapped_chr not in FW_regions :
+                    FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
+                if mapped_chr not in gseq :
+                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
+                    chr_length[mapped_chr] = len(gseq[mapped_chr])
+                cigar = list(reversed(cigar))
+
+                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
+                all_mapped+=1
+                FR = "-RC"
+                mapped_strand = "-"
+                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
+                                                                             mapped_location,
+                                                                             mapped_location + g_len,
+                                                                             mapped_strand)
+                original_BS = reverse_compl_seq(original_BS)  # for RC reads
+                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
+                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
+
+                if len(r_aln) == len(g_aln) :
+                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
+                    if True in checking_genome_context :
+                        try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
+                        my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
+                                                            try_pos, FR)
+                    if my_region_serial == 0 : # still be 0
+                        # for some cases, read has no tags; searching the upstream sequence for tags
+                        #print "[For debug]: FW read has no tags"
+                        try_count = 0
+                        try_pos = mapped_location - min_cut5_len + 1
+                        while my_region_serial == 0 and try_count < MAX_TRY :
+                            my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
+                                            try_pos, FR)
+                            try_pos += 1
+                            try_count += 1
+                        #if my_region_serial == 0 :
+                        #    print "[For debug]: chr=", mapped_chr
+                        #    print "[For debug]: FW_G2A read still can not find fragment serial"
+                        # Tip: sometimes "my_region_serial" is still 0 ...
+
+
+                    N_mismatch = N_MIS(r_aln, g_aln)
+                    if N_mismatch <= int(max_mismatch_no) :
+                        all_mapped_passed += 1
+                        methy = methy_seq(r_aln, g_aln + next2bp)
+                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
+                        #---XS FILTER----------------
+                        XS = 0
+                        nCH = methy.count('y') + methy.count('z')
+                        nmCH = methy.count('Y') + methy.count('Z')
+                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
+                            XS = 1
+                        num_mapped_FW_G2A += 1
+                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
+                                      mapped_location, cigar, original_BS, methy, XS,
+                                      output_genome = output_genome,
+                                      rrbs = True,
+                                      my_region_serial = my_region_serial,
+                                      my_region_start = my_region_start,
+                                      my_region_end = my_region_end)
+                else :
+                    print "[For debug]: reads not in same lengths"
+
+
+            # ---- RC_G2A ---- undirectional
+            RC_regions = dict()
+            for header in RC_G2A_uniq_lst :
+                _, mapped_chr, mapped_location, cigar = RC_G2A_U[header]
+                original_BS = original_bs_reads[header]
+                if mapped_chr not in RC_regions :
+                    RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
+                if mapped_chr not in gseq :
+                    gseq[mapped_chr] = deserialize(db_d(mapped_chr))
+                    chr_length[mapped_chr] = len(gseq[mapped_chr])
+                cigar = list(reversed(cigar))
+
+                r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
+                mapped_location = chr_length[mapped_chr] - mapped_location - g_len
+                all_mapped+=1
+                FR = "+RC"
+                mapped_strand = "+"
+                origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
+                                                                             mapped_location,
+                                                                             mapped_location + g_len,
+                                                                             mapped_strand)
+                original_BS = reverse_compl_seq(original_BS)  # for RC reads
+                checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
+                r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
+
+                if len(r_aln) == len(g_aln) : # and checking_genome_context:
+                    my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
+                    if True in checking_genome_context :
+                        try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
+                        my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
+                                                                    mapped_location + g_len + min_cut5_len -1, FR)
+                    if my_region_serial == 0 : # still be 0
+                        # for some cases, read has no tags; searching the upstream sequence for tags
+                        #print "[For debug]: RC Read has no tags"
+                        try_count = 0
+                        try_pos = mapped_location + g_len + min_cut5_len - 2
+                        while try_count < MAX_TRY :
+                            my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
+                                                    try_pos, FR)
+                            try_pos += 1
+                            try_count += 1
+
+                        #if my_region_serial == 0 :
+                        #    print "[For debug]: chr=", mapped_chr
+                        #    print "[For debug]: RC_C2A read still cannot find fragment serial"
+
+
+                    N_mismatch = N_MIS(r_aln, g_aln)
+                    if N_mismatch <= int(max_mismatch_no) :
+                        all_mapped_passed += 1
+                        methy = methy_seq(r_aln, g_aln + next2bp)
+                        mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
+                        #---XS FILTER----------------
+                        XS = 0
+                        nCH = methy.count('y') + methy.count('z')
+                        nmCH = methy.count('Y') + methy.count('Z')
+                        if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
+                            XS = 1
+                        num_mapped_RC_G2A += 1
+                        outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
+                                      mapped_location, cigar, original_BS, methy, XS,
+                                      output_genome = output_genome,
+                                      rrbs = True,
+                                      my_region_serial = my_region_serial,
+                                      my_region_start = my_region_start,
+                                      my_region_end = my_region_end)
+                else :
+                    print "[For debug]: reads not in same lengths"
+
+
+
+            # Finished both FW and RC
+            logm("Done: %s (%d) \n" % (read_file, no_my_files))
+            print "--> %s (%d) "%(read_file, no_my_files)
+            del original_bs_reads
+            delete_files(WC2T, CC2T, WG2A, CG2A)
+
+
+
+    # End of un-directional library
+
+    delete_files(tmp_path)
+
+
+    logm("O Number of raw reads: %d "% all_raw_reads)
+    if all_raw_reads >0:
+        logm("O Number of CGG/TGG tagged reads: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads))
+        for kk in range(len(n_cut_tag_lst)):
+            logm("O Number of raw reads with %s tag: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))
+        logm("O Number of CGG/TGG reads having adapter removed: %d "%all_tagged_trimmed)
+        logm("O Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
+        logm("O Number of unique-hits reads for post-filtering: %d"%all_mapped)
+
+        logm("O ------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))
+        logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads))
+
+        if asktag=="Y": # undiretional
+            logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
+            logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) )
+            logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
+            logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) )
+            # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration
+            # according to literal meaning
+        elif asktag=="N": # directional
+            logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
+            logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
+
+        n_CG=mC_lst[0]+uC_lst[0]
+        n_CHG=mC_lst[1]+uC_lst[1]
+        n_CHH=mC_lst[2]+uC_lst[2]
+
+        logm("----------------------------------------------")
+        logm("M Methylated C in mapped reads ")
+        logm("M mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
+        logm("M mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
+        logm("M mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))
+    logm("----------------------------------------------")
+    logm("------------------- END ----------------------")
+
+    elapsed(main_read_file)
+
+    close_log()
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_align/bs_single_end.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,732 @@
+import fileinput, os, time, random, math
+from bs_utils.utils import *
+from bs_align_utils import *
+
+#----------------------------------------------------------------
+# Read from the mapped results, return lists of unique / multiple-hit reads
+# The function suppose at most 2 hits will be reported in single file
+def extract_mapping(ali_file):
+    unique_hits = {}
+    non_unique_hits = {}
+
+    header0 = ""
+    lst = []
+
+    for header, chr, location, no_mismatch, cigar in process_aligner_output(ali_file):
+        #------------------------------
+        if header != header0:
+            #---------- output -----------
+            if len(lst) == 1:
+                unique_hits[header0] = lst[0]      # [no_mismatch, chr, location]
+            elif len(lst) > 1:
+                min_lst = min(lst, key = lambda x: x[0])
+                max_lst = max(lst, key = lambda x: x[0])
+
+                if min_lst[0] < max_lst[0]:
+                    unique_hits[header0] = min_lst
+                else:
+                    non_unique_hits[header0] = min_lst[0]
+                    #print "multiple hit", header, chr, location, no_mismatch, cigar # test
+            header0 = header
+            lst = [(no_mismatch, chr, location, cigar)]
+        else: # header == header0, same header (read id)
+            lst.append((no_mismatch, chr, location, cigar))
+
+    if len(lst) == 1:
+        unique_hits[header0] = lst[0]      # [no_mismatch, chr, location]
+    elif len(lst) > 1:
+        min_lst = min(lst, key = lambda x: x[0])
+        max_lst = max(lst, key = lambda x: x[0])
+
+        if min_lst[0] < max_lst[0]:
+            unique_hits[header0] = min_lst
+        else:
+            non_unique_hits[header0] = min_lst[0]
+
+
+    return unique_hits, non_unique_hits
+
+
+def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines,
+                  max_mismatch_no, aligner_command, db_path, tmp_path, outfile,
+                  XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):
+    #----------------------------------------------------------------
+    # adapter : strand-specific or not
+    adapter=""
+    adapter_fw=""
+    adapter_rc=""
+    if adapter_file !="":
+        try :
+            adapter_inf=open(adapter_file,"r")
+        except IOError:
+            print "[Error] Cannot open adapter file : %s" % adapter_file
+            exit(-1)
+        if asktag == "N": #<--- directional library
+            adapter=adapter_inf.readline()
+            adapter_inf.close()
+            adapter=adapter.rstrip("\n")
+        elif asktag == "Y":#<--- un-directional library
+            adapter_fw=adapter_inf.readline()
+            adapter_rc=adapter_inf.readline()
+            adapter_inf.close()
+            adapter_fw=adapter_fw.rstrip("\n")
+            adapter_rc=adapter_rc.rstrip("\n")
+        adapter_inf.close()
+    #----------------------------------------------------------------
+
+
+
+    #----------------------------------------------------------------
+    logm("Read filename: %s"% main_read_file )
+    logm("Un-directional library: %s" % asktag )
+    logm("The first base (for mapping): %d" % cut1)
+    logm("The last base (for mapping): %d" % cut2)
+    logm("Max. lines per mapping: %d"% no_small_lines)
+    logm("Aligner: %s" % aligner_command)
+    logm("Reference genome library path: %s" % db_path )
+    logm("Number of mismatches allowed: %s" % max_mismatch_no )
+    if adapter_file !="":
+        if asktag=="N":
+            logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n")))
+        elif asktag=="Y":
+            logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) )
+            logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) )
+    #----------------------------------------------------------------
+
+    # helper method to join fname with tmp_path
+    tmp_d = lambda fname: os.path.join(tmp_path, fname)
+
+    db_d = lambda fname:  os.path.join(db_path, fname)
+
+    #----------------------------------------------------------------
+    # splitting the big read file
+
+    input_fname = os.path.split(main_read_file)[1]
+
+#    split_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines)
+#    my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path)
+#                                            if splitted_file.startswith("%s-s-" % input_fname))
+
+    #---- Stats ------------------------------------------------------------
+    all_raw_reads=0
+    all_trimed=0
+    all_mapped=0
+    all_mapped_passed=0
+
+    numbers_premapped_lst=[0,0,0,0]
+    numbers_mapped_lst=[0,0,0,0]
+
+    mC_lst=[0,0,0]
+    uC_lst=[0,0,0]
+
+
+    no_my_files=0
+
+    #----------------------------------------------------------------
+    logm("== Start mapping ==")
+
+    for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
+#    for read_file in my_files:
+        original_bs_reads = {}
+        no_my_files+=1
+        random_id = ".tmp-"+str(random.randint(1000000,9999999))
+
+        #-------------------------------------------------------------------
+        # undirectional sequencing
+        #-------------------------------------------------------------------
+        if asktag=="Y":  
+
+            #----------------------------------------------------------------
+            outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
+            outfile3=tmp_d('Trimmed_G2A.fa'+random_id)
+
+            outf2=open(outfile2,'w')
+            outf3=open(outfile3,'w')
+
+            #----------------------------------------------------------------
+            # detect format of input file
+            try :
+                read_inf=open(read_file,"r")
+            except IOError :
+                print "[Error] Cannot open input file : %s" % read_file
+                exit(-1)
+
+            oneline=read_inf.readline()
+            l=oneline.split()
+            input_format=""
+            if oneline[0]=="@":	# fastq
+                input_format="fastq"
+                n_fastq=0
+            elif len(l)==1 and oneline[0]!=">": # pure sequences
+                input_format="seq"
+            elif len(l)==11: # qseq
+                input_format="qseq"
+            elif oneline[0]==">":	# fasta
+                input_format="fasta"
+                n_fasta=0
+            read_inf.close()
+
+            #----------------------------------------------------------------
+            # read sequence, remove adapter and convert 
+            read_id=""
+            seq=""
+            seq_ready="N"
+            for line in fileinput.input(read_file):
+                l=line.split()
+
+                if input_format=="seq":
+                    all_raw_reads+=1
+                    read_id=str(all_raw_reads)
+                    read_id=read_id.zfill(12)
+                    seq=l[0]
+                    seq_ready="Y"
+                elif input_format=="fastq":
+                    m_fastq=math.fmod(n_fastq,4)
+                    n_fastq+=1
+                    seq_ready="N"
+                    if m_fastq==0:
+                        all_raw_reads+=1
+                        read_id=str(all_raw_reads)
+                        read_id=read_id.zfill(12)
+                        seq=""
+                    elif m_fastq==1:
+                        seq=l[0]
+                        seq_ready="Y"
+                    else:
+                        seq=""
+                elif input_format=="qseq":
+                    all_raw_reads+=1
+                    read_id=str(all_raw_reads)
+                    read_id=read_id.zfill(12)
+                    seq=l[8]
+                    seq_ready="Y"
+                elif input_format=="fasta":
+                    m_fasta=math.fmod(n_fasta,2)
+                    n_fasta+=1
+                    seq_ready="N"
+                    if m_fasta==0:
+                        all_raw_reads+=1
+                        #read_id=str(all_raw_reads)
+                        read_id=l[0][1:]
+                        seq=""
+                    elif m_fasta==1:
+                        seq=l[0]
+                        seq_ready="Y"
+                    else:
+                        seq=""
+
+                #----------------------------------------------------------------
+                if seq_ready=="Y":
+                    seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72  -e 52
+                    seq=seq.upper()
+                    seq=seq.replace(".","N")
+
+                    # striping BS adapter from 3' read
+                    if (adapter_fw !="") and (adapter_rc !="") :
+                        new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch)
+                        new_read = Remove_5end_Adapter(new_read, adapter_rc)
+                        if len(new_read) < len(seq) :
+                            all_trimed += 1
+                        seq = new_read
+
+                    if len(seq)<=4:
+                        seq=''.join(["N" for x in xrange(cut2-cut1+1)])
+
+                    #---------  trimmed_raw_BS_read  ------------------
+                    original_bs_reads[read_id] = seq
+
+                    #---------  FW_C2T  ------------------
+                    outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
+                    #---------  RC_G2A  ------------------
+                    outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
+
+            fileinput.close()
+
+            outf2.close()
+            outf3.close()
+
+            delete_files(read_file)
+
+           #--------------------------------------------------------------------------------
+            # Bowtie mapping
+            #-------------------------------------------------------------------------------
+            WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
+            CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
+            WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
+            CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
+
+        #    print aligner_command % {'int_no_mismatches' : int_no_mismatches,
+        #                             'reference_genome' : os.path.join(db_path,'W_C2T'),
+        #                             'input_file' : outfile2,
+        #                             'output_file' : WC2T}
+
+            run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
+                                                   'input_file' : outfile2,
+                                                   'output_file' : WC2T},
+
+                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
+                                                   'input_file' : outfile2,
+                                                   'output_file' : CC2T},
+
+                              aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
+                                                   'input_file' : outfile3,
+                                                   'output_file' : WG2A},
+
+                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
+                                                   'input_file' : outfile3,
+                                                   'output_file' : CG2A} ])
+
+
+            delete_files(outfile2, outfile3)
+
+
+            #--------------------------------------------------------------------------------
+            # Post processing
+            #--------------------------------------------------------------------------------
+
+            FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
+            RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)
+
+            FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
+            RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
+
+            #----------------------------------------------------------------
+            # get unique-hit reads
+            #----------------------------------------------------------------
+            Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
+
+            Unique_FW_C2T=set() # +
+            Unique_RC_G2A=set() # +
+            Unique_FW_G2A=set() # -
+            Unique_RC_C2T=set() # -
+            Multiple_hits=set()
+
+
+            for x in Union_set:
+                _list=[]
+                for d in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
+                    mis_lst=d.get(x,[99])
+                    mis=int(mis_lst[0])
+                    _list.append(mis)
+                for d in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
+                    mis=d.get(x,99) 
+                    _list.append(mis)
+                mini=min(_list)
+                if _list.count(mini) == 1:
+                    mini_index=_list.index(mini)
+                    if mini_index == 0:
+                        Unique_FW_C2T.add(x)
+                    elif mini_index == 1:
+                        Unique_RC_G2A.add(x)
+                    elif mini_index == 2:
+                        Unique_FW_G2A.add(x)
+                    elif mini_index == 3:
+                        Unique_RC_C2T.add(x)
+                    # if mini_index = 4,5,6,7, indicating multiple hits
+                    else :
+                        Multiple_hits.add(x)
+                else :
+                    Multiple_hits.add(x)
+            # write reads rejected by Multiple Hits to file
+            if show_multiple_hit :
+                outf_MH=open("Multiple_hit.fa",'w')
+                for i in Multiple_hits :
+                    outf_MH.write(">%s\n" % i)
+                    outf_MH.write("%s\n" % original_bs_reads[i])
+                outf_MH.close()
+
+            del Union_set
+            del FW_C2T_R
+            del FW_G2A_R
+            del RC_C2T_R
+            del RC_G2A_R
+
+            FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
+            FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
+            RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
+            RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
+            FW_C2T_uniq_lst.sort()
+            RC_C2T_uniq_lst.sort()
+            FW_G2A_uniq_lst.sort()
+            RC_G2A_uniq_lst.sort()
+            FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
+            RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
+            FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
+            RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
+
+            del Unique_FW_C2T
+            del Unique_FW_G2A
+            del Unique_RC_C2T
+            del Unique_RC_G2A
+
+            #----------------------------------------------------------------
+            numbers_premapped_lst[0] += len(Unique_FW_C2T)
+            numbers_premapped_lst[1] += len(Unique_RC_G2A)
+            numbers_premapped_lst[2] += len(Unique_FW_G2A)
+            numbers_premapped_lst[3] += len(Unique_RC_C2T)
+
+
+            #----------------------------------------------------------------
+
+            nn=0
+            gseq = dict()
+            chr_length = dict()
+            for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),
+                                            (RC_G2A_uniq_lst,RC_G2A_U),
+                                            (FW_G2A_uniq_lst,FW_G2A_U),
+                                            (RC_C2T_uniq_lst,RC_C2T_U)]:
+                nn += 1
+                mapped_chr0 = ""
+
+                for header in ali_unique_lst:
+
+                    _, mapped_chr, mapped_location, cigar = ali_dic[header]
+
+                    original_BS = original_bs_reads[header]
+                    #-------------------------------------
+                    if mapped_chr not in gseq:
+                        gseq[mapped_chr] =  deserialize(db_d(mapped_chr))
+                        chr_length[mapped_chr] = len(gseq[mapped_chr])
+
+                    if nn == 2 or nn == 3:
+                        cigar = list(reversed(cigar))
+                    r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
+
+
+                    all_mapped += 1
+
+                    if nn == 1: # +FW mapped to + strand:
+                        FR = "+FW"
+                        mapped_strand="+"
+
+                    elif nn == 2:  # +RC mapped to + strand:
+                        FR = "+RC" # RC reads from -RC reflecting the methylation status on Watson strand (+)
+                        mapped_location = chr_length[mapped_chr] - mapped_location - g_len
+                        mapped_strand = "+"
+                        original_BS = reverse_compl_seq(original_BS)  # for RC reads
+
+                    elif nn == 3:  						# -RC mapped to - strand:
+                        mapped_strand = "-"
+                        FR = "-RC" # RC reads from +RC reflecting the methylation status on Crick strand (-)
+                        original_BS = reverse_compl_seq(original_BS)  # for RC reads
+
+                    elif nn == 4: 						# -FW mapped to - strand:
+                        mapped_strand = "-"
+                        FR = "-FW"
+                        mapped_location = chr_length[mapped_chr] - mapped_location - g_len
+
+                    origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
+
+                    r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
+
+
+                    if len(r_aln)==len(g_aln):
+                        N_mismatch = N_MIS(r_aln, g_aln)
+                        if N_mismatch <= int(max_mismatch_no):
+                            numbers_mapped_lst[nn-1] += 1
+                            all_mapped_passed += 1
+                            methy = methy_seq(r_aln, g_aln + next)
+                            mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
+
+                            #---XS FILTER----------------
+                            XS = 0
+                            nCH = methy.count('y') + methy.count('z')
+                            nmCH = methy.count('Y') + methy.count('Z')
+                            if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
+                                XS = 1
+
+                            outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
+
+            #----------------------------------------------------------------
+            logm("--> %s (%d) "%(read_file, no_my_files))
+            delete_files(WC2T, WG2A, CC2T, CG2A)
+
+
+
+        #--------------------------------------------------------------------
+        # directional sequencing
+        #--------------------------------------------------------------------
+
+        if asktag=="N":  
+            #----------------------------------------------------------------
+            outfile2=tmp_d('Trimed_C2T.fa'+random_id)
+            outf2=open(outfile2,'w')
+
+            n=0
+            #----------------------------------------------------------------
+            try :
+                read_inf=open(read_file,"r")
+            except IOError :
+                print "[Error] Cannot open input file : %s" % read_file
+                exit(-1)
+
+            oneline=read_inf.readline()
+            l=oneline.split()
+            input_format=""
+            if oneline[0]=="@":	# FastQ
+                input_format="fastq"
+                n_fastq=0
+            elif len(l)==1 and oneline[0]!=">": # pure sequences
+                input_format="seq"
+            elif len(l)==11: # Illumina GAII qseq file
+                input_format="qseq"
+            elif oneline[0]==">":	# fasta
+                input_format="fasta"
+                n_fasta=0
+            read_inf.close()
+            #print "detected data format: %s"%(input_format)
+            #----------------------------------------------------------------
+            read_id=""
+            seq=""
+            seq_ready="N"
+            for line in fileinput.input(read_file):
+                l=line.split()
+                if input_format=="seq":
+                    all_raw_reads+=1
+                    read_id=str(all_raw_reads)
+                    read_id=read_id.zfill(12)
+                    seq=l[0]
+                    seq_ready="Y"
+                elif input_format=="fastq":
+                    m_fastq=math.fmod(n_fastq,4)
+                    n_fastq+=1
+                    seq_ready="N"
+                    if m_fastq==0:
+                        all_raw_reads+=1
+                        read_id=str(all_raw_reads)
+                        read_id=read_id.zfill(12)
+                        seq=""
+                    elif m_fastq==1:
+                        seq=l[0]
+                        seq_ready="Y"
+                    else:
+                        seq=""
+                elif input_format=="qseq":
+                    all_raw_reads+=1
+                    read_id=str(all_raw_reads)
+                    read_id=read_id.zfill(12)
+                    seq=l[8]
+                    seq_ready="Y"
+                elif input_format=="fasta":
+                    m_fasta=math.fmod(n_fasta,2)
+                    n_fasta+=1
+                    seq_ready="N"
+                    if m_fasta==0:
+                        all_raw_reads+=1
+                        read_id=l[0][1:]
+                        seq=""
+                    elif m_fasta==1:
+                        seq=l[0]
+                        seq_ready="Y"
+                    else:
+                        seq=""
+
+                #--------------------------------
+                if seq_ready=="Y":
+                    seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72  -e 52
+                    seq=seq.upper()
+                    seq=seq.replace(".","N")
+
+                    #--striping adapter from 3' read -------
+                    if adapter != "":
+                        new_read = RemoveAdapter(seq, adapter, adapter_mismatch)
+                        if len(new_read) < len(seq) :
+                            all_trimed += 1
+                        seq = new_read
+
+                    if len(seq)<=4:
+                        seq = "N" * (cut2-cut1+1)
+
+                    #---------  trimmed_raw_BS_read  ------------------
+                    original_bs_reads[read_id] = seq
+
+
+                    #---------  FW_C2T  ------------------
+                    outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
+
+            fileinput.close()
+
+            outf2.close()
+            delete_files(read_file)
+
+            #--------------------------------------------------------------------------------
+            # Bowtie mapping
+            #--------------------------------------------------------------------------------
+            WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
+            CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
+
+            run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
+                                                  'input_file' : outfile2,
+                                                  'output_file' : WC2T},
+                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
+                                                  'input_file' : outfile2,
+                                                  'output_file' : CC2T} ])
+
+            delete_files(outfile2)
+
+            #--------------------------------------------------------------------------------
+            # Post processing
+            #--------------------------------------------------------------------------------
+
+
+            FW_C2T_U, FW_C2T_R = extract_mapping(WC2T)
+            RC_C2T_U, RC_C2T_R = extract_mapping(CC2T)
+
+            #----------------------------------------------------------------
+            # get uniq-hit reads
+            #----------------------------------------------------------------
+            Union_set = set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())
+
+            Unique_FW_C2T = set() # +
+            Unique_RC_C2T = set() # -
+            Multiple_hits=set()
+            # write reads rejected by Multiple Hits to file
+
+            for x in Union_set:
+                _list=[]
+                for d in [FW_C2T_U,RC_C2T_U]:
+                    mis_lst=d.get(x,[99])
+                    mis=int(mis_lst[0])
+                    _list.append(mis)
+                for d in [FW_C2T_R,RC_C2T_R]:
+                    mis=d.get(x,99)
+                    _list.append(mis)
+                mini=min(_list)
+                #print _list
+                if _list.count(mini)==1:
+                    mini_index=_list.index(mini)
+                    if mini_index==0:
+                        Unique_FW_C2T.add(x)
+                    elif mini_index==1:
+                        Unique_RC_C2T.add(x)
+                    else:
+                        Multiple_hits.add(x)
+                else :
+                    Multiple_hits.add(x)
+            # write reads rejected by Multiple Hits to file
+            if show_multiple_hit :
+                outf_MH=open("Multiple_hit.fa",'w')
+                for i in Multiple_hits :
+                    outf_MH.write(">%s\n" % i)
+                    outf_MH.write("%s\n" % original_bs_reads[i])
+                outf_MH.close()
+
+
+
+            FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
+            RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
+            FW_C2T_uniq_lst.sort()
+            RC_C2T_uniq_lst.sort()
+            FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
+            RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
+
+
+            #----------------------------------------------------------------
+
+            numbers_premapped_lst[0] += len(Unique_FW_C2T)
+            numbers_premapped_lst[1] += len(Unique_RC_C2T)
+
+            #----------------------------------------------------------------
+
+            nn = 0
+            gseq = dict()
+            chr_length = dict()
+            for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]:
+                nn += 1
+                mapped_chr0 = ""
+                for header in ali_unique_lst:
+                    _, mapped_chr, mapped_location, cigar = ali_dic[header]
+                    original_BS = original_bs_reads[header]
+                    #-------------------------------------
+                    if mapped_chr not in gseq :
+                        gseq[mapped_chr] = deserialize(db_d(mapped_chr))
+                        chr_length[mapped_chr] = len(gseq[mapped_chr])
+                    #if mapped_chr != mapped_chr0:
+                    #    my_gseq = deserialize(db_d(mapped_chr))
+                    #    chr_length = len(my_gseq)
+                    #    mapped_chr0 = mapped_chr
+                    #-------------------------------------
+
+                    r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
+
+                    all_mapped+=1
+                    if nn == 1: 	# +FW mapped to + strand:
+                        FR = "+FW"
+                        mapped_strand = "+"
+                    elif nn == 2: 	# -FW mapped to - strand:
+                        mapped_strand = "-"
+                        FR = "-FW"
+                        mapped_location = chr_length[mapped_chr] - mapped_location - g_len
+
+
+                    origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
+                    r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
+
+                    if len(r_aln) == len(g_aln):
+                        N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides
+                        if N_mismatch <= int(max_mismatch_no):
+                            numbers_mapped_lst[nn-1] += 1
+                            all_mapped_passed += 1
+                            methy = methy_seq(r_aln, g_aln+next)
+                            mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
+
+                            #---XS FILTER----------------
+                            XS = 0
+                            nCH = methy.count('y') + methy.count('z')
+                            nmCH = methy.count('Y') + methy.count('Z')
+                            if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
+                                XS = 1
+
+                            outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
+
+            #----------------------------------------------------------------
+            logm("--> %s (%d) "%(read_file,no_my_files))
+            delete_files(WC2T, CC2T)
+
+
+    #----------------------------------------------------------------
+
+#    outf.close()
+    delete_files(tmp_path)
+
+    logm("Number of raw reads: %d \n"% all_raw_reads)
+    if all_raw_reads >0:
+        logm("Number of reads having adapter removed: %d \n" % all_trimed )
+        logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
+        logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped)
+        if asktag=="Y":
+            logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
+            logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
+            logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
+            logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
+        elif asktag=="N":
+            logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
+            logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
+
+        logm("Post-filtering %d uniquely aligned reads with mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
+        if asktag=="Y":
+            logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
+            logm(" ---- %7d RC reads mapped to Watson strand"%(numbers_mapped_lst[1]) )
+            logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[2]) )
+            logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) )
+        elif asktag=="N":
+            logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
+            logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) )
+        logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
+
+        n_CG=mC_lst[0]+uC_lst[0]
+        n_CHG=mC_lst[1]+uC_lst[1]
+        n_CHH=mC_lst[2]+uC_lst[2]
+
+        logm("----------------------------------------------" )
+        logm("Methylated C in mapped reads ")
+
+        logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
+        logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
+        logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))
+
+    logm("------------------- END --------------------" )
+    elapsed("=== END %s ===" % main_read_file)
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_align/output.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,83 @@
+try :
+    import pysam
+except ImportError :
+    print "[Error] It seems that you haven't install \"pysam\" package.. Please do it before you run this script."
+    exit(-1)
+
+import sys
+from bs_align_utils import *
+
+BAM = 'bam'
+SAM = 'sam'
+BS_SEEKER1 = 'bs_seeker1'
+
+formats = [BAM, SAM, BS_SEEKER1]
+
+class outfile:
+    def __init__(self, filename, format, chrom_len, cmd_line, suppress_SAM_header):
+        self.filename = filename
+        self.format = format
+        self.chrom_ids = dict((k, i) for i, k in enumerate(sorted(chrom_len)))
+
+        if format == BS_SEEKER1:
+            self.f = open(filename, 'w')
+        elif format in [SAM, BAM]:
+            header = { 'HD' : { 'VN': '1.0'},
+                       'SQ' : [ {'LN' : chrom_len[c], 'SN' : c} for c in sorted(chrom_len) ],
+                       'PG' : [ { 'ID' : 1, 'PN' : 'BS Seeker 2', 'CL' : cmd_line} ]
+                     }
+            self.f = pysam.Samfile(filename, 'w' + ('b' if format == BAM else ('' if suppress_SAM_header else 'h')), header = header)
+
+
+
+    def close(self):
+        self.f.close()
+
+    def store(self, qname, N_mismatch, FR, refname, strand, pos, cigar, original_BS, methy, STEVE, rnext = -1, pnext = -1, qual = None, output_genome = None,
+              rrbs = False, my_region_serial = -1, my_region_start = 0, my_region_end = 0):
+
+        if self.format == BS_SEEKER1:
+
+            # remove the soft clipped bases from the read
+            # this is done for backwards compatibility with the old format
+            r_start, r_end, _ = get_read_start_end_and_genome_length(cigar)
+            original_BS = original_BS[r_start : r_end]
+
+            if rrbs:
+                self.f.write('%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\n' % (qname, N_mismatch, FR, refname, strand, str(pos+1).zfill(10), output_genome, original_BS, methy, STEVE))
+            else:
+                self.f.write('%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n' % (qname, N_mismatch, FR, refname, strand, str(pos+1).zfill(10), output_genome, original_BS, methy, my_region_serial, my_region_start, my_region_end, STEVE))
+
+
+        elif self.format == BAM or self.format == SAM:
+
+            a = pysam.AlignedRead()
+            a.qname = qname
+            a.seq = original_BS if strand == '+' else reverse_compl_seq(original_BS)
+            a.flag =  0x10 if strand == '-' else 0
+            a.tid = self.chrom_ids[refname]
+            a.pos = pos
+            a.mapq = 255
+            a.cigar = cigar if strand == '+' else list(reversed(cigar))
+            a.rnext = rnext if rnext == -1 else self.chrom_ids[rnext]
+            a.pnext = pnext
+            a.qual= qual
+            if rrbs:
+                a.tags = (('XO', FR),
+                          ('XS', STEVE),
+                          ('NM', N_mismatch),
+                          ('XM', methy),
+                          ('XG', output_genome),
+                          ('YR', my_region_serial),
+                          ('YS', my_region_start),
+                          ('YE', my_region_end)
+                          )
+
+            else:
+                a.tags = (('XO', FR),
+                          ('XS', STEVE),
+                          ('NM', N_mismatch),
+                          ('XM', methy),
+                          ('XG', output_genome))
+
+            self.f.write(a)
Binary file BSseeker2/bs_index/.___init__.py has changed
Binary file BSseeker2/bs_index/._rrbs_build.py has changed
Binary file BSseeker2/bs_index/._wg_build.py has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_index/__init__.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,1 @@
+__author__ = 'pf'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_index/rrbs_build.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,194 @@
+import os
+
+from bs_utils.utils import *
+
+
+FWD_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.fwd_mappable_regions'
+REV_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.rev_mappable_regions'
+
+
+# example: T-CGA
+
+def rrbs_build(fasta_file, build_command, ref_path, low_bound, up_bound, aligner, cut_format="C-CGG"):
+    # ref_path is a string that contains the directory where the reference genomes are stored with
+    # the input fasta filename appended
+
+    cut_format = cut_format.upper() # Ex. "C-CGG,C-TAG"; MspI&ApekI:"G^CWGC"
+    if cut_format == "C-CGG" :
+        ref_path = os.path.join(ref_path,
+            os.path.split(fasta_file)[1] + '_rrbs_%d_%d' % (low_bound, up_bound) +'_' + aligner)
+    else :
+        ref_path = os.path.join(ref_path,
+            os.path.split(fasta_file)[1] + '_rrbs_%s_%d_%d' % ( cut_format.replace("-","").replace(",","-"), low_bound, up_bound) +'_' + aligner)
+
+    ref_p = lambda filename: os.path.join(ref_path, filename)
+
+    clear_dir(ref_path)
+    open_log(ref_p('log'))
+
+    refd = {}
+    w_c2t = open(ref_p('W_C2T.fa'),'w')
+    c_c2t = open(ref_p('C_C2T.fa'),'w')
+
+    w_g2a = open(ref_p('W_G2A.fa'),'w')
+    c_g2a = open(ref_p('C_G2A.fa'),'w')
+
+    mappable_regions_output_file = open(ref_p("RRBS_mappable_regions.txt"),"w")
+
+    all_L = 0
+    all_mappable_length = 0
+    all_unmappable_length = 0
+
+    no_mappable_region = 0
+    total_chromosomes = 0
+
+#    cut_context = re.sub("-", "", cut_format).split(",")
+    cut_context = EnumerateIUPAC(cut_format.replace("-","").split(","))
+    cut_len = [len(i) for i in cut_context]
+    cut_len_max = max(cut_len)
+
+
+    for chrom_id, chrom_seq in read_fasta(fasta_file):
+        total_chromosomes += 1
+        refd[chrom_id] = len(chrom_seq)
+
+        fwd_chr_regions = {}
+        rev_chr_regions = {}
+
+        L = len(chrom_seq)
+        XXXX_sites = []
+        XXXX_XXXX = []
+
+        #-- collect all "XXXX sites ---------------------------------
+        i = 1
+        while i <= L - cut_len_max:
+            j = 0
+            while j < len(cut_len) :
+                if chrom_seq[i : i + cut_len[j]] == cut_context[j]:
+                    XXXX_sites.append( (i, i + cut_len[j] - 1) ) # add (1st position, last position)
+                j += 1
+            i += 1
+
+        #-- find "XXXX" pairs that are within the length of fragment ----
+        for j in xrange(len(XXXX_sites) - 1):
+            DD = (XXXX_sites[j+1][0] - XXXX_sites[j][1]) - 1 # NOT including both XXXX; DD: fragment length
+            if low_bound <= DD <= up_bound:
+                XXXX_XXXX.append([XXXX_sites[j][0], XXXX_sites[j+1][1]]) # leftmost <--> rightmost
+                mappable_seq = chrom_seq[XXXX_sites[j][0] : XXXX_sites[j+1][1] + 1]
+                no_mappable_region += 1
+
+                fwd_chr_regions[str(XXXX_sites[j][0])] = [XXXX_sites[j+1][1], no_mappable_region]
+                rev_chr_regions[str(XXXX_sites[j+1][1])] = [XXXX_sites[j][0], no_mappable_region]
+
+                # start_position, end_position, serial, sequence
+                mappable_regions_output_file.write("%s\t%d\t%d\t%d\t%s\n"%(chrom_id, no_mappable_region,
+                                            XXXX_sites[j][0], XXXX_sites[j+1][1], mappable_seq))
+        # storing region information to file
+        # format: A[left_CCGG_pos]=[right_CCGG_pos, number_of_mappable_region]
+        serialize(fwd_chr_regions, ref_p(FWD_MAPPABLE_REGIONS(chrom_id)))
+        serialize(rev_chr_regions, ref_p(REV_MAPPABLE_REGIONS(chrom_id)))
+
+        #-----------------------------------
+        # mask the genome
+        _map_seq = []
+        mappable_length = 0
+        unmappable_length = 0
+        m = 0
+        mark = False
+        while m < L: # for every nucleotide in chr
+            if len(XXXX_XXXX) > 0:
+                pair = XXXX_XXXX[0]
+                p1 = pair[0]  # left end of fragment
+                p2 = pair[1]  # right end of fragment
+                if p1 <= m < p2 + 1 :
+                    _map_seq.append(chrom_seq[m])
+                    mappable_length+=1
+                    mark = True
+                else :
+                    if not mark:
+                        _map_seq.append("-")
+                        unmappable_length += 1
+                    else: # the last eligible base
+                        _ = XXXX_XXXX.pop(0)
+                        if len(XXXX_XXXX)>0:
+                            pair = XXXX_XXXX[0]
+                            p1 = pair[0]
+                            p2 = pair[1]
+                            if  p1 <= m < p2 + 1:
+                                _map_seq.append(chrom_seq[m])
+                                mappable_length += 1
+                                mark = True
+                            else:
+                                _map_seq.append("-")
+                                unmappable_length += 1
+                                mark = False
+                        else:
+                            _map_seq.append("-")
+                            unmappable_length+=1
+                            mark = False
+            else:
+                if not mark:
+                    _map_seq.append("-")
+                    unmappable_length+=1
+                else:
+                    _map_seq.append("-")
+                    unmappable_length+=1
+                    mark = False
+
+            m+=1
+
+        #-----------------------------------
+
+        chrom_seq = ''.join(_map_seq)
+        serialize(chrom_seq, ref_p(chrom_id))
+
+        w_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
+        w_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
+
+        chrom_seq = reverse_compl_seq(chrom_seq)
+
+        c_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
+        c_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
+
+        #-----------------------------------
+        logm("# %s : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)"%(chrom_id,
+                                                                      L,
+                                                                      unmappable_length,
+                                                                      mappable_length,
+                                                                      float(mappable_length)/L) )
+        all_L += L
+        all_mappable_length += mappable_length
+        all_unmappable_length += unmappable_length
+
+        elapsed('Finished initial pre-processing of ' + chrom_id)
+
+
+    for outf in [w_c2t, c_c2t, w_g2a, c_g2a]:
+        outf.close()
+
+
+    logm("# Total %d chromosome(s) : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)" %(total_chromosomes,
+                                                                                       all_L,
+                                                                                       all_unmappable_length,
+                                                                                       all_mappable_length,
+                                                                                       float(all_mappable_length)/all_L) )
+    logm("# eligible fragments : %d" % no_mappable_region )
+
+    serialize(refd, ref_p("refname"))
+
+    mappable_regions_output_file.close()
+    elapsed('Storing mappable regions and genome')
+
+    #---------------- bowtie-build -------------------------------------------
+
+    # append ref_path to all elements of to_bowtie
+    to_bowtie = map(lambda f: os.path.join(ref_path, f), ['W_C2T', 'W_G2A', 'C_C2T', 'C_G2A'])
+
+    run_in_parallel([(build_command % { 'fname' : fname }, fname + '.log') for fname in to_bowtie])
+
+    elapsed('Index building')
+    # delete all fasta files
+    delete_files( f +'.fa' for f in to_bowtie)
+
+    elapsed('END')
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_index/wg_build.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,55 @@
+from bs_utils.utils import *
+
+
+def wg_build(fasta_file, build_command, ref_path, aligner):
+
+    # ref_path is a string that containts the directory where the reference genomes are stored with
+    # the input fasta filename appended
+    ref_path = os.path.join(ref_path,
+                            os.path.split(fasta_file)[1] + '_'+aligner)
+
+    clear_dir(ref_path)
+    #---------------------------------------------------------------
+    # 1. First get the complementary genome (also do the reverse)
+    # 2. Then do CT and GA conversions
+    #---------------------------------------------------------------
+
+    open_log(os.path.join(ref_path, 'log'))
+    refd = {}
+    w_c2t = open(os.path.join(ref_path, 'W_C2T.fa'),'w')
+    c_c2t = open(os.path.join(ref_path, 'C_C2T.fa'),'w')
+
+    w_g2a = open(os.path.join(ref_path, 'W_G2A.fa'),'w')
+    c_g2a = open(os.path.join(ref_path, 'C_G2A.fa'),'w')
+    for chrom_id, chrom_seq in read_fasta(fasta_file):
+        serialize(chrom_seq, os.path.join(ref_path, chrom_id))
+        refd[chrom_id] = len(chrom_seq)
+
+        w_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
+        w_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
+
+        chrom_seq = reverse_compl_seq(chrom_seq)
+
+        c_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
+        c_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
+
+        elapsed('Preprocessing '+chrom_id)
+
+    for outf in [w_c2t, c_c2t, w_g2a, c_g2a]:
+        outf.close()
+
+    serialize(refd, os.path.join(ref_path,"refname"))
+    elapsed('Genome preprocessing')
+    # append ref_path to all elements of to_bowtie
+    to_bowtie = map(lambda f: os.path.join(ref_path, f), ['W_C2T', 'W_G2A', 'C_C2T', 'C_G2A'])
+
+    # start bowtie-build for all converted genomes and wait for the processes to finish
+
+    run_in_parallel([(build_command % { 'fname' : fname }, fname+'.log') for fname in to_bowtie])
+
+    # delete fasta files of converted genomes
+    if aligner != "rmap" :
+        delete_files(f+'.fa' for f in to_bowtie)
+
+    elapsed('Done')
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_seeker2-align.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,349 @@
+#!/usr/bin/python
+
+from optparse import OptionParser, OptionGroup
+import re
+import tempfile
+from bs_align import output
+from bs_align.bs_pair_end import *
+from bs_align.bs_single_end import *
+from bs_align.bs_rrbs import *
+from bs_utils.utils import *
+
+
+if __name__ == '__main__':
+
+    parser = OptionParser()
+    # option group 1
+    opt_group = OptionGroup(parser, "For single end reads")
+    opt_group.add_option("-i", "--input", type="string", dest="infilename",help="Input your read file name (FORMAT: sequences, fastq, qseq,fasta)", metavar="INFILE")
+    parser.add_option_group(opt_group)
+
+    # option group 2
+    opt_group = OptionGroup(parser, "For pair end reads")
+    opt_group.add_option("-1", "--input_1", type="string", dest="infilename_1",help="Input your read file end 1 (FORMAT: sequences, qseq, fasta, fastq)", metavar="FILE")
+    opt_group.add_option("-2", "--input_2", type="string", dest="infilename_2",help="Input your read file end 2 (FORMAT: sequences, qseq, fasta, fastq)", metavar="FILE")
+    opt_group.add_option("--minins",type = "int",dest = "min_insert_size", help="The minimum insert size for valid paired-end alignments [Default: %default]", default = -1)
+    opt_group.add_option("--maxins",type = "int",dest = "max_insert_size", help="The maximum insert size for valid paired-end alignments [Default: %default]", default = 400)
+    parser.add_option_group(opt_group)
+
+    # option group 3
+    opt_group = OptionGroup(parser, "Reduced Representation Bisulfite Sequencing Options")
+    opt_group.add_option("-r", "--rrbs", action="store_true", dest="rrbs", default = False, help = 'Process reads from Reduced Representation Bisulfite Sequencing experiments')
+    opt_group.add_option("-c", "--cut-site", type="string",dest="cut_format", help="Cutting sites of restriction enzyme. Ex: MspI(C-CGG), Mael:(C-TAG), double-enzyme MspI&Mael:(C-CGG,C-TAG). [Default: %default]", metavar="pattern", default = "C-CGG")
+    opt_group.add_option("-L", "--low", type = "int", dest="rrbs_low_bound", help="lower bound of fragment length (excluding C-CGG ends) [Default: %default]", default = 40)
+    opt_group.add_option("-U", "--up", type = "int", dest="rrbs_up_bound", help="upper bound of fragment length (excluding C-CGG ends) [Default: %default]", default = 500)
+    parser.add_option_group(opt_group)
+
+    # option group 4
+    opt_group = OptionGroup(parser, "General options")
+    opt_group.add_option("-t", "--tag", type="string", dest="taginfo",help="[Y]es for undirectional lib, [N]o for directional [Default: %default]", metavar="TAG", default = 'N')
+    opt_group.add_option("-s","--start_base",type = "int",dest = "cutnumber1", help="The first base of your read to be mapped [Default: %default]", default = 1)
+    opt_group.add_option("-e","--end_base",type = "int",dest = "cutnumber2", help="The last cycle number of your read to be mapped [Default: %default]", default = 200)
+    opt_group.add_option("-a", "--adapter", type="string", dest="adapter_file",help="Input text file of your adaptor sequences (to be trimed from the 3'end of the reads). Input 1 seq for dir. lib., 2 seqs for undir. lib. One line per sequence", metavar="FILE", default = '')
+    opt_group.add_option("--am",type = "int",dest = "adapter_mismatch", help="Number of mismatches allowed in adaptor [Default: %default]", default = 0)
+    opt_group.add_option("-g", "--genome", type="string", dest="genome",help="Name of the reference genome (the same as the reference genome file in the preprocessing step) [ex. chr21_hg18.fa]")
+    opt_group.add_option("-m", "--mismatches",type = "int", dest="int_no_mismatches",help="Number of mismatches in one read [Default: %default]", default = 4)
+    opt_group.add_option("--aligner", dest="aligner",help="Aligner program to perform the analisys: " + ', '.join(supported_aligners) + " [Default: %default]", metavar="ALIGNER", default = BOWTIE2)
+    opt_group.add_option("-p", "--path", dest="aligner_path", help="Path to the aligner program. Defaults: " +' '*70+ '\t'.join(('%s: %s '+' '*70) % (al, aligner_path[al]) for al in sorted(supported_aligners)),
+        metavar="PATH"
+    )
+    opt_group.add_option("-d", "--db", type="string", dest="dbpath",help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]" , metavar="DBPATH", default = reference_genome_path)
+    opt_group.add_option("-l", "--split_line",type = "int", dest="no_split",help="Number of lines per split (the read file will be split into small files for mapping. The result will be merged. [Default: %default]", default = 4000000)
+    opt_group.add_option("-o", "--output", type="string", dest="outfilename",help="The name of output file [INFILE.bs(se|pe|rrbs)]", metavar="OUTFILE")
+    opt_group.add_option("-f", "--output-format", type="string", dest="output_format",help="Output format: "+', '.join(output.formats)+" [Default: %default]", metavar="FORMAT", default = output.BAM)
+    opt_group.add_option("--no-header", action="store_true", dest="no_SAM_header",help="Suppress SAM header lines [Default: %default]", default = False)
+    opt_group.add_option("--temp_dir", type="string", dest="temp_dir",help="The path to your temporary directory [Default: %default]", metavar="PATH", default = tempfile.gettempdir())
+    opt_group.add_option("--XS",type = "string", dest="XS_filter",help="Filter definition for tag XS, format X,Y. X=0.8 and y=5 indicate that for one read, if #(mCH sites)/#(all CH sites)>0.8 and #(mCH sites)>5, then tag XS=1; or else tag XS=0. [Default: %default]", default = "0.5,5") # added by weilong
+    opt_group.add_option("--multiple-hit", action="store_true", dest="Output_multiple_hit", default = False, help = 'Output reads with multiple hits to file\"Multiple_hit.fa\"')
+
+    opt_group.add_option("-v", "--version", action="store_true", dest="version",help="show version of BS-Seeker2", metavar="version", default = False)
+
+    parser.add_option_group(opt_group)
+
+    # option group 5
+    opt_group = OptionGroup(parser, "Aligner Options",
+        "You may specify any additional options for the aligner. You just have to prefix them with " +
+        ', '.join('%s for %s' % (aligner_options_prefixes[aligner], aligner) for aligner in supported_aligners)+
+        ', and BS Seeker will pass them on. For example: --bt-p 4 will increase the number of threads for bowtie to 4, '
+        '--bt--tryhard will instruct bowtie to try as hard as possible to find valid alignments when they exist, and so on. '
+        'Be sure that you know what you are doing when using these options! Also, we don\'t do any validation on the values.')
+    parser.add_option_group(opt_group)
+
+
+    #----------------------------------------------------------------
+    # separate aligner options from BS Seeker options
+    aligner_options = {}
+    bs_seeker_options = []
+    i = 1
+    while i < len(sys.argv):
+        arg = sys.argv[i]
+        m = re.match(r'^%s' % '|'.join('(%s)'% aligner_options_prefixes[al] for al in supported_aligners), arg)
+        if m:
+            a_opt = arg.replace(m.group(0),'-',1)
+            aligner_options[a_opt] = []
+            while i + 1 < len(sys.argv) and sys.argv[i+1][0] != '-':
+                aligner_options[a_opt].append(sys.argv[i+1])
+                i += 1
+            if len(aligner_options[a_opt]) == 0: # if it is a key-only option
+                aligner_options[a_opt] = True
+        else:
+            bs_seeker_options.append(arg)
+        i += 1
+
+
+    (options, args) = parser.parse_args(args = bs_seeker_options)
+
+
+    # if no options were given by the user, print help and exit
+    if len(sys.argv) == 1:
+        print parser.print_help()
+        exit(0)
+
+    if options.version :
+        show_version()
+        exit (-1)
+    else :
+        show_version()
+
+    # check parameters
+    # input read files
+    if options.infilename and (options.infilename_1 or options.infilename_2):
+        error('-i and [-1|-2] options are exclusive. You should use only one of them.')
+
+    if not (options.infilename or (options.infilename_1 and options.infilename_2)):
+        error('You should set either -i or -1 and -2 options.')
+    # -t, directional / un-directional library
+    asktag=str(options.taginfo).upper()
+    if asktag not in 'YN':
+        error('-t option should be either Y or N, not %s' % asktag)
+    # -a
+    if options.aligner not in supported_aligners:
+        error('-a option should be: %s' % ' ,'.join(supported_aligners)+'.')
+    # path for aligner
+    aligner_exec = os.path.expanduser( os.path.join(options.aligner_path or aligner_path[options.aligner], options.aligner) )
+    # mismatch allowed: bowtie 1,build-in parameter '-m'; bowtie 2, post-filter paramter
+    # mismatch should no greater than the read length
+    int_no_mismatches=min(options.int_no_mismatches, options.cutnumber2-options.cutnumber1)
+    str_no_mismatches=str(int_no_mismatches)
+    # -g
+    if options.genome is None:
+        error('-g is a required option')
+    genome = os.path.split(options.genome)[1]
+    genome_subdir = genome
+
+    # try to guess the location of the reference genome for RRBS
+    if options.rrbs:
+        if options.rrbs_low_bound and options.rrbs_up_bound:
+            if options.cut_format == "C-CGG" :
+                genome_subdir += '_rrbs_%d_%d'  % (options.rrbs_low_bound, options.rrbs_up_bound)
+            else :
+                genome_subdir += '_rrbs_%s_%d_%d'  % ( re.sub(",","-",re.sub("-", "", options.cut_format)), options.rrbs_low_bound, options.rrbs_up_bound)
+        else:
+            possible_refs = filter(lambda dir: dir.startswith(genome+'_rrbs_'), os.listdir(options.dbpath))
+            if len(possible_refs) == 1:
+                genome_subdir = possible_refs[0]
+            else:
+                error('Cannot localize unambiguously the reference genome for RRBS. '
+                      'Please, specify the options \"--low\" and \"--up\" that you used at the index-building step.\n'
+                      'Possible choices are:\n' + '\n'.join([pr.split('_rrbs_')[-1].replace('_',', ') for pr in possible_refs]))
+
+    db_path = os.path.expanduser(os.path.join(options.dbpath, genome_subdir + '_' + options.aligner))
+
+    if not os.path.isdir(db_path):
+        error('Index DIR \"' + genome_subdir + '..\" cannot be found in ' + options.dbpath +'.\n\tPlease run the bs_seeker2-build.py '
+                            'to create it with the correct parameters for -g, -r, --low, --up and --aligner.')
+
+    # handle aligner options
+    #
+
+    # default aligner options
+    aligner_options_defaults = {
+                                BOWTIE  : { '-e'              : 40*int_no_mismatches,
+                                            '--nomaqround'    : True,
+                                            '--norc'          : True,
+                                            '-k'              : 2,
+                                            # -k=2; report two best hits, and filter by error rates
+                                            '--quiet'         : True,
+                                            '--best'          : True,
+#                                            '--suppress'      : '2,5,6',
+                                            '--sam'           : True,
+                                            '--sam-nohead'    : True,
+                                            '-p'              : 2
+                                },
+                                BOWTIE2 : {
+                                            #'-M'              : 5,
+                                            '--norc'          : True,
+                                            '--quiet'         : True,
+                                            '-p'              : 2,
+                                            '--sam-nohead'    : True,
+                                            # run bowtie2 in local mode by default
+                                            '--local' : '--end-to-end' not in aligner_options,
+                                            #'--mm'            : True,
+                                            '-k'              : 2
+                                },
+                                SOAP    : { '-v' : int_no_mismatches,
+                                            '-p' : 2,
+                                            '-r' : 2,
+                                            '-M' : 4
+                                          },
+                                RMAP    : { '-M' : 2
+                                            # to do # control for only mapping on + strand
+                                          }
+
+                                }
+
+    if '--end-to-end' not in aligner_options:
+        aligner_options_defaults[BOWTIE2].update({'-D' : 50})
+        #aligner_options_defaults[BOWTIE2].update({'-D' : 50, '-R': 3, '-N': 0, '-L': 15, '-i' : 'S,1,0.50'})
+    else:
+        aligner_options_defaults[BOWTIE2].update({'-D' : 50, '-L': 15, '--score-min': 'L,-0.6,-0.6' })
+
+    aligner_options = dict(aligner_options_defaults[options.aligner], **aligner_options)
+
+    aligner_options_string = lambda : ' %s ' % (' '.join(opt_key +
+                                                         (' ' + ' '.join(map(str,opt_val)) # join all values if the value is an array
+                                                          if type(opt_val) is list else
+                                                                ('' if type(opt_val) is bool and opt_val # output an empty string if it is a key-only option
+                                                                 else ' ' +str(opt_val)) # output the value if it is a single value
+                                                         )
+                                                        for opt_key, opt_val in aligner_options.iteritems() if opt_val not in [None, False]))
+
+
+#    tmp_path = (options.outfilename or options.infilename or options.infilename_1) +'-'+ options.aligner+ '-TMP'
+#    clear_dir(tmp_path)
+
+    if options.output_format not in output.formats:
+        error('Output format should be one of: ' + ', '.join(output.formats))
+
+    if options.outfilename:
+        outfilename = options.outfilename
+        logfilename = outfilename
+    elif options.infilename is not None:
+        logfilename = options.infilename+'_'+ ('rr' if options.rrbs else '') + 'bsse'
+        outfilename = logfilename + '.' + options.output_format
+    else:
+        logfilename = options.infilename_1+'_'+ ('rr' if options.rrbs else '') + 'bspe'
+        outfilename = logfilename + '.' + options.output_format
+
+    outfilename = os.path.expanduser(outfilename)
+    logfilename = os.path.expanduser(logfilename)
+    outfile = output.outfile(outfilename, options.output_format, deserialize(os.path.join(db_path, 'refname')), ' '.join(sys.argv), options.no_SAM_header)
+
+    open_log(logfilename+'.bs_seeker2_log')
+
+    aligner_title = options.aligner
+    if options.aligner == BOWTIE2 :
+        if '--end-to-end' in aligner_options :
+            aligner_title = aligner_title + "-e2e"
+        else:
+            aligner_title = aligner_title + "-local"
+
+    tmp_path = tempfile.mkdtemp(prefix='bs_seeker2_%s_-%s-TMP-' % (os.path.split(outfilename)[1], aligner_title ), dir = options.temp_dir)
+
+
+    (XS_x, XS_y) = options.XS_filter.split(",")
+    XS_pct = float(XS_x)
+    XS_count = int(XS_y)
+    logm('Filter for tag XS: #(mCH)/#(all CH)>%f and #(mCH)>%d' % (XS_pct, XS_count))
+
+
+    logm('Temporary directory: %s' % tmp_path)
+    logm('Reduced Representation Bisulfite Sequencing: %s' % str(options.rrbs))
+    if options.infilename is not None:
+        logm('Single end')
+
+        aligner_command = aligner_exec  + aligner_options_string() + \
+                              { BOWTIE   : ' %(reference_genome)s  -f %(input_file)s %(output_file)s',
+                                BOWTIE2  : ' -x %(reference_genome)s -f -U %(input_file)s -S %(output_file)s',
+                                SOAP     : ' -D %(reference_genome)s.fa.index -o %(output_file)s -a %(input_file)s',
+                                RMAP     : ' -c %(reference_genome)s.fa -o %(output_file)s %(input_file)s'
+                              }[options.aligner]
+        logm ('Aligner command: %s' % aligner_command)
+        # single end reads
+        if options.rrbs: # RRBS scan
+            bs_rrbs(options.infilename,
+                    asktag,
+            #        options.rrbs_taginfo,
+                    options.adapter_file,
+                    options.cutnumber1,
+                    options.cutnumber2,
+                    options.no_split,
+                    str_no_mismatches,
+                    aligner_command,
+                    db_path,
+                    tmp_path,
+                    outfile,
+                    XS_pct,
+                    XS_count,
+                    options.adapter_mismatch,
+                    options.cut_format,
+                    options.Output_multiple_hit
+                    )
+        else: # Normal single end scan
+            bs_single_end(  options.infilename,
+                            asktag,
+                            options.adapter_file,
+                            options.cutnumber1,
+                            options.cutnumber2,
+                            options.no_split,
+                            str_no_mismatches,
+                            aligner_command,
+                            db_path,
+                            tmp_path,
+                            outfile,
+                            XS_pct,
+                            XS_count,
+                            options.adapter_mismatch,
+                            options.Output_multiple_hit
+                            )
+    else:
+        logm('Pair end')
+        # pair end specific default options
+        aligner_options = dict({BOWTIE: {'--ff'  : asktag == 'N',
+                                         '--fr'  : asktag == 'Y',
+                                         '-X'    : options.max_insert_size,
+                                         '-I'    : options.min_insert_size if options.min_insert_size > 0 else None
+                                },
+                                BOWTIE2 : {
+                                         '--ff'  : asktag == 'N',
+                                         '--fr'  : asktag == 'Y',
+                                         '-X'    : options.max_insert_size,
+                                         '-I'    : options.min_insert_size if options.min_insert_size > 0 else None,
+                                         '--no-discordant'  : True,
+                                         '--no-mixed'       : True
+                                },
+                                SOAP: {
+                                        '-x' : options.max_insert_size,
+                                        '-m' : options.min_insert_size if options.min_insert_size > 0 else 100
+                                }}[options.aligner],
+                               # integrating 'rmappe' is different from others
+                                **aligner_options)
+
+        aligner_command = aligner_exec + aligner_options_string() + \
+                              { BOWTIE   : ' %(reference_genome)s  -f -1 %(input_file_1)s -2 %(input_file_2)s %(output_file)s',
+                                BOWTIE2  : ' -x %(reference_genome)s  -f -1 %(input_file_1)s -2 %(input_file_2)s -S %(output_file)s',
+                                SOAP     : ' -D %(reference_genome)s.fa.index -o %(output_file)s -a %(input_file_1)s -b %(input_file_2)s -2 %(output_file)s.unpaired' #,
+                              #  RMAP     : #  rmappe, also paste two inputs into one file.
+                             }[options.aligner]
+
+        logm('Aligner command: %s' % aligner_command)
+
+        bs_pair_end(options.infilename_1,
+                    options.infilename_2,
+                    asktag,
+                    options.adapter_file,
+                    options.cutnumber1,
+                    options.cutnumber2,
+                    options.no_split,
+                    str_no_mismatches,
+                    aligner_command,
+                    db_path,
+                    tmp_path,
+                    outfile,
+                    XS_pct,
+                    XS_count,
+                    options.Output_multiple_hit
+             )
+
+    outfile.close()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_seeker2-build.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,87 @@
+#!/usr/bin/python
+
+import os
+from optparse import OptionParser, OptionGroup
+from bs_index.wg_build import *
+from bs_index.rrbs_build import *
+from bs_utils.utils import *
+
+
+if __name__ == '__main__':
+
+    parser = OptionParser()
+
+    parser.add_option("-f", "--file", dest="filename", help="Input your reference genome file (fasta)", metavar="FILE")
+    parser.add_option("--aligner", dest="aligner", help="Aligner program to perform the analysis: " + ', '.join(supported_aligners) + " [Default: %default]", metavar="ALIGNER", default = BOWTIE2)
+    parser.add_option("-p", "--path", dest="aligner_path", help="Path to the aligner program. Defaults: " +' '*70+ '\t'.join(('%s: %s '+' '*70) % (al, aligner_path[al]) for al in sorted(supported_aligners)),
+                  metavar="PATH")
+    parser.add_option("-d", "--db", type="string", dest="dbpath", help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]", metavar="DBPATH", default = reference_genome_path)
+
+    parser.add_option("-v", "--version", action="store_true", dest="version", help="show version of BS-Seeker2", default=False)
+
+    # RRBS options
+    rrbs_opts = OptionGroup(parser, "Reduced Representation Bisulfite Sequencing Options",
+                                "Use this options with conjuction of -r [--rrbs]")
+    rrbs_opts.add_option("-r", "--rrbs", action="store_true", dest="rrbs", help = 'Build index specially for Reduced Representation Bisulfite Sequencing experiments. Genome other than certain fragments will be masked. [Default: %default]', default = False)
+    rrbs_opts.add_option("-l", "--low",type= "int", dest="low_bound", help="lower bound of fragment length (excluding recognition sequence such as C-CGG) [Default: %default]", default = 40)
+    rrbs_opts.add_option("-u", "--up", type= "int", dest="up_bound", help="upper bound of fragment length (excluding recognition sequence such as C-CGG ends) [Default: %default]", default = 500)
+    rrbs_opts.add_option("-c", "--cut-site", type= "string", dest="cut_format", help="Cut sites of restriction enzyme. Ex: MspI(C-CGG), Mael:(C-TAG), double-enzyme MspI&Mael:(C-CGG,C-TAG). [Default: %default]", default = "C-CGG")
+    parser.add_option_group(rrbs_opts)
+
+
+    (options, args) = parser.parse_args()
+
+    # if no options were given by the user, print help and exit
+    if len(sys.argv) == 1:
+        print parser.print_help()
+        exit(0)
+
+    if options.version :
+        show_version()
+        exit (-1)
+    else :
+        show_version()
+
+    rrbs = options.rrbs
+
+    fasta_file=os.path.expanduser(options.filename)
+    if fasta_file is None:
+        error('Fasta file for the reference genome must be supported')
+
+    if not os.path.isfile(fasta_file):
+        error('%s cannot be found' % fasta_file)
+
+    if options.aligner not in supported_aligners:
+        error('-a option should be: %s' % ' ,'.join(supported_aligners)+'.')
+
+    builder_exec = os.path.join(options.aligner_path or aligner_path[options.aligner],
+                                {BOWTIE   : 'bowtie-build',
+                                 BOWTIE2  : 'bowtie2-build',
+                                 SOAP     : '2bwt-builder',
+                                 RMAP     : '' # do nothing
+                                }[options.aligner])
+
+    build_command = builder_exec + { BOWTIE   : ' -f %(fname)s.fa %(fname)s',
+                                     BOWTIE2  : ' -f %(fname)s.fa %(fname)s',
+                                     SOAP     : ' %(fname)s.fa'
+                                   }[options.aligner]
+
+
+    print "Reference genome file: %s" % fasta_file
+    print "Reduced Representation Bisulfite Sequencing: %s" % rrbs
+    print "Builder path: %s" % builder_exec
+    #---------------------------------------------------------------
+
+    ref_path = options.dbpath
+
+    if os.path.exists(ref_path):
+        if not os.path.isdir(ref_path):
+            error("%s must be a directory. Please, delete it or change the -d option." % ref_path)
+    else:
+        os.mkdir(ref_path)
+
+    if rrbs: # RRBS preprocessing
+        rrbs_build(fasta_file, build_command, ref_path, options.low_bound, options.up_bound, options.aligner, options.cut_format)
+    else: # Whole genome preprocessing
+        wg_build(fasta_file, build_command, ref_path, options.aligner)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_seeker2-call_methylation.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,205 @@
+#!/usr/bin/python
+
+from optparse import OptionParser, OptionGroup
+from bs_utils.utils import *
+
+try :
+    import pysam
+except ImportError :
+    print "[Error] Cannot import \"pysam\" package. Have you installed it?"
+    exit(-1)
+
+import gzip
+
+def context_calling(seq, position):
+
+    word=seq[position]
+    word=word.upper()
+
+    context="--"
+    context_CH="--"
+    if position + 2 < len(seq) and position - 2 >= 0:
+
+        if word == "C":
+            word2 = seq[position+1]
+            context_CH = word + word2
+            if word2 == "G":
+                context = "CG"
+            elif word2 in ['A','C','T']:
+                word3 = seq[position+2]
+                if word3 == "G":
+                    context = "CHG"
+                elif word3 in ['A','C','T']:
+                    context="CHH"
+
+        elif word == "G":
+            word2 = seq[position-1]
+            context_CH = word + word2
+            context_CH = context_CH.translate(string.maketrans("ATCG", "TAGC"))
+            if word2 == "C":
+                context = "CG"
+            elif word2 in ['A','G','T']:
+                word3 = seq[position-2]
+                if word3 == "C":
+                    context = "CHG"
+                elif word3 in ['A','G','T']:
+                    context = "CHH"
+
+    return word, context, context_CH
+
+
+
+if __name__ == '__main__':
+
+    parser = OptionParser()
+    parser.add_option("-i", "--input", type="string", dest="infilename",help="BAM output from bs_seeker2-align.py", metavar="INFILE")
+    parser.add_option("-d", "--db", type="string", dest="dbpath",help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]" , metavar="DBPATH", default = reference_genome_path)
+    parser.add_option("-o", "--output-prefix", type="string", dest="output_prefix",help="The output prefix to create ATCGmap and wiggle files [INFILE]", metavar="OUTFILE")
+
+    parser.add_option("--wig", type="string", dest="wig_file",help="The output .wig file [INFILE.wig]", metavar="OUTFILE")
+    parser.add_option("--CGmap", type="string", dest="CGmap_file",help="The output .CGmap file [INFILE.CGmap]", metavar="OUTFILE")
+    parser.add_option("--ATCGmap", type="string", dest="ATCGmap_file",help="The output .ATCGmap file [INFILE.ATCGmap]", metavar="OUTFILE")
+
+    parser.add_option("-x", "--rm-SX", action="store_true", dest="RM_SX",help="Removed reads with tag \'XS:i:1\', which would be considered as not fully converted by bisulfite treatment [Default: %default]", default = False)
+    parser.add_option("--txt", action="store_true", dest="text",help="Show CGmap and ATCGmap in .gz [Default: %default]", default = False)
+
+    parser.add_option("-r", "--read-no",type = "int", dest="read_no",help="The least number of reads covering one site to be shown in wig file [Default: %default]", default = 1)
+    parser.add_option("-v", "--version", action="store_true", dest="version",help="show version of BS-Seeker2", metavar="version", default = False)
+
+    (options, args) = parser.parse_args()
+
+
+    # if no options were given by the user, print help and exit
+    if len(sys.argv) == 1:
+        print parser.print_help()
+        exit(0)
+
+    if options.version :
+        show_version()
+        exit (-1)
+    else :
+        show_version()
+
+
+    if options.infilename is None:
+        error('-i option is required')
+    if not os.path.isfile(options.infilename):
+        error('Cannot find input file: %s' % options.infilename)
+
+    open_log(options.infilename+'.call_methylation_log')
+    db_d = lambda fname:  os.path.join( os.path.expanduser(options.dbpath), fname) # bug fixed, weilong
+
+    logm('sorting BS-Seeker alignments')
+    sorted_input_filename = options.infilename+'_sorted'
+    pysam.sort(options.infilename, sorted_input_filename)
+    sorted_input_filename += '.bam'
+    logm('indexing sorted alignments')
+    pysam.index(sorted_input_filename)
+
+    logm('calculating methylation levels')
+    if options.text :
+        ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap')
+        ATCGmap = open(ATCGmap_fname, 'w')
+
+        CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap')
+        CGmap = open(CGmap_fname, 'w')
+    else :
+        ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap.gz')
+        ATCGmap = gzip.open(ATCGmap_fname, 'wb')
+
+        CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap.gz')
+        CGmap = gzip.open(CGmap_fname, 'wb')
+
+    wiggle_fname = options.wig_file or ((options.output_prefix or options.infilename) + '.wig')
+    wiggle = open(wiggle_fname, 'w')
+
+    sorted_input = pysam.Samfile(sorted_input_filename, 'rb')
+    
+    chrom = None
+    nucs = ['A', 'T', 'C', 'G', 'N']
+    ATCG_fwd = dict((n, 0) for n in nucs)
+    ATCG_rev = dict((n, 0) for n in nucs)
+    for col in sorted_input.pileup():
+        col_chrom = sorted_input.getrname(col.tid)
+        if chrom != col_chrom:
+            chrom = col_chrom
+            chrom_seq = deserialize(db_d(chrom))
+            wiggle.write('variableStep chrom=%s\n' % chrom)
+
+        for n in nucs:
+            ATCG_fwd[n] = 0
+            ATCG_rev[n] = 0
+
+        nuc, context, subcontext = context_calling(chrom_seq, col.pos)
+        total_reads = 0
+
+
+
+        for pr in col.pileups:
+        #     print pr
+             if (not pr.indel) : # skip indels
+                #if ( (options.RM_SX) and (pr.alignment.tags[1][1] == 1) ):
+                ##=== Fixed error reported by Roberto
+                #print options.RM_SX,  dict(pr.alignment.tags)["XS"]
+                #if ( (options.RM_SX) and (dict(pr.alignment.tags)["XS"] == 1) ):
+
+                if ( (options.RM_SX) and (dict(pr.alignment.tags).get("XS",0) == 1) ):
+                    # print "Debug: ", options.RM_SX, pr.alignment.tags[1]
+                    # when need to filter and read with tag (XS==1), then remove the reads
+                    continue
+
+                if pr.qpos >= len(pr.alignment.seq):
+                    print 'WARNING: read %s has an invalid alignment. Discarding.. ' % pr.alignment.qname
+                    continue
+                read_nuc = pr.alignment.seq[pr.qpos]
+         #       print "read_nuc=", read_nuc
+                if pr.alignment.is_reverse:
+                    ATCG_rev[read_nuc] += 1
+                else:
+                    ATCG_fwd[read_nuc] += 1
+
+                if read_nuc != 'N':
+                    total_reads += 1
+
+        cnts = lambda d: '\t'.join(str(d[n]) for n in nucs)
+        fwd_counts = cnts(ATCG_fwd)
+        rev_counts = cnts(ATCG_rev)
+
+        meth_level = None
+        meth_cytosines = 0
+        unmeth_cytosines = 0
+
+        if nuc == 'C':
+            # plus strand: take the ratio of C's to T's from reads that come from the forward strand
+            meth_cytosines = ATCG_fwd['C']
+            unmeth_cytosines = ATCG_fwd['T']
+
+        elif nuc == 'G':
+            # minus strand: take the ratio of G's to A's from reads that come from the reverse strand
+            meth_cytosines = ATCG_rev['G']
+            unmeth_cytosines = ATCG_rev['A']
+
+        if meth_cytosines + unmeth_cytosines > 0:
+            meth_level = float(meth_cytosines)/(meth_cytosines + unmeth_cytosines)
+
+        pos = col.pos + 1
+
+        meth_level_string = str(meth_level) if meth_level is not None else 'na'
+        ATCGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(fwd_counts)s\t%(rev_counts)s\t%(meth_level_string)s\n' % locals())
+#
+        all_cytosines = meth_cytosines + unmeth_cytosines 
+        if (meth_level is not None) and (all_cytosines >= options.read_no):
+        #    print all_cytosines
+            if nuc == 'C':
+                wiggle.write('%d\t%f\n' % (pos, meth_level))
+            else :
+                wiggle.write('%d\t-%f\n' % (pos, meth_level))
+            CGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(meth_level_string)s\t%(meth_cytosines)s\t%(all_cytosines)s\n' % locals())
+    ATCGmap.close()
+    CGmap.close()
+    wiggle.close()
+
+    logm('Wiggle: %s'% wiggle_fname)
+    logm('ATCGMap: %s' % ATCGmap_fname)
+    logm('CGmap: %s' % CGmap_fname)
+
Binary file BSseeker2/bs_utils/.___init__.py has changed
Binary file BSseeker2/bs_utils/._utils.py has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_utils/__init__.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,1 @@
+__author__ = 'pf'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_utils/utils.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,332 @@
+import gzip
+import os
+
+import datetime
+import re
+import shlex
+import shutil
+import string
+import subprocess
+import types
+#from itertools import izip
+
+import marshal
+import sys
+
+
+
+_rc_trans = string.maketrans('ACGT', 'TGCA')
+def reverse_compl_seq(strseq):
+    return strseq.translate(_rc_trans)[::-1]
+
+
+
+def show_version() :
+    print ""
+    print "     BS-Seeker2 v2.0.3 - May 19, 2013     "
+    print ""
+
+
+
+"""
+IUPAC nucleotide code Base
+    A	Adenine
+    C	Cytosine
+    G	Guanine
+    T (or U)	Thymine (or Uracil)
+    R	A or G
+    Y	C or T
+    S	G or C
+    W	A or T
+    K	G or T
+    M	A or C
+    B	C or G or T
+    D	A or G or T
+    H	A or C or T
+    V	A or C or G
+    N	any base
+    . or -	gap
+"""
+
+
+def IUPAC ( nuc ) :
+    if nuc == 'R' :
+        return ('A','G')
+    elif nuc == 'Y' :
+        return ('C', 'T')
+    elif nuc == 'S' :
+        return ('G', 'C')
+    elif nuc == 'W' :
+        return ('A', 'T')
+    elif nuc == 'K' :
+        return ('G','T')
+    elif nuc == 'M' :
+        return ('A','C')
+    elif nuc == 'B' :
+        return ('C', 'G', 'T')
+    elif nuc == 'D' :
+        return ('A', 'G', 'T')
+    elif nuc == 'H' :
+        return ('A', 'C', 'T')
+    elif nuc == 'V' :
+        return ('A', 'C', 'G')
+    elif nuc == 'N' :
+        return ('A', 'C', 'G', 'T')
+    else :
+        return (nuc)
+
+
+def uniq(inlist):
+    # order preserving
+    uniques = []
+    for item in inlist:
+        if item not in uniques:
+            uniques.append(item)
+    return uniques
+
+from itertools import product
+
+def EnumerateIUPAC ( context_lst ) :
+    tag_list = []
+#    context_lst = [context]
+    for one_context in context_lst :
+        for m in product(*[ IUPAC(i) for i in list(one_context)]) :
+            tag_list.append(''.join(m))
+    return uniq(tag_list)
+
+from itertools import product
+
+# example: cut3_context="CGG"
+# return generator for : ["CGG","TGG"]
+# wild-card C to both C and T
+def Enumerate_C_to_CT ( cut3_context_lst ) :
+    tag_list = []
+    for context in cut3_context_lst :
+        for m in product(*[i if (i is not 'C') else ('C','T') for i in context]) :
+            tag_list.append(''.join(m))
+    return uniq(tag_list)
+
+#-------------------------------------------------------------------------------------
+
+# set a reasonable defaults
+def find_location(program):
+    def is_exe(fpath):
+        return os.path.exists(fpath) and os.access(fpath, os.X_OK)
+    for path in os.environ["PATH"].split(os.pathsep):
+        if is_exe(os.path.join(path, program)):
+            return path
+
+    return None
+
+BOWTIE = 'bowtie'
+BOWTIE2 = 'bowtie2'
+SOAP = 'soap'
+RMAP = 'rmap'
+
+supported_aligners = [
+                      BOWTIE,
+                      BOWTIE2,
+                      SOAP,
+                      RMAP
+                    ]
+
+aligner_options_prefixes = { BOWTIE  : '--bt-',
+                             BOWTIE2 : '--bt2-',
+                             SOAP    : '--soap-',
+                             RMAP    : '--rmap-' }
+
+aligner_path = dict((aligner, os.path.expanduser(find_location(aligner) or default_path))
+                    for aligner, default_path in
+                           [(BOWTIE,'~/bowtie-0.12.7/'),
+                            (BOWTIE2, '~/bowtie-0.12.7/'),
+                            (SOAP, '~/soap2.21release/'),
+                            (RMAP, '~/rmap_v2.05/bin')
+                            ])
+
+
+reference_genome_path = os.path.join(os.path.split(globals()['__file__'])[0],'reference_genomes')
+
+
+
+def error(msg):
+    print >> sys.stderr, 'ERROR: %s' % msg
+    exit(1)
+
+
+global_stime = datetime.datetime.now()
+def elapsed(msg = None):
+    print "[%s]" % msg if msg is not None else "+", "Last:" , datetime.datetime.now() - elapsed.stime, '\tTotal:', datetime.datetime.now() - global_stime
+
+    elapsed.stime = datetime.datetime.now()
+
+elapsed.stime = datetime.datetime.now()
+
+
+def open_log(fname):
+    open_log.logfile = open(fname, 'w', 1)
+
+def logm(message):
+    log_message = "[%s] %s\n" % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), message)
+    print log_message,
+    open_log.logfile.write(log_message)
+
+def close_log():
+    open_log.logfile.close()
+
+
+
+
+def clear_dir(path):
+    """ If path does not exist, it creates a new directory.
+        If path points to a directory, it deletes all of its content.
+        If path points to a file, it raises an exception."""
+
+    if os.path.exists(path):
+        if not os.path.isdir(path):
+            error("%s is a file. Please, delete it manually!" % path)
+        else:
+            for the_file in os.listdir(path):
+                file_path = os.path.join(path, the_file)
+                try:
+                    if os.path.isfile(file_path):
+                        os.unlink(file_path)
+                    elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)
+                except Exception, e:
+                    print e
+    else:
+        os.mkdir(path)
+
+
+def delete_files(*filenames):
+#    return
+    """ Deletes a number of files. filenames can contain generator expressions and/or lists, too"""
+
+    for fname in filenames:
+        if type(fname) in [list, types.GeneratorType]:
+            delete_files(*list(fname))
+        elif os.path.isdir(fname):
+            shutil.rmtree(fname)
+        else:
+            os.remove(fname)
+
+def split_file(filename, output_prefix, nlines):
+    """ Splits a file (equivalend to UNIX split -l ) """
+    fno = 0
+    lno = 0
+    INPUT = open(filename, 'r')
+    output = None
+    for l in INPUT:
+        if lno == 0:
+            fno += 1
+            if output is not None: output.close()
+            output = open('%s%d' % (output_prefix, fno), 'w')
+            lno = nlines
+        output.write(l)
+        lno -= 1
+    output.close()
+    INPUT.close()
+
+def isplit_file(filename, output_prefix, nlines):
+    """ Splits a file (equivalend to UNIX split -l ) """
+    fno = 0
+    lno = 0
+    try :
+        input = (gzip.open if filename.endswith('.gz') else open)(filename, 'r')
+    except IOError :
+        print "[Error] Cannot find file : %s !" % filename
+        exit(-1)
+    output = None
+    output_fname = None
+    for l in input:
+        if lno == 0:
+
+            if output is not None:
+                output.close()
+                yield output_fname
+
+            fno += 1
+            output_fname = '%s%d' % (output_prefix, fno)
+#            output_fname = '%s_0' % output_prefix
+            output = open(output_fname, 'w')
+            lno = nlines
+        output.write(l)
+        lno -= 1
+    if output is not None:
+        output.close()
+    yield output_fname
+
+    input.close()
+
+def read_fasta(fasta_file):
+    """ Iterates over all sequences in a fasta file. One at a time, without reading the whole file into the main memory.
+    """
+
+    try :
+        input = (gzip.open if fasta_file.endswith('.gz') else open)(fasta_file)
+    except IOError:
+        print "[Error] Cannot find fasta file : %s !" % fasta_file
+        exit(-1)
+    sanitize = re.compile(r'[^ACTGN]')
+    sanitize_seq_id = re.compile(r'[^A-Za-z0-9]')
+
+    chrom_seq = []
+    chrom_id = None
+    seen_ids = set()
+
+    for line in input:
+        if line[0] == '>':
+            if chrom_id is not None:
+                yield chrom_id, ''.join(chrom_seq)
+
+            chrom_id = sanitize_seq_id.sub('_', line.split()[0][1:])
+
+            if chrom_id in seen_ids:
+                error('BS Seeker found identical sequence ids (id: %s) in the fasta file: %s. Please, make sure that all sequence ids are unique and contain only alphanumeric characters: A-Za-z0-9_' % (chrom_id, fasta_file))
+            seen_ids.add(chrom_id)
+
+            chrom_seq = []
+
+        else:
+            chrom_seq.append(sanitize.sub('N', line.strip().upper()))
+
+    yield chrom_id, ''.join(chrom_seq)
+
+    input.close()
+
+
+def serialize(obj, filename):
+    """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
+    """
+    output = open(filename+'.data', 'w')
+    marshal.dump(obj, output)
+    output.close()
+
+def deserialize(filename):
+    """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
+    """
+    try:
+        input = open(filename+'.data')
+    except IOError:
+        print "\n[Error]:\n\t Cannot find file: %s.data" % filename
+        exit(-1)
+
+    obj = marshal.load(input)
+    input.close()
+    return obj   
+
+
+
+def run_in_parallel(commands):
+
+    commands = [(cmd[0], open(cmd[1], 'w')) if type(cmd) is tuple else (cmd, None) for cmd in commands]
+
+    logm('Starting commands:\n' + '\n'.join([cmd for cmd, stdout in commands]))
+    for i, proc in enumerate([subprocess.Popen(args = shlex.split(cmd), stdout = stdout) for cmd, stdout in commands]):
+        return_code = proc.wait()
+        logm('Finished: ' + commands[i][0])
+        if return_code != 0:
+            error('%s \nexit with an error code: %d. Please, check the log files.' % (commands[i], return_code))
+    for _, stdout in commands:
+        if stdout is not None:
+            stdout.close()
Binary file BSseeker2/galaxy/.___init__.py has changed
Binary file BSseeker2/galaxy/._bs_seeker2_wrapper.py has changed
Binary file BSseeker2/galaxy/._bs_seeker2_wrapper.xml has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/galaxy/__init__.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,1 @@
+__author__ = 'pf'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/galaxy/bs_seeker2_wrapper.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,123 @@
+import tempfile
+
+__author__ = 'pf'
+from subprocess import Popen
+from collections import defaultdict
+import sys, shutil, os, re
+
+
+
+BUILD = 'build'
+ALIGN = 'align'
+CALL_METHYLATION = 'call_methylation'
+
+EXEC = 'exec'
+EXEC_PATH = EXEC+'-path'
+ARG_TYPES = [BUILD, ALIGN, CALL_METHYLATION, EXEC]
+
+USAGE = """
+%(script)s is a wrapper script for bs_seeker2-build.py and bs_seeker2-align.py that is intended to be used with the Galaxy web platform.
+The script takes command line parameters and runs bs_seeker2-align.py and bs_seeker2-build.py, if neccessary.
+
+The parameters that are related to bs_seeker2-build.py must be prefixed with --%(build_tag)s.
+The parameters that are related to bs_seeker2-align.py must be prefixed with --%(align_tag)s.
+Additionally, the path to BS-Seeker2 has to be specified via the --%(exec_path)s option.
+
+For example:
+
+    python %(script)s --%(exec_path)s /mnt/Data/UCLA/Matteo/BS-Seeker --build-f data/arabidopsis/genome/Arabidopsis.fa --align-i data/arabidopsis/BS6_N1try2L7_seq.txt.fa --align-o data/arabidopsis/BS6_N1try2L7_seq.txt.fa.test_output
+
+This will run build the genome in Arabidopsis.fa and put the indexes in a temporary directory. bs_seeker2-align.py will be run on the
+newly created genome index. I.e. the following two commands will be run in a shell:
+
+    python /mnt/Data/UCLA/Matteo/BS-Seeker/bs_seeker2-build.py --db /tmp/tmpg8Eq1o -f /mnt/Data/UCLA/Matteo/bck_BS-Seeker/data/arabidopsis/genome/Arabidopsis.fa
+
+    python /mnt/Data/UCLA/Matteo/BS-Seeker/bs_seeker2-align.py --db /tmp/tmpg8Eq1o -o /mnt/Data/UCLA/Matteo/bck_BS-Seeker/data/arabidopsis/BS6_N1try2L7_seq.txt.fa.test_output -i /mnt/Data/UCLA/Matteo/bck_BS-Seeker/data/arabidopsis/BS6_N1try2L7_seq.txt.fa -g Arabidopsis.fa
+
+
+The temporary directory will be deleted after the wrapper exits.
+
+
+If no options related to bs_seeker2-build are passed, no genome index will be built and the corresponding pre-built genome index will be used
+instead. No temporary files and directories will be created.
+
+""" % { 'script' : os.path.split(__file__)[1], 'build_tag' :BUILD, 'align_tag' : ALIGN, 'exec_path' : EXEC_PATH }
+
+
+def error(msg):
+    print >> sys.stderr, 'ERROR: %s' % msg
+    exit(1)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        error('No parameters\n\n'+USAGE)
+
+
+    # Parse command line arguments
+    args = defaultdict(dict)
+    arg_key = None
+    arg_val = None
+    arg_type = None
+
+    for arg in sys.argv[1:]:
+        if arg.startswith('--'):
+            try:
+                arg_type, arg_key = re.match(r'--(\w+)(.*)', arg).groups()
+                if arg_type not in ARG_TYPES:
+                    raise Exception("Bad argument: %s. arg_type (%s) must be one of: %s." % (arg, arg_type, ', '.join(ARG_TYPES)))
+                if not arg_key or arg_key[0] != '-':
+                    raise Exception("Bad argument: %s. arg_key (%s) must start with - or --." % (arg, arg_key))
+            except Exception, e:
+                error(str(e) + '\n\n' + USAGE)
+            args[arg_type][arg_key] = ''
+        else:
+            args[arg_type][arg_key] = arg
+
+    path_to_bs_seeker = args.get('exec', {'-path' : None})['-path'] # return None when exec not found
+    if path_to_bs_seeker is None:
+        error('You have to specify the path to BS-Seeker2 via --%s\n\n' % EXEC_PATH + USAGE)
+
+    tempdir = None
+    def run_prog(prog, params):
+        cwd, _ = os.path.split(__file__)
+        cmd = 'python %(prog)s %(params)s' % {
+                   'prog'   : os.path.join(cwd, prog),
+                   'params' : ' '.join('%s %s' % (arg_key, arg_val) for arg_key, arg_val in params.items())
+                   }
+        print 'exec:', cmd
+
+        return_code = Popen(args = cmd, shell = True).wait()
+        if return_code:
+            if tempdir:
+                shutil.rmtree(tempdir)
+            error("%s exit with error code %d" % (prog, return_code))
+    tempdir = tempfile.mkdtemp()
+
+    # bs_seeker2-build
+    if BUILD in args:
+        args[BUILD]['--db'] = tempdir
+        args[ALIGN]['--db'] = tempdir
+        run_prog(os.path.join(path_to_bs_seeker, 'bs_seeker2-build.py'), args[BUILD])
+
+    # bs_seeker2-align
+    args[ALIGN]['--temp_dir'] = tempdir
+    run_prog(os.path.join(path_to_bs_seeker, 'bs_seeker2-align.py'), args[ALIGN])
+
+
+    def getopt(h, k1, k2, default):
+        return h.get(k1, h.get(k2, default))
+    # bs_seeker2-call_methylation
+    args[CALL_METHYLATION].update({  '-i'    : args[ALIGN]['--output'],
+                                     '--db'  : os.path.join(args[ALIGN]['--db'],
+                                    os.path.split(  getopt(args[ALIGN],'-g', '--genome', None))[1] +
+                                    ('_rrbs_%s_%s' % (getopt(args[ALIGN], '-l', '--low', '40'),
+                                                      getopt(args[ALIGN], '-u', '--up', '500'))
+                                     if len(set(['-r', '--rrbs']) & set(args[ALIGN])) > 0 else '') +
+
+                                    '_' + args[ALIGN]['--aligner'])
+                                    })
+    run_prog(os.path.join(path_to_bs_seeker, 'bs_seeker2-call_methylation.py'), args[CALL_METHYLATION])
+
+    if tempdir:
+        shutil.rmtree(tempdir)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/galaxy/bs_seeker2_wrapper.xml	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,255 @@
+<tool id="bs_seeker_wrapper" name="BS-Seeker2" version="2.0.0">
+  <requirements><requirement type='package'>bs_seeker2</requirement></requirements>
+  <description>Versatile aligner for bisulfite sequencing data</description>
+  <command interpreter="python">
+      bs_seeker2_wrapper.py
+      ### define exec path
+      ###    --exec-path "/u/home/galaxy/galaxy/GalaxyTools/bin"
+      ### [Please change the following path to your local directory]
+          --exec-path "/Users/weilong/Documents/program/BSseeker2"
+      ### output
+          --align--output $align_output
+          --call_methylation--wig $call_methylation_wig
+          --call_methylation--CGmap $call_methylation_CGmap
+          --call_methylation--ATCGmap $call_methylation_ATCGmap
+          --call_methylation--txt
+
+      #if $singlePaired.sPaired == "paired"
+          --align--input_1 $input1
+          --align--input_2 $singlePaired.input2
+      #end if
+
+
+      ### aligner
+      --align--aligner ${choosealigner.aligner}
+
+      ### Index from history or built-in
+      #if $choosealigner.rrbsFragments.refGenomeSource.genomeSource == "history"
+          --build--file ${choosealigner.rrbsFragments.refGenomeSource.ownFile}
+          --build--aligner ${choosealigner.aligner}
+          --align-g ${choosealigner.rrbsFragments.refGenomeSource.ownFile}
+          --align--db ${choosealigner.rrbsFragments.refGenomeSource.ownFile}
+      #else if $choosealigner.rrbsFragments.refGenomeSource.genomeSource == "indexed"
+          --align--db ${choosealigner.rrbsFragments.refGenomeSource.index.fields.path}
+          --align-g ${choosealigner.rrbsFragments.refGenomeSource.index.fields.path}/${choosealigner.rrbsFragments.refGenomeSource.index.fields.dbkey}.fa
+
+      #end if
+
+      ### RRBS or WGBS
+      #if $choosealigner.rrbsFragments.Fragmented == "Yes"
+          #if $choosealigner.rrbsFragments.refGenomeSource.genomeSource == "history"
+            --build--rrbs
+            --build--low ${choosealigner.rrbsFragments.lowerBound}
+            --build--up ${choosealigner.rrbsFragments.upperBound}
+          #end if
+          --align--rrbs
+          --align--low ${choosealigner.rrbsFragments.lowerBound}
+          --align--up ${choosealigner.rrbsFragments.upperBound}
+      #end if
+
+
+
+      ### Inputs
+      #if $singlePaired.sPaired == "single"
+          --align-i $input1
+      #end if
+
+      ### Library type
+          --align-t $tag
+
+      ### other general options
+      #if $sParams.sSettingsType == "preSet"
+          --align--start_base 1
+          --align--end_base 200
+          --align--mis 4
+      #end if
+
+      ### adapter information
+      #if $adapterInfo.useAdapter == "Yes"
+          --align--adapter ${adapterInfo.adapter_file}
+      #end if
+
+      #if $sParams.sSettingsType == "full"
+          --align--start_base ${sParams.start_base}
+          --align--end_base ${sParams.end_base}
+          --align--mis ${sParams.num_mismatch}
+      #end if
+
+  </command>
+  <inputs>
+     <param format="fastq,fasta,qseq" name="input1" type="data" label="Input your read file" help="reads file in fastq, qseq or fasta format" />
+     <conditional name="singlePaired">
+        <param name="sPaired" type="select" label="Is this library mate-paired?">
+          <option value="single">Single-end</option>
+          <option value="paired">Paired-end</option>
+        </param>
+        <when value="paired">
+          <param format="fastq,fasta,qseq" name="input2" type="data" label="Input your read file 2" help="reads in fastq, qseq or fasta format" />
+          <param name="min_ins_distance" type="integer" value="-1" label=" Minimum insert size for valid paired-end alignments" />
+          <param name="max_ins_distance" type="integer" value="400" label="Maximum insert size for valid paired-end alignments" />
+        </when>
+     </conditional> 
+     <param name="tag" type="select" label="Type of libraries">
+        <option value="N">directional libraries</option>
+        <option value="Y">undirectional libraries</option>
+     </param>
+     <conditional name="choosealigner">
+         <param name="aligner" type="select" label="Short reads aligner">
+             <option value="bowtie">bowtie</option>
+             <option value="bowtie2">bowtie2</option>
+         </param>
+         <when value="bowtie">
+             <conditional name="rrbsFragments">
+                 <param name="Fragmented" type="select" label="RRBS-seq reads" help="">
+                     <option value="No">No</option>
+                     <option value="Yes">Yes</option>
+                 </param>
+                 <when value="Yes">
+                     <param name="lowerBound" type="integer" value="40" label="The lower bound for RRBS fragments" help="Default: 40" />
+                     <param name="upperBound" type="integer" value="500" label="The upper bound for RRBS fragments" help="Default: 500" />
+                     <conditional name="refGenomeSource">
+                         <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="">
+                             <option value="indexed">Use a built-in index</option>
+                             <option value="history">Use one from the history</option>
+                         </param>
+                         <when value="indexed">
+                             <param name="index" type="select" label="Select a reference genome (RRBS, bowtie)">
+                                 <options from_data_table="bs_seeker2_indexes_RRBS_bowtie">
+                                     <filter type="sort_by" column="2"/>
+                                     <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+                                 </options>
+                             </param>
+                         </when>
+                         <when value="history">
+                             <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />
+                         </when>
+                     </conditional>
+                 </when>
+                 <when value="No">
+                     <conditional name="refGenomeSource">
+                         <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="">
+                             <option value="indexed">Use a built-in index</option>
+                             <option value="history">Use one from the history</option>
+                         </param>
+                         <when value="indexed">
+                             <param name="index" type="select" label="Select a reference genome (WGBS, bowtie)">
+                                 <options from_data_table="bs_seeker2_indexes_WGBS_bowtie">
+                                     <filter type="sort_by" column="2"/>
+                                     <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+                                 </options>
+                             </param>
+                         </when>
+                         <when value="history">
+                             <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />
+                         </when>
+                     </conditional>
+                 </when>
+             </conditional>
+         </when>
+
+         <when value="bowtie2">
+             <conditional name="rrbsFragments">
+                 <param name="Fragmented" type="select" label="RRBS-seq reads" help="">
+                     <option value="No">No</option>
+                     <option value="Yes">Yes</option>
+                 </param>
+                 <when value="Yes">
+                     <param name="lowerBound" type="integer" value="40" label="The lower bound for RRBS fragments" help="Default: 40" />
+                     <param name="upperBound" type="integer" value="500" label="The upper bound for RRBS fragments" help="Default: 500" />
+                     <conditional name="refGenomeSource">
+                         <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="">
+                             <option value="indexed">Use a built-in index</option>
+                             <option value="history">Use one from the history</option>
+                         </param>
+                         <when value="indexed">
+                             <param name="index" type="select" label="Select a reference genome (RRBS, bowtie2)">
+                                 <options from_data_table="bs_seeker2_indexes_RRBS_bowtie2">
+                                     <filter type="sort_by" column="2"/>
+                                     <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+                                 </options>
+                             </param>
+                         </when>
+                         <when value="history">
+                             <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />
+                         </when>
+                     </conditional>
+                 </when>
+                 <when value="No">
+                     <conditional name="refGenomeSource">
+                         <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="">
+                             <option value="indexed">Use a built-in index</option>
+                             <option value="history">Use one from the history</option>
+                         </param>
+                         <when value="indexed">
+                             <param name="index" type="select" label="Select a reference genome (WGBS, bowtie2)">
+                                 <options from_data_table="bs_seeker2_indexes_WGBS_bowtie2">
+                                     <filter type="sort_by" column="2"/>
+                                     <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+                                 </options>
+                             </param>
+                         </when>
+                         <when value="history">
+                             <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" />
+                         </when>
+                     </conditional>
+                 </when>
+             </conditional>
+         </when>
+     </conditional>
+     <conditional name="adapterInfo">
+        <param name="useAdapter" type="select" label="adapter sequence">
+           <option value="noAdapter">No</option>
+           <option value="withAdapter">Yes</option>
+        </param>
+        <when value="withAdapter">
+           <param format="txt" name="adapter_file" type="data" label="Input file of your adaptor sequences" help="Input text file of your adaptor sequences" />
+        </when>
+     </conditional>
+
+     <conditional name="sParams">
+       <param name="sSettingsType" type="select" label="BS Seeker2 settings to use" help="You can use the default settings or set customer values for the BS Seeker2 parameters.">
+         <option value="preSet">User Defaults</option>
+         <option value="full">Full parameter list</option>
+       </param>
+       <when value="preSet" /> 
+       <when value="full">
+           <param name="start_base" type="integer" value="1" label="The start base of the read to be mapped" help="" />
+           <param name="end_base" type="integer" value="200" label="The end base of the read to be mapped" help="" />
+
+           <param name="num_mismatch" type="integer" value="4" label="Number of mismatches" help="(INT) Default: 4" />
+       </when>   
+     </conditional>
+
+</inputs>
+
+  <outputs>
+    <data format="bam" name="align_output"  label="BAM Alignments"> </data>
+    <data format="wig" name="call_methylation_wig"  label="Methylation Levels"> </data>
+    <data format="tabular" name="call_methylation_CGmap"  label="CGmap file"> </data>
+    <data format="tabular" name="call_methylation_ATCGmap"  label="ATCGmap file"> </data>
+
+  </outputs>
+  <help>
+**What it does**
+
+BS-Seeker2 is a seamlessly pipeline for mapping bisulfite sequencing data and generating detailed DNA methylome. BS-Seeker2 improves mappability by using local alignment, and is tailored for RRBS library by building special index, with higher efficiency and accuracy. This is the Galaxy version of BS-Seeker2.
+
+------
+
+**Resources**
+
+The homepage for BS-Seeker2 is http://pellegrini.mcdb.ucla.edu/BS_Seeker2/.
+
+For more information of BS-Seeker2, please refer to https://github.com/BSSeeker/BSseeker2.
+
+------
+
+**Example**
+
+- Adapter file::
+
+      AGATCGGAAGAGCACACGTC
+
+
+  </help>
+</tool>