changeset 0:06f8460885ff

migrate from GitHub
author yutaka-saito
date Sun, 19 Apr 2015 20:51:13 +0900
parents
children 20930a8f700b
files bin/bsf-call bin/bsfcall.py bin/bsfcall.pyc bin/fastq-interleave bin/last-bisulfite-paired.sh bin/last-bisulfite.sh bin/last-dotplot bin/last-map-probs bin/last-merge-batches bin/last-pair-probs bin/last-postmask bin/last-split bin/lastal bin/lastdb bin/lastex bin/maf-convert bin/maf-cull bin/maf-join bin/maf-sort bin/maf-swap bin/parallel-fasta bin/parallel-fastq bsf-call_wrapper.pl bsf-call_wrapper.xml example-data/1.fastq.gz example-data/2.fastq.gz example-data/chrX.sub.fa.gz tool-data/bsf-call_indexes.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml
diffstat 30 files changed, 4149 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/bsf-call	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+"""
+Bisulfighter::bsf-call
+
+Bisulfighter (http://epigenome.cbrc.jp/bisulfighter)
+by National Institute of Advanced Industrial Science and Technology (AIST)
+is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
+http://creativecommons.org/licenses/by-nc-sa/3.0/
+"""
+
+__version__= "1.3"
+
+from optparse import OptionParser
+import os
+import sys
+import re
+
+prog = 'bsf-call'
+usage = """%prog [options] refgenome read1 read2 ...
+  example: %prog -o experiment.txt hg38.fa paired-sample1-1.fastq,paired-sample1-2.fastq"""
+description = "A mapping of the read bisulfite treated by LAST, to detect methylated cytosine (mC) of the results, and outputs the detection result to the file."
+
+op = OptionParser(prog=prog, usage=usage, description=description, version="%s-%s" % (prog, __version__))
+
+op.add_option("-c", "--coverage", type="int", default=5, metavar="C",
+              help="threshold of read coverate (default: %default)")
+
+# op.add_option("-d", "--pe-direction", type="string", default="ff",
+#               help="direction of paired-end probes: ff, fr, rf, rr (default: %default)")
+# 
+op.add_option("-l", "--lower-bound", type="float", default=0.05, metavar="L",
+              help="threshold of mC ratio (default: %default)")
+
+op.add_option("-p", "--multi-thread", type="int", default=1, metavar="P",
+              help="number of threads (default: %default)")
+
+op.add_option("-s", "", type="int", default=150, metavar="S",
+              help="threshold of the alignment score at filtering (default: %default)")
+
+op.add_option("-m", "", type="float", default=1e-9, metavar="M",
+              help="threshold of the mismap probability at filtering (default: %default)")
+
+# op.add_option("", "--last", type="string", default="", metavar="OPT1,OPT2,...",
+#               help="options for LAST (lastal command)")
+# 
+op.add_option("-o", "", type="string", default="bsf-call.out", metavar="FILE",
+              help="output file (default: bsf-call.out)")
+
+op.add_option("-W", "", type="string", default="./bsfwork", metavar="WORKDIR",
+              help="work directory (default: ./bsfwork)")
+
+op.add_option("", "--work-auto", action="store_true", dest="auto_create_work_dir", default=False,
+              help="create work directory automatically")
+
+# op.add_option("-n", "", action="store_true", dest="use_cluster", default=False,
+#               help="run bsf-call on pc cluster")
+# 
+# op.add_option("-q", "", type="string", default="", metavar="QUEUE_LIST",
+#               help="queue list")
+# 
+op.add_option("-M", "", type="string", metavar="MAPPING_DIR",
+              help="mapping result directory")
+
+op.add_option("-T", "", type="string", metavar="LOCAL_DIR",
+              help="local directory")
+
+# op.add_option("-r", "", type="string", default="100M", metavar="SPLIT_READ_SIZE",
+#               help="split read size")
+# 
+# op.add_option("", "--bam", action="store_true", dest="read_bam", default=False,
+#               help="read BAM file for mC detection")
+# 
+# op.add_option("", "--sam", action="store_true", dest="read_sam", default=False,
+#               help="read SAM file for mC detection")
+# 
+# op.add_option("-z", "--compress-prog", type="string", dest="z", metavar="COMPRESS_PROG", default="bzip2", 
+#               help="compression program")
+
+options, args = op.parse_args()
+
+errors = []
+
+work_dir = None
+if options.W:
+    work_dir = options.W
+else:
+    if not options.auto_create_work_dir:
+        work_dir = "bsfwork"
+
+if options.M:
+    if len(args) < 1:
+        op.error("\n  Reference genome is not specified.")
+
+    for result_dir in options.M.split(","):
+        if not os.path.exists(result_dir):
+            errors.append("Mapping result directory: '%s' does not exist." % options.M)
+
+    # if options.read_bam and options.read_sam:
+    #     errors.append("--bam and --sam cannot be placed simultaneously.")
+
+    ref_genome = args[0]
+    reads = None
+else:
+    if len(args) < 2:
+        op.error("\n  Reference genome and read sequence is not specified.")
+
+    # if options.read_bam:
+    #     errors.append("--bam option is specified but -M option is not specified.")
+
+    # if options.read_sam:
+    #     errors.append("--sam option is specified but -M option is not specified.")
+
+    ref_genome = args[0]
+    reads = args[1:]
+
+    for read_files in reads:
+        for read_file in read_files.split(','):
+            if not os.path.exists(read_file):
+                errors.append("Read file: '%s' does not exists." % read_file)
+
+    if work_dir and os.path.exists(work_dir):
+        errors.append("Working directory: '%s' already exists." % work_dir)
+
+if not os.path.exists(ref_genome):
+    errors.append("Reference genome: '%s' does not exists." % ref_genome)
+
+# if options.read_bam or options.read_sam:
+#     try:
+#         import pysam
+#     except:
+#         errors.append("--bam or --sam is specified but pysam is not installed.")
+
+if len(errors) > 0:
+    op.error("\n  " + "\n  ".join(errors))
+
+cmd_opts = {}
+cmd_opts["coverage"] = options.coverage
+# cmd_opts["pe_direction"] = options.pe_direction
+cmd_opts["num_threads"] = options.multi_thread
+cmd_opts["lower_bound"] = options.lower_bound
+cmd_opts["aln_score_thres"] = options.s
+cmd_opts["aln_mismap_prob_thres"] = options.m
+cmd_opts["output"] = options.o
+# cmd_opts["last_opts"] = options.last
+cmd_opts["work_dir"] = work_dir
+# cmd_opts["use_cluster"] = options.use_cluster
+# cmd_opts["queue_list"] = options.q
+cmd_opts["mapping_dir"] = options.M
+cmd_opts["local_dir"] = options.T
+# cmd_opts["split_read_size"] = options.r
+# cmd_opts["read_bam"] = options.read_bam
+# cmd_opts["read_sam"] = options.read_sam
+# cmd_opts["compress_prog"] = options.z
+
+try:
+    sys.path.append('.');
+    import bsfcall
+except ImportError:
+    errors.append("\"import bsfcall\" failed. Please be sure you have bsfcall.py in your python library path.");
+
+import subprocess
+
+# if not checkRunnable('last-map-probs'):
+#     sys.exit(1)
+# if not checkRunnable('last-pair-probs'):
+#     sys.exit(1)
+# if not checkRunnable(options.z):
+#     sys.exit(1)
+
+if len(errors) > 0:
+    op.error("\n  " + "\n  ".join(errors))
+
+# if cmd_opts["use_cluster"]:
+#     cmd_opts["num_threads"] = 1
+#     bsf_call = bsfcall.BsfCallCluster(ref_genome, reads, cmd_opts)
+# else:
+#     bsf_call = bsfcall.BsfCall(ref_genome, reads, cmd_opts)
+bsf_call = bsfcall.BsfCall(ref_genome, reads, cmd_opts)
+
+bsf_call.execute()
+
+sys.exit(0)
+
+def checkRunnable(cmd):
+    for outline, errline in runProcess(cmd.split()):
+        if (re.match(r'error: subProcess:', errline) is not None):
+            print >>sys.stderr, '\"%s\" is not found.' % cmd
+            return False
+    return True
+
+def runProcess(exe):
+    try:
+        p = subprocess.Popen(exe, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    except:
+        yield (None, 'error: runProcess: \'%s\' failed.' % ' '.join(exe))
+        return
+    while (True):
+        retcode = p.poll()
+        if (retcode is not None):
+           break
+        outline = p.stdout.readline()
+        errline = p.stderr.readline()
+        yield (outline, errline)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/bsfcall.py	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,1876 @@
+#!/usr/bin/env python
+"""
+Bisulfighter::bsf-call
+
+Bisulfighter (http://epigenome.cbrc.jp/bisulfighter)
+by National Institute of Advanced Industrial Science and Technology (AIST)
+is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
+http://creativecommons.org/licenses/by-nc-sa/3.0/
+"""
+
+__version__= "1.3"
+
+import sys
+import os
+import glob
+import threading
+import subprocess
+import Queue
+from datetime import datetime
+import hashlib
+from string import maketrans
+import gzip
+import bz2
+import zipfile
+import logging
+from time import sleep
+from shutil import copy
+import re
+# import pysam
+
+class BsfCallBase(object):
+    """
+    base class for BsfCall, LastExecutor, McDetector.
+    define functions that is used in each sub classes.
+    """
+
+    def splitFilePath(self, filePath):
+        """
+        split file path to directory path, file name (with file extension),
+        file name (without file extension) and file extension.
+        if extension of filePath is '.gz', '.gz' extension is ignored.
+        """
+        dir_name, file_name = os.path.split(filePath)
+        base_name, ext = os.path.splitext(file_name)
+        prog = None
+        if (ext == '.gz' or ext == '.gzip' or ext == '.bz2' or ext == '.bzip2' or ext == '.zip'):
+           prog = ext[1:]
+           base_name, ext = os.path.splitext(base_name)
+        if len(ext) > 1:
+            ext = ext[1:]
+        return (dir_name, file_name, base_name, ext, prog)
+
+
+    def readNameByReadFile(self, readFilePath):
+        """
+        get read name by read file path.
+        """
+        dir_name, file_name, read_name, ext, prog = self.splitFilePath(readFilePath)
+        return read_name
+
+
+    def secondReadFilePathByFirstReadFilePath(self, readFile, secondReadType = None):
+        """
+        get second read file path by first read file path.
+        if first read file path is '/path/to/read_file_1.fastq' second read file
+        path is '/path/to/read_file_2.fastq'
+        if secondReadType is specified, the extension of second read file is its
+        value.
+        """
+        fpath = ""
+        dir_name, file_name, basename, ext, prog = self.splitFilePath(readFile)
+        if secondReadType:
+            ext = ".%s" % secondReadType
+        if prog is not None:
+            fpath = "%s/%s2%s.%s" % (dir_name, basename[0:-1], ext, prog)
+        else:
+            fpath = "%s/%s2%s" % (dir_name, basename[0:-1], ext)
+        return fpath
+
+
+    def pairedEndReadNumbers(self):
+        return (1, 2)
+
+
+    def clearGap(self, seq):
+        return seq.replace("-", "")
+
+
+    def complementStartPosition(self, genomeLen, subseqStart, subseqLen):
+        return genomeLen - subseqStart - subseqLen
+
+
+    def complementSeq(self, seq):
+        return seq.translate(maketrans("ATGCatgc", "TACGtacg"))[::-1]
+
+
+    def mcContextType(self, genomeSeq, cBasePos, strand='+'):
+        """
+        get mC context type (CG, CHG, CHH) by genome sequence and C base position.
+        if no mC context found, return None.
+        """
+
+        try:
+            if strand == '+':
+                if genomeSeq[cBasePos + 1] == "G":
+                    return "CG"
+                else:
+                    if genomeSeq[cBasePos + 2] == "G":
+                        return "CHG"
+                    else:
+                        return "CHH"
+            elif strand == '-':
+                if genomeSeq[cBasePos - 1] == "C":
+                    return "CG"
+                else:
+                    if genomeSeq[cBasePos - 2] == "C":
+                        return "CHG"
+                    else:
+                        return "CHH"
+            return None
+        except IndexError:
+            return None
+
+
+    def chrSort(self, a, b):
+        return cmp(a, b)
+
+
+    def strands(self):
+        return ("+", "-")
+
+
+    def mcContextTypes(self):
+        return ("CG", "CHG", "CHH")
+
+
+    def bzip2File(self, filePath, wait = True, log = False):
+        """
+        bzip2 file. If wait argument is False, without waiting for bzip2 process to be
+        completed, this function returns immediately.
+        """
+
+        if log:
+            logging.info("bzip2 start: %s" % filePath)
+
+        dirpath, fname = os.path.split(filePath)
+        cmd = "bzip2 %s" % fname
+        p = subprocess.Popen(cmd, shell = True, cwd = dirpath)
+        if wait:
+            p.wait()
+
+        if log:
+            logging.info("bzip2 done: %s" % filePath)
+
+
+    def isGzipFile(self, filePath):
+        return filePath[-3:] == ".gz" or filePath[-5:] == ".gzip"
+
+
+    def isBzip2File(self, filePath):
+        return filePath[-4:] == ".bz2" or filePath[-6:] == ".bzip2"
+
+
+    def isZipFile(self, filePath):
+        return filePath[-4:] == ".zip"
+
+
+    def isMafFile(self, filePath):
+        f = open(filePath, "rb")
+        data = f.read(1)
+        f.close()
+        if re.match("[\x20-\x7E]", data):
+            f = open(filePath, "r")
+            first_line = f.readline()
+            if first_line[0:5] != "track" and first_line[0] != "#" and first_line[0:2] != "a ":
+                f.close()
+                return False
+
+            if first_line[0:2] == "a ":
+                cnt = 2
+            else:
+                cnt = 1
+
+            while True:
+                line = f.readline()
+                if line == "":
+                    break
+                if line[0] == "#" or line.strip() == "":
+                    continue
+
+                if cnt == 1 and line[0:2] != "a ":
+                    f.close()
+                    return False
+
+                if cnt == 2 and line[0:2] != "s ":
+                    f.close()
+                    return False
+
+                if cnt == 3:
+                    f.close()
+                    if line[0:2] == "s ":
+                        return True
+                    else:
+                        return False
+
+                cnt += 1
+
+            f.close()
+            return False
+        else:
+            return False
+
+
+    def isBamFile(self, filePath):
+        bgzf_magic = b"\x1f\x8b\x08\x04"
+
+        f = open(filePath, "rb")
+        data = f.read(4)
+        f.close()
+
+        return data == bgzf_magic
+
+
+    def isSamFile(self, filePath):
+        f = open(filePath, "rb")
+        data = f.read(1)
+        if data == "@":
+            tag = f.read(2)
+            if tag == "HD" or tag == "SQ" or tag == "RG" or tag == "CO":
+                f.close()
+                return True
+
+        f.seek(0)
+        data = f.read(1)
+        f.close()
+        if re.match("[\x20-\x7E]", data):
+            f = open(filePath, "r")
+            line = f.readline()
+            f.close()
+            return len(line.split("\t")) > 10
+        else:
+            return False
+
+
+    def scriptDir(self):
+        return os.path.dirname(os.path.abspath(sys.argv[0]))
+
+
+    def chrnoFromFastaDescription(self, description):
+        """
+        get chromosome number by fasta description line.
+        """
+
+        return description.strip()[1:].strip()
+
+
+    def chrsByRefGenome(self, refGenome):
+        """
+        get all chromosome numbers that is included in the reference genome.
+        """
+
+        chrs = []
+        for line in open(refGenome, "r"):
+            line = line.strip()
+            if line[0] == ">":
+                chrs.append(self.chrnoFromFastaDescription(line))
+
+        return chrs
+
+
+    def readRefGenome(self, refGenome, refGenomeBuf, refGenomeChr):
+        """
+        read the reference genome fasta file.
+        """
+
+        logging.info("BsfCallBase::readRefGenome: %s" % refGenome)
+        chr = None
+        buf = []
+        fin = open(refGenome, 'r')
+        for line in fin:
+            if line[0] == '>':
+                chr = self.chrnoFromFastaDescription(line)
+                logging.info("BsfCallBase::readRefGenome: chr=%s" % chr)
+                if len(buf) > 0:
+                    refGenomeBuf[refGenomeChr[-1]]=''.join(buf)
+                    del buf[:]
+                refGenomeChr.append(chr)
+            elif chr != None:
+                buf.append(line.strip().upper())
+            else:
+                logging.fatal("BsfCallBase::readRefGenome: the specified reference genome file \"%s\" is malformed." % refGenome)
+        fin.close()
+        refGenomeBuf[refGenomeChr[-1]]=''.join(buf)
+        logging.info("BsfCallBase::readRefGenome: done.")
+        return
+
+
+    def lastalOpts(self, lastOpt):
+        """
+        get options for lastal by bsf-call --last option
+        """
+
+        return " ".join(lastOpt.split(","))
+
+
+    def mergeOpts(self):
+        return ""
+
+
+    def filterOpts(self, mismapProb, scoreThres, isPairedEnd):
+        """
+        get filtering option. this option is specified to last-map-probs or
+        last-pair-probs.
+        """
+
+        option = ""
+        if isPairedEnd:
+            option = "-m%s" % str(mismapProb)
+        else:
+            option = "-s%d -m%s" % (scoreThres, str(mismapProb))
+
+        return option
+
+
+    def isPairedEnd(self, readAttr):
+        return self.pairedEndReadNumbers()[1] in readAttr
+
+
+    def jobIdByQsubOutput(self, qsubOutput):
+        """
+        get job id submitted by qsub command and its output.
+        """
+
+        fields = qsubOutput.strip().split()
+
+        return fields[2]
+
+
+    def waitForSubmitJobs(self, jobIds, checkInterval = 10):
+        """
+        wait for all jobs that have been submitted with qsub command to finish.
+        """
+
+        error_jobs = []
+        while True:
+            all_done = True
+            qstat = os.popen("qstat")
+            for line in qstat:
+                fields = line.strip().split()
+                if fields[0] in jobIds:
+                    if fields[4].find('E') > 0:
+                        if fields[0] not in error_jobs:
+                            logging.fatal("Error has occurred: Job ID=%s" % fields[0])
+                            error_jobs.append(fields[0])
+                    else:
+                        all_done = False
+                        break
+            qstat.close()
+
+            if all_done:
+                break
+            else:
+                sleep(checkInterval)
+
+        return
+
+
+    def logJobSubmit(self, msg, jobId, cmd = None):
+        logging.info("Submit job: %s --> Job ID = %s" % (msg, jobId))
+        if cmd:
+            logging.info(cmd)
+
+
+    def bamMapq2Mismap(self, mapq):
+        return pow(0.1, (float(mapq) / 10))
+
+
+    def getAllMappingResultFiles(self, resultDirs):
+        mapping_result_files = []
+
+        for result_dir in resultDirs:
+            for root, dirs, files in os.walk(result_dir):
+                for filename in files:
+                    logging.info("McDetector::getAllMappingResultFiles: %s" % filename)
+                    mapping_result_file = os.path.join(root, filename)
+                    mapping_result_files.append(mapping_result_file)
+        
+        return mapping_result_files
+                        
+
+class BsfCall(BsfCallBase):
+    """
+    class to execute bsf-call process.
+    """
+
+    def __init__(self, refGenome, readFilePaths, cmdOpts):
+        """
+        constructor of BsfCall
+        """
+
+        self.refGenome = refGenome
+        self.readFilePaths = readFilePaths
+        self.reads = []
+        self.opts = cmdOpts
+
+        self.dataDir = None
+        self.genomeDir = None
+        self.mcContextDir = None
+
+        self.readInFh1 = None
+        self.readInFh2 = None
+        self.numReads = {1: 0, 2: 0}
+
+        self.mappingResultDirs = []
+        self.mappingResultFiles = []
+
+        self.setDataDir()
+        self.setLogger()
+
+        logging.info("bsf-call start.")
+        self.logOption()
+
+        # self.numReadsPerFile = self.sizeForSplitRead(self.opts["split_read_size"])
+
+        if self.opts["mapping_dir"]:
+            self.opts["only_mcdetection"] = True
+        else:
+            self.opts["only_mcdetection"] = False
+
+
+    def execute(self):
+        """
+        execute bsf-call process.
+        """
+
+        try:
+            if self.opts["mapping_dir"]:
+                # Only mc detection
+                self.mappingResultDirs = self.opts["mapping_dir"].split(",")
+                self.mappingResultFiles = self.getAllMappingResultFiles(self.mappingResultDirs)
+                self.opts["mapping_result_files"] = self.mappingResultFiles
+            else:
+                self.makeIndexFile()
+                self.prepareForReads()
+                self.mappingResultDirs = self.processMapping()
+
+            logging.debug("BsfCall:execute: mapping result directories are: %s" % ','.join(self.mappingResultDirs))
+            self.processMcDetection(self.mappingResultDirs, self.opts["local_dir"])
+
+            logging.info("bsf-call done.")
+        except:
+            logging.exception("Exception has occurred.")
+
+
+    def processMapping(self):
+        """
+        run read mapping and filtering process.
+        """
+
+        logging.info("Mapping and filtering process start.")
+
+        result_dirs = []
+        for read_attr in self.reads:
+            self.runLast(read_attr)
+            result_dirs.append(read_attr["results_dir"])
+
+        logging.info("Mapping and filtering process done.")
+
+        return result_dirs
+
+
+    def processMcDetection(self, resultDirs, localDir = None):
+        """
+        run mC detection process.
+        """
+
+        logging.info("mC detection process start.")
+
+        mc_detector = McDetector(self.refGenome, resultDirs, self.mcContextDir, self.opts)
+        mc_detector.execute(self.opts["output"], self.opts["num_threads"])
+
+        logging.info("mC detection process done.")
+
+
+    def setDataDir(self):
+        """
+        create directries to store the files of the bsf-call process.
+        """
+
+        if self.opts["work_dir"]:
+            self.dataDir = self.opts["work_dir"]
+        else:
+            self.dataDir = self.autoWorkDir()
+
+        self.mcContextDir = "%s/mc_contexts" % self.dataDir
+
+        if not os.path.exists(self.dataDir):
+            os.makedirs(self.dataDir)
+
+        if not os.path.exists(self.mcContextDir):
+            os.makedirs(self.mcContextDir)
+
+
+    def setLogger(self):
+        """
+        create logger to store the logs of the bsf-call process.
+        """
+
+        log_level = logging.INFO
+        # log_level = logging.DEBUG
+
+        log_file = "%s/bsf-call.log" % self.dataDir
+        file_logger = logging.FileHandler(filename=log_file)
+
+        file_logger.setLevel(log_level)
+        file_logger.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
+
+        logging.getLogger().addHandler(file_logger)
+        logging.getLogger().setLevel(log_level)
+
+
+    def logOption(self):
+        """
+        output bsf-call option values and arguments to the log.
+        """
+
+        if self.opts["mapping_dir"]:
+            logging.info("Mapping result directory is specified. Only mC detection is executed.")
+            logging.info("  Mapping result directory: %s" % self.opts["mapping_dir"])
+            logging.info("  Reference genome: %s" % self.refGenome)
+            # logging.info("  Read BAM file: %s" % ("Yes" if self.opts["read_bam"] else "No"))
+            # logging.info("  Read SAM file: %s" % ("Yes" if self.opts["read_sam"] else "No"))
+        else:
+            logging.info("Reference genome: %s" % self.refGenome)
+            logging.info("Read files: %s" % self.readFilePaths)
+            logging.info("Working directory: %s" % self.dataDir)
+            logging.info("Options:")
+            logging.info("  Threshold of the alignment score at filtering: %d" % self.opts["aln_score_thres"])
+            # logging.info("  Paired-end direction: %s" % self.opts["pe_direction"])
+            # logging.info("  Options for LAST: %s" % self.opts["last_opts"])
+
+        logging.info("  Threshold of read coverate: %d" % self.opts["coverage"])
+        logging.info("  Threshold of mC ratio: %s" % str(self.opts["lower_bound"]))
+        logging.info("  Threshold of the mismap probability at filtering: %s" % str(self.opts["aln_mismap_prob_thres"]))
+        logging.info("  Working directory: %s" % self.dataDir)
+        logging.info("  Local directory: %s" % self.opts["local_dir"])
+        logging.info("  Output file: %s" % (self.opts["output"] if self.opts["output"] else "(stdout)"))
+        # logging.info("  Use cluster: %s" % ("Yes" if self.opts["use_cluster"] else "No"))
+        # logging.info("  Queue name: %s" % self.opts["queue_list"])
+        logging.info("  Number of threads: %d" % self.opts["num_threads"])
+        # logging.info("  Split read size: %s" % self.opts["split_read_size"])
+
+
+    def prepareForReads(self):
+        """
+        create directories to store split reads and result files.
+        """
+
+        for read_no, read_path in enumerate(self.readFilePaths):
+            readNo = read_no + 1
+            logging.info("Preparations for a read file start: %d: %s" % (readNo, read_path))
+
+            data_dir = "%s/%d" % (self.dataDir, readNo)
+            read = {"base_dir": data_dir, "path": read_path, "reads_dir": data_dir + "/reads", "results_dir": data_dir + "/results"}
+
+            if not os.path.exists(data_dir):
+                os.makedirs(data_dir)
+                os.makedirs(read["reads_dir"])
+                os.makedirs(read["results_dir"])
+
+            pe_no = self.pairedEndReadNumbers()[0]
+            for readpath in read_path.split(","):
+                dir_name, file_name, base_name, ext, prog = self.splitFilePath(readpath)
+                file_type = self.checkReadFileType(readpath)
+                read[pe_no] = {"name": base_name, "fname": file_name, "type": file_type, "path": readpath}
+                pe_no += 1
+
+            is_paired_end = self.isPairedEnd(read)
+            logging.info("Paired-end: %s" % is_paired_end)
+            logging.info("  Forward: %s" % read[1]["path"])
+            if is_paired_end:
+                logging.info("  Reverse: %s" % read[2]["path"])
+
+            logging.info("Preparations for a read file done")
+            self.reads.append(read)
+        return
+
+
+    def runLast(self, readAttr):
+        """
+        run LAST programs to map reads and filtering.
+        """
+
+        is_paired_end = self.isPairedEnd(readAttr)
+        filter_option = self.filterOpts(self.opts["aln_mismap_prob_thres"], self.opts["aln_score_thres"], is_paired_end)
+
+        if is_paired_end:
+            logging.info('BsfCall::runLast: PairedEnd')
+            last_exec = LastExecutorPairedEnd(self.refGenome, self.dataDir, readAttr["reads_dir"], readAttr["results_dir"], self.opts["num_threads"])
+        else:
+            logging.info('BsfCall::runLast: Not PairedEnd')
+            last_exec = LastExecutorSingle(self.refGenome, self.dataDir, readAttr["reads_dir"], readAttr["results_dir"])
+
+        # last_exec.execute(readAttr, self.opts["num_threads"], self.lastalOpts(self.opts["last_opts"]), self.mergeOpts(), filter_option)
+        # last_exec.execute(readAttr, 1, self.lastalOpts(self.opts["last_opts"]), self.mergeOpts(), filter_option)
+        last_exec.execute(readAttr, 1, "", self.mergeOpts(), filter_option)
+
+
+    def makeIndexFile(self):
+        """
+        create index file of reference genome.
+        """
+
+        directions = []
+        if not os.path.exists("%s.f.prj" % self.refGenome):
+            directions.append("f")
+        if not os.path.exists("%s.r.prj" % self.refGenome):
+            directions.append("r")
+
+        if len(directions) > 0:
+            logging.info("Make index file start.")
+            last_executor = LastExecutor(self.refGenome, self.dataDir)
+            last_executor.lastdb(directions, self.opts["num_threads"] > 1)
+            logging.info("Make index file done.")
+
+
+    def checkReadFileType(self, readFilePath):
+        """
+        get read file type.
+        """
+
+        name, ext = os.path.splitext(readFilePath)
+        if ext == ".gz":
+            name, ext = os.path.splitext(name)
+
+        if len(ext) > 1:
+            ext = ext[1:]
+
+        file_type = None
+
+        if ext == "sra" or "fastq" or "fasta":
+            file_type = ext
+        elif ext == "fa":
+            file_type = "fasta"
+        else:
+            f = open(readFilePath, "r")
+            first_char = f.read(1)
+            if first_char == "@":
+                file_type = "fastq"
+            elif first_char == ">":
+                file_type = "fasta"
+            else:
+                file_type = "sra"
+            f.close()
+
+        return file_type
+
+
+    def splitedReadFilePath(self, outputDir, start, end, readDirection, ext):
+        """
+        get splitted read file path.
+        """
+
+        return "%s/%010d-%010d_%d.%s" % (outputDir, start, end, readDirection, ext)
+
+
+    def fastqDumpedFilePath(self, outputDir, readName, readDirection = None):
+        """
+        get output file path for fastq-dump command.
+        """
+
+        path = "%s/%s" % (outputDir, readName)
+        if readDirection:
+            path += "_%d" % readDirection 
+
+        return path + ".fastq"
+
+
+    def autoWorkDir(self):
+        """
+        get working directory path determined automatically.
+        """
+
+        now = datetime.now()
+
+        s = ",".join(self.readFilePaths) + self.refGenome
+        for key, value in self.opts.items():
+            s += ("%s:%s" % (key, value))
+        h = hashlib.md5(s).hexdigest()
+
+        return "%s-%06d-%s" % (now.strftime("%Y%m%d-%H%M%S"), now.microsecond, h[0:16])
+        
+
+class BsfCallCluster(BsfCall):
+    """
+    class to execute bsf-call process on pc cluster.
+    """
+
+    def __init__(self, refGenome, readFilePaths, cmdOpts):
+        """
+        constructor of BsfCallCluster
+        """
+
+        BsfCall.__init__(self, refGenome, readFilePaths, cmdOpts)
+
+        self.lastExecutor = LastExecutorCluster(self.refGenome, self.opts)
+        self.mappingJobIds = []
+
+
+    def processMapping(self):
+        """
+        if bsf-call is executed on cluster, mapping and filtering job is submitted
+        when read file is splitted.
+        therefore, in this function, only wait for all jobs to finish.
+        """
+
+        logging.info("Waiting for all jobs to finish.")
+
+        self.waitForSubmitJobs(self.mappingJobIds)
+
+        logging.info("Mapping and filtering process done.")
+
+        return self.lastExecutor.resultDirs
+
+
+    def processMcDetection(self, resultDirs, localDir = None):
+        """
+        for each chromosome number, submit mC detection process job to the cluster.
+        after all jobs have been finished, output mC detection result.
+        """
+
+        logging.info("mC detection process start.")
+
+        chrs = self.chrsByRefGenome(self.refGenome)
+        job_ids = []
+        for chr_no in chrs:
+            job_id = self.submitMcDetectionJob(resultDirs, chr_no)
+            job_ids.append(job_id)
+            sleep(1)
+        logging.info("Submitted jobs: %s" % ",".join(job_ids))
+
+        self.waitForSubmitJobs(job_ids)
+
+        mc_detector = McDetector(self.refGenome, resultDirs, self.mcContextDir, self.opts)        
+
+        mc_detector.chrs = chrs
+        mc_detector.output(self.opts["output"])
+
+        logging.info("mC detection process done.")
+
+
+    def submitMcDetectionJob(self, resultDirs, chrNo):
+        """
+        submit mC detection process job to the cluster.
+        """
+
+        argv = self.qsubRemoteCommandArgv(resultDirs, chrNo)
+        cmd = self.qsubCommand(chrNo, " ".join(map((lambda s: '"' + s + '"'), argv)))
+
+        qsub = os.popen(cmd)
+        out = qsub.read()
+        qsub.close()
+
+        job_id = self.jobIdByQsubOutput(out)
+        self.logJobSubmit("mC detection job: %s: Mapping result directories: %s" % (chrNo, resultDirs), job_id)
+
+        return job_id
+
+
+    def qsubRemoteCommandArgv(self, resultDirs, chrNo):
+        """
+        get arguments for mC detection program (mc-detector.py).
+        """
+
+        argv = []
+
+        argv.append(self.refGenome)
+        argv.append(",".join(resultDirs))
+        argv.append(self.mcContextDir)
+        argv.append(chrNo)
+        argv.append(str(self.opts["only_mcdetection"]))
+        argv.append(str(self.opts["lower_bound"]))
+        argv.append(str(self.opts["coverage"]))
+        argv.append(str(self.opts["aln_mismap_prob_thres"]))
+
+        # if self.opts["read_bam"]:
+        #     argv.append("BAM")
+        # elif self.opts["read_sam"]:
+        #     argv.append("SAM")
+        # else:
+        #     argv.append("MAF")
+        argv.append("MAF")
+            
+        if self.opts["local_dir"]:
+            argv.append(self.opts["local_dir"])
+
+        return argv
+
+
+    def qsubCommand(self, chrNo, cmdArgs):
+        """
+        get qsub command to submit mC detection job.
+        """
+
+        remote_cmd = os.path.join(self.scriptDir(), "mc-detector.py")
+        out_file = os.path.join(self.mcContextDir, "mc-detector-%s.out" % chrNo)
+        err_file = os.path.join(self.mcContextDir, "mc-detector-%s.err" % chrNo)
+
+        if self.opts["queue_list"]:
+            cmd = "qsub -o %s -e %s -q %s -pe openmpi %d -cwd -b y %s %s" % (out_file, err_file, self.opts["queue_list"], self.opts['num_threads'], remote_cmd, cmdArgs)
+        else:
+            cmd = "qsub -o %s -e %s -pe openmpi %d -cwd -b y %s %s" % (out_file, err_file, self.opts['num_threads'], remote_cmd, cmdArgs)
+
+        return cmd
+
+
+    def afterProcessSplitRead(self, readFile, readAttr = None):
+        """
+        this function is called after output splitted one read file.
+        if bsf-call is executed on pc cluster, mapping and filtering job is submitted
+        after output one splitted read file.
+        """
+
+        if self.isPairedEnd(readAttr):
+            fpath, ext = os.path.splitext(readFile)
+            if fpath[-1:] == "2":
+                read_file = "%s1.%s" % (fpath[0:-1], readAttr[1]["type"])
+                job_id = self.lastExecutor.executeOneRead(readFile, readAttr)
+                self.mappingJobIds.append(job_id)
+        else:
+            job_id = self.lastExecutor.executeOneRead(readFile, readAttr)
+            self.mappingJobIds.append(job_id)
+
+
+class LastExecutor(BsfCallBase):
+    """
+    class to run LAST programs to map read and filtering.
+    """
+
+    def __init__(self, refGenome, baseDir = ".", readsDir = None, resultsDir = None, numThreads = 1):
+        """
+        constructor of LastExecutor
+        """
+
+        self.refGenome = refGenome
+        self.baseDir = baseDir
+        self.readsDir = readsDir
+        self.resultsDir = resultsDir
+        self.queue = Queue.Queue()
+        self.lock = threading.Lock()
+        self.numThreads = numThreads
+
+
+    def execute(self, readAttr, numThreads = 1, lastalOpts = "", mergeOpts = "", filterOpts = ""):
+        """
+        enqueue all splited read files to the queue.
+        create and start threads to execute read mapping and filtering process.
+        wait for all threads to finish.
+        """
+
+        self.enqueue(readAttr)
+
+        if self.queue.qsize()==0:
+            logging.fatal("LastExecutor::execute: Error: queue size=%d" % self.queue.qsize())
+            sys.exit(1)
+
+        logging.info("Queued %d files." % self.queue.qsize())
+
+        threads = []
+        for i in range(numThreads):
+            t = threading.Thread(target=self.worker, args=(readAttr, lastalOpts, mergeOpts, filterOpts))
+            t.daemon = True
+            threads.append(t)
+            t.start()
+
+        i = 1
+        for thread in threads:
+            logging.info("Waiting for %d th thread." % i)
+            thread.join()
+            logging.info("Joined %d th thread." % i)
+            i+=1
+
+
+    def worker(self, readAttr, lastalOpts, mergeOpts, filterOpts):
+        """
+        thread worker to execute LAST.
+        dequeue read file path from the queue and execute read mapping filtering
+        process.
+        """
+        if self.queue.empty():
+            logging.info("LastExecutor::worker: Queue is empty.")
+        while not self.queue.empty():
+            fpath = self.queue.get_nowait()
+            self.runLast(fpath, readAttr, lastalOpts, mergeOpts, filterOpts)
+        return
+
+        
+    def runLast(self, readFile, readAttr, lastalOpts, mergeOpts, filterOpts, rmInFiles = True):
+        """
+        execute LAST programs to map read and filtering.
+        """
+
+        cmd = self.batchCmd(readFile, readAttr, lastalOpts, mergeOpts, filterOpts, rmInFiles)
+        logging.info("LastExecutor::runLast: command=%s" % cmd)
+        p = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
+        # out, error = p.communicate()
+        p.wait()
+
+        error_msg = p.communicate()[1]
+        if len(error_msg) > 0:
+            logging.fatal(error_msg)
+
+
+    def enqueue(self, readAttr):
+        """
+        enqueue all splitted read files to the queue.
+        """
+
+        # logging.info("%s/*_1.%s.bz2" % (self.readsDir, readAttr[1]["type"]))
+        # for read_file in glob.glob("%s/*_1.%s" % (self.readsDir, readAttr[1]["type"])):
+        #     self.queue.put(read_file)
+        self.queue.put(readAttr[1]["path"])
+
+
+    def lastdb(self, directions, parallel = False):
+        """
+        execute lastdb command to create index file of reference genome.
+        """
+
+        cmds = []
+        for direction in directions:
+            cmds.append(self.lastdbCmd(direction))
+
+        if parallel:
+            processes = []
+            for cmd in cmds:
+                logging.info(cmd)
+                p = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
+                out = p.stdout
+                processes.append({"p": p, "out": out})
+            for process in processes:
+                process["p"].wait()
+                out_data = process["out"].read()
+                if len(out_data) > 0:
+                    logging.info(out_data)
+        else:
+            for cmd in cmds:
+                logging.info(cmd)
+                p = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
+                out = p.stdout
+                p.wait()
+                out_data = out.read()
+                if len(out_data) > 0:
+                    logging.info(out_data)
+
+
+    def lastdbCmd(self, direction):
+        """
+        get lastdb command to create index file of reference genome.
+        """
+        # return "lastdb -w2 -u bisulfite_%s.seed %s.%s %s" % (direction, self.refGenome, direction, self.refGenome)
+        return "lastdb -uBIS%s %s.%s %s" % (direction.upper(), self.refGenome, direction, self.refGenome)
+
+
+    def lastalBsCmd(self, readFile, opts = ""):
+        """
+        get lastal command to map read.
+        """
+        # s_opt = self.lastalSopt(direction)
+        read_name = self.readNameByReadFile(readFile)
+        return "TMPDIR=%s last-bisulfite.sh %s.%s %s.%s %s > %s" % (self.readsDir, self.refGenome, 'f', self.refGenome, 'r', readFile, self.mappingResultFilePath(read_name))
+
+    def lastalBsPairCmd(self, readFile1, readFile2, opts = ""):
+        """
+        get lastal command to map read.
+        """
+        # s_opt = self.lastalSopt(direction)
+        read_name = self.readNameByReadFile(readFile)
+        return "PARALLEL=-j%d TMPDIR=%s last-bisulfite-pair.sh %s.%s %s.%s %s %s > %s" % (self.numThreads, self.readsDir, self.refGenome, 'f', self.refGenome, 'r', readFile1, readFile2, self.mappingResultFilePath(read_name))
+
+
+    # def mergeCmd(self, forwardFile, reverseFile, outputFile, opts = "", rmInFiles = True):
+    def mergeCmd(self, inputFile, outputFile, opts = "", rmInFiles = True):
+        """
+        get command to merge lastal output.
+        """
+        cmd = "last-merge-batches %s > %s" % (inputFile, outputFile)
+        if rmInFiles:
+            cmd += "; rm %s" % inputFile
+        return cmd
+
+
+    # def mappingAndMergeCmd(self, readFile, lastalOpts = "", mergeOpts = "", rmInFiles = True):
+    def mappingPairCmd(self, readFile1, readFile2, lastalOpts = "", mergeOpts = "", rmInFiles = True):
+        """
+        get read mapping and filtering command.
+        """
+        read_name = self.readNameByReadFile(readFile)
+        n, ext = os.path.splitext(readFile)
+        if ext == ".gz":
+            n, ext = os.path.splitext(n)
+        lastal_qopt = self.lastalQopt(ext[1:])
+        lastal_opt = "%s %s" % (lastalOpts, lastal_qopt)
+        mapping_file = self.mappingResultFilePath(read_name)
+
+        return self.lastalBsPairCmd(readFile1, readFile2, lastal_opt)
+
+
+    # def mappingResultFilePath(self, readName, direction):
+    def mappingResultFilePath(self, readName):
+        """
+        get read mapping result file path.
+        """
+
+        return "%s/%s.maf" % (self.resultsDir, readName)
+
+
+    def mergeResultFilePath(self, readName):
+        """
+        get merge result file path.
+        """
+
+        return "%s/%s.merge.maf" % (self.resultsDir, readName)
+
+
+    def filterResultFilePath(self, readName):
+        """
+        get filtering result file path.
+        """
+
+        return "%s/%s.maf" % (self.resultsDir, readName)
+
+
+    def lastalSopt(self, direction):
+        """
+        get -s option for lastal.
+        """
+
+        opt = ""
+        if direction == "f":
+            opt = "-s1"
+        elif direction == "r":
+            opt = "-s0"
+
+        return opt
+
+
+    def lastalQopt(self, fileType):
+        """
+        get -Q option for lastal.
+        """
+
+        opt = ""
+        if fileType == "fasta":
+            opt = "-Q0"
+        elif fileType == "fastq":
+            opt = "-Q1"
+
+        return opt
+
+
+class LastExecutorCluster(LastExecutor):
+    """
+    class to run LAST programs on pc cluster.
+    """
+
+    def __init__(self, refGenome, bsfCallOpts):
+        """
+        constructor of LastExecutorCluster
+        """
+
+        self.refGenome = refGenome
+        self.opts = bsfCallOpts
+
+        self.resultDirs = []
+
+
+    def executeOneRead(self, readFile, readAttr):
+        """
+        execute read mapping and filtering process with specified read.
+        on pc cluster, submit mapping and filtering job and return job id.
+        """
+
+        if readAttr["results_dir"] not in self.resultDirs:
+            self.resultDirs.append(readAttr["results_dir"])
+
+        lastal_opts = self.lastalOpts(self.opts["last_opts"])
+        merge_opts = self.mergeOpts()
+        filter_opts = self.filterOpts(self.opts["aln_mismap_prob_thres"], self.opts["aln_score_thres"], self.isPairedEnd(readAttr))
+        job_id = self.submitJob(readFile, readAttr, lastal_opts, merge_opts, filter_opts, self.opts["queue_list"])
+
+        return job_id
+
+
+    def submitJob(self, readFile, readAttr, lastalOpts = "", mergeOpts = "", filterOpts = "", queueName = None):
+        """
+        submit read mapping and filtering process job.
+        """
+
+        job_id = None
+
+        read_name = self.readNameByReadFile(readFile)[0:-2]
+        out_file = self.qsubStdoutFilePath(readAttr["base_dir"], read_name)
+        err_file = self.qsubStderrFilePath(readAttr["base_dir"], read_name)
+        remote_cmd = self.remoteCommand(readAttr)
+        remote_cmd_args = " ".join(map((lambda s: '"' + s + '"'), self.remoteCommandArgv(read_name, readAttr, lastalOpts, filterOpts)))
+
+        if queueName:
+            cmd = "qsub -o %s -e %s -q %s -cwd %s %s" % (out_file, err_file, queueName, remote_cmd, remote_cmd_args)
+        else:
+            cmd = "qsub -o %s -e %s -cwd %s %s" % (out_file, err_file, remote_cmd, remote_cmd_args)
+
+        qsub = os.popen(cmd)
+        out = qsub.read()
+        qsub.close()
+
+        job_id = self.jobIdByQsubOutput(out)
+
+        dir_path, file_name, base_name, ext, prog = self.splitFilePath(readFile)
+        self.logJobSubmit("Mapping and filtering: read: %s/%s" % (dir_path, base_name[0:-2]), job_id)
+
+        return job_id
+
+
+    def remoteCommand(self, readAttr):
+        """
+        get read mapping and filtering command path to submit by qsub command.
+        """
+
+        if self.isPairedEnd(readAttr):
+            return os.path.join(self.scriptDir(), "mapping-p.sh")           
+        else:
+            return os.path.join(self.scriptDir(), "mapping-s.sh")
+
+
+    def remoteCommandArgv(self, readName, readAttr, lastalOpts, filterOpts):
+        """
+        get read mapping and filtering command arguments.
+        """
+
+        argv = []
+
+        argv.append(readAttr["reads_dir"])
+        argv.append(readAttr["results_dir"])
+        argv.append(self.refGenome)
+        argv.append(filterOpts)
+
+        argv.append(readName)
+        argv.append(readAttr[1]["type"])
+        argv.append("%s %s" % (lastalOpts, self.lastalQopt(readAttr[1]["type"])))
+
+        if self.isPairedEnd(readAttr):
+            argv.append(readName)
+            argv.append(readAttr[2]["type"])
+            argv.append("%s %s" % (lastalOpts, self.lastalQopt(readAttr[2]["type"])))
+
+        return argv
+
+
+    def qsubStdoutFilePath(self, dirPath, readName):
+        """
+        get qsub command standard output file path.
+        """
+
+        return "%s/mapping_%s.out" % (dirPath, readName)
+
+
+    def qsubStderrFilePath(self, dirPath, readName):
+        """
+        get qsub command standard error file path.
+        """
+
+        return "%s/mapping_%s.err" % (dirPath, readName)
+
+
+class LastExecutorSingle(LastExecutor):
+    """
+    class to run LAST programs to map single read and filtering.
+    """
+
+    def __init__(self, refGenome, baseDir, readsDir, resultsDir):
+        """
+        constructor of LastExecutorSingle
+        """
+
+        LastExecutor.__init__(self, refGenome, baseDir, readsDir, resultsDir)
+
+
+    def batchCmd(self, readFile1, readAttr, lastalOpts = "", mergeOpts = "", filterOpts = "", rmInFiles = True):
+        """
+        get batch command to map read and filtering.
+        """
+        read_name = self.readNameByReadFile(readFile1)
+        out_file = self.filterResultFilePath(read_name[0:-2])
+        return "TMPDIR=%s last-bisulfite.sh %s.f %s.r %s > %s/%s.maf" % (self.readsDir, self.refGenome, self.refGenome, readFile1, self.resultsDir, read_name)
+        # cmds = []
+        # cmds.append(self.mappingAndMergeCmd(readFile, lastalOpts, mergeOpts, rmInFiles))
+        # cmds.append(self.mappingPairCmd(readFile1, readFile2, lastalOpts, mergeOpts, rmInFiles))
+        # cmds.append(self.filterCmd(self.mergeResultFilePath(read_name), out_file, filterOpts, rmInFiles))
+        # cmds.append(self.filterCmd(self.mappingResultFilePath(read_name), out_file, filterOpts, rmInFiles))
+        # cmds.append("bzip2 %s" % out_file)
+        # return "; ".join(cmds)
+
+
+    def filterCmd(self, inputFile, outputFile, opts = "", rmInFile = True):
+        """
+        get filter command.
+        """
+
+        cmd = "last-map-probs %s %s > %s" % (opts, inputFile, outputFile)
+        if rmInFile:
+            cmd += "; rm %s" % inputFile
+
+        return cmd
+
+
+class LastExecutorPairedEnd(LastExecutor):
+    """
+    class to run LAST programs to map paired-end read and filtering.
+    """
+
+    def __init__(self, refGenome, baseDir, readsDir, resultsDir, numThreads):
+        """
+        constructor of LastExecutorPairedEnd
+        """
+        LastExecutor.__init__(self, refGenome, baseDir, readsDir, resultsDir)
+
+
+    def batchCmd(self, readFile1, readAttr, lastalOpts = "", mergeOpts = "", filterOpts = "", rmInFiles = True):
+        """
+        get batch command to map read and filtering.
+        """
+        # readFile2 = self.secondReadFilePathByFirstReadFilePath(readFile1, readAttr[2]["type"])
+        # read_name1 = self.readNameByReadFile(readFile1)
+        # read_name2 = self.readNameByReadFile(read_files[1])
+        # merge_result_file = "%s %s" % (self.mergeResultFilePath(read_name1), self.mergeResultFilePath(read_name2))
+        # mapping_result_file = "%s %s" % (self.mappingResultFilePath(read_name1), self.mappingResultFilePath(read_name2))
+        # out_file = self.filterResultFilePath(read_name1[0:-2])
+        # return "TMPDIR=%s last-bisulfite-paired.sh %s.f %s.r %s %s > %s" % (self.baseDir, self.refGenome, self.refGenome, readFile1, readFile2, out_file)
+        read_name1 = self.readNameByReadFile(readAttr[1]["path"])
+        read_name2 = self.readNameByReadFile(readAttr[2]["path"])
+        return "PARALLEL=-j%d TMPDIR=%s last-bisulfite-paired.sh %s.f %s.r %s %s > %s/%s,%s.maf" % (self.numThreads, self.readsDir, self.refGenome, self.refGenome, readAttr[1]["path"], readAttr[2]["path"], self.resultsDir, read_name1, read_name2)
+
+
+    def filterCmd(self, inputFile, outputFile, opts = "", rmInFile = True):
+        """
+        get filter command.
+        """
+
+        cmd = "last-pair-probs %s %s > %s" % (opts, inputFile, outputFile)
+        if rmInFile:
+            cmd += "; rm %s" % inputFile
+
+        return cmd
+
+
+class McDetector(BsfCallBase):
+    """
+    class to execute mC detection process.
+    """
+
+    def __init__(self, refGenome, resultDirs, mcContextDir, options):
+        """
+        constructor of McDetector
+        """
+
+        self.refGenome = refGenome
+        self.refGenomeBuf = {}
+        self.refGenomeChr = []
+
+        self.mappingResultDirs = resultDirs
+        self.mcContextDir = mcContextDir
+        self.lowerBound = options["lower_bound"]
+        self.coverageThreshold = options["coverage"]
+        self.onlyMcDetection = options["only_mcdetection"]
+
+        self.opts = options
+
+        self.mappingResultFiles = []
+
+        if self.onlyMcDetection:
+            self.mismapThreshold = options["aln_mismap_prob_thres"]
+            # self.readBam = options["read_bam"]
+            # self.readSam = options["read_sam"]
+            self.readBam = False
+            self.readSam = False
+            if "mapping_result_files" in options:
+                self.mappingResultFiles = options["mapping_result_files"]
+            else:
+                self.mappingResultFiles = self.getAllMappingResultFiles(resultDirs)
+        else:
+            self.mismapThreshold = 1e-10
+            self.readBam = False
+            self.readSam = False
+            self.mappingResultFiles = self.getAllMappingResultFiles(resultDirs)
+
+        if len(self.mappingResultFiles)==0:
+            logging.fatal("McDetector::__init__: error: no mapping result file found.")
+            sys.exit(1)
+        
+        if options["local_dir"]:
+            self.localDir = options["local_dir"]
+        else:
+            self.localDir = mcContextDir
+
+        self.readRefGenome(self.refGenome, self.refGenomeBuf, self.refGenomeChr)
+
+        logging.debug("McDetector::__init__: mappingResultDirs=%s" % ','.join(self.mappingResultDirs))
+
+
+    def execute(self, outputFile, numWorkers = 1):
+        """
+        execute mC detection process and output result.
+        """
+
+        self.process()
+        self.output(outputFile)
+
+
+    def process(self):
+        """
+        execute mC detection process.
+        """
+
+        logging.info("mC detection process start")
+
+        if len(self.mappingResultFiles)==0:
+            logging.fatal("McDetector::process: error: no mapping result found.")
+            sys.exit(1)
+
+        for mapping_result_file in self.mappingResultFiles:
+            logging.info("Parsing mapping result file: %s" % mapping_result_file)
+            if self.onlyMcDetection:
+                if not (self.readBam or self.readSam):
+                    if self.isGzipFile(mapping_result_file) or self.isMafFile(mapping_result_file):
+                        self.processMafFile(mapping_result_file)
+                else:
+                    # BAM or SAM
+                    if self.readBam and self.isBamFile(mapping_result_file):
+                        self.processSamFile(mapping_result_file)
+                    elif self.readSam and self.isSamFile(mapping_result_file):
+                        self.processSamFile(mapping_result_file)
+            else:
+                self.processMafFile(mapping_result_file)
+
+        logging.info("Parsing mapping result file done")
+
+        if self.mcContextDir != self.localDir:
+            copy(self.mcContextLocalFilePath(self.targetChr), self.mcContextDir)
+
+
+    def output(self, outputFile):
+        """
+        output mC detection result.
+        """
+
+        logging.info("McDetector::output: outputFile=%s" % outputFile)
+        popen_args = ['sort', '-k1']
+        list = glob.glob("%s/*.bsf" % self.localDir)
+        if len(list)==0:
+            logging.fatal("McDetect::output: no *._bsf_ files found in %s" % self.localDir)
+            sys.exit(1)
+        for bsf_file in list:
+            if os.path.getsize(bsf_file) > 0:
+                popen_args.append(bsf_file)
+                logging.info("McDetector::output: added bsf_file=\"%s\"" % bsf_file)
+        logging.debug("McDetector::output: popen_args=%s" % ' '.join(popen_args))
+        fout = open(outputFile, 'w')
+        if len(popen_args) > 2:
+            pipe = subprocess.Popen(popen_args, stdout=subprocess.PIPE)
+            block = []
+            for line in pipe.stdout:
+                (chr, ctx_pos, strand, mc_ctx, base, conf) = line.strip().split("\t")
+                ctx_pos = int(ctx_pos)
+                conf = float(conf)
+                if len(block)==0 or block[-1][0]==chr:
+                    block.append([chr, ctx_pos, strand, mc_ctx, base, conf])
+                else:
+                    self.outputOneChrBlock(fout, block)
+                    del block[:]
+                    block.append([chr, ctx_pos, strand, mc_ctx, base, conf])
+            if len(block) > 0:
+                self.outputOneChrBlock(fout, block)
+            pipe.stdout.close()
+        else:
+            logging.info("McDetector::output: no result files found.")
+        fout.close()
+
+
+    def outputOneChrBlock(self, fout, block):
+        """
+        output mC detection result for one chromosome.
+        """
+
+        chr = block[0][0]
+        logging.info("McDetector::outputOneChrBlock: chr=%s" % chr)
+        mc_contexts = {}
+        for b in sorted(block, key=lambda block: block[1]):
+            try:
+                ctx_pos, strand, mc_ctx, base, conf = b[1:]
+                if not ctx_pos in mc_contexts:
+                    mc_contexts[ctx_pos] = {}
+                if not strand in mc_contexts[ctx_pos]:
+                    mc_contexts[ctx_pos][strand] = {}
+                if not mc_ctx in mc_contexts[ctx_pos][strand]:
+                    mc_contexts[ctx_pos][strand][mc_ctx] = []
+                mc_contexts[ctx_pos][strand][mc_ctx].append([base, conf])
+            except ValueError, e:
+                logging.warning("McDetect::outputOneChrBlock: value error: %s: %s -> %s" % (fpath, line.strip(), e.args[0]))
+
+        num_entry = 0
+        if len(mc_contexts.keys()) > 0:
+            for pos in sorted(mc_contexts.keys()):
+                for strand in mc_contexts[pos].keys():
+                    for mc_ctx in mc_contexts[pos][strand].keys():
+                        coverage, mc_ratio = self.calcCoverage(mc_contexts[pos][strand][mc_ctx], strand)
+                        if coverage >= self.coverageThreshold and mc_ratio >= self.lowerBound:
+                            fout.write("%s\t%d\t%s\t%s\t%g\t%d\n" % (chr, pos, strand, mc_ctx, mc_ratio, coverage))
+                            num_entry += 1
+                        else:
+                            logging.info("McDetect::outputOneChrBlock: rejected: chr=%s pos=%d strand=%s coverage=%g mc_ratio=%g" % (chr, pos, strand, coverage, mc_ratio))
+            # self.bzip2File(fpath, False)
+        logging.info("McDetector::outputOneChrBlock: number of entries %s" % num_entry)
+
+
+    def processMafFile(self, resultFile):
+        """
+        read mapping result file, and output mC context data file.
+        """
+
+        file_name = self.splitFilePath(resultFile)[1]
+        outputFile = "%s/%s.bsf" % (self.localDir, file_name)
+        try:
+            fin = open(resultFile, 'r')
+            fout = open(outputFile, 'w')
+            block = []
+            for line in fin:
+                if line[0] == '#' or line[0] == 'p' or line[0] == "\n":
+                    continue
+                if line[0] == 'a' or line[0] == 's' or line[0] == 'q':
+                    block.append(line.strip())
+                if len(block)==4:
+                    if block[0][0]=='a' and block[1][0]=='s' and block[2][0]=='s' and block[3][0]=='q':
+                        mismap_prob = float(block[0].split('=')[2])
+                        if mismap_prob <= self.mismapThreshold:
+                            b1 = block[1].split()
+                            chr = b1[1]
+                            ref_seq = b1[-1].upper()
+                            ref_start = int(b1[2]) # 0-based position
+                            read_seq = block[2].split()[-1]
+                            read_len = len(read_seq)
+                            read_qual = block[3].split()[-1]
+                            strand = self.findStrandFromAlignment(chr, ref_start, read_seq, read_len)
+                            # strand = block[2].split()[4]
+                            if strand == '+' or strand == '-':
+                                lines = self.extractMcContextsByOneRead(chr, strand, mismap_prob, ref_seq, ref_start, read_seq, read_qual, read_len)
+                                for line in lines:
+                                    fout.write(line)
+                                logging.debug("processMafFile: a maf block(%s) is successfully captured." % strand)
+                            elif strand == '+-':
+                                for st in ('+', '-'):
+                                    lines = self.extractMcContextsByOneRead(chr, st, mismap_prob, ref_seq, ref_start, read_seq, read_qual, read_len)
+                                    for line in lines:
+                                        fout.write(line)
+                                logging.debug("processMafFile: a maf block(%s) is successfully captured." % strand)
+                            else:
+                                logging.debug("processMafFile: a maf block does not show strand-specific info.")
+                        else:
+                            logging.info("processMafFile: alignment \"%s\" has greater mismap prob. than the threshold." % block[0])
+                        del block[:]
+                    else:
+                        logging.fatal("processMafFile: error: unexpected malformed maf block is found.")
+                        logging.fatal("block 1: \"%s\"\n" % block[0])
+                        logging.fatal("block 2: \"%s\"\n" % block[1])
+                        logging.fatal("block 3: \"%s\"\n" % block[2])
+                        logging.fatal("block 4: \"%s\"\n" % block[3])
+                        sys.exit(1)
+            fin.close()
+            fout.close()
+
+            if len(block) > 0:
+                logging.fatal("McDetect::processMafFile: error: possible malformed MAF file.")
+                for b in block:
+                    logging.fatal(b)
+                sys.exit(1)
+        except IOError:
+            logging.fatal("McDetect::processMafFile: error: unable to read a MAF file \"%s\"" % resultFile)
+            sys.exit(1)
+        return
+     
+
+    def processSamFile(self, samFile):
+        """
+        read mapping BAM/SAM file, and output mC context data file.
+        """
+
+        logging.info("Process BAM/SAM file start: %s" % samFile)
+
+        samfile = None
+        try:
+            if self.readBam:
+                samfile = pysam.Samfile(samFile, "rb")
+            else:
+                samfile = pysam.Samfile(samFile, "r")
+
+            counter = 1
+            for aln in samfile.fetch(until_eof = True):
+                samaln = SamAlnParser(samfile, aln)
+                samaln.setRefGenome(self.targetChr, self.targetSeqD, self.targetSeqLen)
+                samaln.parse()
+
+                chr = samaln.referenceName()
+
+                if (chr is None) or (chr != self.targetChr):
+                    continue
+
+                if aln.mapq:
+                    mismap = self.bamMapq2Mismap(aln.mapq)
+                    if mismap - self.mismapThreshold > 0:
+                        continue
+
+                read_seq = samaln.alnReadSeq.replace("t", "C")
+                self.extractMcContextsByOneRead(chr, samaln.strand, 0, samaln.alnRefSeq.upper(), samaln.alnRefStart, self.targetSeqLen, read_seq, read_qual)
+
+                counter += 1
+                if counter > 500000:
+                    self.outputMcContextData()
+                    counter = 1
+
+            samfile.close()
+            self.outputMcContextData()
+        except Exception, e:
+            logging.fatal(samFile)
+            logging.fatal("  %s" % str(type(e)))
+            logging.fatal("  %s" % str(e))
+            if samfile:
+                samfile.close()
+
+    def extractMcContextsByOneRead(self, chr, strand, mismapProb, refSeq, refStart, readSeq, readQual, readLen):
+        """
+        extract mC context by one read.
+        """
+
+        lines = []
+        if strand == '+':
+            real_pos = refStart
+            for i in range(readLen):
+                if readSeq[i] == '-':
+                    continue
+                # if refSeq[i] == 'C' and (readSeq[i] == 'C' or readSeq[i] == 'T'):
+                if refSeq[i] == 'C':
+                    baseConf = self.qualityCharToErrorProb(readQual[i])
+                    ctx_type = self.mcContextType(self.refGenomeBuf[chr], real_pos, strand)
+                    line = "%s\t%d\t%s\t%s\t%s\t%g\n" % (chr, real_pos, strand, ctx_type, readSeq[i], (1-mismapProb) * (1-baseConf))
+                    # logging.debug("extractMcContextsByOneRead: line=%s" % line)
+                    lines.append(line)
+                real_pos += 1
+        if strand == '-':
+            real_pos = refStart
+            for i in range(readLen):
+                if readSeq[i] == '-':
+                    continue
+                # if refSeq[i] == 'G' and (readSeq[i] == 'G' or readSeq[i] == 'A'):
+                if refSeq[i] == 'G':
+                    baseConf = self.qualityCharToErrorProb(readQual[i])
+                    ctx_type = self.mcContextType(self.refGenomeBuf[chr], real_pos, strand)
+                    read_base = self.complementaryBase(readSeq[i])
+                    line = "%s\t%d\t%s\t%s\t%s\t%g\n" % (chr, real_pos, strand, ctx_type, readSeq[i], (1-mismapProb) * (1-baseConf))
+                    # logging.debug("extractMcContextsByOneRead: line=%s" % line)
+                    lines.append(line)
+                real_pos += 1
+        return lines
+
+
+    def complementaryBase(self, base):
+        """
+        compute a complemetary base.
+        """
+
+        if base == 'A': return 'T'
+        if base == 'C': return 'G'
+        if base == 'G': return 'C'
+        if base == 'T': return 'A'
+        if base == 'N': return 'N'
+        if base == 'a': return 't'
+        if base == 'c': return 'g'
+        if base == 'g': return 'c'
+        if base == 't': return 'a'
+        if base == 'n': return 'n'
+        sys.exit(1)
+
+
+    def findStrandFromAlignment(self, chr, ref_start, read_seq, read_len):
+        plus_sup = 0
+        minus_sup = 0
+        other_mismatch = 0
+        base_size = 0
+        ref_seq = self.refGenomeBuf[chr]
+        for i in range(read_len):
+            j = ref_start + i
+            if ref_seq[j]=='C' and read_seq[i]=='T':
+                plus_sup += 1
+                base_size += 1
+            elif ref_seq[j]=='G' and read_seq[i]=='A':
+                minus_sup += 1
+                base_size += 1
+            elif (ref_seq[j]!='-' and read_seq[i]!='-') and ref_seq[j]!=read_seq[i]:
+                other_mismatch += 1
+                base_size += 1
+        if base_size==0:
+            return '.'
+        mismatch_rate = float(other_mismatch)/float(base_size)
+        # if plus_sup > minus_sup and plus_sup > other_mismatch:
+        # if plus_sup > minus_sup and mismatch_rate < 0.05:
+        if plus_sup > minus_sup:
+            return '+'
+        # if minus_sup > plus_sup and minus_sup > other_mismatch:
+        # if minus_sup > plus_sup and mismatch_rate < 0.05:
+        if minus_sup > plus_sup:
+            return '-'
+        if plus_sup == minus_sup:
+            return '+-'
+        return '.'
+
+
+    def __extractMcContextsByOneRead(self, chr, strand, mismapProb, refSeq, refStart, refSeqLen, readSeq, readQual):
+        """
+        extract mC context by one read.
+        """
+
+        logging.debug("extractMcContextsByOneRead(%s, %s, %g, %s, %d, %d, %s, %s)" % (chr, strand, mismapProb, refSeq, refStart, refSeqLen, readSeq, readQual))
+
+        nogap_refseq = self.clearGap(refSeq)
+        bases = list(refSeq)
+        last_pos = len(nogap_refseq) - 1
+        pos = -1
+        while True:
+            try:
+                pos = bases.index("C", pos + 1)
+                num_gaps = refSeq.count("-", 0, pos)
+                real_pos = pos - num_gaps
+                ctx_type = self.mcContextType(nogap_refseq, real_pos)
+                ctx_pos = refStart + real_pos
+                if ctx_type == None:
+                    if strand == "+":
+                        ctx_type = self.mcContextType(self.targetSeqD, ctx_pos)
+                    elif strand == "-":
+                        ctx_type = self.mcContextType(self.targetSeqC, ctx_pos)
+                if ctx_type == None:
+                    continue
+                if strand == "-":
+                    ctx_pos = refSeqLen - ctx_pos - 1
+                line = "%d\t%s\t%s\t%s\n" % (ctx_pos, strand, ctx_type, readSeq[pos])
+                base_name = self.mcContextFileBaseName(ctx_pos)
+                if base_name not in self.mcDetectData:
+                    self.mcDetectData[base_name] = []
+                self.mcDetectData[base_name].append(line)
+            except IndexError:
+                logging.debug("extractMcContextsByOneRead#IndexError: %s %d %s %s %s %d" % (chr, ctx_pos, strand, ctx_type, readSeq, pos))
+            except ValueError:
+                break
+
+    def qualityCharToErrorProb(self, qualityChar):
+        """
+        convert FASTQ (Sanger) quality character to error probability.
+        """
+        return 10**((ord('!')-ord(qualityChar))*0.1)
+
+    def outputMcContextData(self):
+        """
+        output mC context data to file.
+        """
+
+        logging.info("Output mC detection data start.")
+
+        for base_name in self.mcDetectData.keys():
+            fpath = self.mcContextFilePathByName(base_name)
+            logging.debug("%s: %d" % (fpath, len(self.mcDetectData[base_name])))
+            fo = open(fpath, "a")
+            fo.write("".join(self.mcDetectData[base_name]))
+            fo.close()
+        self.mcDetectData.clear()
+
+        logging.info("Output mC detection data done.")
+
+
+    def mcContextHash(self):
+        h = {}
+        for strand in self.strands():
+            h[strand] = {}
+            for mc_ctx in self.mcContextTypes():
+                h[strand][mc_ctx] = {}
+
+        return h
+
+
+    def isC(self, seq):
+        return seq[0:1].upper() == "C"
+
+
+    def isT(self, seq):
+        return seq[0:1].upper() == "T"
+
+
+    def calcCoverage(self, seqAry, strand):
+        """
+        count the number of C and T (or G and A), calculate mC ratio.
+        """
+
+        num_c = 0.0
+        num_t = 0.0
+        num_all = 0
+        (C, T) = ('C', 'T')
+        if strand == '-':
+            (C, T) = ('G', 'A')
+        for i in seqAry:
+            num_all += 1
+            if i[0] == C:
+                num_c += i[1] 
+            elif i[0] == T:
+                num_t += i[1]
+
+        num_ct = num_c + num_t
+        if num_all == 0 or num_ct == 0:
+            return (0, 0)
+        # return (num_all, num_c/num_all)
+        return (num_all, num_c/num_ct)
+
+
+    def mafMismapValue(self, aLine):
+        """
+        get mismap value by maf "a" line.
+        """
+
+        mismap_fields = filter((lambda s: s.startswith('mismap=')), aLine.split())
+        if len(mismap_fields) > 0:
+            return float(mismap_fields[0][7:])
+        else:
+            return None
+
+
+    def mcContextFilePath(self, pos):
+        """
+        get mC context data file path with specified position.
+        """
+
+        return self.mcContextFilePathByName(self.mcContextFileBaseName(pos))
+
+
+    def mcContextFilePathByName(self, name):
+        """
+        get mC context data file path with specified name.
+        """
+
+        return "%s/%s/%s.tsv" % (self.localDir, self.targetChr, name)
+
+
+    def mcContextFileBaseName(self, pos):
+        """
+        get mC context data file name with specified chromosome number.
+        """
+
+        return "%010d" % (int(pos) / 100000)
+
+
+    def mcContextLocalFilePath(self, chrNo):
+        """
+        get local mC context data file path with specified chromosome number.
+        """
+
+        return "%s/%s.tsv" % (self.localDir, chrNo)
+
+
+    def mcContextGlobalFilePath(self, chrNo):
+        """
+        get global mC context data file path with specified chromosome number.
+        """
+
+        return "%s/%s.tsv" % (self.mcContextDir, chrNo)
+
+
+class SamAlnParser(BsfCallBase):
+
+    def __init__(self, samfile, aln):
+        self.samfile = samfile
+        self.aln = aln
+
+        self.refName = None
+        self.refSeq = None
+        self.refSeqLen = None
+
+        self.strand = None
+
+        self.alnRefStart = None
+        self.alnRefSeq = None
+        self.alnRefSeqLen = None
+
+        self.alnReadSeq = None
+
+
+    def setRefGenome(self, name, sequence, length = None):
+        if length is None:
+            length = len(sequence)
+
+        self.refName = name
+        self.refSeq = sequence
+        self.refSeqLen = length
+
+
+    def referenceName(self):
+        if self.aln.tid >= 0:
+            return self.samfile.getrname(self.aln.tid)
+        else:
+            return None
+
+
+    def getStrand(self):
+        if self.aln.is_reverse:
+            return "-"
+        else:
+            return "+"
+
+
+    def parseCigar(self, cigar):
+        return [{"op": v[-1:], "num": int(v[:-1])} for v in re.findall('\d+[A-Z=]', cigar)]
+
+
+    def alignmentSequences(self, refSeqPos, readSeq, cigars):
+        refs = []
+        rest_refseq = 0
+
+        reads = []
+        read_pos = 0
+
+        for cigar in cigars:
+            if cigar["op"] == "M":
+                refs.append(self.refSeq[refSeqPos:refSeqPos+cigar["num"]])
+                refSeqPos += cigar["num"]
+                reads.append(readSeq[read_pos:read_pos+cigar["num"]])
+                read_pos += cigar["num"]
+            elif cigar["op"] == "I":
+                refs.append("-" * cigar["num"])
+                reads.append(readSeq[read_pos:read_pos+cigar["num"]])
+                read_pos += cigar["num"]
+            elif cigar["op"] == "P":
+                refs.append("-" * cigar["num"])
+                reads.append("-" * cigar["num"])
+            elif cigar["op"] == "D" or cigar["op"] == "N":
+                rest_refseq += cigar["num"]
+                reads.append("-" * cigar["num"])
+            elif cigar["op"] == "S":
+                read_pos += cigar["num"]
+
+        if rest_refseq > 0:
+            refs.append(self.refSeq[refSeqPos:refSeqPos+rest_refseq])
+
+        return {"reference": "".join(refs), "read": "".join(reads)}
+
+
+    def parse(self):
+        self.strand = self.getStrand()
+
+        self.alnRefStart = self.aln.pos
+        read_seq = self.aln.seq
+        cigars = self.parseCigar(self.aln.cigarstring)
+        alignment = self.alignmentSequences(self.alnRefStart, read_seq, cigars)
+
+        if self.strand == "+":
+            self.alnRefSeq = alignment["reference"]
+            self.alnReadSeq = alignment["read"]
+        elif self.strand == "-":
+            nogap_refseq = self.clearGap(alignment["reference"])
+            self.alnRefStart =  self.complementStartPosition(self.refSeqLen, self.alnRefStart, len(nogap_refseq))
+            self.alnRefSeq = self.complementSeq(alignment["reference"])
+            self.alnReadSeq = self.complementSeq(alignment["read"])
+
Binary file bin/bsfcall.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/fastq-interleave	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,12 @@
+#! /bin/bash
+
+# Reads 2 fastq files, and writes them interleaved.
+# Assumes 1 fastq per 4 lines, i.e. no line wrapping.
+
+paste <(cat "$1" | paste - - - -) <(cat "$2" | paste - - - -) | tr '\t' '\n'
+
+# Is this better?
+#paste <(zcat -f "$1"|paste - - - -) <(zcat -f "$2"|paste - - - -)|tr '\t' '\n'
+
+# This does not interpret "-" as stdin:
+#paste <(paste - - - - < "$1") <(paste - - - - < "$2") | tr '\t' '\n'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/last-bisulfite-paired.sh	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,54 @@
+#! /bin/sh
+
+# Align paired bisulfite-converted DNA reads to a genome.
+
+# This assumes that reads1.fastq are all from the converted strand
+# (i.e. they have C->T conversions) and reads2.fastq are all from the
+# reverse-complement (i.e. they have G->A conversions).
+
+# "GNU parallel" needs to be installed.
+
+[ $# -eq 4 ] || {
+    cat <<EOF
+Typical usage:
+
+  lastdb -uBISF my_f mygenome.fa
+  lastdb -uBISR my_r mygenome.fa
+
+  $(basename $0) my_f my_r reads1.fastq reads2.fastq > results.maf
+
+EOF
+    exit 2
+}
+
+# Try to get the LAST programs into the PATH, if they aren't already:
+PATH=$PATH:$(dirname $0)/../src
+
+tmp=${TMPDIR-/tmp}/$$
+trap 'rm -f $tmp.*' EXIT
+
+cat > $tmp.script << 'EOF'
+t=$1.$$
+
+lastal -pBISF -s1 -Q1 -e120 -i1 "$2" "$4" > $t.t1f
+lastal -pBISR -s0 -Q1 -e120 -i1 "$3" "$4" > $t.t1r
+last-merge-batches $t.t1f $t.t1r > $t.t1
+rm $t.t1f $t.t1r
+
+lastal -pBISF -s0 -Q1 -e120 -i1 "$2" "$5" > $t.t2f
+lastal -pBISR -s1 -Q1 -e120 -i1 "$3" "$5" > $t.t2r
+last-merge-batches $t.t2f $t.t2r > $t.t2
+rm $t.t2f $t.t2r
+
+last-pair-probs -m0.1 $t.t1 $t.t2 |
+perl -F'(\s+)' -ane '$F[12] =~ y/ta/CG/ if /^s/ and $s++ % 2; print @F'
+rm $t.t1 $t.t2
+EOF
+
+# Convert C to t, and all other letters to uppercase:
+perl -pe 'y/Cca-z/ttA-Z/ if $. % 4 == 2' "$3" | split -l400000 -a5 - $tmp.1
+
+# Convert G to a, and all other letters to uppercase:
+perl -pe 'y/Gga-z/aaA-Z/ if $. % 4 == 2' "$4" | split -l400000 -a5 - $tmp.2
+
+parallel --gnu --xapply sh $tmp.script $tmp "$1" "$2" ::: $tmp.1* ::: $tmp.2*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/last-bisulfite.sh	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,37 @@
+#! /bin/sh
+
+# Align bisulfite-converted DNA reads to a genome.
+
+# This assumes that the reads are all from the converted strand
+# (i.e. they have C->T conversions, not G->A conversions).
+
+[ $# -gt 1 ] || {
+    cat <<EOF
+Typical usage:
+
+  lastdb -uBISF my_f mygenome.fa
+  lastdb -uBISR my_r mygenome.fa
+
+  $(basename $0) my_f my_r reads.fastq > results.maf
+
+EOF
+    exit 2
+}
+my_f=$1
+my_r=$2
+shift 2
+
+# Try to get the LAST programs into the PATH, if they aren't already:
+PATH=$PATH:$(dirname $0)/../src
+
+tmp=${TMPDIR-/tmp}/$$
+trap 'rm -f $tmp.*' EXIT
+
+# Convert C to t, and all other letters to uppercase:
+perl -pe 'y/Cca-z/ttA-Z/ if $. % 4 == 2' "$@" > "$tmp".q
+
+lastal -pBISF -s1 -Q1 -e120 "$my_f" "$tmp".q > "$tmp".f
+lastal -pBISR -s0 -Q1 -e120 "$my_r" "$tmp".q > "$tmp".r
+
+last-merge-batches "$tmp".f "$tmp".r | last-split -m0.1 |
+perl -F'(\s+)' -ane '$F[12] =~ y/ta/CG/ if /^s/ and $s++ % 2; print @F'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/last-dotplot	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,291 @@
+#! /usr/bin/env python
+
+# Read pair-wise alignments in MAF or LAST tabular format: write an
+# "Oxford grid", a.k.a. dotplot.
+
+# TODO: Currently, pixels with zero aligned nt-pairs are white, and
+# pixels with one or more aligned nt-pairs are black.  This can look
+# too crowded for large genome alignments.  I tried shading each pixel
+# according to the number of aligned nt-pairs within it, but the
+# result is too faint.  How can this be done better?
+
+import fileinput, itertools, optparse, os, re, sys
+
+# Try to make PIL/PILLOW work:
+try: from PIL import Image, ImageDraw, ImageFont, ImageColor
+except ImportError: import Image, ImageDraw, ImageFont, ImageColor
+
+my_name = os.path.basename(sys.argv[0])
+usage = """
+  %prog --help
+  %prog [options] last-tabular-output dotplot.png
+  %prog [options] last-tabular-output dotplot.gif
+  etc."""
+parser = optparse.OptionParser(usage=usage)
+# Replace "width" & "height" with a single "length" option?
+parser.add_option("-x", "--width", type="int", dest="width", default=1000,
+                  help="maximum width in pixels (default: %default)")
+parser.add_option("-y", "--height", type="int", dest="height", default=1000,
+                  help="maximum height in pixels (default: %default)")
+parser.add_option("-f", "--fontfile", dest="fontfile",
+                  help="TrueType or OpenType font file")
+parser.add_option("-s", "--fontsize", type="int", dest="fontsize", default=11,
+                  help="TrueType or OpenType font size (default: %default)")
+parser.add_option("-c", "--forwardcolor", dest="forwardcolor", default="red",
+                  help="Color for forward alignments (default: %default)")
+parser.add_option("-r", "--reversecolor", dest="reversecolor", default="blue",
+                  help="Color for reverse alignments (default: %default)")
+(opts, args) = parser.parse_args()
+if len(args) != 2: parser.error("2 arguments needed")
+
+if opts.fontfile:  font = ImageFont.truetype(opts.fontfile, opts.fontsize)
+else:              font = ImageFont.load_default()
+
+# Make these options too?
+text_color = "black"
+background_color = "white"
+pix_tween_seqs = 2  # number of border pixels between sequences
+border_shade = 239, 239, 239  # the shade of grey to use for border pixels
+label_space = 5     # minimum number of pixels between axis labels
+
+image_mode = 'RGB'
+forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
+reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
+overlap_color = tuple([(i+j)//2 for i, j in zip(forward_color, reverse_color)])
+
+def tabBlocks(beg1, beg2, blocks):
+    '''Get the gapless blocks of an alignment, from LAST tabular format.'''
+    for i in blocks.split(","):
+        if ":" in i:
+            x, y = i.split(":")
+            beg1 += int(x)
+            beg2 += int(y)
+        else:
+            size = int(i)
+            yield beg1, beg2, size
+            beg1 += size
+            beg2 += size
+
+def mafBlocks(beg1, beg2, seq1, seq2):
+    '''Get the gapless blocks of an alignment, from MAF format.'''
+    size = 0
+    for x, y in itertools.izip(seq1, seq2):
+        if x == "-":
+            if size:
+                yield beg1, beg2, size
+                beg1 += size
+                beg2 += size
+                size = 0
+            beg2 += 1
+        elif y == "-":
+            if size:
+                yield beg1, beg2, size
+                beg1 += size
+                beg2 += size
+                size = 0
+            beg1 += 1
+        else:
+            size += 1
+    if size: yield beg1, beg2, size
+
+def alignmentInput(lines):
+    '''Get alignments and sequence lengths, from MAF or tabular format.'''
+    mafCount = 0
+    for line in lines:
+        w = line.split()
+        if line[0].isdigit():  # tabular format
+            chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
+            if w[4] == "-": beg1 -= seqlen1
+            chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
+            if w[9] == "-": beg2 -= seqlen2
+            blocks = tabBlocks(beg1, beg2, w[11])
+            yield chr1, seqlen1, chr2, seqlen2, blocks
+        elif line[0] == "s":  # MAF format
+            if mafCount == 0:
+                chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
+                if w[4] == "-": beg1 -= seqlen1
+                mafCount = 1
+            else:
+                chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
+                if w[4] == "-": beg2 -= seqlen2
+                blocks = mafBlocks(beg1, beg2, seq1, seq2)
+                yield chr1, seqlen1, chr2, seqlen2, blocks
+                mafCount = 0
+
+def readAlignments(lines):
+    '''Get alignments and sequence lengths, from MAF or tabular format.'''
+    alignments = []
+    seqLengths1 = {}
+    seqLengths2 = {}
+    for chr1, seqlen1, chr2, seqlen2, blocks in alignmentInput(lines):
+        aln = chr1, chr2, blocks
+        alignments.append(aln)
+        seqLengths1[chr1] = seqlen1
+        seqLengths2[chr2] = seqlen2
+    return alignments, seqLengths1, seqLengths2
+
+sys.stderr.write(my_name + ": reading alignments...\n")
+input = fileinput.input(args[0])
+alignments, seq_size_dic1, seq_size_dic2 = readAlignments(input)
+sys.stderr.write(my_name + ": done\n")
+
+if not alignments:
+    sys.exit(my_name + ": there are no alignments")
+
+def natural_sort_key(my_string):
+    '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
+    parts = re.split(r'(\d+)', my_string)
+    parts[1::2] = map(int, parts[1::2])
+    return parts
+
+def get_text_sizes(my_strings):
+    '''Get widths & heights, in pixels, of some strings.'''
+    if opts.fontsize == 0: return [(0, 0) for i in my_strings]
+    image_size = 1, 1
+    im = Image.new(image_mode, image_size)
+    draw = ImageDraw.Draw(im)
+    return [draw.textsize(i, font=font) for i in my_strings]
+
+def get_seq_info(seq_size_dic):
+    '''Return miscellaneous information about the sequences.'''
+    seq_names = seq_size_dic.keys()
+    seq_names.sort(key=natural_sort_key)
+    seq_sizes = [seq_size_dic[i] for i in seq_names]
+    name_sizes = get_text_sizes(seq_names)
+    margin = max(zip(*name_sizes)[1])  # maximum text height
+    return seq_names, seq_sizes, name_sizes, margin
+
+seq_names1, seq_sizes1, name_sizes1, margin1 = get_seq_info(seq_size_dic1)
+seq_names2, seq_sizes2, name_sizes2, margin2 = get_seq_info(seq_size_dic2)
+
+def div_ceil(x, y):
+    '''Return x / y rounded up.'''
+    q, r = divmod(x, y)
+    return q + (r != 0)
+
+def tot_seq_pix(seq_sizes, bp_per_pix):
+    '''Return the total pixels needed for sequences of the given sizes.'''
+    return sum([div_ceil(i, bp_per_pix) for i in seq_sizes])
+
+def get_bp_per_pix(seq_sizes, pix_limit):
+    '''Get the minimum bp-per-pixel that fits in the size limit.'''
+    seq_num = len(seq_sizes)
+    seq_pix_limit = pix_limit - pix_tween_seqs * (seq_num - 1)
+    if seq_pix_limit < seq_num:
+        sys.exit(my_name + ": can't fit the image: too many sequences?")
+    lower_bound = div_ceil(sum(seq_sizes), seq_pix_limit)
+    for bp_per_pix in itertools.count(lower_bound):  # slow linear search
+        if tot_seq_pix(seq_sizes, bp_per_pix) <= seq_pix_limit: break
+    return bp_per_pix
+
+sys.stderr.write(my_name + ": choosing bp per pixel...\n")
+bp_per_pix1 = get_bp_per_pix(seq_sizes1, opts.width  - margin1)
+bp_per_pix2 = get_bp_per_pix(seq_sizes2, opts.height - margin2)
+bp_per_pix = max(bp_per_pix1, bp_per_pix2)
+sys.stderr.write(my_name + ": bp per pixel = " + str(bp_per_pix) + "\n")
+
+def get_seq_starts(seq_pix, pix_tween_seqs, margin):
+    '''Get the start pixel for each sequence.'''
+    seq_starts = []
+    pix_tot = margin - pix_tween_seqs
+    for i in seq_pix:
+        pix_tot += pix_tween_seqs
+        seq_starts.append(pix_tot)
+        pix_tot += i
+    return seq_starts
+
+def get_pix_info(seq_sizes, margin):
+    '''Return pixel information about the sequences.'''
+    seq_pix = [div_ceil(i, bp_per_pix) for i in seq_sizes]
+    seq_starts = get_seq_starts(seq_pix, pix_tween_seqs, margin)
+    tot_pix = seq_starts[-1] + seq_pix[-1]
+    return seq_pix, seq_starts, tot_pix
+
+seq_pix1, seq_starts1, width  = get_pix_info(seq_sizes1, margin1)
+seq_pix2, seq_starts2, height = get_pix_info(seq_sizes2, margin2)
+seq_start_dic1 = dict(zip(seq_names1, seq_starts1))
+seq_start_dic2 = dict(zip(seq_names2, seq_starts2))
+hits = [0] * (width * height)  # the image data
+
+sys.stderr.write(my_name + ": processing alignments...\n")
+for seq1, seq2, blocks in alignments:
+    seq_start1 = seq_start_dic1[seq1]
+    seq_start2 = seq_start_dic2[seq2]
+    my_start = seq_start2 * width + seq_start1
+    for beg1, beg2, size in blocks:
+        end1 = beg1 + size
+        end2 = beg2 + size
+        if beg1 >= 0: j = xrange(beg1, end1)
+        else:         j = xrange(-1 - beg1, -1 - end1, -1)
+        if beg2 >= 0: k = xrange(beg2, end2)
+        else:         k = xrange(-1 - beg2, -1 - end2, -1)
+        if beg2 >= 0: store_value = 1
+        else:         store_value = 2
+        for real_pos1, real_pos2 in itertools.izip(j, k):
+            pix1 = real_pos1 // bp_per_pix
+            pix2 = real_pos2 // bp_per_pix
+            hits[my_start + pix2 * width + pix1] |= store_value
+sys.stderr.write(my_name + ": done\n")
+
+def make_label(text, text_size, range_start, range_size):
+    '''Return an axis label with endpoint & sort-order information.'''
+    text_width  = text_size[0]
+    label_start = range_start + (range_size - text_width) // 2
+    label_end   = label_start + text_width
+    sort_key    = text_width - range_size
+    return sort_key, label_start, label_end, text
+
+def get_nonoverlapping_labels(labels):
+    '''Get a subset of non-overlapping axis labels, greedily.'''
+    nonoverlapping_labels = []
+    for i in labels:
+        if True not in [i[1] < j[2] + label_space and j[1] < i[2] + label_space
+                        for j in nonoverlapping_labels]:
+            nonoverlapping_labels.append(i)
+    return nonoverlapping_labels
+
+def get_axis_image(seq_names, name_sizes, seq_starts, seq_pix):
+    '''Make an image of axis labels.'''
+    min_pos = seq_starts[0]
+    max_pos = seq_starts[-1] + seq_pix[-1]
+    height = max(zip(*name_sizes)[1])
+    labels = [make_label(i, j, k, l) for i, j, k, l in
+              zip(seq_names, name_sizes, seq_starts, seq_pix)]
+    labels = [i for i in labels if i[1] >= min_pos and i[2] <= max_pos]
+    labels.sort()
+    labels = get_nonoverlapping_labels(labels)
+    image_size = max_pos, height
+    im = Image.new(image_mode, image_size, border_shade)
+    draw = ImageDraw.Draw(im)
+    for i in labels:
+        position = i[1], 0
+        draw.text(position, i[3], font=font, fill=text_color)
+    return im
+
+image_size = width, height
+im = Image.new(image_mode, image_size, background_color)
+
+for i in range(height):
+    for j in range(width):
+        store_value = hits[i * width + j]
+        xy = j, i
+        if   store_value == 1: im.putpixel(xy, forward_color)
+        elif store_value == 2: im.putpixel(xy, reverse_color)
+        elif store_value == 3: im.putpixel(xy, overlap_color)
+
+if opts.fontsize != 0:
+    axis1 = get_axis_image(seq_names1, name_sizes1, seq_starts1, seq_pix1)
+    axis2 = get_axis_image(seq_names2, name_sizes2, seq_starts2, seq_pix2)
+    axis2 = axis2.rotate(270)
+    im.paste(axis1, (0, 0))
+    im.paste(axis2, (0, 0))
+
+for i in seq_starts1[1:]:
+    box = i - pix_tween_seqs, margin2, i, height
+    im.paste(border_shade, box)
+
+for i in seq_starts2[1:]:
+    box = margin1, i - pix_tween_seqs, width, i
+    im.paste(border_shade, box)
+
+im.save(args[1])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/last-map-probs	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,125 @@
+#! /usr/bin/env python
+
+# Copyright 2010, 2011, 2012, 2014 Martin C. Frith
+
+# Read query-genome alignments: write them along with the probability
+# that each alignment is not the true mapping of its query.  These
+# probabilities make the risky assumption that one of the alignments
+# reported for each query is correct.
+
+import sys, os, fileinput, math, optparse, signal
+
+def logsum(numbers):
+    """Adds numbers, in log space, to avoid overflow."""
+    m = max(numbers)
+    s = sum(math.exp(i - m) for i in numbers)
+    return m + math.log(s)
+
+def mafScore(aLine):
+    for word in aLine.split():
+        if word.startswith("score="):
+            return float(word[6:])
+    raise Exception("found an alignment without a score")
+
+def namesAndScores(lines):
+    queryNames = []
+    scores = []
+    for line in lines:
+        if line[0] == "a":  # faster than line.startswith("a")
+            s = mafScore(line)
+            scores.append(s)
+            sLineCount = 0
+        elif line[0] == "s":
+            sLineCount += 1
+            if sLineCount == 2: queryNames.append(line.split(None, 2)[1])
+        elif line[0].isdigit():  # we have an alignment in tabular format
+            w = line.split(None, 7)
+            scores.append(float(w[0]))
+            queryNames.append(w[6])
+    return queryNames, scores
+
+def scoreTotals(queryNames, scores, temperature):
+    queryLists = {}
+    for n, s in zip(queryNames, scores):
+        queryLists.setdefault(n, []).append(s / temperature)
+    return dict((k, logsum(v)) for k, v in queryLists.iteritems())
+
+def writeOneBatch(lines, queryNames, scores, denominators, opts, temperature):
+    isWanted = True
+    i = 0
+    for line in lines:
+        if line[0] == "a":
+            s = scores[i]
+            p = 1.0 - math.exp(s / temperature - denominators[queryNames[i]])
+            i += 1
+            if s < opts.score or p > opts.mismap:
+                isWanted = False
+            else:
+                newLineEnd = " mismap=%.3g\n" % p
+                line = line.rstrip() + newLineEnd
+        elif line[0].isdigit():  # we have an alignment in tabular format
+            s = scores[i]
+            p = 1.0 - math.exp(s / temperature - denominators[queryNames[i]])
+            i += 1
+            if s < opts.score or p > opts.mismap: continue
+            newLineEnd = "\t%.3g\n" % p
+            line = line.rstrip() + newLineEnd
+        if isWanted: print line,
+        if line.isspace(): isWanted = True  # reset at end of maf paragraph
+
+def doOneBatch(lines, opts, temperature):
+    queryNames, scores = namesAndScores(lines)
+    denominators = scoreTotals(queryNames, scores, temperature)
+    writeOneBatch(lines, queryNames, scores, denominators, opts, temperature)
+
+def readHeaderOrDie(lines):
+    t = 0.0
+    e = -1
+    for line in lines:
+        if line.startswith("#") or line.isspace():
+            print line,
+            for i in line.split():
+                if i.startswith("t="): t = float(i[2:])
+                elif i.startswith("e="): e = int(i[2:])
+            if t > 0 and e >= 0: break
+        else:
+            raise Exception("I need a header with t= and e=")
+    return t, e
+
+def lastMapProbs(opts, args):
+    f = fileinput.input(args)
+    temperature, e = readHeaderOrDie(f)
+    if opts.score < 0: opts.score = e + round(temperature * math.log(1000))
+    lines = []
+
+    for line in f:
+        if line.startswith("# batch"):
+            doOneBatch(lines, opts, temperature)
+            lines = []
+        lines.append(line)
+    doOneBatch(lines, opts, temperature)
+
+if __name__ == "__main__":
+    signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
+
+    usage = """
+  %prog --help
+  %prog [options] lastal-alignments"""
+
+    description = "Calculate a mismap probability for each alignment.  This is the probability that the alignment does not reflect the origin of the query sequence, assuming that one reported alignment does reflect the origin of each query."
+
+    op = optparse.OptionParser(usage=usage, description=description)
+    op.add_option("-m", "--mismap", type="float", default=0.01, metavar="M",
+                  help="don't write alignments with mismap probability > M (default: %default)")
+    op.add_option("-s", "--score", type="float", default=-1, metavar="S",
+                  help="don't write alignments with score < S (default: e+t*ln[1000])")
+    (opts, args) = op.parse_args()
+    if not args and sys.stdin.isatty():
+        op.print_help()
+        op.exit()
+
+    try: lastMapProbs(opts, args)
+    except KeyboardInterrupt: pass  # avoid silly error message
+    except Exception, e:
+        prog = os.path.basename(sys.argv[0])
+        sys.exit(prog + ": error: " + str(e))
Binary file bin/last-merge-batches has changed
Binary file bin/last-pair-probs has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/last-postmask	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,105 @@
+#! /usr/bin/env python
+
+# Copyright 2014 Martin C. Frith
+
+# Read MAF-format alignments, and write those that have a segment with
+# score >= threshold, with gentle masking of lowercase letters.  There
+# must be a lastal header with score parameters.
+
+# Gentle masking is described in: MC Frith, PLoS One 2011;6(12):e28819
+# "Gentle masking of low-complexity sequences improves homology search"
+
+# Limitations: doesn't (yet) handle sequence quality data,
+# frameshifts, or generalized affine gaps.
+
+import fileinput, itertools, optparse, os, signal, sys
+
+def getScoreMatrix(rowHeads, colHeads, matrix, deleteCost, insertCost):
+    defaultScore = min(map(min, matrix))
+    scoreMatrix = [[defaultScore for i in range(128)] for j in range(128)]
+    for i, x in enumerate(rowHeads):
+        for j, y in enumerate(colHeads):
+            xu = ord(x.upper())
+            xl = ord(x.lower())
+            yu = ord(y.upper())
+            yl = ord(y.lower())
+            score = matrix[i][j]
+            maskScore = min(score, 0)
+            scoreMatrix[xu][yu] = score
+            scoreMatrix[xu][yl] = maskScore
+            scoreMatrix[xl][yu] = maskScore
+            scoreMatrix[xl][yl] = maskScore
+    for i in range(128):
+        scoreMatrix[i][ord("-")] = -deleteCost
+        scoreMatrix[ord("-")][i] = -insertCost
+    return scoreMatrix
+
+def isGoodAlignment(seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
+    """Does the alignment have a segment with score >= minScore?"""
+    r, q = seqs
+    score = 0
+    xOld = " "
+    yOld = " "
+    for x, y in itertools.izip(r, q):
+        score += scoreMatrix[ord(x)][ord(y)]
+        if score >= minScore: return True
+        if x == "-" and xOld != "-": score -= insOpenCost
+        if y == "-" and yOld != "-": score -= delOpenCost
+        if score < 0: score = 0
+        xOld = x
+        yOld = y
+    return False
+
+def printIfGood(maf, seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
+    if isGoodAlignment(seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
+        for line in maf:
+            print line,
+        print
+
+def lastPostmask(args):
+    scoreMatrix = []
+    maf = []
+    seqs = []
+
+    for line in fileinput.input(args):
+        if line[0] == "#":
+            print line,
+            w = line.split()
+            for i in w:
+                if i.startswith("a="): aDel = int(i[2:])
+                if i.startswith("b="): bDel = int(i[2:])
+                if i.startswith("A="): aIns = int(i[2:])
+                if i.startswith("B="): bIns = int(i[2:])
+                if i.startswith("e="): minScore = int(i[2:])
+            if len(w) > 1 and max(map(len, w)) == 1:
+                colHeads = w[1:]
+                rowHeads = []
+                matrix = []
+            elif len(w) > 2 and len(w[1]) == 1:
+                rowHeads.append(w[1])
+                matrix.append(map(int, w[2:]))
+        elif line.isspace():
+            if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore)
+            maf = []
+            seqs = []
+        else:
+            if not scoreMatrix:
+                scoreMatrix = getScoreMatrix(rowHeads, colHeads, matrix,
+                                             bDel, bIns)
+            maf.append(line)
+            if line[0] == "s": seqs.append(line.split()[6])
+    if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore)
+
+if __name__ == "__main__":
+    signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
+
+    usage = "%prog in.maf > out.maf"
+    description = "Get alignments that have a segment with score >= threshold, with gentle masking of lowercase letters."
+    op = optparse.OptionParser(usage=usage, description=description)
+    (opts, args) = op.parse_args()
+
+    try: lastPostmask(args)
+    except KeyboardInterrupt: pass  # avoid silly error message
+    except Exception, e:
+        prog = os.path.basename(sys.argv[0])
+        sys.exit(prog + ": error: " + str(e))
Binary file bin/last-split has changed
Binary file bin/lastal has changed
Binary file bin/lastdb has changed
Binary file bin/lastex has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/maf-convert	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,674 @@
+#! /usr/bin/env python
+# Copyright 2010, 2011, 2013, 2014 Martin C. Frith
+# Read MAF-format alignments: write them in other formats.
+# Seems to work with Python 2.x, x>=4
+
+# By "MAF" we mean "multiple alignment format" described in the UCSC
+# Genome FAQ, not e.g. "MIRA assembly format".
+
+from itertools import *
+import sys, os, fileinput, math, operator, optparse, signal, string
+
+def maxlen(s):
+    return max(map(len, s))
+
+def joined(things, delimiter):
+    return delimiter.join(map(str, things))
+
+identityTranslation = string.maketrans("", "")
+def deleted(myString, deleteChars):
+    return myString.translate(identityTranslation, deleteChars)
+
+def quantify(iterable, pred=bool):
+    """Count how many times the predicate is true."""
+    return sum(map(pred, iterable))
+
+class Maf:
+    def __init__(self, aLine, sLines, qLines, pLines):
+        self.namesAndValues = dict(i.split("=") for i in aLine[1:])
+
+        if not sLines: raise Exception("empty alignment")
+        cols = zip(*sLines)
+        self.seqNames = cols[1]
+        self.alnStarts = map(int, cols[2])
+        self.alnSizes = map(int, cols[3])
+        self.strands = cols[4]
+        self.seqSizes = map(int, cols[5])
+        self.alnStrings = cols[6]
+
+        self.qLines = qLines
+        self.pLines = pLines
+
+def dieUnlessPairwise(maf):
+    if len(maf.alnStrings) != 2:
+        raise Exception("pairwise alignments only, please")
+
+def insertionCounts(alnString):
+    gaps = alnString.count("-")
+    forwardFrameshifts = alnString.count("\\")
+    reverseFrameshifts = alnString.count("/")
+    letters = len(alnString) - gaps - forwardFrameshifts - reverseFrameshifts
+    return letters, forwardFrameshifts, reverseFrameshifts
+
+def coordinatesPerLetter(alnString, alnSize):
+    letters, forwardShifts, reverseShifts = insertionCounts(alnString)
+    if forwardShifts or reverseShifts or letters < alnSize: return 3
+    else: return 1
+
+def mafLetterSizes(maf):
+    return map(coordinatesPerLetter, maf.alnStrings, maf.alnSizes)
+
+def alnLetterSizes(sLines):
+    return [coordinatesPerLetter(i[6], int(i[3])) for i in sLines]
+
+def isTranslated(sLines):
+    for i in sLines:
+        alnString = i[6]
+        if "\\" in alnString or "/" in alnString: return True
+        if len(alnString) - alnString.count("-") < int(i[3]): return True
+    return False
+
+def insertionSize(alnString, letterSize):
+    """Get the length of sequence included in the alnString."""
+    letters, forwardShifts, reverseShifts = insertionCounts(alnString)
+    return letters * letterSize + forwardShifts - reverseShifts
+
+def symbolSize(symbol, letterSize):
+    if symbol == "-": return 0
+    if symbol == "\\": return 1
+    if symbol == "/": return -1
+    return letterSize
+
+def isMatch(alignmentColumn):
+    # No special treatment of ambiguous bases/residues: same as NCBI BLAST.
+    first = alignmentColumn[0].upper()
+    for i in alignmentColumn[1:]:
+        if i.upper() != first: return False
+    return True
+
+def isGapless(alignmentColumn):
+    return "-" not in alignmentColumn
+
+def matchAndInsertSizes(alignmentColumns, letterSizes):
+    """Get sizes of gapless blocks, and of the inserts between them."""
+    for k, v in groupby(alignmentColumns, isGapless):
+        if k:
+            sizeOfGaplessBlock = len(list(v))
+            yield str(sizeOfGaplessBlock)
+        else:
+            blockRows = izip(*v)
+            blockRows = imap(''.join, blockRows)
+            insertSizes = imap(insertionSize, blockRows, letterSizes)
+            yield joined(insertSizes, ":")
+
+##### Routines for converting to AXT format: #####
+
+axtCounter = count()
+
+def writeAxt(maf):
+    if maf.strands[0] != "+":
+        raise Exception("for AXT, the 1st strand in each alignment must be +")
+
+    # Convert to AXT's 1-based coordinates:
+    alnStarts = imap(operator.add, maf.alnStarts, repeat(1))
+    alnEnds = imap(operator.add, maf.alnStarts, maf.alnSizes)
+
+    rows = zip(maf.seqNames, alnStarts, alnEnds, maf.strands)
+    head, tail = rows[0], rows[1:]
+
+    outWords = []
+    outWords.append(axtCounter.next())
+    outWords.extend(head[:3])
+    outWords.extend(chain(*tail))
+    outWords.append(maf.namesAndValues["score"])
+
+    print joined(outWords, " ")
+    for i in maf.alnStrings: print i
+    print  # print a blank line at the end
+
+##### Routines for converting to tabular format: #####
+
+def writeTab(maf):
+    aLine, sLines, qLines, pLines = maf
+
+    score = "0"
+    expect = None
+    mismap = None
+    for i in aLine:
+        if   i.startswith("score="): score = i[6:]
+        elif i.startswith("expect="): expect = i[7:]
+        elif i.startswith("mismap="): mismap = i[7:]
+
+    outWords = []
+    outWords.append(score)
+
+    for i in sLines: outWords.extend(i[1:6])
+
+    alnStrings = [i[6] for i in sLines]
+    alignmentColumns = zip(*alnStrings)
+    letterSizes = alnLetterSizes(sLines)
+    gapStrings = matchAndInsertSizes(alignmentColumns, letterSizes)
+    gapWord = ",".join(gapStrings)
+    outWords.append(gapWord)
+
+    if expect: outWords.append(expect)
+    if mismap: outWords.append(mismap)
+
+    print "\t".join(outWords)
+
+##### Routines for converting to PSL format: #####
+
+def pslBlocks(alignmentColumns, alnStarts, letterSizes):
+    """Get sizes and start coordinates of gapless blocks in an alignment."""
+    start1, start2 = alnStarts
+    letterSize1, letterSize2 = letterSizes
+    size = 0
+    for x, y in alignmentColumns:
+        if x == "-":
+            if size:
+                yield size, start1, start2
+                start1 += size * letterSize1
+                start2 += size * letterSize2
+                size = 0
+            start2 += symbolSize(y, letterSize2)
+        elif y == "-":
+            if size:
+                yield size, start1, start2
+                start1 += size * letterSize1
+                start2 += size * letterSize2
+                size = 0
+            start1 += symbolSize(x, letterSize1)
+        else:
+            size += 1
+    if size: yield size, start1, start2
+
+def pslCommaString(things):
+    # UCSC software seems to prefer a trailing comma
+    return joined(things, ",") + ","
+
+def gapRunCount(letters):
+    """Get the number of runs of gap characters."""
+    uniqLetters = map(operator.itemgetter(0), groupby(letters))
+    return uniqLetters.count("-")
+
+def pslEndpoints(alnStart, alnSize, strand, seqSize):
+    alnEnd = alnStart + alnSize
+    if strand == "+": return alnStart, alnEnd
+    else: return seqSize - alnEnd, seqSize - alnStart
+
+def caseSensitivePairwiseMatchCounts(columns, isProtein):
+    # repMatches is always zero
+    # for proteins, nCount is always zero, because that's what BLATv34 does
+    standardBases = "ACGTU"
+    matches = mismatches = repMatches = nCount = 0
+    for i in columns:
+        if "-" in i: continue
+        x, y = i
+        if x in standardBases and y in standardBases or isProtein:
+            if x == y: matches += 1
+            else: mismatches += 1
+        else: nCount += 1
+    return matches, mismatches, repMatches, nCount
+
+def writePsl(maf, isProtein):
+    dieUnlessPairwise(maf)
+
+    alnStrings = map(str.upper, maf.alnStrings)
+    alignmentColumns = zip(*alnStrings)
+    letterSizes = mafLetterSizes(maf)
+
+    matchCounts = caseSensitivePairwiseMatchCounts(alignmentColumns, isProtein)
+    matches, mismatches, repMatches, nCount = matchCounts
+    numGaplessColumns = sum(matchCounts)
+
+    qNumInsert = gapRunCount(maf.alnStrings[0])
+    qBaseInsert = maf.alnSizes[1] - numGaplessColumns * letterSizes[1]
+
+    tNumInsert = gapRunCount(maf.alnStrings[1])
+    tBaseInsert = maf.alnSizes[0] - numGaplessColumns * letterSizes[0]
+
+    strand = maf.strands[1]
+    if max(letterSizes) > 1:
+        strand += maf.strands[0]
+    elif maf.strands[0] != "+":
+        raise Exception("for non-translated PSL, the 1st strand in each alignment must be +")
+
+    tName, qName = maf.seqNames
+    tSeqSize, qSeqSize = maf.seqSizes
+
+    rows = zip(maf.alnStarts, maf.alnSizes, maf.strands, maf.seqSizes)
+    tStart, tEnd = pslEndpoints(*rows[0])
+    qStart, qEnd = pslEndpoints(*rows[1])
+
+    blocks = list(pslBlocks(alignmentColumns, maf.alnStarts, letterSizes))
+    blockCount = len(blocks)
+    blockSizes, tStarts, qStarts = map(pslCommaString, zip(*blocks))
+
+    outWords = (matches, mismatches, repMatches, nCount,
+                qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand,
+                qName, qSeqSize, qStart, qEnd, tName, tSeqSize, tStart, tEnd,
+                blockCount, blockSizes, qStarts, tStarts)
+    print joined(outWords, "\t")
+
+##### Routines for converting to SAM format: #####
+
+def readGroupId(readGroupItems):
+    for i in readGroupItems:
+        if i.startswith("ID:"):
+            return i[3:]
+    raise Exception("readgroup must include ID")
+
+def readSequenceLengths(lines):
+    """Read name & length of topmost sequence in each maf block."""
+    sequenceLengths = {}  # an OrderedDict might be nice
+    isSearching = True
+    for line in lines:
+        if line.isspace(): isSearching = True
+        elif isSearching:
+            w = line.split()
+            if w[0] == "s":
+                seqName, seqSize = w[1], w[5]
+                sequenceLengths[seqName] = seqSize
+                isSearching = False
+    return sequenceLengths
+
+def naturalSortKey(s):
+    """Return a key that sorts strings in "natural" order."""
+    return [(str, int)[k]("".join(v)) for k, v in groupby(s, str.isdigit)]
+
+def karyotypicSortKey(s):
+    """Attempt to sort chromosomes in GATK's ridiculous order."""
+    if s == "chrM": return []
+    if s == "MT": return ["~"]
+    return naturalSortKey(s)
+
+def writeSamHeader(sequenceLengths, dictFile, readGroupItems):
+    print "@HD\tVN:1.3\tSO:unsorted"
+    for k in sorted(sequenceLengths, key=karyotypicSortKey):
+        print "@SQ\tSN:%s\tLN:%s" % (k, sequenceLengths[k])
+    if dictFile:
+        for i in fileinput.input(dictFile):
+            if i.startswith("@SQ"): print i,
+            elif not i.startswith("@"): break
+    if readGroupItems:
+        print "@RG\t" + "\t".join(readGroupItems)
+
+mapqMissing = "255"
+mapqMaximum = "254"
+mapqMaximumNum = float(mapqMaximum)
+
+def mapqFromProb(probString):
+    try: p = float(probString)
+    except ValueError: raise Exception("bad probability: " + probString)
+    if p < 0 or p > 1: raise Exception("bad probability: " + probString)
+    if p == 0: return mapqMaximum
+    phred = -10 * math.log(p, 10)
+    if phred >= mapqMaximumNum: return mapqMaximum
+    return str(int(round(phred)))
+
+def cigarParts(beg, alignmentColumns, end):
+    if beg: yield str(beg) + "H"
+
+    # (doesn't handle translated alignments)
+    isActive = True
+    for x, y in alignmentColumns: break
+    else: isActive = False
+    while isActive:
+        size = 1
+        if x == "-":
+            for x, y in alignmentColumns:
+                if x != "-": break
+                size += 1
+            else: isActive = False
+            yield str(size) + "I"
+        elif y == "-":
+            for x, y in alignmentColumns:
+                if y != "-": break
+                size += 1
+            else: isActive = False
+            yield str(size) + "D"
+        else:
+            for x, y in alignmentColumns:
+                if x == "-" or y == "-": break
+                size += 1
+            else: isActive = False
+            yield str(size) + "M"
+
+    if end: yield str(end) + "H"
+
+def writeSam(maf, rg):
+    aLine, sLines, qLines, pLines = maf
+
+    if isTranslated(sLines):
+        raise Exception("this looks like translated DNA - can't convert to SAM format")
+
+    score = None
+    evalue = None
+    mapq = mapqMissing
+    for i in aLine:
+        if i.startswith("score="):
+            v = i[6:]
+            if v.isdigit(): score = "AS:i:" + v  # it must be an integer
+        elif i.startswith("expect="):
+            evalue = "EV:Z:" + i[7:]
+        elif i.startswith("mismap="):
+            mapq = mapqFromProb(i[7:])
+
+    head, tail = sLines[0], sLines[1:]
+
+    s, rName, rStart, rAlnSize, rStrand, rSeqSize, rAlnString = head
+    if rStrand != "+":
+        raise Exception("for SAM, the 1st strand in each alignment must be +")
+    pos = str(int(rStart) + 1)  # convert to 1-based coordinate
+
+    for s, qName, qStart, qAlnSize, qStrand, qSeqSize, qAlnString in tail:
+        alignmentColumns = zip(rAlnString.upper(), qAlnString.upper())
+
+        qStart = int(qStart)
+        qRevStart = int(qSeqSize) - qStart - int(qAlnSize)
+        cigar = "".join(cigarParts(qStart, iter(alignmentColumns), qRevStart))
+
+        seq = deleted(qAlnString, "-")
+
+        qual = "*"
+        if qLines:
+            q, qualityName, qualityString = qLines[0]
+            # don't try to handle multiple alignments for now (YAGNI):
+            if len(qLines) > 1 or len(tail) > 1 or qualityName != qName:
+                raise Exception("can't interpret the quality data")
+            qual = ''.join(j for i, j in izip(qAlnString, qualityString)
+                           if i != "-")
+
+        # It's hard to get all the pair info, so this is very
+        # incomplete, but hopefully good enough.
+        # I'm not sure whether to add 2 and/or 8 to flag.
+        if qName.endswith("/1"):
+            qName = qName[:-2]
+            if qStrand == "+": flag = "99"  # 1 + 2 + 32 + 64
+            else:              flag = "83"  # 1 + 2 + 16 + 64
+        elif qName.endswith("/2"):
+            qName = qName[:-2]
+            if qStrand == "+": flag = "163"  # 1 + 2 + 32 + 128
+            else:              flag = "147"  # 1 + 2 + 16 + 128
+        else:
+            if qStrand == "+": flag = "0"
+            else:              flag = "16"
+
+        editDistance = sum(1 for x, y in alignmentColumns if x != y)
+        # no special treatment of ambiguous bases: might be a minor bug
+        editDistance = "NM:i:" + str(editDistance)
+
+        outWords = [qName, flag, rName, pos, mapq, cigar, "*\t0\t0", seq, qual]
+        outWords.append(editDistance)
+        if score: outWords.append(score)
+        if evalue: outWords.append(evalue)
+        if rg: outWords.append(rg)
+        print "\t".join(outWords)
+
+##### Routines for converting to BLAST-like format: #####
+
+def pairwiseMatchSymbol(alignmentColumn):
+    if isMatch(alignmentColumn): return "|"
+    else: return " "
+
+def strandText(strand):
+    if strand == "+": return "Plus"
+    else: return "Minus"
+
+def blastCoordinate(oneBasedCoordinate, strand, seqSize):
+    if strand == "-":
+        oneBasedCoordinate = seqSize - oneBasedCoordinate + 1
+    return str(oneBasedCoordinate)
+
+def chunker(things, chunkSize):
+    for i in range(0, len(things), chunkSize):
+        yield things[i:i+chunkSize]
+
+def blastChunker(maf, lineSize, alignmentColumns):
+    letterSizes = mafLetterSizes(maf)
+    coords = maf.alnStarts
+    for chunkCols in chunker(alignmentColumns, lineSize):
+        chunkRows = zip(*chunkCols)
+        chunkRows = map(''.join, chunkRows)
+        starts = [i + 1 for i in coords]  # change to 1-based coordinates
+        starts = map(blastCoordinate, starts, maf.strands, maf.seqSizes)
+        increments = map(insertionSize, chunkRows, letterSizes)
+        coords = map(operator.add, coords, increments)
+        ends = map(blastCoordinate, coords, maf.strands, maf.seqSizes)
+        yield chunkCols, chunkRows, starts, ends
+
+def writeBlast(maf, lineSize, oldQueryName):
+    dieUnlessPairwise(maf)
+
+    if maf.seqNames[1] != oldQueryName:
+        print "Query= %s" % maf.seqNames[1]
+        print "         (%s letters)" % maf.seqSizes[1]
+        print
+
+    print ">%s" % maf.seqNames[0]
+    print "          Length = %s" % maf.seqSizes[0]
+    print
+
+    scoreLine = " Score = %s" % maf.namesAndValues["score"]
+    try: scoreLine += ", Expect = %s" % maf.namesAndValues["expect"]
+    except KeyError: pass
+    print scoreLine
+
+    alignmentColumns = zip(*maf.alnStrings)
+
+    alnSize = len(alignmentColumns)
+    matches = quantify(alignmentColumns, isMatch)
+    matchPercent = 100 * matches // alnSize  # round down, like BLAST
+    identLine = " Identities = %s/%s (%s%%)" % (matches, alnSize, matchPercent)
+    gaps = alnSize - quantify(alignmentColumns, isGapless)
+    gapPercent = 100 * gaps // alnSize  # round down, like BLAST
+    if gaps: identLine += ", Gaps = %s/%s (%s%%)" % (gaps, alnSize, gapPercent)
+    print identLine
+
+    strands = map(strandText, maf.strands)
+    print " Strand = %s / %s" % (strands[1], strands[0])
+    print
+
+    for chunk in blastChunker(maf, lineSize, alignmentColumns):
+        cols, rows, starts, ends = chunk
+        startWidth = maxlen(starts)
+        matchSymbols = map(pairwiseMatchSymbol, cols)
+        matchSymbols = ''.join(matchSymbols)
+        print "Query: %-*s %s %s" % (startWidth, starts[1], rows[1], ends[1])
+        print "       %-*s %s"    % (startWidth, " ", matchSymbols)
+        print "Sbjct: %-*s %s %s" % (startWidth, starts[0], rows[0], ends[0])
+        print
+
+##### Routines for converting to HTML format: #####
+
+def writeHtmlHeader():
+    print '''
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+ "http://www.w3.org/TR/html4/strict.dtd">
+<html lang="en"><head>
+<meta http-equiv="Content-type" content="text/html; charset=UTF-8">
+<title>Reliable Alignments</title>
+<style type="text/css">
+/* Try to force monospace, working around browser insanity: */
+pre {font-family: "Courier New", monospace, serif; font-size: 0.8125em}
+.a {background-color: #3333FF}
+.b {background-color: #9933FF}
+.c {background-color: #FF66CC}
+.d {background-color: #FF3333}
+.e {background-color: #FF9933}
+.f {background-color: #FFFF00}
+.key {display:inline; margin-right:2em}
+</style>
+</head><body>
+
+<div style="line-height:1">
+<pre class="key"><span class="a">  </span> prob &gt; 0.999</pre>
+<pre class="key"><span class="b">  </span> prob &gt; 0.99 </pre>
+<pre class="key"><span class="c">  </span> prob &gt; 0.95 </pre>
+<pre class="key"><span class="d">  </span> prob &gt; 0.9  </pre>
+<pre class="key"><span class="e">  </span> prob &gt; 0.5  </pre>
+<pre class="key"><span class="f">  </span> prob &le; 0.5  </pre>
+</div>
+'''
+
+def probabilityClass(probabilityColumn):
+    p = ord(min(probabilityColumn)) - 33
+    if   p >= 30: return 'a'
+    elif p >= 20: return 'b'
+    elif p >= 13: return 'c'
+    elif p >= 10: return 'd'
+    elif p >=  3: return 'e'
+    else: return 'f'
+
+def identicalRuns(s):
+    """Yield (item, start, end) for each run of identical items in s."""
+    beg = 0
+    for k, v in groupby(s):
+        end = beg + len(list(v))
+        yield k, beg, end
+        beg = end
+
+def htmlSpan(text, classRun):
+    key, beg, end = classRun
+    textbit = text[beg:end]
+    if key: return '<span class="%s">%s</span>' % (key, textbit)
+    else: return textbit
+
+def multipleMatchSymbol(alignmentColumn):
+    if isMatch(alignmentColumn): return "*"
+    else: return " "
+
+def writeHtml(maf, lineSize):
+    scoreLine = "Alignment"
+    try:
+        scoreLine += " score=%s" % maf.namesAndValues["score"]
+        scoreLine += ", expect=%s" % maf.namesAndValues["expect"]
+    except KeyError: pass
+    print "<h3>%s:</h3>" % scoreLine
+
+    if maf.pLines:
+        probRows = [i[1] for i in maf.pLines]
+        probCols = izip(*probRows)
+        classes = imap(probabilityClass, probCols)
+    else:
+        classes = repeat(None)
+
+    nameWidth = maxlen(maf.seqNames)
+    alignmentColumns = zip(*maf.alnStrings)
+
+    print '<pre>'
+    for chunk in blastChunker(maf, lineSize, alignmentColumns):
+        cols, rows, starts, ends = chunk
+        startWidth = maxlen(starts)
+        endWidth = maxlen(ends)
+        matchSymbols = map(multipleMatchSymbol, cols)
+        matchSymbols = ''.join(matchSymbols)
+        classChunk = islice(classes, lineSize)
+        classRuns = list(identicalRuns(classChunk))
+        for n, s, r, e in zip(maf.seqNames, starts, rows, ends):
+            spans = [htmlSpan(r, i) for i in classRuns]
+            spans = ''.join(spans)
+            formatParams = nameWidth, n, startWidth, s, spans, endWidth, e
+            print '%-*s %*s %s %*s' % formatParams
+        print ' ' * nameWidth, ' ' * startWidth, matchSymbols
+        print
+    print '</pre>'
+
+##### Routines for reading MAF format: #####
+
+def mafInput(lines, isKeepComments):
+    a = []
+    s = []
+    q = []
+    p = []
+    for i in lines:
+        w = i.split()
+        if not w:
+            if a: yield a, s, q, p
+            a = []
+            s = []
+            q = []
+            p = []
+        elif w[0] == "a":
+            a = w
+        elif w[0] == "s":
+            s.append(w)
+        elif w[0] == "q":
+            q.append(w)
+        elif w[0] == "p":
+            p.append(w)
+        elif i[0] == "#" and isKeepComments:
+            print i,
+    if a: yield a, s, q, p
+
+def isFormat(myString, myFormat):
+    return myFormat.startswith(myString)
+
+def mafConvert(opts, args):
+    format = args[0].lower()
+    if isFormat(format, "sam"):
+        if opts.dictionary: d = readSequenceLengths(fileinput.input(args[1:]))
+        else: d = {}
+        if opts.readgroup:
+            readGroupItems = opts.readgroup.split()
+            rg = "RG:Z:" + readGroupId(readGroupItems)
+        else:
+            readGroupItems = []
+            rg = ""
+        if not opts.noheader: writeSamHeader(d, opts.dictfile, readGroupItems)
+    inputLines = fileinput.input(args[1:])
+    if isFormat(format, "html") and not opts.noheader: writeHtmlHeader()
+    isKeepCommentLines = isFormat(format, "tabular") and not opts.noheader
+    oldQueryName = ""
+    for maf in mafInput(inputLines, isKeepCommentLines):
+        if   isFormat(format, "axt"): writeAxt(Maf(*maf))
+        elif isFormat(format, "blast"):
+            maf = Maf(*maf)
+            writeBlast(maf, opts.linesize, oldQueryName)
+            oldQueryName = maf.seqNames[1]
+        elif isFormat(format, "html"): writeHtml(Maf(*maf), opts.linesize)
+        elif isFormat(format, "psl"): writePsl(Maf(*maf), opts.protein)
+        elif isFormat(format, "sam"): writeSam(maf, rg)
+        elif isFormat(format, "tabular"): writeTab(maf)
+        else: raise Exception("unknown format: " + format)
+    if isFormat(format, "html") and not opts.noheader: print "</body></html>"
+
+if __name__ == "__main__":
+    signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
+
+    usage = """
+  %prog --help
+  %prog axt mafFile(s)
+  %prog blast mafFile(s)
+  %prog html mafFile(s)
+  %prog psl mafFile(s)
+  %prog sam mafFile(s)
+  %prog tab mafFile(s)"""
+
+    description = "Read MAF-format alignments & write them in another format."
+
+    op = optparse.OptionParser(usage=usage, description=description)
+    op.add_option("-p", "--protein", action="store_true",
+                  help="assume protein alignments, for psl match counts")
+    op.add_option("-n", "--noheader", action="store_true",
+                  help="omit any header lines from the output")
+    op.add_option("-d", "--dictionary", action="store_true",
+                  help="include dictionary of sequence lengths in sam format")
+    op.add_option("-f", "--dictfile",
+                  help="get sequence dictionary from DICTFILE")
+    op.add_option("-r", "--readgroup",
+                  help="read group info for sam format")
+    op.add_option("-l", "--linesize", type="int", default=60, #metavar="CHARS",
+                  help="line length for blast and html formats (default: %default)")
+    (opts, args) = op.parse_args()
+    if opts.linesize <= 0: op.error("option -l: should be >= 1")
+    if opts.dictionary and opts.dictfile: op.error("can't use both -d and -f")
+    if len(args) < 1: op.error("I need a format-name and some MAF alignments")
+    if opts.dictionary and (len(args) == 1 or "-" in args[1:]):
+        op.error("need file (not pipe) with option -d")
+
+    try: mafConvert(opts, args)
+    except KeyboardInterrupt: pass  # avoid silly error message
+    except Exception, e:
+        prog = os.path.basename(sys.argv[0])
+        sys.exit(prog + ": error: " + str(e))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/maf-cull	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,112 @@
+#! /usr/bin/env python
+
+# Read MAF-format alignments.  Write them, omitting alignments whose
+# coordinates in the top-most sequence are contained in those of >=
+# cullingLimit higher-scoring alignments.
+
+# Alignments on opposite strands are not considered to contain each
+# other.
+
+# The alignments must be sorted by sequence name, then strand, then
+# start coordinate.
+
+# This algorithm is not theoretically optimal, but it is simple and
+# probably fast in practice.  Optimal algorithms are described in:
+# Winnowing sequences from a database search.
+# Berman P, Zhang Z, Wolf YI, Koonin EV, Miller W.
+# J Comput Biol. 2000 Feb-Apr;7(1-2):293-302.
+# (Use a "priority search tree" or an "interval tree".)
+
+# Seems to work with Python 2.x, x>=4
+
+import fileinput, itertools, operator, optparse, os, signal, sys
+
+# The intervals must have size > 0.
+
+def isFresh(oldInterval, newInterval):
+    return oldInterval.end > newInterval.start
+
+def freshIntervals(storedIntervals, newInterval):
+    """Yields those storedIntervals that overlap newInterval."""
+    return [i for i in storedIntervals if isFresh(i, newInterval)]
+
+def isDominated(dog, queen):
+    return dog.score < queen.score and dog.end <= queen.end
+
+def isWanted(newInterval, storedIntervals, cullingLimit):
+    """Is newInterval dominated by < cullingLimit storedIntervals?"""
+    dominators = (i for i in storedIntervals if isDominated(newInterval, i))
+    return len(list(dominators)) < cullingLimit
+
+# Check that the intervals are sorted by start position, and further
+# sort them in descending order of score.
+def sortedIntervals(intervals):
+    oldStart = ()
+    for k, v in itertools.groupby(intervals, operator.attrgetter("start")):
+        if k < oldStart: raise Exception("the input is not sorted properly")
+        oldStart = k
+        for i in sorted(v, key=operator.attrgetter("score"), reverse=True):
+            yield i
+
+def culledIntervals(intervals, cullingLimit):
+    """Yield intervals contained in < cullingLimit higher-scoring intervals."""
+    storedIntervals = []
+    for i in sortedIntervals(intervals):
+        storedIntervals = freshIntervals(storedIntervals, i)
+        if isWanted(i, storedIntervals, cullingLimit):
+            yield i
+            storedIntervals.append(i)
+
+class Maf:
+    def __init__(self, lines):
+        self.lines = lines
+        try:
+            aLine = lines[0]
+            aWords = aLine.split()
+            scoreGen = (i for i in aWords if i.startswith("score="))
+            scoreWord = scoreGen.next()
+            self.score = float(scoreWord.split("=")[1])
+        except: raise Exception("missing score")
+        try:
+            sLine = lines[1]
+            sWords = sLine.split()
+            seqName = sWords[1]
+            alnStart = int(sWords[2])
+            alnSize = int(sWords[3])
+            strand = sWords[4]
+            self.start = seqName, strand, alnStart
+            self.end = seqName, strand, alnStart + alnSize
+        except: raise Exception("can't interpret the MAF data")
+
+def mafInput(lines):
+    for k, v in itertools.groupby(lines, str.isspace):
+        if not k: yield Maf(list(v))
+
+def filterComments(lines):
+    for i in lines:
+        if i.startswith("#"): print i,
+        else: yield i
+
+def mafCull(opts, args):
+    inputLines = fileinput.input(args)
+    inputMafs = mafInput(filterComments(inputLines))
+    for maf in culledIntervals(inputMafs, opts.limit):
+        for i in maf.lines: print i,
+        print  # blank line after each alignment
+
+if __name__ == "__main__":
+    signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
+
+    usage = "%prog [options] my-alignments.maf"
+    description = "Cull alignments whose top-sequence coordinates are contained in LIMIT or more higher-scoring alignments."
+    op = optparse.OptionParser(usage=usage, description=description)
+    op.add_option("-l", "--limit", type="int", default=2,
+                  help="culling limit (default: %default)")
+    (opts, args) = op.parse_args()
+    if opts.limit < 1: op.error("option -l: should be >= 1")
+
+    try: mafCull(opts, args)
+    except KeyboardInterrupt: pass  # avoid silly error message
+    except Exception, e:
+        prog = os.path.basename(sys.argv[0])
+        sys.exit(prog + ": error: " + str(e))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/maf-join	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,244 @@
+#! /usr/bin/env python
+
+# Copyright 2009, 2010, 2011 Martin C. Frith
+
+# Join two or more sets of MAF-format multiple alignments into bigger
+# multiple alignments.  The 'join field' is the top genome, which
+# should be the same for each input.  Each input should be sorted by
+# position in the top genome.
+
+# WARNING: Alignment columns with a gap in the top genome are joined
+# arbitrarily!!!
+
+import sys, os, fileinput, optparse, signal
+
+signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # stop spurious error message
+
+class peekable:  # Adapted from Python Cookbook 2nd edition
+    """An iterator that supports a peek operation."""
+    def __init__(self, iterable):
+        self.it = iter(iterable)
+        self.cache = []
+    def __iter__(self):
+        return self
+    def next(self):
+        if self.cache: return self.cache.pop()
+        else: return self.it.next()
+    def peek(self):
+        if not self.cache: self.cache.append(self.it.next())
+        return self.cache[0]
+
+def maxLen(things): return max(map(len, things))
+
+class MafBlock:
+    def __init__(self, chr, beg, end, strand, chrSize, seq, prob):
+        self.chr = chr  # chromosome names
+        self.beg = beg  # alignment begin coordinates
+        self.end = end  # alignment end coordinates
+        self.strand = strand
+        self.chrSize = chrSize  # chromosome sizes
+        self.seq = seq  # aligned sequences, including gaps
+        self.prob = prob  # probabilities (may be empty)
+
+    def __nonzero__(self):
+        return len(self.seq) > 0
+
+    def __cmp__(self, other):
+        return cmp(self.chr[:1] + self.beg[:1], other.chr[:1] + other.beg[:1])
+
+    def before(self, other):
+        return (self.chr[0], self.end[0]) <= (other.chr[0], other.beg[0])
+
+    def after(self, other):
+        return (self.chr[0], self.beg[0]) >= (other.chr[0], other.end[0])
+
+    def addLine(self, line):
+        words = line.split()
+        if line.startswith('s'):
+            self.chr.append(words[1])
+            self.beg.append(int(words[2]))
+            self.end.append(int(words[2]) + int(words[3]))
+            self.strand.append(words[4])
+            self.chrSize.append(words[5])
+            self.seq.append(list(words[6]))
+        elif line.startswith('p'):
+            self.prob.append(words[1])
+
+    def write(self):
+        beg = map(str, self.beg)
+        size = [str(e-b) for b, e in zip(self.beg, self.end)]
+        seq = [''.join(i) for i in self.seq]
+        columns = self.chr, beg, size, self.strand, self.chrSize, seq
+        widths = map(maxLen, columns)
+        print 'a'
+        for row in zip(*columns):
+            widthsAndFields = zip(widths, row)
+            field0 = "%-*s" % widthsAndFields[0]  # left-justify
+            fields = ["%*s" % i for i in widthsAndFields[1:]]  # right-justify
+            print 's', field0, ' '.join(fields)
+        pad = ' '.join(' ' * i for i in widths[:-1])
+        for i in self.prob:
+            print 'p', pad, i
+        print  # blank line afterwards
+
+def topSeqBeg(maf): return maf.beg[0]
+def topSeqEnd(maf): return maf.end[0]
+def emptyMaf(): return MafBlock([], [], [], [], [], [], [])
+
+def joinOnFirstItem(x, y):
+    if x[0] != y[0]:
+        raise ValueError('join fields not equal:\n'+str(x[0])+'\n'+str(y[0]))
+    return x + y[1:]
+
+def mafEasyJoin(x, y):
+    '''Join two MAF blocks on the top sequence.'''
+    xJoin = zip(x.chr, x.beg, x.end, x.strand, x.chrSize, x.seq)
+    yJoin = zip(y.chr, y.beg, y.end, y.strand, y.chrSize, y.seq)
+    joined = joinOnFirstItem(xJoin, yJoin)
+    chr, beg, end, strand, chrSize, seq = zip(*joined)
+    prob = x.prob + y.prob
+    return MafBlock(chr, beg, end, strand, chrSize, seq, prob)
+
+def countNonGaps(s): return len(s) - s.count('-')
+
+def nthNonGap(s, n):
+    '''Get the start position of the n-th non-gap.'''
+    for i, x in enumerate(s):
+        if x != '-':
+            if n == 0: return i
+            n -= 1
+    raise ValueError('non-gap not found')
+
+def nthLastNonGap(s, n):
+    '''Get the end position of the n-th last non-gap.'''
+    return len(s) - nthNonGap(s[::-1], n)
+
+def mafSlice(maf, alnBeg, alnEnd):
+    '''Return a slice of a MAF block, using coordinates in the alignment.'''
+    beg = [b + countNonGaps(s[:alnBeg]) for b, s in zip(maf.beg, maf.seq)]
+    end = [e - countNonGaps(s[alnEnd:]) for e, s in zip(maf.end, maf.seq)]
+    seq = [i[alnBeg:alnEnd] for i in maf.seq]
+    prob = [i[alnBeg:alnEnd] for i in maf.prob]
+    return MafBlock(maf.chr, beg, end, maf.strand, maf.chrSize, seq, prob)
+
+def mafSliceTopSeq(maf, newTopSeqBeg, newTopSeqEnd):
+    '''Return a slice of a MAF block, using coordinates in the top sequence.'''
+    lettersFromBeg = newTopSeqBeg - topSeqBeg(maf)
+    lettersFromEnd = topSeqEnd(maf) - newTopSeqEnd
+    alnBeg = nthNonGap(maf.seq[0], lettersFromBeg)
+    alnEnd = nthLastNonGap(maf.seq[0], lettersFromEnd)
+    return mafSlice(maf, alnBeg, alnEnd)
+
+def jumpGaps(sequence, index):
+    '''Return the next index of the sequence where there is a non-gap.'''
+    nextIndex = index
+    while sequence[nextIndex] == '-': nextIndex += 1
+    return nextIndex
+
+def gapsToAdd(sequences):
+    '''Return new gaps and their positions, needed to align the non-gaps.'''
+    gapInfo = [[] for i in sequences]
+    gapBeg = [0 for i in sequences]
+    try:
+        while True:
+            gapEnd = [jumpGaps(s, p) for s, p in zip(sequences, gapBeg)]
+            gapSize = [e-b for b, e in zip(gapBeg, gapEnd)]
+            maxGapSize = max(gapSize)
+            for s, e, i in zip(gapSize, gapEnd, gapInfo):
+                if s < maxGapSize:
+                    newGap = maxGapSize - s
+                    i.append((newGap, e))
+            gapBeg = [e+1 for e in gapEnd]
+    except IndexError: return gapInfo
+
+def chunksAndGaps(s, gapsAndPositions, oneGap):
+    '''Yield chunks of "s" interspersed with gaps at given positions.'''
+    oldPosition = 0
+    for gapLen, position in gapsAndPositions:
+        yield s[oldPosition:position]
+        yield oneGap * gapLen
+        oldPosition = position
+    yield s[oldPosition:]
+
+def mafAddGaps(maf, gapsAndPositions):
+    '''Add the given gaps at the given positions to a MAF block.'''
+    maf.seq = [sum(chunksAndGaps(i, gapsAndPositions, ['-']), [])
+               for i in maf.seq]
+    maf.prob = [''.join(chunksAndGaps(i, gapsAndPositions, '~'))
+                for i in maf.prob]
+
+def mafJoin(mafs):
+    '''Intersect and join overlapping MAF blocks.'''
+    newTopSeqBeg = max(map(topSeqBeg, mafs))
+    newTopSeqEnd = min(map(topSeqEnd, mafs))
+    mafs = [mafSliceTopSeq(i, newTopSeqBeg, newTopSeqEnd) for i in mafs]
+    topSeqs = [i.seq[0] for i in mafs]
+    gapInfo = gapsToAdd(topSeqs)
+    for maf, gapsAndPositions in zip(mafs, gapInfo):
+        mafAddGaps(maf, gapsAndPositions)
+    return reduce(mafEasyJoin, mafs)
+
+def mafInput(lines):
+    '''Read lines and yield MAF blocks.'''
+    maf = emptyMaf()
+    for line in lines:
+        if line.isspace():
+            if maf: yield maf
+            maf = emptyMaf()
+        else:
+            maf.addLine(line)
+    if maf: yield maf
+
+def sortedMafInput(lines):
+    '''Read lines and yield MAF blocks, checking that they are in order.'''
+    old = emptyMaf()
+    for maf in mafInput(lines):
+        if maf < old: sys.exit(progName + ": MAF blocks not sorted properly")
+        yield maf
+        old = maf
+
+def allOverlaps(sequences, beg, end):
+    '''Yield all combinations of MAF blocks that overlap in the top genome.'''
+    assert beg < end
+    if not sequences:
+        yield ()
+        return
+    for i in sequences[0]:
+        if topSeqEnd(i) <= beg: continue
+        if topSeqBeg(i) >= end: break  # assumes they're sorted by position
+        newBeg = max(beg, topSeqBeg(i))
+        newEnd = min(end, topSeqEnd(i))
+        for j in allOverlaps(sequences[1:], newBeg, newEnd):
+            yield (i,) + j
+
+def nextWindow(window, input, referenceMaf):
+    '''Yield "relevant" MAF blocks, based on overlap with referenceMaf.'''
+    for maf in window:
+        if not maf.before(referenceMaf): yield maf
+    try:
+        while True:
+            maf = input.peek()
+            if maf.after(referenceMaf): break
+            maf = input.next()
+            if not maf.before(referenceMaf): yield maf
+    except StopIteration: pass
+
+def overlappingMafs(sortedMafInputs):
+    '''Yield all combinations of MAF blocks that overlap in the top genome.'''
+    if not sortedMafInputs: return
+    head, tail = sortedMafInputs[0], sortedMafInputs[1:]
+    windows = [[] for t in tail]
+    for h in head:  # iterate over MAF blocks in the first input
+        windows = [list(nextWindow(w, t, h)) for w, t in zip(windows, tail)]
+        for i in allOverlaps(windows, topSeqBeg(h), topSeqEnd(h)):
+            yield (h,) + i
+
+op = optparse.OptionParser(usage="%prog sorted-file1.maf sorted-file2.maf ...")
+(opts, args) = op.parse_args()
+
+progName = os.path.basename(sys.argv[0])
+
+inputs = [peekable(sortedMafInput(fileinput.input(i))) for i in args]
+
+for mafs in overlappingMafs(inputs):
+    mafJoin(mafs).write()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/maf-sort	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,77 @@
+#! /bin/sh
+
+# Sort MAF-format alignments by sequence name, then strand, then start
+# position, then end position, of the top sequence.  Also, merge
+# identical alignments.  Comment lines starting with "#" are written
+# at the top, in unchanged order.  If option "-d" is specified, then
+# alignments that appear only once are omitted (like uniq -d).
+
+# Minor flaws, that do not matter for typical MAF input:
+# 1) It might not work if the input includes TABs.
+# 2) Preceding whitespace is considered part of the sequence name.  I
+# want to use sort -b, but it seems to be broken in different ways for
+# different versions of sort!
+# 3) Alignments with differences in whitespace are considered
+# non-identical.
+
+# This script uses perl instead of specialized commands like uniq.
+# The reason is that, on some systems (e.g. Mac OS X), uniq doesn't
+# work with long lines.
+
+# Make "sort" use a standard ordering:
+LC_ALL=C
+export LC_ALL
+
+uniqOpt=1
+whichSequence=1
+while getopts hdn: opt
+do
+    case $opt in
+	h)  cat <<EOF
+Usage: $(basename $0) [options] my-alignments.maf
+
+Options:
+  -h  show this help message and exit
+  -d  only print duplicate alignments
+  -n  sort by the n-th sequence (default: 1)
+EOF
+	    exit
+	    ;;
+	d)  uniqOpt=2
+            ;;
+	n)  whichSequence="$OPTARG"
+	    ;;
+    esac
+done
+shift $((OPTIND - 1))
+
+baseField=$((6 * $whichSequence))
+a=$(($baseField - 4))
+a=$a,$a
+b=$(($baseField - 1))
+b=$b,$b
+c=$(($baseField - 3))
+c=$c,$c
+d=$(($baseField - 2))
+d=$d,$d
+
+# 1) Add digits to "#" lines, so that sorting won't change their order.
+# 2) Replace spaces, except in "s" lines.
+# 3) Join each alignment into one big line.
+perl -pe '
+s/^#/sprintf("#%.9d",$c++)/e;
+y/ /\a/ unless /^s/;
+y/\n/\b/ if /^\w/;
+' "$@" |
+
+sort -k$a -k$b -k${c}n -k${d}n |  # sort the lines
+
+# Print only the first (or second) of each run of identical lines:
+perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt |
+
+# 1) Remove the digits from "#" lines.
+# 2) Restore spaces and newlines.
+perl -pe '
+s/^#.{9}/#/;
+y/\a\b/ \n/;
+'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/maf-swap	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,140 @@
+#! /usr/bin/env python
+
+# Read MAF-format alignments, and write them, after moving the Nth
+# sequence to the top in each alignment.
+
+# Before writing, if the top sequence would be on the - strand, then
+# flip all the strands.  But don't do this if the top sequence is
+# translated DNA.
+
+# Seems to work with Python 2.x, x>=4
+
+import fileinput, itertools, optparse, os, signal, string, sys
+
+def filterComments(lines):
+    for i in lines:
+        if i.startswith("#"): print i,
+        else: yield i
+
+def mafInput(lines):
+    for k, v in itertools.groupby(lines, str.isspace):
+        if not k: yield list(v)
+
+def indexOfNthSequence(mafLines, n):
+    for i, line in enumerate(mafLines):
+        if line.startswith("s"):
+            if n == 1: return i
+            n -= 1
+    raise Exception("encountered an alignment with too few sequences")
+
+def rangeOfNthSequence(mafLines, n):
+    """Get the range of lines associated with the Nth sequence."""
+    start = indexOfNthSequence(mafLines, n)
+    stop = start + 1
+    while stop < len(mafLines):
+        line = mafLines[stop]
+        if not (line.startswith("q") or line.startswith("i")): break
+        stop += 1
+    return start, stop
+
+complement = string.maketrans('ACGTNSWRYKMBDHVacgtnswrykmbdhv',
+                              'TGCANSWYRMKVHDBtgcanswyrmkvhdb')
+# doesn't handle "U" in RNA sequences
+def revcomp(seq):
+    return seq[::-1].translate(complement)
+
+def flippedMafS(words):
+    alnStart = int(words[2])
+    alnSize = int(words[3])
+    strand = words[4]
+    seqSize = int(words[5])
+    alnString = words[6]
+    newStart = seqSize - alnStart - alnSize
+    if strand == "-": newStrand = "+"
+    else:             newStrand = "-"
+    newString = revcomp(alnString)
+    out = words[0], words[1], newStart, alnSize, newStrand, seqSize, newString
+    return map(str, out)
+
+def flippedMafP(words):
+    flippedString = words[1][::-1]
+    return words[:1] + [flippedString]
+
+def flippedMafQ(words):
+    qualityString = words[2]
+    flippedString = qualityString[::-1]
+    return words[:2] + [flippedString]
+
+def flippedMafLine(mafLine):
+    words = mafLine.split()
+    if   words[0] == "s": return flippedMafS(words)
+    elif words[0] == "p": return flippedMafP(words)
+    elif words[0] == "q": return flippedMafQ(words)
+    else: return words
+
+def maxlen(s):
+    return max(map(len, s))
+
+def sLineFieldWidths(mafLines):
+    sLines = (i for i in mafLines if i[0] == "s")
+    sColumns = zip(*sLines)
+    return map(maxlen, sColumns)
+
+def joinedMafS(words, fieldWidths):
+    formatParams = itertools.chain(*zip(fieldWidths, words))
+    return "%*s %-*s %*s %*s %*s %*s %*s\n" % tuple(formatParams)
+
+def joinedMafLine(words, fieldWidths):
+    if words[0] == "s":
+        return joinedMafS(words, fieldWidths)
+    elif words[0] == "q":
+        words = words[:2] + [""] * 4 + words[2:]
+        return joinedMafS(words, fieldWidths)
+    elif words[0] == "p":
+        words = words[:1] + [""] * 5 + words[1:]
+        return joinedMafS(words, fieldWidths)
+    else:
+        return " ".join(words) + "\n"
+
+def flippedMaf(mafLines):
+    flippedLines = map(flippedMafLine, mafLines)
+    fieldWidths = sLineFieldWidths(flippedLines)
+    return (joinedMafLine(i, fieldWidths) for i in flippedLines)
+
+def isCanonicalStrand(mafLine):
+    words = mafLine.split()
+    strand = words[4]
+    if strand == "+": return True
+    alnString = words[6]
+    if "/" in alnString or "\\" in alnString: return True  # frameshifts
+    alnSize = int(words[3])
+    gapCount = alnString.count("-")
+    if len(alnString) - gapCount < alnSize: return True  # translated DNA
+    return False
+
+def mafSwap(opts, args):
+    inputLines = fileinput.input(args)
+    for mafLines in mafInput(filterComments(inputLines)):
+        start, stop = rangeOfNthSequence(mafLines, opts.n)
+        mafLines[1:stop] = mafLines[start:stop] + mafLines[1:start]
+        if not isCanonicalStrand(mafLines[1]):
+            mafLines = flippedMaf(mafLines)
+        for i in mafLines: print i,
+        print  # blank line after each alignment
+
+if __name__ == "__main__":
+    signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
+
+    usage = "%prog [options] my-alignments.maf"
+    description = "Change the order of sequences in MAF-format alignments."
+    op = optparse.OptionParser(usage=usage, description=description)
+    op.add_option("-n", type="int", default=2,
+                  help="move the Nth sequence to the top (default: %default)")
+    (opts, args) = op.parse_args()
+    if opts.n < 1: op.error("option -n: should be >= 1")
+
+    try: mafSwap(opts, args)
+    except KeyboardInterrupt: pass  # avoid silly error message
+    except Exception, e:
+        prog = os.path.basename(sys.argv[0])
+        sys.exit(prog + ": error: " + str(e))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/parallel-fasta	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,8 @@
+#! /bin/sh
+
+parallel --gnu --version > /dev/null || exit 1
+
+parallel --gnu --minversion 20130222 > /dev/null ||
+echo $(basename $0): warning: old version of parallel, might be slow 1>&2
+
+exec parallel --gnu --pipe --recstart '>' "$@"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/parallel-fastq	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,9 @@
+#! /bin/sh
+
+parallel --gnu --version > /dev/null || exit 1
+
+parallel --gnu --minversion 20130222 > /dev/null ||
+echo $(basename $0): warning: old version of parallel, might be slow 1>&2
+
+# use a record size of 8 lines, so that paired sequences stay together:
+exec parallel --gnu --pipe -L8 "$@"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bsf-call_wrapper.pl	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,51 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use FindBin;
+
+print STDOUT "The tool script is called with:\n", join(" ", ($0, @ARGV)), "\n\n";
+
+my ($idx, $in) = ("", "");
+my $default_option = "-o bsf-call.out -W bsfwork";
+
+my $tooldir = shift(@ARGV);
+$tooldir = $FindBin::Bin;
+$ENV{PATH} = "$tooldir/bin:" . $ENV{PATH};
+my $reference_source = shift(@ARGV);
+my $read_end = shift(@ARGV);
+my $gslot = shift(@ARGV);
+#$idx = "$tooldir/data/chrX.sub.fa";
+
+if ($reference_source eq "indexed") {
+    $idx = shift(@ARGV);
+}
+elsif ($reference_source eq "history") {
+    my $own = shift(@ARGV);
+    $idx = "reference.fa";
+    &invoke_command("ln -s $own reference.fa");
+}
+else {
+    die "never reach here\n";
+}
+
+if ($read_end eq "single-end") {
+    $in = shift(@ARGV);
+    &invoke_command("$tooldir/bin/bsf-call $default_option -p $gslot $idx $in");
+}
+elsif ($read_end eq "paired-end") {
+    my $in1 = shift(@ARGV);
+    my $in2 = shift(@ARGV);
+    $in = $in1 . "," . $in2;
+    &invoke_command("$tooldir/bin/bsf-call $default_option -p $gslot $idx $in");
+}
+else {
+    die "never reach here\n";
+}
+
+sub invoke_command {
+    my ($command) = @_;
+    print "invoking: $command\n";
+    system($command);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bsf-call_wrapper.xml	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,116 @@
+<tool id="bsf-call" name="bsf-call" version="1.0.0">
+  <description>Mapping bisulfite-seq reads and calling methylated cytosines</description>
+<!--
+  <version_command></version_command>
+-->
+
+  <requirements>
+    <requirement type="set_environment">TOOLDIR</requirement>
+  </requirements>
+
+  <command interpreter="perl">
+    bsf-call_wrapper.pl TOOLDIR $reference.source $read.end \${GALAXY_SLOTS:-1} 
+
+    #if $reference.source=="indexed":
+      $reference.index.fields.path
+    #else if $reference.source=="history":
+      $reference.own_file
+    #else 
+
+    #end if
+
+    #if $read.end=="single-end"
+      $in 
+    #else if $read.end=="paired-end"
+      $in1 $in2
+    #else
+      
+    #end if
+  </command>
+
+  <inputs>
+    <conditional name="reference">
+      <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?">
+        <option value="indexed">Use a built-in genome index</option>
+        <option value="history">Use a genome from the history and build index</option>
+      </param>
+      <when value="indexed">
+        <param name="index" type="select" label="Select reference genome">
+          <options from_data_table="bsf-call_indexes">
+            <filter type="sort_by" column="2"/>
+            <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+          </options>
+        </param>
+      </when>
+      <when value="history">
+        <param name="own_file" type="data" format="fasta" label="Select reference genome"/>
+      </when>
+    </conditional>
+
+    <conditional name="read">
+      <param name="end" type="select" label="Will you use single-end reads or paired-end reads?">
+        <option value="single-end">Single-end reads</option>
+        <option value="paired-end">Paired-end reads</option>
+      </param>
+      <when value="single-end">
+        <param name="in" type="data" format="fastqsanger" label="Single-end reads in fastqsanger format"/>
+      </when>
+      <when value="paired-end">
+        <param name="in1" type="data" format="fastqsanger" label="Paired-end reads 1 in fastqsanger format"/>
+        <param name="in2" type="data" format="fastqsanger" label="Paired-end reads 2 in fastqsanger format"/>
+      </when>
+    </conditional> 
+  </inputs>
+
+  <outputs>
+    <data name="outres" format="tabular" label="${tool.name} on ${on_string}: result" from_work_dir="bsf-call.out"/>
+    <data name="outlog" format="txt" label="${tool.name} on ${on_string}: log" from_work_dir="bsfwork/bsf-call.log"/>
+  </outputs>
+
+  <help>
+**bsf-call**
+
+Mapping bisulfite-seq reads and calling methylated cytosines
+
+------
+
+**Input format**
+
+Inputs are bisulfite-seq reads in fastqsanger format (single-end or paired-end), and a reference genome index (built-in or constructed from your fasta file). 
+
+------
+
+**Output format**
+
+Output is a six-column tab-delimited file::
+
+  Col.| Description
+  ----+--------------------------------------
+  1   | chromosome label (e.g. chr1)
+  2   | genomic position (0-based)
+  3   | strand (+,-)
+  4   | mC context (CG, CHG, CHH)
+  5   | mC rate (float)
+  6   | read coverage
+
+------
+
+**Contact**
+
+Toutai Mituyama
+
+mituyama-toutai AT aist.go.jp
+  </help>
+
+  <citations>
+    <citation type="doi">10.1093/nar/gkt1373</citation>
+  </citations>
+
+</tool>
+
+<!--
+Also note the use of the reserved parameter name GALAXY_DATA_INDEX_DIR - it points to the ~/tool-data directory. 
+ \${GALAXY_SLOTS:-4} 
+
+Number of cores/threads allocated by the job runner or resource manager to the tool for the given job (here 4 is the default number of threads to use if running via custom runner that does not configure GALAXY_SLOTS or in an older Galaxy runtime). 
+-->
Binary file example-data/1.fastq.gz has changed
Binary file example-data/2.fastq.gz has changed
Binary file example-data/chrX.sub.fa.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/bsf-call_indexes.loc.sample	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,2 @@
+hg19	hg19	hg19	/disk/reference/Bisulfighter/Homo_sapiens/UCSC/hg19/Sequence/BsfcallIndex/genome.fa
+test	test	test	/disk/reference/Bisulfighter/Homo_sapiens/UCSC/hg19/Sequence/BsfcallIndex/chrX.sub.fa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,6 @@
+<tables>
+  <table name="bsf-call_indexes" comment_char="#">
+    <columns>value, dbkey, name, path</columns>
+    <file path="tool-data/bsf-call_indexes.loc"/>
+  </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <set_environment>
+    <environment_variable name="TOOLDIR" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable>   
+  </set_environment>
+</tool_dependency>