Mercurial > repos > richard-burhans > segalign

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/diagonal_partition.py	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+
+"""
+Diagonal partitioning for segment files output by SegAlign.
+
+Usage:
+diagonal_partition <max-segments> <lastz-command>
+"""
+
+import os
+import sys
+
+
+def chunks(lst, n):
+    """Yield successive n-sized chunks from list."""
+    for i in range(0, len(lst), n):
+        yield lst[i: i + n]
+
+
+if __name__ == "__main__":
+
+    DELETE_AFTER_CHUNKING = True
+
+    # input_params = "10000 sad sadsa sad --segments=tmp10.block5.r1239937044.plus.segments dsa sa --strand=plus --output=out.maf sadads 2> logging.err"
+    # sys.argv = [sys.argv[0]] + input_params.split(' ')
+    chunk_size = int(sys.argv[1])  # first parameter contains chunk size
+    params = sys.argv[2:]
+
+    # Parsing command output from SegAlign
+    segment_key = "--segments="
+    segment_index = None
+    input_file = None
+
+    for index, value in enumerate(params):
+        if value[: len(segment_key)] == segment_key:
+            segment_index = index
+            input_file = value[len(segment_key):]
+            break
+    if segment_index is None:
+        print(f"Error: could not segment key {segment_key} in parameters {params}")
+        exit(0)
+
+    if not os.path.isfile(input_file):
+        print(f"Error: File {input_file} does not exist")
+        exit(0)
+
+    if (
+        chunk_size == 0 or sum(1 for _ in open(input_file)) <= chunk_size
+    ):  # no need to sort if number of lines <= chunk_size
+        print(" ".join(params), flush=True)
+        exit(0)
+
+    # Find rest of relevant parameters
+    output_key = "--output="
+    output_index = None
+    output_alignment_file = None
+    output_alignment_file_base = None
+    output_format = None
+
+    strand_key = "--strand="
+    strand_index = None
+    for index, value in enumerate(params):
+        if value[: len(output_key)] == output_key:
+            output_index = index
+            output_alignment_file = value[len(output_key):]
+            output_alignment_file_base, output_format = output_alignment_file.rsplit(
+                ".", 1
+            )
+        if value[: len(strand_key)] == strand_key:
+            strand_index = index
+    if segment_index is None:
+        print(f"Error: could not output key {output_key} in parameters {params}")
+        exit(0)
+    if strand_index is None:
+        print(f"Error: could not output key {strand_key} in parameters {params}")
+        exit(0)
+
+    err_index = -1  # error file is at very end
+    err_name_base = params[-1].split(".err", 1)[0]
+
+    data = {}  # dict of list of tuple (x, y, str)
+
+    direction = None
+    if "plus" in params[strand_index]:
+        direction = "f"
+    elif "minus" in params[strand_index]:
+        direction = "r"
+    else:
+        print(
+            f"Error: could not figure out direction from strand value {params[strand_index]}"
+        )
+        exit(0)
+
+    for line in open(input_file, "r"):
+        if line == "":
+            continue
+        (
+            seq1_name,
+            seq1_start,
+            seq1_end,
+            seq2_name,
+            seq2_start,
+            seq2_end,
+            _dir,
+            score,
+        ) = line.split()
+        # data.append((int(seq1_start), int(seq2_start), line))
+        half_dist = int((int(seq1_end) - int(seq1_start)) // 2)
+        assert int(seq1_end) > int(seq1_start)
+        assert int(seq2_end) > int(seq2_start)
+        seq1_mid = int(seq1_start) + half_dist
+        seq2_mid = int(seq2_start) + half_dist
+        data.setdefault((seq1_name, seq2_name), []).append((seq1_mid, seq2_mid, line))
+
+    # If there are chromosome pairs with segment count <= chunk_size
+    # then no need to sort and split these pairs into separate files.
+    # It is better to keep these pairs in a single segment file.
+    skip_pairs = []  # pairs that have count <= chunk_size.
+    # these will not be sorted
+    if len(data.keys()) > 1:
+        for pair in data.keys():
+            if len(data[pair]) <= chunk_size:
+                skip_pairs.append(pair)
+
+    # sorting for forward segments
+    if direction == "r":
+        for pair in data.keys():
+            if pair not in skip_pairs:
+                data[pair] = sorted(
+                    data[pair], key=lambda coord: (coord[1] - coord[0], coord[0])
+                )
+    # sorting for reverse segments
+    elif direction == "f":
+        for pair in data.keys():
+            if pair not in skip_pairs:
+                data[pair] = sorted(
+                    data[pair], key=lambda coord: (coord[1] + coord[0], coord[0])
+                )
+    else:
+        print(f"INVALID DIRECTION VALUE: {direction}")
+        exit(0)
+
+    # Writing file in chunks
+    ctr = 0
+    for pair in data.keys() - skip_pairs:
+        for chunk in chunks(list(zip(*data[pair]))[2], chunk_size):
+            ctr += 1
+            name_addition = f".split{ctr}"
+            fname = input_file.split(".segments", 1)[0] + name_addition + ".segments"
+
+            with open(fname, "w") as f:
+                f.writelines(chunk)
+            # update segment file in command
+            params[segment_index] = segment_key + fname
+            # update output file in command
+            params[output_index] = (
+                output_key
+                + output_alignment_file_base
+                + name_addition
+                + "."
+                + output_format
+            )
+            # update error file in command
+            params[-1] = err_name_base + name_addition + ".err"
+            print(" ".join(params), flush=True)
+
+    # writing unsorted skipped pairs
+    if len(skip_pairs) > 0:
+        skip_pairs_with_len = sorted(
+            [(len(data[p]), p) for p in skip_pairs]
+        )  # list of tuples of (pair length, pair)
+        aggregated_skip_pairs = []  # list of list of pair names
+        current_count = 0
+        aggregated_skip_pairs.append([])
+        for count, pair in skip_pairs_with_len:
+            if current_count + count <= chunk_size:
+                current_count += count
+                aggregated_skip_pairs[-1].append(pair)
+            else:
+                aggregated_skip_pairs.append([])
+                current_count = count
+                aggregated_skip_pairs[-1].append(pair)
+
+        for aggregate in aggregated_skip_pairs:
+            ctr += 1
+            name_addition = f".split{ctr}"
+            fname = input_file.split(".segments", 1)[0] + name_addition + ".segments"
+            with open(fname, "w") as f:
+                for pair in sorted(aggregate, key=lambda p: (p[1], p[0])):
+                    chunk = list(zip(*data[pair]))[2]
+                    f.writelines(chunk)
+            # update segment file in command
+            params[segment_index] = segment_key + fname
+            # update output file in command
+            params[output_index] = (
+                output_key
+                + output_alignment_file_base
+                + name_addition
+                + "."
+                + output_format
+            )
+            # update error file in command
+            params[-1] = err_name_base + name_addition + ".err"
+            print(" ".join(params), flush=True)
+
+    if DELETE_AFTER_CHUNKING:
+        os.remove(input_file)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gapped_extension_options.xml	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,10 @@
+<macros>
+    <xml name="gapped_extension_options">
+        <section name="gapped_extension_options" expanded="false" title="Gapped Extension Options">
+            <param argument="--nogapped" type="boolean" checked="false" label="Don't perform gapped extension stage"/>
+            <param argument="--ydrop" type="integer" value="9430" label="Y-drop value for gapped extension"/>
+            <param argument="--gappedthresh" type="integer" value="" optional="true" label="Score threshold for gapped alignments"/>
+            <param argument="--notrivial" type="boolean" checked="false" label="Don't output a trivial self-alignment block if the target and query sequences are identical"/>
+        </section>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lastz-cmd.ini	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,94 @@
+[arguments]
+flag_args =
+  allgappedbounds
+  anyornone
+  gapped
+  gfextend
+  markend
+  nocensus
+  noentropy
+  nofilter
+  nogapped
+  nogfextend
+  nomirror
+  norecoverseeds
+  notransition
+  notrivial
+  notwins
+  noytrim
+  recoverseeds
+  self
+  version
+  yasra75
+  yasra85
+  yasra85short
+  yasra90
+  yasra95
+  yasra95short
+  yasra98
+
+str_args =
+  action:query
+  action:target
+  ambiguous
+  ball
+  chores
+  filter
+  format
+  gap
+  hspthresh
+  include
+  match
+  maxwordcount
+  mismatch
+  nochain
+  output
+  outputmasking
+  outputmasking+
+  outputmasking+:soft
+  outputmasking:soft
+  querydepth
+  queryhsplimit
+  rdotplot
+  readgroup
+  scores
+  seed
+  segments
+  show
+  strand
+  targetcapsule
+  twins
+  writecapsule
+  writesegments
+
+int_args =
+  allocate:query
+  allocate:target
+  allocate:traceback
+  exact
+  gappedthresh
+  inner
+  masking
+  queryhspbest
+  step
+  word
+  xdrop
+  ydrop
+
+bool_str_args=
+  census
+  census16
+  census32
+  chain
+  entropy
+  help
+  infer
+  inferonly
+  infscores
+  tableonly
+
+bool_int_args=
+  progress
+  progress+masking
+  transition
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,22 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">segalign-full</requirement>
+            <requirement type="package" version="0.18">bashlex</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@TOOL_VERSION@">0.1.2.1</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">21.05</token>
+    <xml name="edam_ontology">
+        <edam_operations>
+            <edam_operation>operation_0491</edam_operation>
+        </edam_operations>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1109/SC41405.2020.00043</citation>
+        </citations>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/output_options.xml	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,8 @@
+<macros>
+    <xml name="output_options">
+        <section name="output_options" expanded="false" title="Output Options">
+            <yield/>
+            <param argument="--markend" type="boolean" value="false" label="Write a marker line just before completion"/>
+        </section>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/package_output.py	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,286 @@
+#!/usr/bin/env python
+
+import argparse
+import configparser
+import json
+import os
+import sys
+import tarfile
+import typing
+
+import bashlex
+
+
+class PackageFile:
+    def __init__(
+        self,
+        pathname: str = "data_package.tgz",
+        top_dir: str = "galaxy",
+        data_dir: str = "files",
+        config_file: str = "commands.json",
+    ) -> None:
+        self.pathname: str = os.path.realpath(pathname)
+        self.data_root: str = os.path.join(top_dir, data_dir)
+        self.config_path: str = os.path.join(top_dir, config_file)
+        self.config_file: str = config_file
+        self.tarfile: typing.Optional[tarfile.TarFile] = None
+        self.name_cache: typing.Dict[typing.Any, typing.Any] = {}
+        self.working_dir: str = os.path.realpath(os.getcwd())
+
+    def _initialize(self) -> None:
+        if self.tarfile is None:
+            self.tarfile = tarfile.open(
+                name=self.pathname,
+                mode="w:gz",
+                format=tarfile.GNU_FORMAT,
+                compresslevel=1,
+            )
+
+    def add_config(self, pathname: str) -> None:
+        if self.tarfile is None:
+            self._initialize()
+
+        source_path = os.path.realpath(pathname)
+
+        if self.tarfile is not None:
+            self.tarfile.add(source_path, arcname=self.config_path, recursive=False)
+
+    def add_file(self, pathname: str, arcname: typing.Optional[str] = None) -> None:
+        if self.tarfile is None:
+            self._initialize()
+
+        source_path = os.path.realpath(pathname)
+
+        dest_path = None
+
+        if arcname is None:
+            dest_path = os.path.join(self.data_root, os.path.basename(source_path))
+        else:
+            arc_path = os.path.realpath(arcname)
+            rel_path = os.path.relpath(arc_path, self.working_dir)
+            if not (os.path.isabs(rel_path) or rel_path.startswith("../")):
+                dest_path = os.path.join(self.data_root, rel_path)
+            else:
+                sys.exit("path fail")
+
+        if dest_path is not None:
+            if self.tarfile is not None:
+                if dest_path not in self.name_cache:
+                    try:
+                        self.tarfile.add(
+                            source_path, arcname=dest_path, recursive=False
+                        )
+                    except FileNotFoundError:
+                        sys.exit(f"missing source file {source_path}")
+
+                    self.name_cache[dest_path] = 1
+                    # print(f"added: {dest_path}", flush=True)
+
+    def close(self) -> None:
+        if self.tarfile is not None:
+            self.tarfile.close()
+            self.tarfile = None
+
+
+class bashCommandLineFile:
+    def __init__(
+        self,
+        pathname: str,
+        config: configparser.ConfigParser,
+        package_file: PackageFile,
+    ) -> None:
+        self.pathname: str = pathname
+        self.config = config
+        self.package_file = package_file
+        self.executable: typing.Optional[str] = None
+        self._parse_lines()
+
+    def _parse_lines(self) -> None:
+        with open("commands.json", "w") as ofh:
+            with open(self.pathname) as f:
+                line: str
+                for line in f:
+                    line = line.rstrip("\n")
+                    command_dict = self._parse_line(line)
+                    # we may want to re-write args here
+                    new_args_list = []
+
+                    args_list = command_dict.get("args", [])
+                    for arg in args_list:
+                        if arg.startswith("--target="):
+                            pathname = arg[9:]
+                            new_args_list.append(arg)
+                            if "[" in pathname:
+                                elems = pathname.split("[")
+                                sequence_file = elems.pop(0)
+                                self.package_file.add_file(sequence_file, sequence_file)
+                                for elem in elems:
+                                    if elem.endswith("]"):
+                                        elem = elem[:-1]
+                                        if elem.startswith("subset="):
+                                            subset_file = elem[7:]
+                                            self.package_file.add_file(subset_file)
+
+                        elif arg.startswith("--query="):
+                            pathname = arg[8:]
+                            new_args_list.append(arg)
+                            if "[" in pathname:
+                                elems = pathname.split("[")
+                                sequence_file = elems.pop(0)
+                                self.package_file.add_file(sequence_file, sequence_file)
+                                for elem in elems:
+                                    if elem.endswith("]"):
+                                        elem = elem[:-1]
+                                        if elem.startswith("subset="):
+                                            subset_file = elem[7:]
+                                            self.package_file.add_file(subset_file)
+                        elif arg.startswith("--segments="):
+                            pathname = arg[11:]
+                            new_args_list.append(arg)
+                            self.package_file.add_file(pathname)
+                        elif arg.startswith("--scores="):
+                            pathname = arg[9:]
+                            new_args_list.append("--scores=data/scores.txt")
+                            self.package_file.add_file(pathname, "data/scores.txt")
+                        else:
+                            new_args_list.append(arg)
+
+                    command_dict["args"] = new_args_list
+                    print(json.dumps(command_dict), file=ofh)
+
+        self.package_file.add_config("commands.json")
+
+    def _parse_line(self, line: str) -> typing.Dict[str, typing.Any]:
+        # resolve shell redirects
+        trees: typing.List[typing.Any] = bashlex.parse(line, strictmode=False)  # type: ignore[attr-defined]
+        positions: typing.List[typing.Tuple[int, int]] = []
+
+        for tree in trees:
+            visitor = nodevisitor(positions)
+            visitor.visit(tree)
+
+        # do replacements from the end so the indicies will be correct
+        positions.reverse()
+
+        processed = list(line)
+        for start, end in positions:
+            processed[start:end] = ""
+
+        processed_line: str = "".join(processed)
+
+        command_dict = self._parse_processed_line(processed_line)
+        command_dict["stdin"] = visitor.stdin
+        command_dict["stdout"] = visitor.stdout
+        command_dict["stderr"] = visitor.stderr
+
+        return command_dict
+
+    def _parse_processed_line(self, line: str) -> typing.Dict[str, typing.Any]:
+        argv: typing.List[str] = list(bashlex.split(line))  # type: ignore[attr-defined]
+        self.executable = argv.pop(0)
+
+        parser: argparse.ArgumentParser = argparse.ArgumentParser(add_help=False)
+        if "arguments" in self.config:
+            arguments_section = self.config["arguments"]
+
+            arg: str
+            if "flag_args" in arguments_section:
+                for arg in arguments_section["flag_args"].split():
+                    parser.add_argument(f"--{arg}", action="store_true")
+
+            if "str_args" in arguments_section:
+                for arg in arguments_section["str_args"].split():
+                    parser.add_argument(f"--{arg}", type=str)
+
+            if "bool_str_args" in arguments_section:
+                for arg in arguments_section["bool_str_args"].split():
+                    parser.add_argument(
+                        f"--{arg}", nargs="?", const=True, default=False
+                    )
+
+            if "int_args" in arguments_section:
+                for arg in arguments_section["int_args"].split():
+                    parser.add_argument(f"--{arg}", type=int)
+
+            if "bool_int_args" in arguments_section:
+                for arg in arguments_section["bool_int_args"].split():
+                    parser.add_argument(
+                        f"--{arg}", nargs="?", const=True, default=False
+                    )
+
+        namespace, rest = parser.parse_known_intermixed_args(argv)
+        vars_dict = vars(namespace)
+
+        command_dict: typing.Dict[str, typing.Any] = {
+            "executable": self.executable,
+            "args": [],
+        }
+
+        for var in vars_dict.keys():
+            value = vars_dict[var]
+            if value is not None:
+                if isinstance(value, bool):
+                    if value is True:
+                        command_dict["args"].append(f"--{var}")
+                else:
+                    command_dict["args"].append(f"--{var}={value}")
+
+        if len(rest) >= 0:
+            value = rest.pop(0)
+            command_dict["args"].append(f"--target={value}")
+
+        if len(rest) >= 0:
+            value = rest.pop(0)
+            command_dict["args"].append(f"--query={value}")
+
+        return command_dict
+
+
+class nodevisitor(bashlex.ast.nodevisitor):  # type: ignore[name-defined,misc]
+    def __init__(self, positions: typing.List[typing.Tuple[int, int]]) -> None:
+        self.positions = positions
+        self.stdin = None
+        self.stdout = None
+        self.stderr = None
+
+    def visitredirect(
+        self,
+        n: bashlex.ast.node,  # type: ignore[name-defined]
+        n_input: int,
+        n_type: str,
+        output: typing.Any,
+        heredoc: typing.Any,
+    ) -> None:
+        if isinstance(n_input, int) and 0 <= n_input <= 2:
+            if isinstance(output, bashlex.ast.node) and output.kind == "word":  # type: ignore[attr-defined]
+                self.positions.append(n.pos)
+                if n_input == 0:
+                    self.stdin = output.word
+                elif n_input == 1:
+                    self.stdout = output.word
+                elif n_input == 2:
+                    self.stderr = output.word
+            else:
+                sys.exit(f"oops 1: {type(n_input)}")
+        else:
+            sys.exit(f"oops 2: {type(n_input)}")
+
+    def visitheredoc(self, n: bashlex.ast.node, value: typing.Any) -> None:  # type: ignore[name-defined]
+        pass
+
+
+def main() -> None:
+    our_dirname: str = os.path.dirname(os.path.realpath(__file__))
+    lastz_command_config_file: str = os.path.join(our_dirname, "lastz-cmd.ini")
+
+    config: configparser.ConfigParser = configparser.ConfigParser()
+    config.read(lastz_command_config_file)
+
+    package_file = PackageFile()
+    lastz_command_file = "lastz_commands.txt"
+    bashCommandLineFile(lastz_command_file, config, package_file)
+    package_file.close()
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/run_segalign_diagonal_partition	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+set -o errexit
+set -o nounset
+set -o pipefail
+#set -o xtrace
+
+##
+## parse arguments
+##
+
+SEGALIGN_ARGS=()
+
+MAX_SEGMENT_SIZE=""
+OUTPUT_FILENAME=""
+LASTZ_COMMAND_FILE="lastz_commands.txt"
+HELP=0
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --help)
+            HELP=1
+            shift
+            ;;
+        --max_segments)
+            MAX_SEGMENT_SIZE="$2"
+            shift 2
+            ;;
+        --output)
+            OUTPUT_FILENAME=$(readlink -f "$2")
+            shift 2
+            ;;
+        --tool_directory)
+            TOOL_DIRECTORY="$2"
+            shift 2
+            ;;
+        *)
+            SEGALIGN_ARGS+=("$1")
+            shift
+    esac
+done
+
+set -- "${SEGALIGN_ARGS[@]}"
+
+##
+## check arguments
+##
+
+if [[ $# == 0 || "$HELP" == "1" ]]; then
+    segalign --help
+    exit 0
+fi
+
+if [[ $# -lt 2 ]]; then
+    echo "run_segalign_diagonal_partition: missing target and query sequences" 1>&2
+    exit 1
+fi
+
+ref_path=$(readlink -f "$1")
+test -e "$ref_path" || {
+    echo "run_segalign_diagonal_partition: target file \"$ref_path\" does not exist" 1>&2
+    exit 1
+}
+query_path=$(readlink -f "$2")
+test -e "$query_path" || {
+    echo "run_segalign_diagonal_partition: query file \"$query_path\" does not exist" 1>&2
+    exit 1
+}
+shift 2
+
+DATA_FOLDER="data"
+mkdir -p "$DATA_FOLDER" || {
+    echo "run_segalign_diagonal_partition: cannont create data directory \"$DATA_FOLDER\"" 1>&2
+    exit 1
+}
+
+cd $DATA_FOLDER/..
+echo ""
+echo "Converting fasta files to 2bit format" 1>&2
+
+##
+## convert target and query to 2bit
+##
+faToTwoBit "$ref_path" "$DATA_FOLDER/ref.2bit" || {
+    echo "run_segalign_diagonal_partition: cannot convert \"$ref_path\" to 2bit" 1>&2
+    exit 1
+}
+faToTwoBit "$query_path" "$DATA_FOLDER/query.2bit" || {
+    echo "run_segalign_diagonal_partition: cannont convert \"$ref_path\" to 2bit" 1>&2
+    exit 1
+}
+
+
+
+time {
+    while IFS= read -r line; do
+        "$TOOL_DIRECTORY/diagonal_partition.py" $MAX_SEGMENT_SIZE $line >> $LASTZ_COMMAND_FILE
+        # segalign sort writes out the partitioned segment files to the working
+        # directory and prints the modified lastz commands.
+    done < <(stdbuf -oL segalign $ref_path $query_path "${DATA_FOLDER}/" "$@" ) # segalign begins running in this line,
+} 1>&2 # and every newline written to stdout, get assigned to $line which
+       # gets sent to diagonal_partition for diagonal partitioning
+
+exit 0
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scoring_options.xml	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,25 @@
+<macros>
+    <xml name="scoring_options">
+        <section name="scoring_options" expanded="false" title="Scoring Options">
+            <param argument="--scoring" type="data" value="false" optional="true" format="txt" label="Scoring file in LASTZ format"/>
+            <param name="ambiguous_selector" type="select" label="Ambiguous Nucleotides">
+                <option value="x" selected="true">None</option>
+                <option value="n">N</option>
+                <option value="iupac">IUPAC</option>
+            </param>
+            <conditional name="ambiguous_params">
+                <param name="set_ambiguous_params_selector" type="select" label="Set Ambiguous Nucleotide Scoring Parameters">
+                    <option value="false" selected="true">No</option>
+                    <option value="true">Yes</option>
+                </param>
+                <when value="false">
+                    <!-- Do nothing -->
+                </when>
+                <when value="true">
+                    <param name="ambiguous_reward" type="integer" value="0" label="Ambiguous Nucleotide Reward"/>
+                    <param name="ambiguous_penalty" type="integer" value="0" label="Ambiguous Nucleotide Penalty"/>
+                </when>
+            </conditional>
+        </section>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seeding_options.xml	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,34 @@
+<macros>
+    <xml name="seeding_options">
+        <section name="seeding_options" expanded="false" title="Seeding Options">
+            <conditional name="seed">
+                <param name="seed_selector" type="select" label="Seed patern">
+                    <option value="12of19" selected="true">12of19 (1110100110010101111)</option>
+                    <option value="14of22">14of22 (1110101100110010101111)</option>
+                    <option value="custom">custom</option>
+                </param>
+                <when value="12of19">
+                    <!-- Do nothing -->
+                </when>
+                <when value="14of22">
+                    <!-- Do nothing -->
+                </when>
+                <when value="custom">
+                    <param name="custom_seed" type="text" value="" optional="false" label="Custom seed pattern">
+                        <validator type="empty_field"/>
+                        <validator type="regex" message="Arbitrary pattern of 1s, 0s, and Ts">^[01T]+$</validator>
+                        <sanitizer invalid_char="">
+                            <valid initial="none">
+                                <add value="0"/>
+                                <add value="1"/>
+                                <add value="T"/>
+                            </valid>
+                        </sanitizer>
+                    </param>
+                </when>
+            </conditional>
+            <param argument="--step" type="integer" value="1" label="Offset between the starting positions of successive target words considered for generating seed table"/>
+            <param argument="--notransition" type="boolean" checked="false" label="Don't allow one transition in a seed hit"/>
+        </section>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/segalign.xml	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,246 @@
+<tool id="segalign" name="SegAlign" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>A Scalable GPU System for Pairwise Whole Genome Alignments based on LASTZ's seed-filter-extend paradigm</description>
+    <macros>
+        <import>macros.xml</import>
+        <import>sequence_options.xml</import>
+        <import>scoring_options.xml</import>
+        <import>seeding_options.xml</import>
+        <import>ungapped_extension_options.xml</import>
+        <import>gapped_extension_options.xml</import>
+        <import>output_options.xml</import>
+        <import>segalign_output_options.xml</import>
+        <import>system_options.xml</import>
+    </macros>
+    <expand macro="edam_ontology"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+##
+## https://www.gnu.org/software/coreutils/manual/html_node/nproc-invocation.html
+##
+## If the OMP_NUM_THREADS or OMP_THREAD_LIMIT environment variables
+## are set, then they will determine the minimum and maximum returned
+## value respectively.
+##
+## This is how you tame nproc(1)
+##
+OMP_THREAD_LIMIT=\${GALAXY_SLOTS:-2}
+
+## Mode ----------------------------------------------------------------
+
+#if str($mode.mode_selector) == "segalign"
+    #if str($mode.diagonal_partition_options.diagonal_partition) == "true"
+        #set $segalign_mode = "segalign_diagonal_partition"
+        '$__tool_directory__/run_segalign_diagonal_partition'
+            --tool_directory '$__tool_directory__'
+            --max_segments '$mode.diagonal_partition_options.max_segments'
+    #else
+        #set $segalign_mode = "segalign"
+        run_segalign
+    #end if
+            '$mode.target'
+            '$mode.query'
+#else if str($mode.mode_selector) == "segalign_repeat_masker"
+    #set $segalign_mode = "segalign_repeat_masker"
+        run_segalign_repeat_masker
+            '$mode.seq_file'
+#end if
+
+## Sequence Options ----------------------------------------------------
+
+            --strand '$mode.sequence_options.strand_selector'
+#if $segalign_mode == "segalign_repeat_masker"
+            --neighbor_proportion '$mode.sequence_options.neighbor_proportion'
+#end if
+
+## Scoring Options -----------------------------------------------------
+
+#set $scoring_pathname = str($mode.scoring_options.scoring)
+#if $scoring_pathname != "None":
+            --scoring '$scoring_pathname'
+#end if
+#if str($mode.scoring_options.ambiguous_selector) != "x"
+    #if str($mode.scoring_options.set_ambiguous_params_selector) == "true"
+        #set $argument_value = ','.join($mode.scoring_options.ambiguous_selector, $mode.scoring_options.ambiguous_reward, $mode.scoring_options.ambiguous_penalty)
+            --ambiguous '$argument_value'
+    #else
+            --ambiguous '$ambiguous_selector'
+    #end if
+#end if
+
+## Seeding Options -----------------------------------------------------
+
+#if str($mode.seeding_options.seed.seed_selector) == "custom"
+            --seed '$mode.seeding_options.seed.custom_seed'
+#else
+            --seed '$mode.seeding_options.seed.seed_selector'
+#end if
+            --step '$mode.seeding_options.step'
+#if str($mode.seeding_options.notransition) == "true"
+            --notransition
+#end if
+
+## Ungapped Extension Options ------------------------------------------
+
+            --xdrop '$mode.ungapped_extension_options.xdrop'
+            --hspthresh '$mode.ungapped_extension_options.hspthresh'
+#if str($mode.ungapped_extension_options.noentropy) == "true"
+            --noentropy
+#end if
+
+## Gapped Extension Options --------------------------------------------
+
+#if $segalign_mode != "segalign_repeat_masker"
+    #if str($mode.gapped_extension_options.nogapped) == "true"
+            --nogapped
+    #end if
+            --ydrop '$mode.gapped_extension_options.ydrop'
+    #if str($mode.gapped_extension_options.gappedthresh) != ""
+            --gappedthresh '$mode.gapped_extension_options.gappedthresh'
+    #end if
+    #if str($mode.gapped_extension_options.notrivial) == "true"
+            --notrivial
+    #end if
+#end if
+
+## Output Options -----------------------------------------------------
+
+#if $segalign_mode != "segalign_repeat_masker"
+    #if str($mode.output_options.format.format_selector) == "bam"
+            --format '$mode.output_options.format.bam_options'
+    #else if str($mode.output_options.format.format_selector) == "general_def"
+            --format general-
+    #else if str($mode.output_options.format.format_selector) == "general_full"
+            --format 'general-:${mode.output_options.format.fields}'
+    #else if str($mode.output_options.format.format_selector) == "maf"
+            --format '$mode.output_options.format.maf_type'
+    #else if str($mode.output_options.format.format_selector) == "blastn"
+            --format=BLASTN-
+    #else if str($mode.output_options.format.format_selector) == "differences"
+            --format=differences
+    #end if
+    ## todo :: rplot, bam
+    ##  --action:target=multiple
+    ##  $output_format.rplot
+    ##  .if str( $output_format.out.format ) == "bam":
+    ##      | samtools sort -@\${GALAXY_SLOTS:-2} -T "\${TMPDIR:-.}" -O bam -o '${output}'
+    ##  .else:
+    ##      > '${output}'
+    ##  .end if
+    ##  .if $output_format.rplot:
+    ##      &&
+    ##      Rscript $r_plot > /dev/null 2>&1
+    ##  .end if
+    #if $segalign_mode == "segalign"
+            --output '$segalign_output'
+    #end if
+#else if $segalign_mode == "segalign_repeat_masker"
+            --M '$mode.output_options.M'
+            --output '$segalign_repeat_masker_output'
+#end if
+#if str($mode.output_options.markend) == "true"
+            --markend
+#end if
+
+## System Options -----------------------------------------------------
+
+            --wga_chunk_size '$mode.system_options.wga_chunk_size'
+            --lastz_interval_size '$mode.system_options.lastz_interval_size'
+            --seq_block_size '$mode.system_options.seq_block_size'
+            --num_gpu '$mode.system_options.num_gpu'
+#if str($mode.system_options.debug) == "true"
+            --debug
+#end if
+
+## -------------------------------------------------------------------
+
+#if $segalign_mode == "segalign_diagonal_partition"
+        &&
+        '$__tool_directory__/package_output.py'
+#end if
+
+    ]]></command>
+    <inputs>
+        <conditional name="mode">
+            <param name="mode_selector" type="select" label="Choose the mode">
+                <option value="segalign" selected="true">SegAlign</option>
+                <option value="segalign_repeat_masker">SegAlign repeat masker</option>
+            </param>
+            <when value="segalign">
+                <param name="target" type="data" format="fasta" label="Target sequence file in FASTA format"/>
+                <param name="query" type="data" format="fasta" label="Query sequence file in FASTA format"/>
+                <expand macro="sequence_options"/>
+                <expand macro="scoring_options"/>
+                <expand macro="seeding_options"/>
+                <expand macro="ungapped_extension_options"/>
+                <expand macro="gapped_extension_options"/>
+                <expand macro="output_options">
+                    <expand macro="segalign_output_options"/>
+                </expand>
+                <expand macro="system_options"/>
+                <section name="diagonal_partition_options" expanded="false" title="Diagonal Partition Options">
+                    <param argument="--diagonal_partition" type="boolean" value="false" label="Enable diagonal partition optimization"/>
+                    <param argument="--max_segments" type="integer" value="20000" label="Max segments"/>
+                </section>
+            </when>
+            <when value="segalign_repeat_masker">
+                <param name="seq_file" type="data" format="fasta" label="Sequence file in FASTA format"/>
+                <expand macro="sequence_options">
+                    <param argument="--neighbor_proportion" type="float" value="0.2" label="Proportion of neighbouring intervals to align the query interval to"/>
+                </expand>
+                <expand macro="scoring_options"/>
+                <expand macro="seeding_options"/>
+                <expand macro="ungapped_extension_options"/>
+                <expand macro="output_options">
+                    <param argument="--M" type="integer" value="1" max="255" label="report any position that is covered by at least this many alignments; the maximum allowed depth is 255"/>
+                </expand>
+                <expand macro="system_options"/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="segalign_output" format="tabular" label="SegAlign on ${on_string}">
+            <change_format>
+                <when input="mode.output_options.format.format_selector" value="bam" format="bam"/>
+                <when input="mode.output_options.format.format_selector" value="maf" format="maf"/>
+                <when input="mode.output_options.format.format_selector" value="differences" format="interval"/>
+            </change_format>
+            <filter>mode['mode_selector'] == 'segalign' and mode['diagonal_partition_options']['diagonal_partition'] is False</filter>
+        </data>
+        <data name="segalign_diagonal_partition_output" format="tgz" from_work_dir="data_package.tgz" label="SegAlign Diagonal Partition on ${on_string}">
+            <filter>mode['mode_selector'] == 'segalign' and mode['diagonal_partition_options']['diagonal_partition'] is True</filter>
+        </data>
+        <data name="segalign_repeat_masker_output" format="tabular" label="SegAlign Repeat Masker on ${on_string}">
+            <filter>mode['mode_selector'] == 'segalign_repeat_masker'</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1" expect_test_failure="true">
+            <param name="mode_selector" value="segalign"/>
+            <param name="target" value="hg38.chr20.chunk.fa.gz" ftype="fasta"/>
+            <param name="query" value="mm39.chr2.chunk.fa.gz" ftype="fasta"/>
+            <output name="segalign_output" decompress="true" file="segalign-output.maf.gz" ftype="maf"/>
+        </test>
+        <test expect_num_outputs="1" expect_test_failure="true">
+            <param name="mode_selector" value="segalign_repeat_masker"/>
+            <param name="seq_file" value="hg38.chr20.chunk.fa.gz" ftype="fasta"/>
+            <output name="segalign_repeat_masker_output" decompress="true" file="segalign-repeat-masker-output.tab.gz" ftype="tabular"/>
+        </test>
+        <test expect_num_outputs="1" expect_test_failure="true">
+            <param name="mode_selector" value="segalign"/>
+            <param name="target" value="hg38.chr20.chunk.fa.gz" ftype="fasta"/>
+            <param name="query" value="mm39.chr2.chunk.fa.gz" ftype="fasta"/>
+            <param name="diagonal_partition" value="true"/>
+            <output name="segalign_diagonal_partition_output" ftype="tgz">
+                <assert_contents>
+                    <has_archive_member path="galaxy/commands.json"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+    SegAlign is a scalable, GPU-accelerated system for computing pairwise WGA. SegAlign is based on the standard seed-filter-extend heuristic, in which the filtering stage dominates the runtime (e.g. 98% for human-mouse WGA), and is accelerated using GPU(s).
+
+    https://github.com/gsneha26
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/segalign_output_options.xml	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,93 @@
+<macros>
+    <xml name="segalign_output_options">
+        <conditional name="format">
+            <param name="format_selector" type="select" display="radio" label="Specify the output format">
+                <option value="bam">BAM --format=sam)</option>
+                <option value="general_def">General default (--format=general)</option>
+                <option value="general_full">Customized general (--format=general[:fields])</option>
+                <option value="maf" selected="true">MAF (--format=maf)</option>
+                <option value="blastn">blastn (--format=BLASTN)</option>
+                <option value="differences">Differences (--format=differences)</option>
+            </param>
+            <when value="bam">
+                <param name="bam_options" type="select" display="radio" argument="--format=sam, --format=softsam" label="Select a BAM flavor to output" help="Lastz actually outputs SAM data but Galaxy converts it into BAM to save space. For alignments that don't reach the end of a query, ‑‑format=sam uses 'hard clipping', while ‑‑format=softsam uses 'soft clipping'. See the section on 'clipped alignment' in the SAM specification for an explanation of what this means. The options ‑‑format=sam- and ‑‑format=softsam- suppress the SAM header lines. This makes them suitable for concatenating output from multiple runs. If you need to specify readgroup information: use AddOrEplaceReadGroups from Picard package">
+                    <option value="sam">BAM</option>
+                    <option value="softsam">soft-clipped BAM</option>
+                    <option value="sam-">BAM without header</option>
+                    <option value="softsam-">soft-clipped BAM without header</option>
+                </param>
+            </when>
+            <when value="general_def">
+                <!-- Do nothing -->
+            </when>
+            <when value="general_full">
+                <param name="fields" type="select" display="checkboxes" multiple="true" label="Select which fields to include" argument="--format=general-[:fields]">
+                    <option value="score" selected="true">score: Score of the alignment block</option>
+                    <option value="name1" selected="true">name1: Name of the target sequence</option>
+                    <option value="number1">number1: Number of the target sequence within the target file</option>
+                    <option value="strand1" selected="true">strand1: Target sequence strand </option>
+                    <option value="size1" selected="true">size1: Size of the entire target sequence</option>
+                    <option value="start1">start1: Starting position of the alignment block in the target, origin-one</option>
+                    <option value="zstart1" selected="true">zstart1: Starting position of the alignment block in the target, origin-zero</option>
+                    <option value="end1" selected="true">end1: Ending position of the alignment block in the target</option>
+                    <option value="length1">length1: Length of the alignment block in the target (excluding gaps)</option>
+                    <option value="text1">text1: Aligned characters in the target, including gap characters</option>
+                    <option value="qalign1">qalign1: The target quality sequence (if there is one) correpsonding to aligned characters</option>
+                    <option value="nucs1">nucs1: The entire target sequence</option>
+                    <option value="name2" selected="true">name2: Name of the query sequence</option>
+                    <option value="number2">number2: Number of the query sequence within the query file</option>
+                    <option value="strand2" selected="true">strand2: Query sequence strand</option>
+                    <option value="size2" selected="true">size2: Size of the entire query sequence</option>
+                    <option value="start2">start2: Starting position of the alignment block in the query, origin-one</option>
+                    <option value="zstart2" selected="true">zstart2: Starting position of the alignment block in the query, origin-one</option>
+                    <option value="end2" selected="true">end2: Ending position of the alignment block in the query</option>
+                    <option value="length2">length2: Length of the alignment block in the query (excluding gaps)</option>
+                    <option value="text2">text2: Aligned characters in the query, including gap characters</option>
+                    <option value="qalign2">qalign2: The query quality sequence (if there is one) correpsonding to aligned characters</option>
+                    <option value="nucs2">nucs2: The entire query sequence</option>
+                    <option value="nmatch">nmatch: Match count</option>
+                    <option value="nmismatch">nmismatch: Mismatch count</option>
+                    <option value="ncolumn">ncolumn: Number of columns in the block. This includes matches, mismatches (substitutions), and gaps</option>
+                    <option value="npair">npair: Number of aligned bases in the block that are matches or mismatches (substitutions)</option>
+                    <option value="ngap">ngap: Gap count, the number of gaps in the block, counting each run of gapped columns as a single gap</option>
+                    <option value="cgap">cgap: Gap column count, the number of gaps in the block, counting each gapped column as a separate gap</option>
+                    <option value="diff">diff: Differences between what would be written for text1 and text2</option>
+                    <option value="cigar">cigar: A CIGAR-like representation of the alignment’s path</option>
+                    <option value="cigarx">cigarx: Same as cigar, but uses a newer syntax that distinguishes matches from substitutions</option>
+                    <option value="identity" selected="true">identity: Fraction of aligned bases in the block that are matches </option>
+                    <option value="idfrac">idfrac: Fraction of aligned bases in the block that are matches </option>
+                    <option value="id%" selected="true">id% Fraction of aligned bases in the block that are matches (as %)</option>
+                    <option value="blastid%">blastid%: Fraction of the alignment block that is matches, as would be reported by NCBI BLAST</option>
+                    <option value="continuity">continuity: Rate of non-gaps (non-indels) in the alignment block</option>
+                    <option value="confrac">confrac: Rate of non-gaps (non-indels) in the alignment block (as fraction)</option>
+                    <option value="con%">con%: Rate of non-gaps (non-indels) in the alignment block (as %)</option>
+                    <option value="coverage" selected="true">coverage: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block</option>
+                    <option value="covfrac">covfrac: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block (as fraction)</option>
+                    <option value="cov%" selected="true">cov%: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block (as %)</option>
+                    <option value="diagonal">diagonal: The diagonal of the start of the alignment block in the DP matrix, expressed as an identifying number start1-start2</option>
+                    <option value="shingle">shingle: A measurement of the shingle overlap between the target and the query</option>
+                    <option value="number">number: The alignment number, counted as alignments are written to output (1-base)</option>
+                    <option value="znumber">znumber: The alignment number, counted as alignments are written to output (0-base)</option>
+                    <sanitizer invalid_char="">
+                        <valid initial="string.letters,string.digits">
+                            <add value="%"/>
+                        </valid>
+                    </sanitizer>
+                </param>
+            </when>
+            <when value="maf">
+                <param name="maf_type" type="select" display="radio" argument="--format=maf" label="Select MAF flavor" help="MAF is a multiple alignment format developed at UCSC">
+                    <option value="maf">MAF</option>
+                    <option value="maf+">MAF with additional stats</option>
+                    <option value="maf-" selected="true">MAF without header and comments</option>
+                </param>
+            </when>
+            <when value="blastn">
+                <!-- Do nothing -->
+            </when>
+            <when value="differences">
+                <!-- Do nothing -->
+            </when>
+        </conditional>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sequence_options.xml	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,12 @@
+<macros>
+    <xml name="sequence_options">
+        <section name="sequence_options" expanded="false" title="Sequence Options">
+            <param name="strand_selector" type="select" label="Strand to search">
+                <option value="plus">plus</option>
+                <option value="minus">minus</option>
+                <option value="both" selected="true">both</option>
+            </param>
+            <yield/>
+        </section>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/system_options.xml	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,11 @@
+<macros>
+    <xml name="system_options">
+        <section name="system_options" title="System Options">
+            <param argument="--wga_chunk_size" type="integer" value="250000" label="Chunk sizes for GPU calls for Xdrop - ⚠ change only if you are a developer"/>
+            <param argument="--lastz_interval_size" type="integer" value="10000000" label="LASTZ interval for Ydrop - ⚠ change only if you are a developer"/>
+            <param argument="--seq_block_size" type="integer" value="500000000" label="LASTZ interval for Ydrop - ⚠ change only if you are a developer"/>
+            <param argument="--num_gpu" type="integer" value="-1" label="Specify number of GPUs to use - -1 if all the GPUs should be used"/>
+            <param argument="--debug" type="boolean" value="false" label="Print debug messages"/>
+        </section>
+    </xml>
+</macros>
Binary file test-data/hg38.chr20.chunk.fa.gz has changed
Binary file test-data/mm39.chr2.chunk.fa.gz has changed
Binary file test-data/segalign-output.maf.gz has changed
Binary file test-data/segalign-repeat-masker-output.tab.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ungapped_extension_options.xml	Wed Apr 17 18:06:54 2024 +0000
@@ -0,0 +1,9 @@
+<macros>
+    <xml name="ungapped_extension_options">
+        <section name="ungapped_extension_options" expanded="false" title="Ungapped Extension Options">
+            <param argument="--xdrop" type="integer" value="910" label="X-drop value for gap-free extension"/>
+            <param argument="--hspthresh" type="integer" value="3000" label="Segment score threshold for high scoring pairs"/>
+            <param argument="--noentropy" type="boolean" checked="false" label="Don't adjust low score segment pair scores using entropy factor after filtering stage"/>
+        </section>
+    </xml>
+</macros>