Mercurial > repos > richard-burhans > segalign
changeset 0:5c72425b7f1b draft
planemo upload for repository https://github.com/richard-burhans/galaxytools/tree/main/tools/segalign commit 98a4dd44360447aa96d92143384d78e116d7581b
author | richard-burhans |
---|---|
date | Wed, 17 Apr 2024 18:06:54 +0000 |
parents | |
children | d3f11ae6790c |
files | diagonal_partition.py gapped_extension_options.xml lastz-cmd.ini macros.xml output_options.xml package_output.py run_segalign_diagonal_partition scoring_options.xml seeding_options.xml segalign.xml segalign_output_options.xml sequence_options.xml system_options.xml test-data/hg38.chr20.chunk.fa.gz test-data/mm39.chr2.chunk.fa.gz test-data/segalign-output.maf.gz test-data/segalign-repeat-masker-output.tab.gz ungapped_extension_options.xml |
diffstat | 18 files changed, 1162 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/diagonal_partition.py Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 + +""" +Diagonal partitioning for segment files output by SegAlign. + +Usage: +diagonal_partition <max-segments> <lastz-command> +""" + +import os +import sys + + +def chunks(lst, n): + """Yield successive n-sized chunks from list.""" + for i in range(0, len(lst), n): + yield lst[i: i + n] + + +if __name__ == "__main__": + + DELETE_AFTER_CHUNKING = True + + # input_params = "10000 sad sadsa sad --segments=tmp10.block5.r1239937044.plus.segments dsa sa --strand=plus --output=out.maf sadads 2> logging.err" + # sys.argv = [sys.argv[0]] + input_params.split(' ') + chunk_size = int(sys.argv[1]) # first parameter contains chunk size + params = sys.argv[2:] + + # Parsing command output from SegAlign + segment_key = "--segments=" + segment_index = None + input_file = None + + for index, value in enumerate(params): + if value[: len(segment_key)] == segment_key: + segment_index = index + input_file = value[len(segment_key):] + break + if segment_index is None: + print(f"Error: could not segment key {segment_key} in parameters {params}") + exit(0) + + if not os.path.isfile(input_file): + print(f"Error: File {input_file} does not exist") + exit(0) + + if ( + chunk_size == 0 or sum(1 for _ in open(input_file)) <= chunk_size + ): # no need to sort if number of lines <= chunk_size + print(" ".join(params), flush=True) + exit(0) + + # Find rest of relevant parameters + output_key = "--output=" + output_index = None + output_alignment_file = None + output_alignment_file_base = None + output_format = None + + strand_key = "--strand=" + strand_index = None + for index, value in enumerate(params): + if value[: len(output_key)] == output_key: + output_index = index + output_alignment_file = value[len(output_key):] + output_alignment_file_base, output_format = output_alignment_file.rsplit( + ".", 1 + ) + if value[: len(strand_key)] == strand_key: + strand_index = index + if segment_index is None: + print(f"Error: could not output key {output_key} in parameters {params}") + exit(0) + if strand_index is None: + print(f"Error: could not output key {strand_key} in parameters {params}") + exit(0) + + err_index = -1 # error file is at very end + err_name_base = params[-1].split(".err", 1)[0] + + data = {} # dict of list of tuple (x, y, str) + + direction = None + if "plus" in params[strand_index]: + direction = "f" + elif "minus" in params[strand_index]: + direction = "r" + else: + print( + f"Error: could not figure out direction from strand value {params[strand_index]}" + ) + exit(0) + + for line in open(input_file, "r"): + if line == "": + continue + ( + seq1_name, + seq1_start, + seq1_end, + seq2_name, + seq2_start, + seq2_end, + _dir, + score, + ) = line.split() + # data.append((int(seq1_start), int(seq2_start), line)) + half_dist = int((int(seq1_end) - int(seq1_start)) // 2) + assert int(seq1_end) > int(seq1_start) + assert int(seq2_end) > int(seq2_start) + seq1_mid = int(seq1_start) + half_dist + seq2_mid = int(seq2_start) + half_dist + data.setdefault((seq1_name, seq2_name), []).append((seq1_mid, seq2_mid, line)) + + # If there are chromosome pairs with segment count <= chunk_size + # then no need to sort and split these pairs into separate files. + # It is better to keep these pairs in a single segment file. + skip_pairs = [] # pairs that have count <= chunk_size. + # these will not be sorted + if len(data.keys()) > 1: + for pair in data.keys(): + if len(data[pair]) <= chunk_size: + skip_pairs.append(pair) + + # sorting for forward segments + if direction == "r": + for pair in data.keys(): + if pair not in skip_pairs: + data[pair] = sorted( + data[pair], key=lambda coord: (coord[1] - coord[0], coord[0]) + ) + # sorting for reverse segments + elif direction == "f": + for pair in data.keys(): + if pair not in skip_pairs: + data[pair] = sorted( + data[pair], key=lambda coord: (coord[1] + coord[0], coord[0]) + ) + else: + print(f"INVALID DIRECTION VALUE: {direction}") + exit(0) + + # Writing file in chunks + ctr = 0 + for pair in data.keys() - skip_pairs: + for chunk in chunks(list(zip(*data[pair]))[2], chunk_size): + ctr += 1 + name_addition = f".split{ctr}" + fname = input_file.split(".segments", 1)[0] + name_addition + ".segments" + + with open(fname, "w") as f: + f.writelines(chunk) + # update segment file in command + params[segment_index] = segment_key + fname + # update output file in command + params[output_index] = ( + output_key + + output_alignment_file_base + + name_addition + + "." + + output_format + ) + # update error file in command + params[-1] = err_name_base + name_addition + ".err" + print(" ".join(params), flush=True) + + # writing unsorted skipped pairs + if len(skip_pairs) > 0: + skip_pairs_with_len = sorted( + [(len(data[p]), p) for p in skip_pairs] + ) # list of tuples of (pair length, pair) + aggregated_skip_pairs = [] # list of list of pair names + current_count = 0 + aggregated_skip_pairs.append([]) + for count, pair in skip_pairs_with_len: + if current_count + count <= chunk_size: + current_count += count + aggregated_skip_pairs[-1].append(pair) + else: + aggregated_skip_pairs.append([]) + current_count = count + aggregated_skip_pairs[-1].append(pair) + + for aggregate in aggregated_skip_pairs: + ctr += 1 + name_addition = f".split{ctr}" + fname = input_file.split(".segments", 1)[0] + name_addition + ".segments" + with open(fname, "w") as f: + for pair in sorted(aggregate, key=lambda p: (p[1], p[0])): + chunk = list(zip(*data[pair]))[2] + f.writelines(chunk) + # update segment file in command + params[segment_index] = segment_key + fname + # update output file in command + params[output_index] = ( + output_key + + output_alignment_file_base + + name_addition + + "." + + output_format + ) + # update error file in command + params[-1] = err_name_base + name_addition + ".err" + print(" ".join(params), flush=True) + + if DELETE_AFTER_CHUNKING: + os.remove(input_file)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gapped_extension_options.xml Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,10 @@ +<macros> + <xml name="gapped_extension_options"> + <section name="gapped_extension_options" expanded="false" title="Gapped Extension Options"> + <param argument="--nogapped" type="boolean" checked="false" label="Don't perform gapped extension stage"/> + <param argument="--ydrop" type="integer" value="9430" label="Y-drop value for gapped extension"/> + <param argument="--gappedthresh" type="integer" value="" optional="true" label="Score threshold for gapped alignments"/> + <param argument="--notrivial" type="boolean" checked="false" label="Don't output a trivial self-alignment block if the target and query sequences are identical"/> + </section> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lastz-cmd.ini Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,94 @@ +[arguments] +flag_args = + allgappedbounds + anyornone + gapped + gfextend + markend + nocensus + noentropy + nofilter + nogapped + nogfextend + nomirror + norecoverseeds + notransition + notrivial + notwins + noytrim + recoverseeds + self + version + yasra75 + yasra85 + yasra85short + yasra90 + yasra95 + yasra95short + yasra98 + +str_args = + action:query + action:target + ambiguous + ball + chores + filter + format + gap + hspthresh + include + match + maxwordcount + mismatch + nochain + output + outputmasking + outputmasking+ + outputmasking+:soft + outputmasking:soft + querydepth + queryhsplimit + rdotplot + readgroup + scores + seed + segments + show + strand + targetcapsule + twins + writecapsule + writesegments + +int_args = + allocate:query + allocate:target + allocate:traceback + exact + gappedthresh + inner + masking + queryhspbest + step + word + xdrop + ydrop + +bool_str_args= + census + census16 + census32 + chain + entropy + help + infer + inferonly + infscores + tableonly + +bool_int_args= + progress + progress+masking + transition +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,22 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">segalign-full</requirement> + <requirement type="package" version="0.18">bashlex</requirement> + <yield/> + </requirements> + </xml> + <token name="@TOOL_VERSION@">0.1.2.1</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">21.05</token> + <xml name="edam_ontology"> + <edam_operations> + <edam_operation>operation_0491</edam_operation> + </edam_operations> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1109/SC41405.2020.00043</citation> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/output_options.xml Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,8 @@ +<macros> + <xml name="output_options"> + <section name="output_options" expanded="false" title="Output Options"> + <yield/> + <param argument="--markend" type="boolean" value="false" label="Write a marker line just before completion"/> + </section> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/package_output.py Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,286 @@ +#!/usr/bin/env python + +import argparse +import configparser +import json +import os +import sys +import tarfile +import typing + +import bashlex + + +class PackageFile: + def __init__( + self, + pathname: str = "data_package.tgz", + top_dir: str = "galaxy", + data_dir: str = "files", + config_file: str = "commands.json", + ) -> None: + self.pathname: str = os.path.realpath(pathname) + self.data_root: str = os.path.join(top_dir, data_dir) + self.config_path: str = os.path.join(top_dir, config_file) + self.config_file: str = config_file + self.tarfile: typing.Optional[tarfile.TarFile] = None + self.name_cache: typing.Dict[typing.Any, typing.Any] = {} + self.working_dir: str = os.path.realpath(os.getcwd()) + + def _initialize(self) -> None: + if self.tarfile is None: + self.tarfile = tarfile.open( + name=self.pathname, + mode="w:gz", + format=tarfile.GNU_FORMAT, + compresslevel=1, + ) + + def add_config(self, pathname: str) -> None: + if self.tarfile is None: + self._initialize() + + source_path = os.path.realpath(pathname) + + if self.tarfile is not None: + self.tarfile.add(source_path, arcname=self.config_path, recursive=False) + + def add_file(self, pathname: str, arcname: typing.Optional[str] = None) -> None: + if self.tarfile is None: + self._initialize() + + source_path = os.path.realpath(pathname) + + dest_path = None + + if arcname is None: + dest_path = os.path.join(self.data_root, os.path.basename(source_path)) + else: + arc_path = os.path.realpath(arcname) + rel_path = os.path.relpath(arc_path, self.working_dir) + if not (os.path.isabs(rel_path) or rel_path.startswith("../")): + dest_path = os.path.join(self.data_root, rel_path) + else: + sys.exit("path fail") + + if dest_path is not None: + if self.tarfile is not None: + if dest_path not in self.name_cache: + try: + self.tarfile.add( + source_path, arcname=dest_path, recursive=False + ) + except FileNotFoundError: + sys.exit(f"missing source file {source_path}") + + self.name_cache[dest_path] = 1 + # print(f"added: {dest_path}", flush=True) + + def close(self) -> None: + if self.tarfile is not None: + self.tarfile.close() + self.tarfile = None + + +class bashCommandLineFile: + def __init__( + self, + pathname: str, + config: configparser.ConfigParser, + package_file: PackageFile, + ) -> None: + self.pathname: str = pathname + self.config = config + self.package_file = package_file + self.executable: typing.Optional[str] = None + self._parse_lines() + + def _parse_lines(self) -> None: + with open("commands.json", "w") as ofh: + with open(self.pathname) as f: + line: str + for line in f: + line = line.rstrip("\n") + command_dict = self._parse_line(line) + # we may want to re-write args here + new_args_list = [] + + args_list = command_dict.get("args", []) + for arg in args_list: + if arg.startswith("--target="): + pathname = arg[9:] + new_args_list.append(arg) + if "[" in pathname: + elems = pathname.split("[") + sequence_file = elems.pop(0) + self.package_file.add_file(sequence_file, sequence_file) + for elem in elems: + if elem.endswith("]"): + elem = elem[:-1] + if elem.startswith("subset="): + subset_file = elem[7:] + self.package_file.add_file(subset_file) + + elif arg.startswith("--query="): + pathname = arg[8:] + new_args_list.append(arg) + if "[" in pathname: + elems = pathname.split("[") + sequence_file = elems.pop(0) + self.package_file.add_file(sequence_file, sequence_file) + for elem in elems: + if elem.endswith("]"): + elem = elem[:-1] + if elem.startswith("subset="): + subset_file = elem[7:] + self.package_file.add_file(subset_file) + elif arg.startswith("--segments="): + pathname = arg[11:] + new_args_list.append(arg) + self.package_file.add_file(pathname) + elif arg.startswith("--scores="): + pathname = arg[9:] + new_args_list.append("--scores=data/scores.txt") + self.package_file.add_file(pathname, "data/scores.txt") + else: + new_args_list.append(arg) + + command_dict["args"] = new_args_list + print(json.dumps(command_dict), file=ofh) + + self.package_file.add_config("commands.json") + + def _parse_line(self, line: str) -> typing.Dict[str, typing.Any]: + # resolve shell redirects + trees: typing.List[typing.Any] = bashlex.parse(line, strictmode=False) # type: ignore[attr-defined] + positions: typing.List[typing.Tuple[int, int]] = [] + + for tree in trees: + visitor = nodevisitor(positions) + visitor.visit(tree) + + # do replacements from the end so the indicies will be correct + positions.reverse() + + processed = list(line) + for start, end in positions: + processed[start:end] = "" + + processed_line: str = "".join(processed) + + command_dict = self._parse_processed_line(processed_line) + command_dict["stdin"] = visitor.stdin + command_dict["stdout"] = visitor.stdout + command_dict["stderr"] = visitor.stderr + + return command_dict + + def _parse_processed_line(self, line: str) -> typing.Dict[str, typing.Any]: + argv: typing.List[str] = list(bashlex.split(line)) # type: ignore[attr-defined] + self.executable = argv.pop(0) + + parser: argparse.ArgumentParser = argparse.ArgumentParser(add_help=False) + if "arguments" in self.config: + arguments_section = self.config["arguments"] + + arg: str + if "flag_args" in arguments_section: + for arg in arguments_section["flag_args"].split(): + parser.add_argument(f"--{arg}", action="store_true") + + if "str_args" in arguments_section: + for arg in arguments_section["str_args"].split(): + parser.add_argument(f"--{arg}", type=str) + + if "bool_str_args" in arguments_section: + for arg in arguments_section["bool_str_args"].split(): + parser.add_argument( + f"--{arg}", nargs="?", const=True, default=False + ) + + if "int_args" in arguments_section: + for arg in arguments_section["int_args"].split(): + parser.add_argument(f"--{arg}", type=int) + + if "bool_int_args" in arguments_section: + for arg in arguments_section["bool_int_args"].split(): + parser.add_argument( + f"--{arg}", nargs="?", const=True, default=False + ) + + namespace, rest = parser.parse_known_intermixed_args(argv) + vars_dict = vars(namespace) + + command_dict: typing.Dict[str, typing.Any] = { + "executable": self.executable, + "args": [], + } + + for var in vars_dict.keys(): + value = vars_dict[var] + if value is not None: + if isinstance(value, bool): + if value is True: + command_dict["args"].append(f"--{var}") + else: + command_dict["args"].append(f"--{var}={value}") + + if len(rest) >= 0: + value = rest.pop(0) + command_dict["args"].append(f"--target={value}") + + if len(rest) >= 0: + value = rest.pop(0) + command_dict["args"].append(f"--query={value}") + + return command_dict + + +class nodevisitor(bashlex.ast.nodevisitor): # type: ignore[name-defined,misc] + def __init__(self, positions: typing.List[typing.Tuple[int, int]]) -> None: + self.positions = positions + self.stdin = None + self.stdout = None + self.stderr = None + + def visitredirect( + self, + n: bashlex.ast.node, # type: ignore[name-defined] + n_input: int, + n_type: str, + output: typing.Any, + heredoc: typing.Any, + ) -> None: + if isinstance(n_input, int) and 0 <= n_input <= 2: + if isinstance(output, bashlex.ast.node) and output.kind == "word": # type: ignore[attr-defined] + self.positions.append(n.pos) + if n_input == 0: + self.stdin = output.word + elif n_input == 1: + self.stdout = output.word + elif n_input == 2: + self.stderr = output.word + else: + sys.exit(f"oops 1: {type(n_input)}") + else: + sys.exit(f"oops 2: {type(n_input)}") + + def visitheredoc(self, n: bashlex.ast.node, value: typing.Any) -> None: # type: ignore[name-defined] + pass + + +def main() -> None: + our_dirname: str = os.path.dirname(os.path.realpath(__file__)) + lastz_command_config_file: str = os.path.join(our_dirname, "lastz-cmd.ini") + + config: configparser.ConfigParser = configparser.ConfigParser() + config.read(lastz_command_config_file) + + package_file = PackageFile() + lastz_command_file = "lastz_commands.txt" + bashCommandLineFile(lastz_command_file, config, package_file) + package_file.close() + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/run_segalign_diagonal_partition Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,105 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail +#set -o xtrace + +## +## parse arguments +## + +SEGALIGN_ARGS=() + +MAX_SEGMENT_SIZE="" +OUTPUT_FILENAME="" +LASTZ_COMMAND_FILE="lastz_commands.txt" +HELP=0 + +while [[ $# -gt 0 ]]; do + case $1 in + --help) + HELP=1 + shift + ;; + --max_segments) + MAX_SEGMENT_SIZE="$2" + shift 2 + ;; + --output) + OUTPUT_FILENAME=$(readlink -f "$2") + shift 2 + ;; + --tool_directory) + TOOL_DIRECTORY="$2" + shift 2 + ;; + *) + SEGALIGN_ARGS+=("$1") + shift + esac +done + +set -- "${SEGALIGN_ARGS[@]}" + +## +## check arguments +## + +if [[ $# == 0 || "$HELP" == "1" ]]; then + segalign --help + exit 0 +fi + +if [[ $# -lt 2 ]]; then + echo "run_segalign_diagonal_partition: missing target and query sequences" 1>&2 + exit 1 +fi + +ref_path=$(readlink -f "$1") +test -e "$ref_path" || { + echo "run_segalign_diagonal_partition: target file \"$ref_path\" does not exist" 1>&2 + exit 1 +} +query_path=$(readlink -f "$2") +test -e "$query_path" || { + echo "run_segalign_diagonal_partition: query file \"$query_path\" does not exist" 1>&2 + exit 1 +} +shift 2 + +DATA_FOLDER="data" +mkdir -p "$DATA_FOLDER" || { + echo "run_segalign_diagonal_partition: cannont create data directory \"$DATA_FOLDER\"" 1>&2 + exit 1 +} + +cd $DATA_FOLDER/.. +echo "" +echo "Converting fasta files to 2bit format" 1>&2 + +## +## convert target and query to 2bit +## +faToTwoBit "$ref_path" "$DATA_FOLDER/ref.2bit" || { + echo "run_segalign_diagonal_partition: cannot convert \"$ref_path\" to 2bit" 1>&2 + exit 1 +} +faToTwoBit "$query_path" "$DATA_FOLDER/query.2bit" || { + echo "run_segalign_diagonal_partition: cannont convert \"$ref_path\" to 2bit" 1>&2 + exit 1 +} + + + +time { + while IFS= read -r line; do + "$TOOL_DIRECTORY/diagonal_partition.py" $MAX_SEGMENT_SIZE $line >> $LASTZ_COMMAND_FILE + # segalign sort writes out the partitioned segment files to the working + # directory and prints the modified lastz commands. + done < <(stdbuf -oL segalign $ref_path $query_path "${DATA_FOLDER}/" "$@" ) # segalign begins running in this line, +} 1>&2 # and every newline written to stdout, get assigned to $line which + # gets sent to diagonal_partition for diagonal partitioning + +exit 0 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scoring_options.xml Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,25 @@ +<macros> + <xml name="scoring_options"> + <section name="scoring_options" expanded="false" title="Scoring Options"> + <param argument="--scoring" type="data" value="false" optional="true" format="txt" label="Scoring file in LASTZ format"/> + <param name="ambiguous_selector" type="select" label="Ambiguous Nucleotides"> + <option value="x" selected="true">None</option> + <option value="n">N</option> + <option value="iupac">IUPAC</option> + </param> + <conditional name="ambiguous_params"> + <param name="set_ambiguous_params_selector" type="select" label="Set Ambiguous Nucleotide Scoring Parameters"> + <option value="false" selected="true">No</option> + <option value="true">Yes</option> + </param> + <when value="false"> + <!-- Do nothing --> + </when> + <when value="true"> + <param name="ambiguous_reward" type="integer" value="0" label="Ambiguous Nucleotide Reward"/> + <param name="ambiguous_penalty" type="integer" value="0" label="Ambiguous Nucleotide Penalty"/> + </when> + </conditional> + </section> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seeding_options.xml Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,34 @@ +<macros> + <xml name="seeding_options"> + <section name="seeding_options" expanded="false" title="Seeding Options"> + <conditional name="seed"> + <param name="seed_selector" type="select" label="Seed patern"> + <option value="12of19" selected="true">12of19 (1110100110010101111)</option> + <option value="14of22">14of22 (1110101100110010101111)</option> + <option value="custom">custom</option> + </param> + <when value="12of19"> + <!-- Do nothing --> + </when> + <when value="14of22"> + <!-- Do nothing --> + </when> + <when value="custom"> + <param name="custom_seed" type="text" value="" optional="false" label="Custom seed pattern"> + <validator type="empty_field"/> + <validator type="regex" message="Arbitrary pattern of 1s, 0s, and Ts">^[01T]+$</validator> + <sanitizer invalid_char=""> + <valid initial="none"> + <add value="0"/> + <add value="1"/> + <add value="T"/> + </valid> + </sanitizer> + </param> + </when> + </conditional> + <param argument="--step" type="integer" value="1" label="Offset between the starting positions of successive target words considered for generating seed table"/> + <param argument="--notransition" type="boolean" checked="false" label="Don't allow one transition in a seed hit"/> + </section> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segalign.xml Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,246 @@ +<tool id="segalign" name="SegAlign" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>A Scalable GPU System for Pairwise Whole Genome Alignments based on LASTZ's seed-filter-extend paradigm</description> + <macros> + <import>macros.xml</import> + <import>sequence_options.xml</import> + <import>scoring_options.xml</import> + <import>seeding_options.xml</import> + <import>ungapped_extension_options.xml</import> + <import>gapped_extension_options.xml</import> + <import>output_options.xml</import> + <import>segalign_output_options.xml</import> + <import>system_options.xml</import> + </macros> + <expand macro="edam_ontology"/> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ +## +## https://www.gnu.org/software/coreutils/manual/html_node/nproc-invocation.html +## +## If the OMP_NUM_THREADS or OMP_THREAD_LIMIT environment variables +## are set, then they will determine the minimum and maximum returned +## value respectively. +## +## This is how you tame nproc(1) +## +OMP_THREAD_LIMIT=\${GALAXY_SLOTS:-2} + +## Mode ---------------------------------------------------------------- + +#if str($mode.mode_selector) == "segalign" + #if str($mode.diagonal_partition_options.diagonal_partition) == "true" + #set $segalign_mode = "segalign_diagonal_partition" + '$__tool_directory__/run_segalign_diagonal_partition' + --tool_directory '$__tool_directory__' + --max_segments '$mode.diagonal_partition_options.max_segments' + #else + #set $segalign_mode = "segalign" + run_segalign + #end if + '$mode.target' + '$mode.query' +#else if str($mode.mode_selector) == "segalign_repeat_masker" + #set $segalign_mode = "segalign_repeat_masker" + run_segalign_repeat_masker + '$mode.seq_file' +#end if + +## Sequence Options ---------------------------------------------------- + + --strand '$mode.sequence_options.strand_selector' +#if $segalign_mode == "segalign_repeat_masker" + --neighbor_proportion '$mode.sequence_options.neighbor_proportion' +#end if + +## Scoring Options ----------------------------------------------------- + +#set $scoring_pathname = str($mode.scoring_options.scoring) +#if $scoring_pathname != "None": + --scoring '$scoring_pathname' +#end if +#if str($mode.scoring_options.ambiguous_selector) != "x" + #if str($mode.scoring_options.set_ambiguous_params_selector) == "true" + #set $argument_value = ','.join($mode.scoring_options.ambiguous_selector, $mode.scoring_options.ambiguous_reward, $mode.scoring_options.ambiguous_penalty) + --ambiguous '$argument_value' + #else + --ambiguous '$ambiguous_selector' + #end if +#end if + +## Seeding Options ----------------------------------------------------- + +#if str($mode.seeding_options.seed.seed_selector) == "custom" + --seed '$mode.seeding_options.seed.custom_seed' +#else + --seed '$mode.seeding_options.seed.seed_selector' +#end if + --step '$mode.seeding_options.step' +#if str($mode.seeding_options.notransition) == "true" + --notransition +#end if + +## Ungapped Extension Options ------------------------------------------ + + --xdrop '$mode.ungapped_extension_options.xdrop' + --hspthresh '$mode.ungapped_extension_options.hspthresh' +#if str($mode.ungapped_extension_options.noentropy) == "true" + --noentropy +#end if + +## Gapped Extension Options -------------------------------------------- + +#if $segalign_mode != "segalign_repeat_masker" + #if str($mode.gapped_extension_options.nogapped) == "true" + --nogapped + #end if + --ydrop '$mode.gapped_extension_options.ydrop' + #if str($mode.gapped_extension_options.gappedthresh) != "" + --gappedthresh '$mode.gapped_extension_options.gappedthresh' + #end if + #if str($mode.gapped_extension_options.notrivial) == "true" + --notrivial + #end if +#end if + +## Output Options ----------------------------------------------------- + +#if $segalign_mode != "segalign_repeat_masker" + #if str($mode.output_options.format.format_selector) == "bam" + --format '$mode.output_options.format.bam_options' + #else if str($mode.output_options.format.format_selector) == "general_def" + --format general- + #else if str($mode.output_options.format.format_selector) == "general_full" + --format 'general-:${mode.output_options.format.fields}' + #else if str($mode.output_options.format.format_selector) == "maf" + --format '$mode.output_options.format.maf_type' + #else if str($mode.output_options.format.format_selector) == "blastn" + --format=BLASTN- + #else if str($mode.output_options.format.format_selector) == "differences" + --format=differences + #end if + ## todo :: rplot, bam + ## --action:target=multiple + ## $output_format.rplot + ## .if str( $output_format.out.format ) == "bam": + ## | samtools sort -@\${GALAXY_SLOTS:-2} -T "\${TMPDIR:-.}" -O bam -o '${output}' + ## .else: + ## > '${output}' + ## .end if + ## .if $output_format.rplot: + ## && + ## Rscript $r_plot > /dev/null 2>&1 + ## .end if + #if $segalign_mode == "segalign" + --output '$segalign_output' + #end if +#else if $segalign_mode == "segalign_repeat_masker" + --M '$mode.output_options.M' + --output '$segalign_repeat_masker_output' +#end if +#if str($mode.output_options.markend) == "true" + --markend +#end if + +## System Options ----------------------------------------------------- + + --wga_chunk_size '$mode.system_options.wga_chunk_size' + --lastz_interval_size '$mode.system_options.lastz_interval_size' + --seq_block_size '$mode.system_options.seq_block_size' + --num_gpu '$mode.system_options.num_gpu' +#if str($mode.system_options.debug) == "true" + --debug +#end if + +## ------------------------------------------------------------------- + +#if $segalign_mode == "segalign_diagonal_partition" + && + '$__tool_directory__/package_output.py' +#end if + + ]]></command> + <inputs> + <conditional name="mode"> + <param name="mode_selector" type="select" label="Choose the mode"> + <option value="segalign" selected="true">SegAlign</option> + <option value="segalign_repeat_masker">SegAlign repeat masker</option> + </param> + <when value="segalign"> + <param name="target" type="data" format="fasta" label="Target sequence file in FASTA format"/> + <param name="query" type="data" format="fasta" label="Query sequence file in FASTA format"/> + <expand macro="sequence_options"/> + <expand macro="scoring_options"/> + <expand macro="seeding_options"/> + <expand macro="ungapped_extension_options"/> + <expand macro="gapped_extension_options"/> + <expand macro="output_options"> + <expand macro="segalign_output_options"/> + </expand> + <expand macro="system_options"/> + <section name="diagonal_partition_options" expanded="false" title="Diagonal Partition Options"> + <param argument="--diagonal_partition" type="boolean" value="false" label="Enable diagonal partition optimization"/> + <param argument="--max_segments" type="integer" value="20000" label="Max segments"/> + </section> + </when> + <when value="segalign_repeat_masker"> + <param name="seq_file" type="data" format="fasta" label="Sequence file in FASTA format"/> + <expand macro="sequence_options"> + <param argument="--neighbor_proportion" type="float" value="0.2" label="Proportion of neighbouring intervals to align the query interval to"/> + </expand> + <expand macro="scoring_options"/> + <expand macro="seeding_options"/> + <expand macro="ungapped_extension_options"/> + <expand macro="output_options"> + <param argument="--M" type="integer" value="1" max="255" label="report any position that is covered by at least this many alignments; the maximum allowed depth is 255"/> + </expand> + <expand macro="system_options"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="segalign_output" format="tabular" label="SegAlign on ${on_string}"> + <change_format> + <when input="mode.output_options.format.format_selector" value="bam" format="bam"/> + <when input="mode.output_options.format.format_selector" value="maf" format="maf"/> + <when input="mode.output_options.format.format_selector" value="differences" format="interval"/> + </change_format> + <filter>mode['mode_selector'] == 'segalign' and mode['diagonal_partition_options']['diagonal_partition'] is False</filter> + </data> + <data name="segalign_diagonal_partition_output" format="tgz" from_work_dir="data_package.tgz" label="SegAlign Diagonal Partition on ${on_string}"> + <filter>mode['mode_selector'] == 'segalign' and mode['diagonal_partition_options']['diagonal_partition'] is True</filter> + </data> + <data name="segalign_repeat_masker_output" format="tabular" label="SegAlign Repeat Masker on ${on_string}"> + <filter>mode['mode_selector'] == 'segalign_repeat_masker'</filter> + </data> + </outputs> + <tests> + <test expect_num_outputs="1" expect_test_failure="true"> + <param name="mode_selector" value="segalign"/> + <param name="target" value="hg38.chr20.chunk.fa.gz" ftype="fasta"/> + <param name="query" value="mm39.chr2.chunk.fa.gz" ftype="fasta"/> + <output name="segalign_output" decompress="true" file="segalign-output.maf.gz" ftype="maf"/> + </test> + <test expect_num_outputs="1" expect_test_failure="true"> + <param name="mode_selector" value="segalign_repeat_masker"/> + <param name="seq_file" value="hg38.chr20.chunk.fa.gz" ftype="fasta"/> + <output name="segalign_repeat_masker_output" decompress="true" file="segalign-repeat-masker-output.tab.gz" ftype="tabular"/> + </test> + <test expect_num_outputs="1" expect_test_failure="true"> + <param name="mode_selector" value="segalign"/> + <param name="target" value="hg38.chr20.chunk.fa.gz" ftype="fasta"/> + <param name="query" value="mm39.chr2.chunk.fa.gz" ftype="fasta"/> + <param name="diagonal_partition" value="true"/> + <output name="segalign_diagonal_partition_output" ftype="tgz"> + <assert_contents> + <has_archive_member path="galaxy/commands.json"/> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ + SegAlign is a scalable, GPU-accelerated system for computing pairwise WGA. SegAlign is based on the standard seed-filter-extend heuristic, in which the filtering stage dominates the runtime (e.g. 98% for human-mouse WGA), and is accelerated using GPU(s). + + https://github.com/gsneha26 + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segalign_output_options.xml Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,93 @@ +<macros> + <xml name="segalign_output_options"> + <conditional name="format"> + <param name="format_selector" type="select" display="radio" label="Specify the output format"> + <option value="bam">BAM --format=sam)</option> + <option value="general_def">General default (--format=general)</option> + <option value="general_full">Customized general (--format=general[:fields])</option> + <option value="maf" selected="true">MAF (--format=maf)</option> + <option value="blastn">blastn (--format=BLASTN)</option> + <option value="differences">Differences (--format=differences)</option> + </param> + <when value="bam"> + <param name="bam_options" type="select" display="radio" argument="--format=sam, --format=softsam" label="Select a BAM flavor to output" help="Lastz actually outputs SAM data but Galaxy converts it into BAM to save space. For alignments that don't reach the end of a query, ‑‑format=sam uses 'hard clipping', while ‑‑format=softsam uses 'soft clipping'. See the section on 'clipped alignment' in the SAM specification for an explanation of what this means. The options ‑‑format=sam- and ‑‑format=softsam- suppress the SAM header lines. This makes them suitable for concatenating output from multiple runs. If you need to specify readgroup information: use AddOrEplaceReadGroups from Picard package"> + <option value="sam">BAM</option> + <option value="softsam">soft-clipped BAM</option> + <option value="sam-">BAM without header</option> + <option value="softsam-">soft-clipped BAM without header</option> + </param> + </when> + <when value="general_def"> + <!-- Do nothing --> + </when> + <when value="general_full"> + <param name="fields" type="select" display="checkboxes" multiple="true" label="Select which fields to include" argument="--format=general-[:fields]"> + <option value="score" selected="true">score: Score of the alignment block</option> + <option value="name1" selected="true">name1: Name of the target sequence</option> + <option value="number1">number1: Number of the target sequence within the target file</option> + <option value="strand1" selected="true">strand1: Target sequence strand </option> + <option value="size1" selected="true">size1: Size of the entire target sequence</option> + <option value="start1">start1: Starting position of the alignment block in the target, origin-one</option> + <option value="zstart1" selected="true">zstart1: Starting position of the alignment block in the target, origin-zero</option> + <option value="end1" selected="true">end1: Ending position of the alignment block in the target</option> + <option value="length1">length1: Length of the alignment block in the target (excluding gaps)</option> + <option value="text1">text1: Aligned characters in the target, including gap characters</option> + <option value="qalign1">qalign1: The target quality sequence (if there is one) correpsonding to aligned characters</option> + <option value="nucs1">nucs1: The entire target sequence</option> + <option value="name2" selected="true">name2: Name of the query sequence</option> + <option value="number2">number2: Number of the query sequence within the query file</option> + <option value="strand2" selected="true">strand2: Query sequence strand</option> + <option value="size2" selected="true">size2: Size of the entire query sequence</option> + <option value="start2">start2: Starting position of the alignment block in the query, origin-one</option> + <option value="zstart2" selected="true">zstart2: Starting position of the alignment block in the query, origin-one</option> + <option value="end2" selected="true">end2: Ending position of the alignment block in the query</option> + <option value="length2">length2: Length of the alignment block in the query (excluding gaps)</option> + <option value="text2">text2: Aligned characters in the query, including gap characters</option> + <option value="qalign2">qalign2: The query quality sequence (if there is one) correpsonding to aligned characters</option> + <option value="nucs2">nucs2: The entire query sequence</option> + <option value="nmatch">nmatch: Match count</option> + <option value="nmismatch">nmismatch: Mismatch count</option> + <option value="ncolumn">ncolumn: Number of columns in the block. This includes matches, mismatches (substitutions), and gaps</option> + <option value="npair">npair: Number of aligned bases in the block that are matches or mismatches (substitutions)</option> + <option value="ngap">ngap: Gap count, the number of gaps in the block, counting each run of gapped columns as a single gap</option> + <option value="cgap">cgap: Gap column count, the number of gaps in the block, counting each gapped column as a separate gap</option> + <option value="diff">diff: Differences between what would be written for text1 and text2</option> + <option value="cigar">cigar: A CIGAR-like representation of the alignment’s path</option> + <option value="cigarx">cigarx: Same as cigar, but uses a newer syntax that distinguishes matches from substitutions</option> + <option value="identity" selected="true">identity: Fraction of aligned bases in the block that are matches </option> + <option value="idfrac">idfrac: Fraction of aligned bases in the block that are matches </option> + <option value="id%" selected="true">id% Fraction of aligned bases in the block that are matches (as %)</option> + <option value="blastid%">blastid%: Fraction of the alignment block that is matches, as would be reported by NCBI BLAST</option> + <option value="continuity">continuity: Rate of non-gaps (non-indels) in the alignment block</option> + <option value="confrac">confrac: Rate of non-gaps (non-indels) in the alignment block (as fraction)</option> + <option value="con%">con%: Rate of non-gaps (non-indels) in the alignment block (as %)</option> + <option value="coverage" selected="true">coverage: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block</option> + <option value="covfrac">covfrac: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block (as fraction)</option> + <option value="cov%" selected="true">cov%: Fraction of the entire input sequence (target or query, whichever is shorter) that is covered by the alignment block (as %)</option> + <option value="diagonal">diagonal: The diagonal of the start of the alignment block in the DP matrix, expressed as an identifying number start1-start2</option> + <option value="shingle">shingle: A measurement of the shingle overlap between the target and the query</option> + <option value="number">number: The alignment number, counted as alignments are written to output (1-base)</option> + <option value="znumber">znumber: The alignment number, counted as alignments are written to output (0-base)</option> + <sanitizer invalid_char=""> + <valid initial="string.letters,string.digits"> + <add value="%"/> + </valid> + </sanitizer> + </param> + </when> + <when value="maf"> + <param name="maf_type" type="select" display="radio" argument="--format=maf" label="Select MAF flavor" help="MAF is a multiple alignment format developed at UCSC"> + <option value="maf">MAF</option> + <option value="maf+">MAF with additional stats</option> + <option value="maf-" selected="true">MAF without header and comments</option> + </param> + </when> + <when value="blastn"> + <!-- Do nothing --> + </when> + <when value="differences"> + <!-- Do nothing --> + </when> + </conditional> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequence_options.xml Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,12 @@ +<macros> + <xml name="sequence_options"> + <section name="sequence_options" expanded="false" title="Sequence Options"> + <param name="strand_selector" type="select" label="Strand to search"> + <option value="plus">plus</option> + <option value="minus">minus</option> + <option value="both" selected="true">both</option> + </param> + <yield/> + </section> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/system_options.xml Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,11 @@ +<macros> + <xml name="system_options"> + <section name="system_options" title="System Options"> + <param argument="--wga_chunk_size" type="integer" value="250000" label="Chunk sizes for GPU calls for Xdrop - ⚠ change only if you are a developer"/> + <param argument="--lastz_interval_size" type="integer" value="10000000" label="LASTZ interval for Ydrop - ⚠ change only if you are a developer"/> + <param argument="--seq_block_size" type="integer" value="500000000" label="LASTZ interval for Ydrop - ⚠ change only if you are a developer"/> + <param argument="--num_gpu" type="integer" value="-1" label="Specify number of GPUs to use - -1 if all the GPUs should be used"/> + <param argument="--debug" type="boolean" value="false" label="Print debug messages"/> + </section> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ungapped_extension_options.xml Wed Apr 17 18:06:54 2024 +0000 @@ -0,0 +1,9 @@ +<macros> + <xml name="ungapped_extension_options"> + <section name="ungapped_extension_options" expanded="false" title="Ungapped Extension Options"> + <param argument="--xdrop" type="integer" value="910" label="X-drop value for gap-free extension"/> + <param argument="--hspthresh" type="integer" value="3000" label="Segment score threshold for high scoring pairs"/> + <param argument="--noentropy" type="boolean" checked="false" label="Don't adjust low score segment pair scores using entropy factor after filtering stage"/> + </section> + </xml> +</macros>