changeset 0:2af9137ba067 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mash_sketch_builder/ commit c6efcbece52dec310253537b35419839746fff7f"
author iuc
date Wed, 26 Feb 2020 17:06:21 -0500
parents
children b6016642539d
files data_manager/mash_sketch_builder.py data_manager/mash_sketch_builder.xml data_manager_conf.xml test-data/mash_sketch_data_manager.json test-data/test_assembly.fasta tool-data/mash_sketches.loc.sample tool_data_table_conf.xml.sample
diffstat 6 files changed, 199 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/mash_sketch_builder.py	Wed Feb 26 17:06:21 2020 -0500
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import argparse
+import errno
+import json
+import os
+import subprocess
+import uuid
+
+
+DATA_TABLE_NAME = "mash_sketches"
+
+
+def mash_sketch(mash_sketch_args, sketch_name, target_directory, data_table_name=DATA_TABLE_NAME):
+    UUID = str(uuid.uuid4())
+
+    os.mkdir(os.path.join(target_directory, UUID))
+
+    sketch_path = os.path.join(target_directory, UUID, "sketch")
+
+    args = [
+        '-k', str(mash_sketch_args["kmer_size"]),
+        '-s', str(mash_sketch_args["sketch_size"]),
+        '-w', str(mash_sketch_args["probability_threshold"]),
+        '-o', str(sketch_path),
+        '-p', str(mash_sketch_args["threads"]),
+        str(mash_sketch_args["fasta"]),
+    ]
+
+    if mash_sketch_args["individual_sequences"]:
+        args = args + ["-i"]
+
+    subprocess.check_call(['mash', 'sketch'] + args, cwd=target_directory)
+
+    data_table_entry = {
+        'data_tables': {
+            data_table_name: [
+                {
+                    "value": UUID,
+                    "name": sketch_name,
+                    "path": UUID,
+                }
+            ]
+        }
+    }
+
+    return data_table_entry
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('data_manager_json')
+    parser.add_argument('--kmer-size', dest='kmer_size', type=int, default=35, help='kmer length')
+    parser.add_argument('--sketch-size', dest='sketch_size', type=int, default=31, help='minimizer length')
+    parser.add_argument('--probability-threshold', dest='probability_threshold', type=float, default=0.01, help='Probability threshold for warning about low k-mer size')
+    parser.add_argument('--individual-sequences', dest='individual_sequences', action='store_true', default=False, help='Sketch individual sequences (for multi-fasta files)')
+    parser.add_argument('--fasta', dest='fasta', help='Fasta file to sketch')
+    parser.add_argument('--threads', dest='threads', default=1, help='threads')
+    parser.add_argument('--sketch-name', dest='sketch_name', help='Name for sketch')
+    args = parser.parse_args()
+
+    data_manager_input = json.loads(open(args.data_manager_json).read())
+
+    target_directory = data_manager_input['output_data'][0]['extra_files_path']
+
+    try:
+        os.mkdir( target_directory )
+    except OSError as exc:
+        if exc.errno == errno.EEXIST and os.path.isdir( target_directory ):
+            pass
+        else:
+            raise
+
+    data_manager_output = {}
+
+    mash_sketch_args = {
+        "kmer_size": args.kmer_size,
+        "sketch_size": args.sketch_size,
+        "probability_threshold": args.probability_threshold,
+        "fasta": args.fasta,
+        "individual_sequences": args.individual_sequences,
+        "threads": args.threads,
+    }
+
+    data_manager_output = mash_sketch(
+        mash_sketch_args,
+        args.sketch_name,
+        target_directory,
+    )
+
+    open(args.data_manager_json, 'w').write(json.dumps(data_manager_output, sort_keys=True))
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/mash_sketch_builder.xml	Wed Feb 26 17:06:21 2020 -0500
@@ -0,0 +1,74 @@
+<?xml version="1.0"?>
+<tool id="mash_sketch_builder" name="Mash Sketch" tool_type="manage_data" version="@TOOL_VERSION@+galaxy0" profile="18.09">
+    <description>builder</description>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">mash</requirement>
+        <requirement type="package" version="3.7">python</requirement>
+    </requirements>
+    <macros>
+        <token name="@TOOL_VERSION@">2.1</token>
+    </macros>
+    <version_command>mash --version</version_command>
+    <command detect_errors="exit_code">
+    <![CDATA[
+        python '$__tool_directory__/mash_sketch_builder.py'
+          '${out_file}'
+          --threads \${GALAXY_SLOTS:-1}
+          #if str( $input_sequence_source.input_sequence_source_selector ) == "tool_data_table":
+            --fasta '${input_sequence_source.input_sequence.fields.path}'
+          #elif str( $input_sequence_source.input_sequence_source_selector ) == 'history':
+            --fasta '${input_sequence_source.input_sequence}'
+          #end if
+          --sketch-name '${sketch_name}'
+          --sketch-size '${sketch_size}'
+          --kmer-size '${kmer_size}'
+          --probability-threshold '${probability_threshold}'
+          ${individual_sequences}
+    ]]>
+    </command>
+    <inputs>
+        <conditional name="input_sequence_source">
+            <param name="input_sequence_source_selector" type="select"
+                   label="Select a sequence from your history or use one from a tool data table?">
+                <option value="tool_data_table">Sequence from tool data table</option>
+                <option selected="True" value="history">Sequence from history</option>
+            </param>
+            <when value="tool_data_table">
+                <param name="input_sequence" type="select" label="Source FASTA Sequence">
+                    <options from_data_table="all_fasta"/>
+                </param>
+            </when>
+            <when value="history">
+                <param name="input_sequence" type="data" format="fasta" label="Input sequence" help=""/>
+            </when>
+        </conditional>
+        <param type="text" name="sketch_name" label="Sketch name" help="Human-readable description of the sketch"/>
+        <param type="integer" name="sketch_size" value="1000" min="10" max="1000000" label="Sketch size" help="Each sketch will have at most this many non-redundant min-hashes."/>
+        <param type="integer" name="kmer_size" value="21" min="1" max="32" label="K-mer size" help="Hashes will be based on strings of this many nucleotides."/>
+        <param type="float" name="probability_threshold" value="0.01" min="0." max="1." label="Probability threshold for warning about low k-mer size." />
+        <param type="boolean" name="individual_sequences" truevalue="--individual-sequences" falsevalue="" label="Sketch individual sequences" help="e.g. for multi-fastas of single-chromosome genomes or pair-wise gene comparisons."/>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_sequence_source_selector" value="history"/>
+            <param name="input_sequence" value="test_assembly.fasta"/>
+            <param name="sketch_name" value="Test Sketch" />
+            <output name="out_file" value="mash_sketch_data_manager.json" compare="sim_size" />
+        </test>
+    </tests>
+    <help><![CDATA[
+
+**What it does**
+
+  Create a sketch file, which is a reduced representation of a sequence or set
+  of sequences (based on min-hashes) that can be used for fast distance
+  estimations. For output, one sketch file will be generated, but it can have
+  multiple sketches within it, divided by sequences or files.
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1186/s13059-016-0997-x</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Wed Feb 26 17:06:21 2020 -0500
@@ -0,0 +1,18 @@
+<data_managers>
+    <data_manager tool_file="data_manager/mash_sketch_builder.xml" id="mash_sketch_builder" version="2.1+galaxy0">
+        <data_table name="mash_sketches">
+            <output>
+                <column name="value"/>
+                <column name="name"/>
+                <column name="path" output_ref="out_file">
+                    <move type="directory">
+                        <source>${path}</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mash_sketches/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mash_sketches/${path}/sketch.msh</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mash_sketch_data_manager.json	Wed Feb 26 17:06:21 2020 -0500
@@ -0,0 +1,1 @@
+{"data_tables": {"mash_sketches": [{"name": "sketch", "path": "sketch", "value": "sketch"}]}}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_assembly.fasta	Wed Feb 26 17:06:21 2020 -0500
@@ -0,0 +1,3 @@
+>test
+GCATGTCGATCTGTGTGCTAGTCGTAGTCGATCGATCTGATCGATCTGTCAGTCAGTAGT
+CTCAGCGATGCATTATTATATTATATTATCGATCGATGCTGATCGATTATATTCGATCTG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Wed Feb 26 17:06:21 2020 -0500
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tables>
+    <!-- Locations of Mash sketches in the required format -->
+    <table name="mash_sketches" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/mash_sketches.loc" />
+    </table>
+</tables>