changeset 4:36bdf8b439ed draft

Uploaded
author greg
date Sun, 03 Jan 2021 16:13:22 +0000
parents 6116deacb2c7
children 77eb4c46ee88
files macros.xml vsnp_determine_ref_from_data.py vsnp_determine_ref_from_data.xml
diffstat 3 files changed, 126 insertions(+), 303 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Sun Jan 03 16:13:22 2021 +0000
@@ -0,0 +1,24 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<macros>
+    <token name="@WRAPPER_VERSION@">1.0</token>
+    <token name="@PROFILE@">19.09</token>
+    <xml name="param_reference_source">
+        <param name="reference_source" type="select" label="Choose the source for the reference genome">
+            <option value="cached" selected="true">locally cached</option>
+            <option value="history">from history</option>
+        </param>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">
+                @misc{None,
+                journal = {None},
+                author = {1. Stuber T},
+                title = {Manuscript in preparation},
+                year = {None},
+                url = {https://github.com/USDA-VS/vSNP},}
+            </citation>
+        </citations>
+    </xml>
+</macros>
+
--- a/vsnp_determine_ref_from_data.py	Mon Nov 23 21:42:34 2020 +0000
+++ b/vsnp_determine_ref_from_data.py	Sun Jan 03 16:13:22 2021 +0000
@@ -2,30 +2,21 @@
 
 import argparse
 import gzip
-import multiprocessing
 import os
-import queue
+from collections import OrderedDict
+
 import yaml
 from Bio.SeqIO.QualityIO import FastqGeneralIterator
-from collections import OrderedDict
 
-INPUT_READS_DIR = 'input_reads'
 OUTPUT_DBKEY_DIR = 'output_dbkey'
 OUTPUT_METRICS_DIR = 'output_metrics'
 
 
-def get_base_file_name(file_path):
+def get_sample_name(file_path):
     base_file_name = os.path.basename(file_path)
     if base_file_name.find(".") > 0:
         # Eliminate the extension.
         return os.path.splitext(base_file_name)[0]
-    elif base_file_name.find("_") > 0:
-        # The dot extension was likely changed to
-        # the " character.
-        items = base_file_name.split("_")
-        no_ext = "_".join(items[0:-2])
-        if len(no_ext) > 0:
-            return no_ext
     return base_file_name
 
 
@@ -91,18 +82,6 @@
     return group, dbkey
 
 
-def get_group_and_dbkey_for_collection(task_queue, finished_queue, dnaprints_dict, timeout):
-    while True:
-        try:
-            tup = task_queue.get(block=True, timeout=timeout)
-        except queue.Empty:
-            break
-        fastq_file, count_list, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum = tup
-        group, dbkey = get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum)
-        finished_queue.put((fastq_file, count_list, group, dbkey))
-        task_queue.task_done()
-
-
 def get_oligo_dict():
     oligo_dict = {}
     oligo_dict["01_ab1"] = "AATTGTCGGATAGCCTGGCGATAACGACGC"
@@ -138,7 +117,7 @@
 def get_seq_counts(value, fastq_list, gzipped):
     count = 0
     for fastq_file in fastq_list:
-        if gzipped == "true":
+        if gzipped:
             with gzip.open(fastq_file, 'rt') as fh:
                 for title, seq, qual in FastqGeneralIterator(fh):
                     count += seq.count(value)
@@ -166,17 +145,6 @@
     return count_summary, count_list, brucella_sum, bovis_sum, para_sum
 
 
-def get_species_counts_for_collection(task_queue, finished_queue, gzipped, timeout):
-    while True:
-        try:
-            fastq_file = task_queue.get(block=True, timeout=timeout)
-        except queue.Empty:
-            break
-        count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts([fastq_file], gzipped)
-        finished_queue.put((fastq_file, count_summary, count_list, brucella_sum, bovis_sum, para_sum))
-        task_queue.task_done()
-
-
 def get_species_strings(count_summary):
     binary_dictionary = {}
     for k, v in count_summary.items():
@@ -197,56 +165,20 @@
     return brucella_string, bovis_string, para_string
 
 
-def get_species_strings_for_collection(task_queue, finished_queue, timeout):
-    while True:
-        try:
-            tup = task_queue.get(block=True, timeout=timeout)
-        except queue.Empty:
-            break
-        fastq_file, count_summary, count_list, brucella_sum, bovis_sum, para_sum = tup
-        brucella_string, bovis_string, para_string = get_species_strings(count_summary)
-        finished_queue.put((fastq_file, count_list, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum))
-        task_queue.task_done()
-
-
-def output_dbkey(file_name, dbkey, output_file=None):
+def output_dbkey(file_name, dbkey, output_file):
     # Output the dbkey.
-    if output_file is None:
-        # We're producing a dataset collection.
-        output_file = os.path.join(OUTPUT_DBKEY_DIR, "%s.txt" % file_name)
     with open(output_file, "w") as fh:
         fh.write("%s" % dbkey)
 
 
-def output_files(fastq_file, count_list, group, dbkey, dbkey_file=None, metrics_file=None):
-    base_file_name = get_base_file_name(fastq_file)
-    if dbkey_file is not None:
-        # We're dealing with a single read or
-        # a set of paired reads.  If the latter,
-        # the following will hopefully produce a
-        # good sample string.
-        if base_file_name.find("_") > 0:
-            base_file_name = base_file_name.split("_")[0]
+def output_files(fastq_file, count_list, group, dbkey, dbkey_file, metrics_file):
+    base_file_name = get_sample_name(fastq_file)
     output_dbkey(base_file_name, dbkey, dbkey_file)
     output_metrics(base_file_name, count_list, group, dbkey, metrics_file)
 
 
-def output_files_for_collection(task_queue, timeout):
-    while True:
-        try:
-            tup = task_queue.get(block=True, timeout=timeout)
-        except queue.Empty:
-            break
-        fastq_file, count_list, group, dbkey = tup
-        output_files(fastq_file, count_list, group, dbkey)
-        task_queue.task_done()
-
-
-def output_metrics(file_name, count_list, group, dbkey, output_file=None):
+def output_metrics(file_name, count_list, group, dbkey, output_file):
     # Output the metrics.
-    if output_file is None:
-        # We're producing a dataset collection.
-        output_file = os.path.join(OUTPUT_METRICS_DIR, "%s.txt" % file_name)
     with open(output_file, "w") as fh:
         fh.write("Sample: %s\n" % file_name)
         fh.write("Brucella counts: ")
@@ -262,42 +194,21 @@
         fh.write("\ndbkey: %s\n" % dbkey)
 
 
-def set_num_cpus(num_files, processes):
-    num_cpus = int(multiprocessing.cpu_count())
-    if num_files < num_cpus and num_files < processes:
-        return num_files
-    if num_cpus < processes:
-        half_cpus = int(num_cpus / 2)
-        if num_files < half_cpus:
-            return num_files
-        return half_cpus
-    return processes
-
-
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
 
     parser.add_argument('--dnaprint_fields', action='append', dest='dnaprint_fields', nargs=2, help="List of dnaprints data table value, name and path fields")
-    parser.add_argument('--read1', action='store', dest='read1', required=False, default=None, help='Required: single read')
+    parser.add_argument('--read1', action='store', dest='read1', help='Required: single read')
     parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
-    parser.add_argument('--gzipped', action='store', dest='gzipped', help='Input files are gzipped')
-    parser.add_argument('--output_dbkey', action='store', dest='output_dbkey', required=False, default=None, help='Output reference file')
-    parser.add_argument('--output_metrics', action='store', dest='output_metrics', required=False, default=None, help='Output metrics file')
-    parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting')
+    parser.add_argument('--gzipped', action='store_true', dest='gzipped', help='Input files are gzipped')
+    parser.add_argument('--output_dbkey', action='store', dest='output_dbkey', help='Output reference file')
+    parser.add_argument('--output_metrics', action='store', dest='output_metrics', help='Output metrics file')
 
     args = parser.parse_args()
 
-    collection = False
-    fastq_list = []
-    if args.read1 is not None:
-        fastq_list.append(args.read1)
-        if args.read2 is not None:
-            fastq_list.append(args.read2)
-    else:
-        collection = True
-        for file_name in sorted(os.listdir(INPUT_READS_DIR)):
-            file_path = os.path.abspath(os.path.join(INPUT_READS_DIR, file_name))
-            fastq_list.append(file_path)
+    fastq_list = [args.read1]
+    if args.read2 is not None:
+        fastq_list.append(args.read2)
 
     # The value of dnaprint_fields is a list of lists, where each list is
     # the [value, name, path] components of the vsnp_dnaprints data table.
@@ -306,62 +217,9 @@
     # table to ensure a proper mapping for discovering the dbkey.
     dnaprints_dict = get_dnaprints_dict(args.dnaprint_fields)
 
-    if collection:
-        # Here fastq_list consists of any number of
-        # reads, so each file will be processed and
-        # dataset collections will be produced as outputs.
-        multiprocessing.set_start_method('spawn')
-        queue1 = multiprocessing.JoinableQueue()
-        queue2 = multiprocessing.JoinableQueue()
-        num_files = len(fastq_list)
-        cpus = set_num_cpus(num_files, args.processes)
-        # Set a timeout for get()s in the queue.
-        timeout = 0.05
-
-        for fastq_file in fastq_list:
-            queue1.put(fastq_file)
-
-        # Complete the get_species_counts task.
-        processes = [multiprocessing.Process(target=get_species_counts_for_collection, args=(queue1, queue2, args.gzipped, timeout, )) for _ in range(cpus)]
-        for p in processes:
-            p.start()
-        for p in processes:
-            p.join()
-        queue1.join()
-
-        # Complete the get_species_strings task.
-        processes = [multiprocessing.Process(target=get_species_strings_for_collection, args=(queue2, queue1, timeout, )) for _ in range(cpus)]
-        for p in processes:
-            p.start()
-        for p in processes:
-            p.join()
-        queue2.join()
-
-        # Complete the get_group_and_dbkey task.
-        processes = [multiprocessing.Process(target=get_group_and_dbkey_for_collection, args=(queue1, queue2, dnaprints_dict, timeout, )) for _ in range(cpus)]
-        for p in processes:
-            p.start()
-        for p in processes:
-            p.join()
-        queue1.join()
-
-        # Complete the output_files task.
-        processes = [multiprocessing.Process(target=output_files_for_collection, args=(queue2, timeout, )) for _ in range(cpus)]
-        for p in processes:
-            p.start()
-        for p in processes:
-            p.join()
-        queue2.join()
-
-        if queue1.empty() and queue2.empty():
-            queue1.close()
-            queue1.join_thread()
-            queue2.close()
-            queue2.join_thread()
-    else:
-        # Here fastq_list consists of either a single read
-        # or a set of paired reads, producing single outputs.
-        count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts(fastq_list, args.gzipped)
-        brucella_string, bovis_string, para_string = get_species_strings(count_summary)
-        group, dbkey = get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum)
-        output_files(args.read1, count_list, group, dbkey, dbkey_file=args.output_dbkey, metrics_file=args.output_metrics)
+    # Here fastq_list consists of either a single read
+    # or a set of paired reads, producing single outputs.
+    count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts(fastq_list, args.gzipped)
+    brucella_string, bovis_string, para_string = get_species_strings(count_summary)
+    group, dbkey = get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum)
+    output_files(args.read1, count_list, group, dbkey, dbkey_file=args.output_dbkey, metrics_file=args.output_metrics)
--- a/vsnp_determine_ref_from_data.xml	Mon Nov 23 21:42:34 2020 +0000
+++ b/vsnp_determine_ref_from_data.xml	Sun Jan 03 16:13:22 2021 +0000
@@ -1,181 +1,132 @@
-<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0">
+<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="@WRAPPER_VERSION@.1" profile="@PROFILE@">
     <description>from input data</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
     <requirements>
         <requirement type="package" version="1.76">biopython</requirement>
         <requirement type="package" version="5.3">pyyaml</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
-#import os
 #import re
 #set gzipped = 'false'
 #set input_type = $input_type_cond.input_type
-#set input_reads_dir = 'input_reads'
-#set output_dbkey_dir = 'output_dbkey'
-#set output_metrics_dir = 'output_metrics'
-mkdir -p $input_reads_dir &&
-mkdir -p $output_dbkey_dir &&
-mkdir -p $output_metrics_dir &&
-#if str($input_type) == "single":
-    #set read_type_cond = $input_type_cond.read_type_cond
-    #set read1 = $read_type_cond.read1
+
+#if $input_type in ["single", "pair"]:
+    #set read1 = $input_type_cond.read1
     #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
-    #if str($read_type_cond.read_type) == "single":
-        ln -s '${read1}' '${read1_identifier}' &&
-        #if $read1.is_of_type('fastqsanger.gz'):
-            #set gzipped = 'true'
-        #end if
+    ln -s '${read1}' '${read1_identifier}' &&
+    #if $input_type == "pair":
+        #set read2 = $input_type_cond.read2
+        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
+        ln -s '${read2}' '${read2_identifier}' &&
     #else:
-        #set read2 = $read_type_cond.read2
-        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
-        ln -s '${read1}' '${read1_identifier}' &&
-        ln -s '${read2}' '${read2_identifier}' &&
-        #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
-            #set gzipped = 'true'
-        #end if
+        #set read2 = None 
     #end if
 #else:
-    #set collection_type = $input_type_cond.collection_type_cond.collection_type
-    #for $i in $input_type_cond.collection_type_cond.reads_collection:
-        #if $i.is_of_type('fastqsanger.gz'):
-            #set gzipped = 'true'
-        #end if
-        #set filename = $i.file_name
-        #if str($collection_type) == 'single_reads':
-            #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-        #else:
-            ## Galaxy builds lists of pairs as nested lists with elements
-            ## named forward and reverse.  When flattened, these lists
-            ## will work as inputs to the Parse parameter value expression
-            ## tool in workflows.  However, the output list created by the
-            ## expression tool will not function correctly with the bwa_mem
-            ## mapper.  Naming the identifier as follows is a solution.
-            #set identifier = re.sub('[^\s\w\-]', '_', str($i.name))
-        #end if
-        ln -s '$filename' '$input_reads_dir/$identifier' &&
-    #end for
+    #set read1 = $input_type_cond.reads_collection['forward']
+    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
+    ln -s '${read1}' '${read1_identifier}' &&
+    #set read2 = $input_type_cond.reads_collection['reverse']
+    #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
+    ln -s '${read2}' '${read2_identifier}' &&
 #end if
+
 python '$__tool_directory__/vsnp_determine_ref_from_data.py'
-#if str($input_type) == "single":
-    #if str($read_type_cond.read_type) == "single":
-        --read1 '${read1_identifier}'
-    #else:
-        --read1 '${read1_identifier}'
-        --read2 '${read2_identifier}'
+    --read1 '${read1_identifier}'
+    #if $read2 is not None
+      --read2 '${read2_identifier}'
     #end if
     --output_dbkey '$output_dbkey'
     --output_metrics '$output_metrics'
+#if $read1.is_of_type('fastqsanger.gz'):
+    --gzipped
 #end if
---gzipped $gzipped
---processes $processes
-#if str($in_test_mode) == "false":
-    #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
-    #for $i in $dnaprint_fields:
-        --dnaprint_fields '${i[0]}' '${i[2]}'
-    #end for
-#else:
-    --in_test_mode '$in_test_mode'
-#end if
+#set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
+#for $i in $dnaprint_fields:
+    --dnaprint_fields '${i[0]}' '${i[2]}'
+#end for
 ]]></command>
     <inputs>
         <conditional name="input_type_cond">
             <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
                 <option value="single" selected="true">Single files</option>
-                <option value="collection">Collection of files</option>
+		<option value="paired">Paired reads</option>
+		<option value="pair">Paired reads in separate data sets</option>
             </param>
             <when value="single">
-                <conditional name="read_type_cond">
-                    <param name="read_type" type="select" label="Choose the read type">
-                        <option value="paired" selected="true">Paired</option>
-                        <option value="single">Single</option>
-                    </param>
-                    <when value="paired">
-                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
-                        <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
-                    </when>
-                    <when value="single">
-                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
-                    </when>
-                </conditional>
+                <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
             </when>
-            <when value="collection">
-                <conditional name="collection_type_cond">
-                    <param name="collection_type" type="select" label="Collection of single reads or paired reads?">
-                        <option value="single_reads" selected="true">Single reads</option>
-                        <option value="paired_reads">Paired reads</option>
-                    </param>
-                    <when value="single_reads">
-                        <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
-                    </when>
-                    <when value="paired_reads">
-                        <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
-                    </when>
-                </conditional>
+            <when value="paired">
+                <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
+            </when>
+            <when value="pair">
+                <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+                <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
             </when>
         </conditional>
-        <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
-        <!-- Functional testing -->
-        <param name="in_test_mode" type="hidden" value="false"/>
     </inputs>
     <outputs>
-        <data name="output_dbkey" format="txt"  label="${tool.name} (dbkey) on ${on_string}">
-            <filter>input_type_cond['input_type'] == 'single'</filter>
-        </data>
-        <data name="output_metrics" format="txt"  label="${tool.name} (metrics) on ${on_string}">
-            <filter>input_type_cond['input_type'] == 'single'</filter>
-        </data>
-        <collection name="output_dbkey_collection" type="list" label="${tool.name} (dbkey) on ${on_string}">
-            <discover_datasets pattern="__name__" directory="output_dbkey" format="txt"/>
-            <filter>input_type_cond['input_type'] == 'collection'</filter>
-        </collection>
-        <collection name="output_metrics_collection" type="list" label="${tool.name} (metrics) on ${on_string}">
-            <discover_datasets pattern="__name__" directory="output_metrics" format="txt"/>
-            <filter>input_type_cond['input_type'] == 'collection'</filter>
-        </collection>
+        <data name="output_dbkey" format="txt" label="${tool.name} on ${on_string} (dbkey)"/>
+        <data name="output_metrics" format="txt" label="${tool.name} on ${on_string} (metrics)"/>
     </outputs>
     <tests>
-        <test>
-            <param name="in_test_mode" value="true"/>
-            <param name="read_type" value="single"/>
+        <!-- 1 single read -->
+        <test expect_num_outputs="2">
+            <param name="input_type" value="single"/>
             <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/>
             <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
             <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
         </test>
-        <test>
-            <param name="in_test_mode" value="true"/>
-            <param name="input_type" value="collection"/>
-            <param name="collection_type" value="paired_reads"/>
+        <!-- 1 set of paired reads -->
+        <test expect_num_outputs="2">
+            <param name="input_type" value="pair"/>
+            <param name="read1" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
+            <param name="read2" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
+            <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
+            <output name="output_metrics" file="paired_metrics.txt" ftype="txt"/>
+        </test>
+        <!-- A collection of paired reads -->
+        <test expect_num_outputs="2">
+            <param name="input_type" value="paired"/>
             <param name="reads_collection">
                 <collection type="paired">
-                    <element name="forward" value="forward.fastq.gz" ftype="fastqsanger.gz"/>
-                    <element name="reverse" value="reverse.fastq.gz" ftype="fastqsanger.gz"/>
+                    <element name="forward" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
+                    <element name="reverse" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
                 </collection>
             </param>
-            <output_collection name="output_dbkey_collection" type="list">
-                <element name="forward.txt" file="forward_dbkey.txt" ftype="txt"/>
-                <element name="reverse.txt" file="reverse_dbkey.txt" ftype="txt"/>
-            </output_collection>
-            <output_collection name="output_metrics_collection" type="list">
-                <element name="forward.txt" file="forward_metrics.txt" ftype="txt"/>
-                <element name="reverse.txt" file="reverse_metrics.txt" ftype="txt"/>
-            </output_collection>
+            <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
+            <output name="output_metrics" file="paired_collection_metrics.txt" ftype="txt"/>
         </test>
     </tests>
     <help>
 **What it does**
 
-Accepts a single fastqsanger read, a set of paired reads, or a collection of reads and inspects the data to discover the
-best reference genome for aligning the reads.  This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to
-perform this task.  While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call
-the complete string a "DNA print".  All of the "DNA prints" files installed by the complementary **vSNP DNAprints data
-manager** tool are then inspected to find a match for the compiled "DNA print" string.  These files are each associated
-with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g.,
-**Map with BWA-MEM**) to align the reads to the associated reference.
+Accepts a single fastqsanger read, a set of paired reads, or a collection of single or paired reads (bacterial samples) and
+inspects the data to discover the best reference genome for aligning the reads.
+
+The information needed to discover the best reference is maintained by the USDA in this repository_.  References are curreently
+
+.. _repository:  https://github.com/USDA-VS/vSNP_reference_options
+
+limited to TB complex, paraTB, and Brucella, but information for additional references will be added.  The information for each
+reference is a string consisting of zeros and ones, compiled by USDA researchers, which we call a "DNA print".   These strings
+are maintained in yaml files for use in Galaxy, and are installed via the **vSNP DNAprints data manager** tool.
 
-The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
-used to compile the "DNA print" string.
+This tool creates an in-memory dictionary of these DNA print strings for matching with a string generated by inspecting the
+input sample data.  During inspection, this tool accrues sequence counts for supported species, ultimately generating a string
+consisting of zeros and ones based on the counts, (i.e., a DNA print).  This string is then compared to the strings contained
+in the in-memory dictionary of DNA prints to find a match.
+
+The strings in the in-memory dictionary are each associated with a Galaxy "dbkey" (i.e., genome build), so when a match is found,
+the associated "dbkey" is passed to a mapper (e.g., **Map with BWA-MEM**), typically within a workflow via an expression tool,
+to align the reads to the associated reference.
+
+This tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
+about the sequence counts that were discovered in the input sample data that produced the "DNA print" string.
 
 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
-and discovering the primary species is critical.  DNA print matchig is currently supported for the following genomes.
+and discovering the primary species is critical.  DNA print matching is currently supported for the following genomes.
 
  * Mycobacterium bovis AF2122/97
  * Brucella abortus bv. 1 str. 9-941
@@ -197,17 +148,7 @@
 **Required Options**
 
  * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option.
- * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
     </help>
-    <citations>
-        <citation type="bibtex">
-            @misc{None,
-            journal = {None},
-            author = {1. Stuber T},
-            title = {Manuscript in preparation},
-            year = {None},
-            url = {https://github.com/USDA-VS/vSNP},}
-        </citation>
-    </citations>
+    <expand macro="citations"/>
 </tool>