Mercurial > repos > greg > vsnp_get_snps

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Sun Jan 03 16:06:33 2021 +0000
@@ -0,0 +1,24 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<macros>
+    <token name="@WRAPPER_VERSION@">1.0</token>
+    <token name="@PROFILE@">19.09</token>
+    <xml name="param_reference_source">
+        <param name="reference_source" type="select" label="Choose the source for the reference genome">
+            <option value="cached" selected="true">locally cached</option>
+            <option value="history">from history</option>
+        </param>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">
+                @misc{None,
+                journal = {None},
+                author = {1. Stuber T},
+                title = {Manuscript in preparation},
+                year = {None},
+                url = {https://github.com/USDA-VS/vSNP},}
+            </citation>
+        </citations>
+    </xml>
+</macros>
+
--- a/tool-data/vsnp_excel.loc.sample	Sat Nov 14 09:16:04 2020 +0000
+++ b/tool-data/vsnp_excel.loc.sample	Sun Jan 03 16:06:33 2021 +0000
@@ -1,4 +1,4 @@
 ## vSNP Excel files
 #Value	Name	Path	Description
 #AF2122	Mbovis_define_filter.xlsx	vsnp/AF2122/Mbovis_define_filter.xlsx	Excel file for Mycobacterium bovis AF2122/97
-#NC_006932	Bab1_define_filter.xlsx	/vsnp/NC_006932/Bab1_define_filter.xlsx	Excel file for Brucella abortus bv. 1 str. 9-941
+#NC_006932	Bab1_define_filter.xlsx	vsnp/NC_006932/Bab1_define_filter.xlsx	Excel file for Brucella abortus bv. 1 str. 9-941
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Sun Jan 03 16:06:33 2021 +0000
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Location of excel files for vsnp_get_snps tool -->
+    <table name="vsnp_excel" comment_char="#">
+        <columns>value, name, path, description</columns>
+        <file path="${__HERE__}/test-data/vsnp_excel.loc" />
+    </table>
+</tables>
--- a/vsnp_get_snps.py	Sat Nov 14 09:16:04 2020 +0000
+++ b/vsnp_get_snps.py	Sun Jan 03 16:06:33 2021 +0000
@@ -1,24 +1,20 @@
 #!/usr/bin/env python

-# Collect quality parsimonious SNPs from vcf files and output alignment files in fasta format.
+# Collect quality parsimonious SNPs from vcf files
+# and output alignment files in fasta format.

 import argparse
 import multiprocessing
 import os
-import pandas
 import queue
 import shutil
 import sys
 import time
-import vcf
 from collections import OrderedDict
 from datetime import datetime

-ALL_VCFS_DIR = 'all_vcf'
-INPUT_VCF_DIR = 'input_vcf_dir'
-OUTPUT_JSON_AVG_MQ_DIR = 'output_json_avg_mq_dir'
-OUTPUT_JSON_SNPS_DIR = 'output_json_snps_dir'
-OUTPUT_SNPS_DIR = 'output_snps_dir'
+import pandas
+import vcf


 def get_time_stamp():
@@ -40,74 +36,87 @@
 def setup_all_vcfs(vcf_files, vcf_dirs):
     # Create the all_vcfs directory and link
     # all input vcf files into it for processing.
-    os.makedirs(ALL_VCFS_DIR)
-    vcf_dirs.append(ALL_VCFS_DIR)
+    all_vcfs_dir = 'all_vcf'
+    os.makedirs(all_vcfs_dir)
+    vcf_dirs.append(all_vcfs_dir)
     for vcf_file in vcf_files:
         file_name_base = os.path.basename(vcf_file)
-        dst_file = os.path.join(ALL_VCFS_DIR, file_name_base)
+        dst_file = os.path.join(all_vcfs_dir, file_name_base)
         os.symlink(vcf_file, dst_file)
     return vcf_dirs


 class SnpFinder:

-    def __init__(self, num_files, reference, excel_grouper_file,
-                 all_isolates, ac, mq_val, n_threshold, qual_threshold, output_summary):
+    def __init__(self, num_files, dbkey, input_excel, all_isolates, ac, min_mq, quality_score_n_threshold, min_quality_score, input_vcf_dir, output_json_avg_mq_dir, output_json_snps_dir, output_snps_dir, output_summary):
+        # Allele count
         self.ac = ac
+        # Create a group that will contain all isolates.
         self.all_isolates = all_isolates
+        # Evolving positions dictionary.
         self.all_positions = None
-        # Filter based on the contents of an Excel file.
-        self.excel_grouper_file = excel_grouper_file
-        # Use Genbank file
+        # Isolate groups.
         self.groups = []
-        # This will be populated from the columns
-        # in the Excel filter file if it is used.
-        self.mq_val = mq_val
-        self.n_threshold = n_threshold
-        self.qual_threshold = qual_threshold
-        self.reference = reference
+        # Excel file for grouping.
+        self.input_excel = input_excel
+        # Directory of input zero coverage vcf files.
+        self.input_vcf_dir = input_vcf_dir
+        # Minimum map quality value.
+        self.min_mq = min_mq
+        # Minimum quality score value.
+        self.min_quality_score = min_quality_score
+        # Number of input zero coverage vcf files.
+        self.num_files = num_files
+        # Output directory for json average mq files.
+        self.output_json_avg_mq_dir = output_json_avg_mq_dir
+        # Output directory for json snps files.
+        self.output_json_snps_dir = output_json_snps_dir
+        # Output directory for snps files.
+        self.output_snps_dir = output_snps_dir
+        # Quality score N threshold value.
+        self.quality_score_n_threshold = quality_score_n_threshold
+        self.dbkey = dbkey
         self.start_time = get_time_stamp()
         self.summary_str = ""
         self.timer_start = datetime.now()
-        self.num_files = num_files
         self.initiate_summary(output_summary)

     def append_to_summary(self, html_str):
+        # Append a string to the html summary output file.
         self.summary_str = "%s%s" % (self.summary_str, html_str)

     def bin_input_files(self, filename, samples_groups_dict, defining_snps, inverted_defining_snps, found_positions, found_positions_mix):
+        # Categorize input files into closely related
+        # isolate groups based on discovered SNPs, and
+        # return a group dictionary.
         sample_groups_list = []
-        table_name = self.get_base_file_name(filename)
-        try:
-            defining_snp = False
-            # Absolute positions in set union of two lists.
-            for abs_position in list(defining_snps.keys() & (found_positions.keys() | found_positions_mix.keys())):
-                group = defining_snps[abs_position]
+        table_name = self.get_sample_name(filename)
+        defining_snp = False
+        # Absolute positions in set union of two lists.
+        for abs_position in list(defining_snps.keys() & (found_positions.keys() | found_positions_mix.keys())):
+            group = defining_snps[abs_position]
+            sample_groups_list.append(group)
+            self.check_add_group(group)
+            if len(list(defining_snps.keys() & found_positions_mix.keys())) > 0:
+                table_name = self.get_sample_name(filename)
+                table_name = '%s<font color="red">[[MIXED]]</font>' % table_name
+            self.copy_file(filename, group)
+            defining_snp = True
+        if not set(inverted_defining_snps.keys()).intersection(found_positions.keys() | found_positions_mix.keys()):
+            for abs_position in list(inverted_defining_snps.keys()):
+                group = inverted_defining_snps[abs_position]
                 sample_groups_list.append(group)
                 self.check_add_group(group)
-                if len(list(defining_snps.keys() & found_positions_mix.keys())) > 0:
-                    table_name = self.get_base_file_name(filename)
-                    table_name = '%s<font color="red">[[MIXED]]</font>' % table_name
                 self.copy_file(filename, group)
                 defining_snp = True
-            if not set(inverted_defining_snps.keys()).intersection(found_positions.keys() | found_positions_mix.keys()):
-                for abs_position in list(inverted_defining_snps.keys()):
-                    group = inverted_defining_snps[abs_position]
-                    sample_groups_list.append(group)
-                    self.check_add_group(group)
-                    self.copy_file(filename, group)
-                    defining_snp = True
-            if defining_snp:
-                samples_groups_dict[table_name] = sorted(sample_groups_list)
-            else:
-                samples_groups_dict[table_name] = ['<font color="red">No defining SNP</font>']
-        except TypeError as e:
-            msg = "<br/>Error processing file %s to generate samples_groups_dict: %s<br/>" % (filename, str(e))
-            self.append_to_summary(msg)
-            samples_groups_dict[table_name] = [msg]
+        if defining_snp:
+            samples_groups_dict[table_name] = sorted(sample_groups_list)
+        else:
+            samples_groups_dict[table_name] = ['<font color="red">No defining SNP</font>']
         return samples_groups_dict

     def check_add_group(self, group):
+        # Add a group if it is npt already in the list.
         if group not in self.groups:
             self.groups.append(group)

@@ -117,12 +126,12 @@
         shutil.copy(filename, dir)

     def decide_snps(self, filename):
-        positions_dict = self.all_positions
         # Find the SNPs in a vcf file to produce a pandas data
         # frame and a dictionary containing sample map qualities.
+        positions_dict = self.all_positions
         sample_map_qualities = {}
         # Eliminate the path.
-        file_name_base = self.get_base_file_name(filename)
+        file_name_base = self.get_sample_name(filename)
         vcf_reader = vcf.Reader(open(filename, 'r'))
         sample_dict = {}
         for record in vcf_reader:
@@ -132,26 +141,20 @@
                 if alt == "None":
                     sample_dict.update({record_position: "-"})
                 else:
-                    # Not sure this is the best place to capture MQM average
-                    # may be faster after parsimony SNPs are decided, but
-                    # then it will require opening the files again.
                     # On rare occassions MQM gets called "NaN", thus passing
                     # a string when a number is expected when calculating average.
                     mq_val = self.get_mq_val(record.INFO, filename)
                     if str(mq_val).lower() not in ["nan"]:
                         sample_map_qualities.update({record_position: mq_val})
-                    # Add parameters here to change what each vcf represents.
-                    # SNP is represented in table, now how will the vcf represent
-                    # the called position alt != "None", which means a deletion
-                    # as alt is not record.FILTER, or rather passed.
-                    len_alt = len(alt)
-                    if len_alt == 1:
+                    if len(alt) == 1:
                         qual_val = self.val_as_int(record.QUAL)
                         ac = record.INFO['AC'][0]
                         ref = str(record.REF[0])
-                        if ac == 2 and qual_val > self.n_threshold:
+                        if ac == 2 and qual_val > self.quality_score_n_threshold:
+                            # Add the SNP to a group.
                             sample_dict.update({record_position: alt})
-                        elif ac == 1 and qual_val > self.n_threshold:
+                        elif ac == 1 and qual_val > self.quality_score_n_threshold:
+                            # The position is ambiguous.
                             alt_ref = "%s%s" % (alt, ref)
                             if alt_ref == "AG":
                                 sample_dict.update({record_position: "R"})
@@ -181,28 +184,27 @@
                                 sample_dict.update({record_position: "N"})
                             # Poor calls
                         elif qual_val <= 50:
+                            # Call the reference allele.
                             # Do not coerce record.REF[0] to a string!
                             sample_dict.update({record_position: record.REF[0]})
-                        elif qual_val <= self.n_threshold:
+                        elif qual_val <= self.quality_score_n_threshold:
                             sample_dict.update({record_position: "N"})
                         else:
                             # Insurance -- Will still report on a possible
-                            # SNP even if missed with above statement
+                            # SNP even if missed with above statements.
                             # Do not coerce record.REF[0] to a string!
                             sample_dict.update({record_position: record.REF[0]})
         # Merge dictionaries and order
         merge_dict = {}
-        # abs_pos:REF
         merge_dict.update(positions_dict)
-        # abs_pos:ALT replacing all_positions, because keys must be unique
         merge_dict.update(sample_dict)
         sample_df = pandas.DataFrame(merge_dict, index=[file_name_base])
         return sample_df, file_name_base, sample_map_qualities

     def df_to_fasta(self, parsimonious_df, group):
-        # Generate SNP alignment file from the parsimonious_df
-        # data frame.
-        snps_file = os.path.join(OUTPUT_SNPS_DIR, "%s.fasta" % group)
+        # Generate SNP alignment file from
+        # the parsimonious_df data frame.
+        snps_file = os.path.join(self.output_snps_dir, "%s.fasta" % group)
         test_duplicates = []
         has_sequence_data = False
         for index, row in parsimonious_df.iterrows():
@@ -225,39 +227,30 @@
         # Find SNP positions in a vcf file.
         found_positions = {}
         found_positions_mix = {}
-        try:
-            vcf_reader = vcf.Reader(open(filename, 'r'))
-            try:
-                for record in vcf_reader:
-                    qual_val = self.val_as_int(record.QUAL)
-                    chrom = record.CHROM
-                    position = record.POS
-                    absolute_position = "%s:%s" % (str(chrom), str(position))
-                    alt = str(record.ALT[0])
-                    if alt != "None":
-                        mq_val = self.get_mq_val(record.INFO, filename)
-                        ac = record.INFO['AC'][0]
-                        len_ref = len(record.REF)
-                        if ac == self.ac and len_ref == 1 and qual_val > self.qual_threshold and mq_val > self.mq_val:
-                            found_positions.update({absolute_position: record.REF})
-                        if ac == 1 and len_ref == 1 and qual_val > self.qual_threshold and mq_val > self.mq_val:
-                            found_positions_mix.update({absolute_position: record.REF})
-                return found_positions, found_positions_mix
-            except (ZeroDivisionError, ValueError, UnboundLocalError, TypeError) as e:
-                self.append_to_summar("<br/>Error parsing record in file %s: %s<br/>" % (filename, str(e)))
-                return {'': ''}, {'': ''}
-        except (SyntaxError, AttributeError) as e:
-            self.append_to_summary("<br/>Error attempting to read file %s: %s<br/>" % (filename, str(e)))
-            return {'': ''}, {'': ''}
+        vcf_reader = vcf.Reader(open(filename, 'r'))
+        for record in vcf_reader:
+            qual_val = self.val_as_int(record.QUAL)
+            chrom = record.CHROM
+            position = record.POS
+            absolute_position = "%s:%s" % (str(chrom), str(position))
+            alt = str(record.ALT[0])
+            if alt != "None":
+                mq_val = self.get_mq_val(record.INFO, filename)
+                ac = record.INFO['AC'][0]
+                if ac == self.ac and len(record.REF) == 1 and qual_val > self.min_quality_score and mq_val > self.min_mq:
+                    found_positions.update({absolute_position: record.REF})
+                if ac == 1 and len(record.REF) == 1 and qual_val > self.min_quality_score and mq_val > self.min_mq:
+                    found_positions_mix.update({absolute_position: record.REF})
+        return found_positions, found_positions_mix

     def gather_and_filter(self, prefilter_df, mq_averages, group_dir):
         # Group a data frame of SNPs.
-        if self.excel_grouper_file is None:
+        if self.input_excel is None:
             filtered_all_df = prefilter_df
             sheet_names = None
         else:
             # Filter positions to be removed from all.
-            xl = pandas.ExcelFile(self.excel_grouper_file)
+            xl = pandas.ExcelFile(self.input_excel)
             sheet_names = xl.sheet_names
             # Use the first column to filter "all" postions.
             exclusion_list_all = self.get_position_list(sheet_names, 0)
@@ -265,13 +258,15 @@
             exclusion_list = exclusion_list_all + exclusion_list_group
             # Filters for all applied.
             filtered_all_df = prefilter_df.drop(columns=exclusion_list, errors='ignore')
-        json_snps_file = os.path.join(OUTPUT_JSON_SNPS_DIR, "%s.json" % group_dir)
+        json_snps_file = os.path.join(self.output_json_snps_dir, "%s.json" % group_dir)
         parsimonious_df = self.get_parsimonious_df(filtered_all_df)
         samples_number, columns = parsimonious_df.shape
         if samples_number >= 4:
+            # Sufficient samples have been found
+            # to build a phylogenetic tree.
             has_sequence_data = self.df_to_fasta(parsimonious_df, group_dir)
             if has_sequence_data:
-                json_avg_mq_file = os.path.join(OUTPUT_JSON_AVG_MQ_DIR, "%s.json" % group_dir)
+                json_avg_mq_file = os.path.join(self.output_json_avg_mq_dir, "%s.json" % group_dir)
                 mq_averages.to_json(json_avg_mq_file, orient='split')
                 parsimonious_df.to_json(json_snps_file, orient='split')
             else:
@@ -285,18 +280,13 @@
                 msg = "%s for group: %s" % (msg, group_dir)
             self.append_to_summary("%s<br/>\n" % msg)

-    def get_base_file_name(self, file_path):
+    def get_sample_name(self, file_path):
+        # Return the sample part of a file name.
         base_file_name = os.path.basename(file_path)
         if base_file_name.find(".") > 0:
             # Eliminate the extension.
             return os.path.splitext(base_file_name)[0]
-        elif base_file_name.find("_") > 0:
-            # The dot extension was likely changed to
-            # the " character.
-            items = base_file_name.split("_")
-            return "_".join(items[0:-1])
-        else:
-            return base_file_name
+        return base_file_name

     def get_mq_val(self, record_info, filename):
         # Get the MQ (gatk) or MQM (freebayes) value
@@ -333,7 +323,7 @@
         # Get a list of positions defined by an excel file.
         exclusion_list = []
         try:
-            filter_to_all = pandas.read_excel(self.excel_grouper_file, header=1, usecols=[group])
+            filter_to_all = pandas.read_excel(self.input_excel, header=1, usecols=[group])
             for value in filter_to_all.values:
                 value = str(value[0])
                 if "-" not in value.split(":")[-1]:
@@ -348,8 +338,7 @@
                         exclusion_list.append(chrom + ":" + str(position))
             return exclusion_list
         except ValueError:
-            exclusion_list = []
-            return exclusion_list
+            return []

     def get_snps(self, task_queue, timeout):
         while True:
@@ -357,19 +346,16 @@
                 group_dir = task_queue.get(block=True, timeout=timeout)
             except queue.Empty:
                 break
-            # Parse all vcf files to accumulate SNPs into a
-            # data frame.
+            # Parse all vcf files to accumulate
+            # the SNPs into a data frame.
             positions_dict = {}
             group_files = []
             for file_name in os.listdir(os.path.abspath(group_dir)):
                 file_path = os.path.abspath(os.path.join(group_dir, file_name))
                 group_files.append(file_path)
             for file_name in group_files:
-                try:
-                    found_positions, found_positions_mix = self.find_initial_positions(file_name)
-                    positions_dict.update(found_positions)
-                except Exception as e:
-                    self.append_to_summary("Error updating the positions_dict dictionary when processing file %s:\n%s\n" % (file_name, str(e)))
+                found_positions, found_positions_mix = self.find_initial_positions(file_name)
+                positions_dict.update(found_positions)
             # Order before adding to file to match
             # with ordering of individual samples.
             # all_positions is abs_pos:REF
@@ -394,10 +380,10 @@

     def group_vcfs(self, vcf_files):
         # Parse an excel file to produce a
-        # grouping dictionary for filtering SNPs.
-        xl = pandas.ExcelFile(self.excel_grouper_file)
+        # grouping dictionary for SNPs.
+        xl = pandas.ExcelFile(self.input_excel)
         sheet_names = xl.sheet_names
-        ws = pandas.read_excel(self.excel_grouper_file, sheet_name=sheet_names[0])
+        ws = pandas.read_excel(self.input_excel, sheet_name=sheet_names[0])
         defining_snps = ws.iloc[0]
         defsnp_iterator = iter(defining_snps.iteritems())
         next(defsnp_iterator)
@@ -429,9 +415,9 @@
         self.append_to_summary('<html>\n')
         self.append_to_summary('<head></head>\n')
         self.append_to_summary('<body style=\"font-size:12px;">')
-        self.append_to_summary("<b>Time started:</b> %s<br/>" % str(get_time_stamp()))
+        self.append_to_summary("<b>Time started:</b> %s<br/>" % get_time_stamp())
         self.append_to_summary("<b>Number of VCF inputs:</b> %d<br/>" % self.num_files)
-        self.append_to_summary("<b>Reference:</b> %s<br/>" % str(self.reference))
+        self.append_to_summary("<b>Reference:</b> %s<br/>" % self.dbkey)
         self.append_to_summary("<b>All isolates:</b> %s<br/>" % str(self.all_isolates))

     def return_val(self, val, index=0):
@@ -450,26 +436,30 @@


 if __name__ == '__main__':
+
     parser = argparse.ArgumentParser()

-    parser.add_argument('--all_isolates', action='store', dest='all_isolates', required=False, default="No", help='Create table with all isolates'),
-    parser.add_argument('--excel_grouper_file', action='store', dest='excel_grouper_file', required=False, default=None, help='Optional Excel filter file'),
+    parser.add_argument('--ac', action='store', dest='ac', type=int, help='Allele count value'),
+    parser.add_argument('--all_isolates', action='store_true', dest='all_isolates', required=False, default=False, help='Create table with all isolates'),
+    parser.add_argument('--input_excel', action='store', dest='input_excel', required=False, default=None, help='Optional Excel filter file'),
+    parser.add_argument('--input_vcf_dir', action='store', dest='input_vcf_dir', help='Input vcf directory'),
+    parser.add_argument('--min_mq', action='store', dest='min_mq', type=int, help='Minimum map quality value'),
+    parser.add_argument('--min_quality_score', action='store', dest='min_quality_score', type=int, help='Minimum quality score value'),
+    parser.add_argument('--output_json_avg_mq_dir', action='store', dest='output_json_avg_mq_dir', help='Output json average mq directory'),
+    parser.add_argument('--output_json_snps_dir', action='store', dest='output_json_snps_dir', help='Output json snps directory'),
+    parser.add_argument('--output_snps_dir', action='store', dest='output_snps_dir', help='Output snps directory'),
     parser.add_argument('--output_summary', action='store', dest='output_summary', help='Output summary html file'),
-    parser.add_argument('--reference', action='store', dest='reference', help='Reference file'),
-    parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting')
+    parser.add_argument('--processes', action='store', dest='processes', type=int, help='Configured processes for job'),
+    parser.add_argument('--quality_score_n_threshold', action='store', dest='quality_score_n_threshold', type=int, help='Minimum quality score N value for alleles'),
+    parser.add_argument('--dbkey', action='store', dest='dbkey', help='Galaxy genome build dbkey'),

     args = parser.parse_args()

-    # Initializations - TODO: should these be passed in as command line args?
-    ac = 2
-    mq_val = 56
-    n_threshold = 50
-    qual_threshold = 150
-
-    # Build the list of sample vcf files for the current run.
+    # Build the list of all input zero coverage vcf
+    # files, both the samples and the "database".
     vcf_files = []
-    for file_name in os.listdir(INPUT_VCF_DIR):
-        file_path = os.path.abspath(os.path.join(INPUT_VCF_DIR, file_name))
+    for file_name in os.listdir(args.input_vcf_dir):
+        file_path = os.path.abspath(os.path.join(args.input_vcf_dir, file_name))
         vcf_files.append(file_path)

     multiprocessing.set_start_method('spawn')
@@ -480,21 +470,23 @@
     timeout = 0.05

     # Initialize the snp_finder object.
-    snp_finder = SnpFinder(num_files, args.reference, args.excel_grouper_file, args.all_isolates,
-                           ac, mq_val, n_threshold, qual_threshold, args.output_summary)
+    snp_finder = SnpFinder(num_files, args.dbkey, args.input_excel, args.all_isolates, args.ac, args.min_mq, args.quality_score_n_threshold, args.min_quality_score, args.input_vcf_dir, args.output_json_avg_mq_dir, args.output_json_snps_dir, args.output_snps_dir, args.output_summary)

-    # Initialize the set of directories containiing vcf files for analysis.
+    # Define and make the set of directories into which the input_zc_vcf
+    # files will be placed.  Selected input values (e.g., the use of
+    # an Excel file for grouping and filtering, creating a group with
+    # all isolates) are used to define the directories.
     vcf_dirs = []
-    if args.excel_grouper_file is None:
+    if args.input_excel is None:
         vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs)
     else:
-        if args.all_isolates.lower() == "yes":
+        if args.all_isolates:
             vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs)
         # Parse the Excel file to detemine groups for filtering.
         snp_finder.group_vcfs(vcf_files)
         # Append the list of group directories created by
         # the above call to the set of directories containing
-        # vcf files for analysis
+        # vcf files for analysis.
         group_dirs = [d for d in os.listdir(os.getcwd()) if os.path.isdir(d) and d in snp_finder.groups]
         vcf_dirs.extend(group_dirs)
--- a/vsnp_get_snps.xml	Sat Nov 14 09:16:04 2020 +0000
+++ b/vsnp_get_snps.xml	Sun Jan 03 16:06:33 2021 +0000
@@ -1,131 +1,150 @@
-<tool id="vsnp_get_snps" name="vSNP: get SNPs" version="1.0.0">
+<tool id="vsnp_get_snps" name="vSNP: get SNPs" version="@WRAPPER_VERSION@.0" profile="@PROFILE@">
     <description></description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
     <requirements>
         <requirement type="package" version="0.25.3">pandas</requirement>
         <requirement type="package" version="0.6.8">pyvcf</requirement>
         <requirement type="package" version="1.2.0">xlrd</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
-#import os
+#import re
+
 #set input_vcf_dir = 'input_vcf_dir'
-#set input_zc_vcf_type = $input_zc_vcf_type_cond.input_zc_vcf_type
 #set output_json_avg_mq_dir = 'output_json_avg_mq_dir'
 #set output_json_snps_dir = 'output_json_snps_dir'
 #set output_snps_dir = 'output_snps_dir'
+
 mkdir -p $input_vcf_dir &&
 mkdir -p $output_json_avg_mq_dir &&
 mkdir -p $output_json_snps_dir &&
 mkdir -p $output_snps_dir &&
-#set reference = '?'
+
+#set dbkey = '?'
 #for $i in $input_vcf_collection:
-    #set reference = $i.metadata.dbkey
-    #set filename = $i.file_name
-    #set name = $i.name
-    ln -s '$filename' '$input_vcf_dir/$name' &&
+    #if str($dbkey) == '?':
+        #set dbkey = $i.metadata.dbkey
+    #else if str($dbkey) != $i.metadata.dbkey:
+        >&2 echo "The dbkeys associated with the zero coverage VCF files with SNPs found in closely related isolate groups are not unique" &&
+exit 1
+    #end if
+    #set vcf_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
+    ln -s '${i}' '$input_vcf_dir/${vcf_identifier}' &&
 #end for
-#if str($input_zc_vcf_type) == "single":
-    #set input_zc_vcf = $input_zc_vcf_type_cond.input_zc_vcf
-    #set file_name_base = $os.path.basename($input_zc_vcf.file_name)
-    ln -s '$input_zc_vcf' '$input_vcf_dir/$file_name_base' &&
+#if str($dbkey) == '?':
+    >&2 echo "The dbkey must be set for the zero coverage VCF files with SNPs found in closely related isolate groups" && exit 1
+#end if
+#if str($input_zc_vcf_type_cond.input_zc_vcf_type) == "single":
+    #set zc_vcf_identifier = re.sub('[^\s\w\-]', '_', str($input_zc_vcf.element_identifier))
+    ln -s '${input_zc_vcf}' '$input_vcf_dir/${zc_vcf_identifier}' &&
 #else
     #for $i in $input_zc_vcf_type_cond.input_zc_vcf_collection:
-        #set filename = $i.file_name
-        #set name = $i.name
-        ln -s '$filename' '$input_vcf_dir/$name' &&
+        #set zc_vcf_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
+        ln -s '${i}' '$input_vcf_dir/${zc_vcf_identifier}' &&
     #end for
 #end if
-#if str($excel_grouper_cond.excel_grouper) == "yes":
-    #set excel_file = 'No genome specified for input VCF (database) file(s)'
-    #set excel_grouper_source = $excel_grouper_cond.excel_grouper_source_cond.excel_grouper_source
-    #if str($excel_grouper_source) == "cached":
+#if str($input_excel_cond.input_excel_param) == 'yes':
+    #if str($input_excel_cond.excel_source_cond.excel_source) == 'cached':
+        #set excel_file = 'No genome specified for input VCF (database) file(s)'
         #set excel_fields = $__app__.tool_data_tables['vsnp_excel'].get_fields()
+	## The value of excel_fields is a nested list that looks like this.
+        ## [['AF2122', 'Mbovis_define_filter.xlsx', '~/tool-data/vsnp/AF2122/excel/Mbovis_define_filter.xlsx', 'Excel file for AF2122'],...]
         #for $i in $excel_fields:
-            #if str($i[0]) == $reference:
+            #if str($i[0]) == $dbkey:
                 #set excel_file = $i[2]
                 #break
             #end if
         #end for
     #else:
-        #set excel_file = $excel_grouper_cond.excel_grouper_source_cond.excel_grouper_file
+        #set excel_file = $input_excel_cond.excel_source_cond.input_excel
     #end if
 #end if
 python '$__tool_directory__/vsnp_get_snps.py'
---processes $processes
---reference '$reference'
-#if str($excel_grouper_cond.excel_grouper) == "yes":
-    --excel_grouper_file '$excel_file'
+--ac $ac
+#if str($input_excel_cond.input_excel_param) == 'yes':
+    --input_excel '$excel_file'
 #end if
-#if str($all_isolates) == "Yes":
-    --all_isolates '$all_isolates'
-#end if
+$all_isolates
+--input_vcf_dir '$input_vcf_dir'
+--min_mq $min_mq
+--min_quality_score $min_quality_score
+--output_json_avg_mq_dir '$output_json_avg_mq_dir'
+--output_json_snps_dir '$output_json_snps_dir'
+--output_snps_dir '$output_snps_dir'
 --output_summary '$output_summary'
+--processes \${GALAXY_SLOTS:-8}
+--quality_score_n_threshold $quality_score_n_threshold
+--dbkey '$dbkey'
 ]]></command>
     <inputs>
         <conditional name="input_zc_vcf_type_cond">
             <param name="input_zc_vcf_type" type="select" label="Choose the category of the files to be analyzed">
-                <option value="single" selected="true">A single zero coverage VCF file</option>
-                <option value="collection">A collection of zero coverage VCF files</option>
+                <option value="collection" selected="true">A collection of zero coverage VCF files</option>
+                <option value="single">A single zero coverage VCF file</option>
             </param>
             <when value="single">
-                <param name="input_zc_vcf" type="data" format="vcf" label="Zero coverage VCF file">
-                    <validator type="unspecified_build"/>
-                </param>
+                <param name="input_zc_vcf" type="data" format="vcf" label="Zero coverage VCF file"/>
             </when>
             <when value="collection">
-                <param name="input_zc_vcf_collection" format="vcf" type="data_collection" collection_type="list" label="Collection of zero coverage VCF files">
-                    <validator type="unspecified_build"/>
-                </param>
+                <param name="input_zc_vcf_collection" format="vcf" type="data_collection" collection_type="list" label="Collection of zero coverage VCF files"/>
             </when>
         </conditional>
-        <param name="input_vcf_collection" format="vcf" type="data_collection" collection_type="list" label="Collection of VCF files against which to analyze the zero coverages VCF file(s)">
-            <validator type="unspecified_build"/>
-        </param>
-        <conditional name="excel_grouper_cond">
-            <param name="excel_grouper" type="select" label="Use Excel file for grouping and filtering?">
+        <param name="input_vcf_collection" format="vcf" type="data_collection" collection_type="list" label="Collection of zero coverage VCF files with SNPs found in closely related isolate groups"/>
+        <param name="ac" type="integer" min="0" value="2" label="Allele count threshold" help="At least 1 position must have this value for a SNP to be added to a group"/>
+        <param name="min_mq" type="integer" min="0" value="56" label="Map quality threshold" help="At least 1 position must have a higher MQ value for a SNP to be added to a group"/>
+        <param name="min_quality_score" type="integer" min="0" value="150" label="Quality score threshold" help="At least 1 position must have a higher quality score for a SNP to be added to a group"/>
+        <param name="quality_score_n_threshold" type="integer" min="0" value="150" label="Minimum quality score N value for alleles" help="Alleles are marked as N for quality scores between this value and the minimum quality score value above"/>
+        <conditional name="input_excel_cond">
+            <param name="input_excel_param" type="select" label="Use Excel file for grouping and filtering?">
                 <option value="yes" selected="true">Yes</option>
                 <option value="no">No</option>
             </param>
             <when value="yes">
-                <conditional name="excel_grouper_source_cond">
-                    <param name="excel_grouper_source" type="select" label="Choose the source for the Excel file">
+                <conditional name="excel_source_cond">
+                    <param name="excel_source" type="select" label="Choose the source for the Excel file">
                         <option value="cached">locally cached</option>
                         <option value="history">from history</option>
                     </param>
                     <when value="cached">
-                        <param name="excel_grouper_file" type="select" label="Excel file" help="Selection will be overridden if it does not match the dbkeys associated with the collection of VCF files being analyzed">
-                            <options from_data_table="vsnp_excel"/>
-                            <validator type="no_options" message="No built-in Excel grouping and filtering datasets are available"/>
+                        <param name="input_excel" type="select" label="Excel file">
+                            <options from_data_table="vsnp_excel">
+                                <filter type="data_meta" column="0" key="dbkey" ref="input_vcf_collection"/>
+                                <validator type="no_options" message="No built-in Excel grouping and filtering datasets are available"/>
+                            </options>
                         </param>
                     </when>
                     <when value="history">
-                        <param name="excel_grouper_file" type="data" format="xlsx" label="Excel file">
-                            <validator type="no_options" message="The current history does not include an xlsx dataset that can be used for grouping and filtering"/>
-                        </param>
+                        <param name="input_excel" type="data" format="xlsx" label="Excel file"/>
                     </when>
                 </conditional>
             </when>
             <when value="no"/>
         </conditional>
-        <param name="all_isolates" type="select" display="radio" label="Create table with all isolates?">
-            <option value="No" selected="true">No</option>
-            <option value="Yes">Yes</option>
-        </param>
-        <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
+        <param argument="all_isolates" type="boolean" truevalue="--all_isolates" falsevalue="" checked="false" label="Create a group containing all isolates?"/>
     </inputs>
     <outputs>
-        <collection name="snps" type="list" label="${tool.name} (SNPs) on ${on_string}">
-            <discover_datasets pattern="__name__" directory="output_snps_dir" format="fasta"/>
+        <collection name="snps" type="list" label="${tool.name} on ${on_string} (SNPs)">
+            <discover_datasets pattern="__name_and_ext__" directory="output_snps_dir"/>
         </collection>
-        <collection name="json_avg_mq" type="list" label="${tool.name} (average MQ) on ${on_string}">
-            <discover_datasets pattern="__name__" directory="output_json_avg_mq_dir" format="json"/>
+        <collection name="json_avg_mq" type="list" label="${tool.name} on ${on_string} (average mq)">
+            <discover_datasets pattern="__name_and_ext__" directory="output_json_avg_mq_dir"/>
         </collection>
-        <collection name="json_snps" type="list" label="${tool.name} (SNPs as json) on ${on_string}">
-            <discover_datasets pattern="__name__" directory="output_json_snps_dir" format="json"/>
+        <collection name="json_snps" type="list" label="${tool.name} on ${on_string} (SNPs as json)">
+            <discover_datasets pattern="__name_and_ext__" directory="output_json_snps_dir"/>
         </collection>
-        <data name="output_summary" format="html" label="${tool.name} (summary) on ${on_string}"/>
+        <data name="output_summary" format="html" label="${tool.name} on ${on_string} (summary)"/>
     </outputs>
     <tests>
-        <test>
+        <!--
+            Unfortunately the test files cannot be gzipped since Galaxy changes the file names
+            to be something like 00-0121_WI_Cervid_99-A_vcf_gz, and the VCF Reader requires
+            gzipped files to have a .gz extension.  The exception is
+            UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
+        -->
+        <!-- A single vcf input, no excel file, all_isolates is False -->
+        <test expect_num_outputs="4">
+            <param name="input_zc_vcf_type" value="single"/>
             <param name="input_zc_vcf" value="input_zc_vcf.vcf" ftype="vcf" dbkey="89"/>
             <param name="input_vcf_collection">
                 <collection type="list">
@@ -133,48 +152,147 @@
                     <element name="SRR1792272_zc.vcf" value="SRR1792272_zc.vcf" dbkey="89"/>
                 </collection>
             </param>
-            <param name="excel_grouper" value="no"/>
-            <output_collection name="snps" type="list">
-                <element name="all_vcf.fasta" file="all_vcf.fasta" ftype="fasta" compare="contains"/>
+            <param name="input_excel_param" value="no"/>
+            <output_collection name="snps" type="list" count="1">
+                <element name="all_vcf" file="all_vcf.fasta" ftype="fasta" compare="contains"/>
             </output_collection>
-            <output_collection name="json_avg_mq" type="list">
-                <element name="all_vcf.json" file="json_avg_mq_all_vcf.json" ftype="json" compare="contains"/>
+            <output_collection name="json_avg_mq" type="list" count="1">
+                <element name="all_vcf" file="json_avg_mq_all_vcf.json" ftype="json" compare="contains"/>
             </output_collection>
-            <output_collection name="json_snps" type="list">
-                <element name="all_vcf.json" file="json_all_vcf.json" ftype="json" compare="contains"/>
+            <output_collection name="json_snps" type="list" count="1">
+                <element name="all_vcf" file="json_all_vcf.json" ftype="json" compare="contains"/>
             </output_collection>
             <output name="output_summary" file="output_summary.html" ftype="html" compare="contains"/>
         </test>
+        <!-- An input collection, no excel file, all_isolates is False -->
+        <test expect_num_outputs="4">
+            <param name="input_zc_vcf_type" value="collection"/>
+            <param name="input_zc_vcf_collection">
+                <collection type="list">
+                    <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/>
+                    <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/>
+                </collection>
+            </param>
+            <param name="input_vcf_collection">
+                <collection type="list">
+                    <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/>
+                    <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/>
+                    <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/>
+                </collection>
+            </param>
+            <param name="input_excel_param" value="no"/>
+            <output_collection name="snps" type="list" count="1">
+                <element name="all_vcf" file="all_vcf2.fasta" ftype="fasta" compare="contains"/>
+            </output_collection>
+            <output_collection name="json_avg_mq" type="list" count="1">
+                <element name="all_vcf" file="json_avg_mq_all_vcf2.json" ftype="json" compare="contains"/>
+            </output_collection>
+            <output_collection name="json_snps" type="list" count="1">
+                <element name="all_vcf" file="json_all_vcf2.json" ftype="json" compare="contains"/>
+            </output_collection>
+            <output name="output_summary" file="output_summary2.html" ftype="html" compare="contains"/>
+        </test>
+        <!-- An input collection, an excel file, all_isolates is False -->
+        <test expect_num_outputs="4">
+            <param name="input_zc_vcf_type" value="collection"/>
+            <param name="input_zc_vcf_collection">
+                <collection type="list">
+                    <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/>
+                    <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/>
+                </collection>
+            </param>
+            <param name="input_vcf_collection">
+                <collection type="list">
+                    <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/>
+                    <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/>
+                    <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/>
+                </collection>
+            </param>
+            <param name="input_excel_param" value="yes"/>
+            <param name="input_excel" value="89"/>
+            <output_collection name="snps" type="list" count="1">
+                <element name="Mbovis-17" file="Mbovis-17_snps.fasta" ftype="fasta"/>
+            </output_collection>
+            <output_collection name="json_avg_mq" type="list" count="1">
+                <element name="Mbovis-17" file="Mbovis-17_avg_mq_json.json" ftype="json" compare="contains"/>
+            </output_collection>
+            <output_collection name="json_snps" type="list" count="1">
+                <element name="Mbovis-17" file="Mbovis-17_snps_json.json" ftype="json" compare="contains"/>
+            </output_collection>
+            <output name="output_summary" file="output_summary3.html" ftype="html" compare="contains"/>
+        </test>
+        <!-- An input collection, an excel file, all_isolates is True -->
+        <test expect_num_outputs="4">
+            <param name="input_zc_vcf_type" value="collection"/>
+            <param name="input_zc_vcf_collection">
+                <collection type="list">
+                    <element name="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" value="BCG_Pasteur_Unknown_FR_SRR8886989.vcf" dbkey="89"/>
+                    <element name="BCG_Tokyo_Unknown_JP_DRR029468.vcf" value="BCG_Tokyo_Unknown_JP_DRR029468.vcf" dbkey="89"/>
+                </collection>
+            </param>
+            <param name="input_vcf_collection">
+                <collection type="list">
+                    <element name="01_1787_FL_Zoo_Jaguar.vcf" value="01_1787_FL_Zoo_Jaguar.vcf" dbkey="89"/>
+                    <element name="02_5877_MEX_TX_Fed.vcf" value="02_5877_MEX_TX_Fed.vcf" dbkey="89"/>
+                    <element name="02_0585_COA_TX_Fed.vcf" value="02_0585_COA_TX_Fed.vcf" dbkey="89"/>
+                </collection>
+            </param>
+            <param name="input_excel_param" value="yes"/>
+            <param name="input_excel" value="89"/>
+            <param name="all_isolates" value="--all_isolates"/>
+            <output_collection name="snps" type="list" count="2">
+                <element name="Mbovis-17" file="Mbovis-17_snps.fasta" ftype="fasta"/>
+                <element name="all_vcf" file="all_vcf3.fasta" ftype="fasta"/>
+            </output_collection>
+            <output_collection name="json_avg_mq" type="list" count="2">
+                <element name="Mbovis-17" file="Mbovis-17_avg_mq_json.json" ftype="json" compare="contains"/>
+                <element name="all_vcf" file="Mbovis-17_avg_mq_json.json" ftype="json" compare="contains"/>
+            </output_collection>
+            <output_collection name="json_snps" type="list" count="2">
+                <element name="Mbovis-17" file="Mbovis-17_snps_json.json" ftype="json" compare="contains"/>
+                <element name="all_vcf" file="Mbovis-17_snps_json.json" ftype="json" compare="contains"/>
+            </output_collection>
+            <output name="output_summary" file="output_summary4.html" ftype="html" compare="contains"/>
+        </test>
     </tests>
     <help>
 **What it does**

-Accepts a zero-coverage VCF file (or a collection of them) produced by the **vSNP: add zero coverage** tool
-along with a collection of VCF files that have been aligned with the same reference. The inputs are analyzed
-to discover quality parsimonious SNPs in the zero-coverage VCF file(s).  An Excel spreadsheet containing
-specified SNPs can optiomally be used to filter desired SNP positions by group.  Users can choose whether to
-select a locally cached Excel spreadsheet or one from their current history.
+Accepts a zero coverage VCF file produced by the **vSNP: add zero coverage** tool (or a collection of them) along with a collection
+of zero coverage VCF files that have been aligned with the same reference and contain SNPs called between closely related isolate groups.
+The tool produces fasta files containing SNP alignments, json files containing the SNP positions and additional json files containing
+the average map quality values.
+
+The SNP alignments produced by this tool are used to create phylogenetic trees, so larger input collections result in more populated
+phylogenetic trees.  Both of the json outputs are used by the **vSNP: build tables** tool to produce annotated SNP tables in the form
+of Excel spreadsheets.
+
+An Excel spreadsheet containing specified SNPs can optiomally be used to filter desired SNP positions by group.  Users can choose a
+locally cached Excel spreadsheet or one from their current history.
+
+A SNP is added to a group if it has at least one position with a specified allele count value, a quality score greater than a specified
+value, and a map quality greater than a specified value.
+
+If the allele count equals the specified value (2) and the quality score for a SNP position is greater than the minimum quality score
+value (150), the alternate allele is called.
+
+However, if the allele count is 1, the position is called ambiguous.  Deletions are called when the alternate allele is a gap.  If the
+quality score is less than or equal to the minimum quality score N value for alleles (150), the allele is marked "N".

 **Required Options**

- * **Choose the category of the files to be analyzed** - select single file or a collection of files, then select the appropriate history item (single VCF item or dataset collection of VCF elements) based on the selected option.
- * **Collection of VCF files against which to analyze the zero coverages VCF file(s)**  - select a dataset collection from the current history that is associated with the same reference as the selected zero-coverage VCF file(s).
+ * **Zero coverage VCF file(s)** - Select a single or collection of zero coverage VCF files, typically produced by the **vSNP: add zero coverage** tool, from the current history.
+ * **Collection of zero coverage VCF files with SNPs found in closely related isolate groups** - Select a dataset collection of zero coverage vcf files from the current history.

 **Additional Options**

+ * **Allele count threshold** - At least 1 position must have an allele count greater than this value for a SNP to be added to a group (2 is optimal).
+ * **Map quality threshold** - At least 1 position must have a higher MQ value for a SNP to be added to a group (56 is optimal).
+ * **Quality score threshold** -At least 1 position must have a higher quality score for a SNP to be added to a group (150 is optimal).
+ * **Minimum quality score N value for alleles** - If none of the avove 3 requirements is met and the quality score is less than or equal to the minimum quality score N value for alleles, the allele is marked "N" (150 is optimal).
  * **Use Excel file for grouping and filtering?** - select Yes to filter desired SNP positions by group.  A cached Excel spreadsheet provides the most widely used SNP positions for grouping, but a custom spreadhseet can be selected from the current history.
- * **Job Resource Parameters** - an administrator for the Galaxy instance must configure this tool to display this option, so it may not be available.  If it is, you can choose the number of processors to use for tool execution.
- * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
+ * **Create a group containing all isolates?** - select Yes to output an additional group containing of all isolates.
 </help>
-    <citations>
-        <citation type="bibtex">
-            @misc{None,
-            journal = {None},
-            author = {1. Stuber T},
-            title = {Manuscript in preparation},
-            year = {None},
-            url = {https://github.com/USDA-VS/vSNP},}
-        </citation>
-    </citations>
+    <expand macro="citations"/>
 </tool>