changeset 0:6f870ed59b6e draft

Uploaded
author nml
date Mon, 06 Feb 2017 12:31:04 -0500
parents
children 599a4dc309aa
files srst2.pl srst2.py srst2.xml tool_data_table_conf.xml.sample tool_dependencies.xml vfdb_files.loc
diffstat 6 files changed, 2368 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srst2.pl	Mon Feb 06 12:31:04 2017 -0500
@@ -0,0 +1,378 @@
+#!/usr/bin/env perl
+
+
+use strict;
+use warnings;
+use Cwd;
+use File::Copy;
+
+#The first 4 arguments should be in this format:
+#  /path/to/srst2.py bam_output scores_output pileup_output ...
+
+my $binary = $ARGV[0];
+shift;
+
+my ($bam_results, $scores, $pileup, $job_type, $txt_results, $genes_results, $fullgenes_results, $name, $databases);
+my ($allele_results,$allele_type);
+
+
+$bam_results = $ARGV[0];
+shift(@ARGV);
+$scores = $ARGV[0];
+shift(@ARGV);
+$pileup = $ARGV[0];
+shift(@ARGV);
+
+#Now that we shifted the first 4 arguments, get the rest depending on the job type. There are three:
+#I pass a letter to tell us which job type was selected.
+$job_type = $ARGV[0];
+shift(@ARGV);
+
+#If m, mlst only:  we only have one mlst output file
+if($job_type eq "m")
+{
+   $txt_results = $ARGV[0];
+   shift(@ARGV);
+   $allele_results = $ARGV[0];
+   shift(@ARGV);
+   $allele_type = $ARGV[0];
+   shift(@ARGV);
+}
+#If g, genedb only: we have two outputs: genes and fullgenes. We also get the name of the database
+elsif($job_type eq "g")
+{
+   $genes_results = $ARGV[0];
+   shift(@ARGV);
+   $fullgenes_results = $ARGV[0];
+   shift(@ARGV);
+   $databases = $ARGV[0];
+   shift (@ARGV);
+}
+#If b, both mlst and genedb: We will have three output files and the database name.
+else
+{
+   $txt_results = $ARGV[0];
+   shift(@ARGV);
+   $genes_results = $ARGV[0];
+   shift(@ARGV);
+   $fullgenes_results = $ARGV[0];
+   shift(@ARGV);
+   $databases = $ARGV[0];
+   shift(@ARGV);
+}
+
+#After we get the output files/database name, now we get the name of the reads
+#This allows SRST2 to give meaningful output instead of just printing 'dataset_xxx' as the sample name
+my $filename = $ARGV[0];
+shift(@ARGV);
+
+#This index offset is used to determine where the 'genedb' option is located.
+#If only a single-end input, the genedb will be 3 positions into the arguments:
+#  -input_se sample.fastq --genedb database.fasta
+my $index_offset = 3;
+
+print @ARGV;
+#change the file extensions of the input reads so srst can use them
+
+#Usually the file name looks like this: sample_S1_L001_R1_001.fastq
+#If we use this file name, it confuses srst2 a lot. So we just extract
+#the sample name to use as input file name.
+my @file_name = split /_/, $filename;
+$name = "temp_file_name";
+
+my ($for_read, $rev_read, $sing_read, $database);
+if ($ARGV[0] eq "--input_pe")
+{
+        #Increment index offset if we paired-end inputs
+        $index_offset++;
+
+	$for_read = $name."_1.dat";
+	$rev_read = $name."_2.dat";
+
+	symlink($ARGV[1], $for_read);
+	symlink($ARGV[2], $rev_read);
+
+	$ARGV[1] = $for_read;
+	$ARGV[2] = $rev_read;
+}
+else
+{
+        $sing_read = $name.".dat";
+
+        symlink($ARGV[1], $sing_read);
+
+        $ARGV[1] = $sing_read;
+
+}
+#If we are running a job to include genedb, use the database name for input file name
+if ($job_type eq 'g' | $job_type eq 'b')
+{
+  my @db_names = split /,/, $databases;
+  my $num_db = @db_names;
+  my %names_hash = ();
+  # loop through dbs to replace spaces with _ and check for duplicates
+  for (my $i = 0; $i < $num_db; $i++){
+    $db_names[$i]=~s/ /_/g;
+    if( exists($names_hash{$db_names[$i]}) ) {
+      print STDERR  "More than one database with the same name";
+      exit(1);
+    }else{
+      $names_hash{$db_names[$i]}=$db_names[$i];
+    }
+  }
+
+
+  foreach my $db_name (@db_names){
+    (my $base = $db_name) =~ s/\.[^.]+$//;
+    $database = $base.".dat";
+
+    symlink($ARGV[$index_offset], $database);
+    $ARGV[$index_offset] = $database;
+
+    $index_offset++;
+  }
+}
+
+for (my $i  =0; $i< @ARGV; $i++){
+  if (index($ARGV[$i], "maxins") != -1){
+    my ($maxins, $minins);
+    my @b2args = split(' ', $ARGV[$i]);
+    for (my $j = 0; $j < @b2args; $j++){
+      if (index($b2args[$j], "maxins") != -1){
+        $maxins = $b2args[$j+1];
+      }
+      if (index($b2args[$j], "minins") != -1){
+        $minins = $b2args[$j+1];
+      }
+    }
+    if ($maxins - $minins < 0){
+      print STDERR  "--minins cannot be greater than --maxins";
+      exit(1);
+    }
+  }
+}
+
+
+
+my $command = "python $binary @ARGV";
+
+my $exit_code = system($command);
+
+my $cur_dir = getcwd();
+# make arrays for using multiple custom databases (creates multiple output files - need to be concatenated)
+my (@genefiles, @bamfiles, @pileupfiles, @fullgenefiles, @scoresfiles);
+
+# go through files in the output directory to move/concatenate them as required.
+
+foreach my $file (<$cur_dir/*>)
+{
+    print $file, "\n";
+    #Will cause problems if any files have 'mlst' in them
+  if ($file =~ /mlst/)
+  {
+    move($file, $txt_results);
+  }
+  elsif ($file =~ /\.bam$/)
+  {
+    push @bamfiles, $file;
+  }
+  elsif ($file =~ /\.scores$/)
+  {
+    push @scoresfiles, $file;
+  }
+  elsif ($file =~ /\.pileup$/)
+  {
+    push @pileupfiles, $file;
+  }
+  elsif ($file =~ /__fullgenes__/)
+  {
+    push @fullgenefiles, $file;
+  }
+  elsif ($file =~ /__genes__/)
+  {
+    push @genefiles, $file;
+  }
+  elsif ($file =~ /all_consensus_alleles.fasta$/ && $allele_type eq 'all') {
+    move($file,$allele_results);
+  }
+  elsif ($file =~ /new_consensus_alleles.fasta$/ && $allele_type eq 'new'){
+    move($file,$allele_results);
+  }
+}
+
+
+my ($cmd, $temp_head, $temp_full, $cat_header, $final_bam, @headers );
+
+# create new concatenated bam file with all bam files.
+if (@bamfiles > 1){
+  my $counter = 0;
+  $cat_header = "cat_header";
+  while ($counter < @bamfiles) {
+    $headers[$counter] = "bam_header".$counter;
+    # make a header file for each bam results file
+    my $cmd = "samtools view -H $bamfiles[$counter] > $headers[$counter]";
+    system($cmd);
+    if ($counter >= 1){
+      # only keep the @hd and @pg from first file because the final concatenated file can only have one of each (doesn't matter location)
+      $temp_head="cut_head".$counter;
+      # cut off first row and last row of each file (the @HD and @PG)
+      $cmd = "tail -n +2 $headers[$counter] | head -n -1 > $temp_head";
+      system($cmd);
+      unlink $headers[$counter];
+      # replace the old header with the new cut header in the array
+      $headers[$counter] = $temp_head;
+    }
+    $counter++;
+  }
+  # combine all header files
+  $cmd = "cat ".join(" ",@headers)." > $cat_header";
+  system($cmd);
+
+  $final_bam = "final_bam";
+  # concatenate all the bam files *must include the concatenated header file created above
+  $cmd = "samtools cat -h $cat_header -o $final_bam ".join(" ",@bamfiles)." ";
+  system($cmd);
+
+  # sort the bam file so it can be indexed
+  $cmd = "samtools sort $final_bam 'last_bam'";
+  system($cmd);
+
+  # move bam file to where Galaxy expects it.
+  $cmd = "mv 'last_bam.bam' $bam_results";
+  system($cmd);
+} else {
+  # only one bam file, don't need to concatenate
+  move($bamfiles[0], $bam_results);
+}
+
+# concatenate all pileup files
+if (@pileupfiles > 1){
+  $cmd = "cat ".join(" ",@pileupfiles)." > $pileup";
+  system($cmd);
+} else {
+  move($pileupfiles[0], $pileup);
+}
+
+# perform find-replace to restore original user-specified file names
+foreach my $gene (@genefiles){
+  my $data = read_file($gene);
+  $data =~ s/temp_file_name/$file_name[0]/g;
+  write_file($gene, $data);
+}
+
+foreach my $gene (@fullgenefiles){
+  my $data = read_file($gene);
+  $data =~ s/temp_file_name/$file_name[0]/g;
+  write_file($gene, $data);
+}
+
+# concatenate gene files with a space separating each file
+if (@genefiles > 1){
+  my $join = join(" <(echo) ", @genefiles);
+  my @args = ("bash", "-c", "cat $join > $genes_results");
+  system(@args);
+} else {
+  # only one gene results file
+  move($genefiles[0], $genes_results);
+}
+
+# concatenate full gene results files but only keep header in first file.
+if (@fullgenefiles >1){
+  for (my $i= 1; $i < @fullgenefiles; $i++){
+    # go through all files but the first one to remove headers
+    # create a temp file to save the file after header is removed
+    $temp_full = "temp_full".$i;
+    $cmd = "tail -n +2 $fullgenefiles[$i] > $temp_full";
+    system($cmd);
+    unlink $fullgenefiles[$i];
+    $fullgenefiles[$i] = $temp_full;
+  }
+  $cmd = "cat ". join(" ",@fullgenefiles)." > $fullgenes_results";
+  system($cmd);
+} else{
+  # only one full gene results file
+  move($fullgenefiles[0], $fullgenes_results);
+}
+
+# concatenate full gene results files but only keep header in first file.
+if (@scoresfiles >1){
+  for (my $i= 1; $i < @scoresfiles; $i++){
+    # go through all files but the first one to remove headers
+    # create a temp file to save the file after header is removed
+    $temp_full = "temp_full".$i;
+    $cmd = "tail -n +2 $scoresfiles[$i] > $temp_full";
+    system($cmd);
+    unlink $scoresfiles[$i];
+    $scoresfiles[$i] = $temp_full;
+  }
+  $cmd = "cat ". join(" ",@scoresfiles)." > $scores";
+  system($cmd);
+} else{
+  # only one scores file
+  move($scoresfiles[0], $scores);
+}
+
+# cleanup srst2 output and temp files
+foreach my $file (@fullgenefiles){
+  unlink $file;
+}
+foreach my $file (@genefiles){
+  unlink $file;
+}
+foreach my $file (@pileupfiles){
+  unlink $file;
+}
+foreach my $file (@bamfiles){
+  unlink $file;
+}
+foreach my $file (@headers){
+  unlink $file;
+}
+foreach my $file (@scoresfiles){
+  unlink $file;
+}
+unlink $temp_head;
+unlink $temp_full;
+unlink $cat_header;
+unlink $final_bam;
+
+#get rid of symlinks
+if ($for_read)
+{
+	unlink $for_read;
+}
+if ($rev_read)
+{
+	unlink $rev_read;
+}
+if ($sing_read)
+{
+         unlink $sing_read;
+}
+if ($database)
+{
+         unlink $database;
+}
+$exit_code = $exit_code >> 8;
+exit $exit_code;
+
+sub read_file {
+    my ($filename) = @_;
+
+    open my $in, '<:encoding(UTF-8)', $filename or die "Could not open '$filename' for reading $!";
+    local $/ = undef;
+    my $all = <$in>;
+    close $in;
+
+    return $all;
+}
+
+sub write_file {
+    my ($filename, $content) = @_;
+
+    open my $out, '>:encoding(UTF-8)', $filename or die "Could not open '$filename' for writing $!";;
+    print $out $content;
+    close $out;
+
+    return;
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srst2.py	Mon Feb 06 12:31:04 2017 -0500
@@ -0,0 +1,1548 @@
+#!/usr/bin/env python
+
+# SRST2 - Short Read Sequence Typer (v2)
+# Python Version 2.7.5
+#
+# Authors - Michael Inouye (minouye@unimelb.edu.au), Harriet Dashnow (h.dashnow@gmail.com),
+#	Kathryn Holt (kholt@unimelb.edu.au), Bernie Pope (bjpope@unimelb.edu.au)
+#
+# see LICENSE.txt for the license
+#
+# Dependencies:
+#	bowtie2	   http://bowtie-bio.sourceforge.net/bowtie2/index.shtml version 2.1.0
+#	SAMtools   http://samtools.sourceforge.net Version: 0.1.18 (Version: 0.1.19 DOES NOT WORK - loss of edge coverage)
+#	SciPy		http://www.scipy.org/install.html
+#
+# Git repository: https://github.com/katholt/srst2/
+# README: https://github.com/katholt/srst2/blob/master/README.md
+# Questions or feature requests: https://github.com/katholt/srst2/issues
+# Manuscript: http://biorxiv.org/content/early/2014/06/26/006627
+
+
+from argparse import (ArgumentParser, FileType)
+import logging
+from subprocess import call, check_output, CalledProcessError, STDOUT
+import os, sys, re, collections, operator
+from scipy.stats import binom_test, linregress
+from math import log
+from itertools import groupby
+from operator import itemgetter
+from collections import OrderedDict
+try:
+	from version import srst2_version
+except:
+	srst2_version = "version unknown"
+
+edge_a = edge_z = 2
+
+
+def parse_args():
+	"Parse the input arguments, use '-h' for help."
+
+	parser = ArgumentParser(description='SRST2 - Short Read Sequence Typer (v2)')
+
+	# version number of srst2, print and then exit
+	parser.add_argument('--version', action='version', version='%(prog)s ' + srst2_version)
+
+	# Read inputs
+	parser.add_argument(
+		'--input_se', nargs='+',type=str, required=False,
+		help='Single end read file(s) for analysing (may be gzipped)')
+	parser.add_argument(
+		'--input_pe', nargs='+', type=str, required=False,
+		help='Paired end read files for analysing (may be gzipped)')
+	parser.add_argument(
+		'--forward', type=str, required=False, default="_1",
+			help='Designator for forward reads (only used if NOT in MiSeq format sample_S1_L001_R1_001.fastq.gz; otherwise default is _1, i.e. expect forward reads as sample_1.fastq.gz)')
+	parser.add_argument(
+		'--reverse', type=str, required=False, default="_2",
+			help='Designator for reverse reads (only used if NOT in MiSeq format sample_S1_L001_R2_001.fastq.gz; otherwise default is _2, i.e. expect forward reads as sample_2.fastq.gz')
+	parser.add_argument('--read_type', type=str, choices=['q', 'qseq', 'f'], default='q',
+		help='Read file type (for bowtie2; default is q=fastq; other options: qseq=solexa, f=fasta).')
+
+	# MLST parameters
+	parser.add_argument('--mlst_db', type=str, required=False, nargs=1, help='Fasta file of MLST alleles (optional)')
+	parser.add_argument('--mlst_delimiter', type=str, required=False,
+		help='Character(s) separating gene name from allele number in MLST database (default "-", as in arcc-1)', default="-")
+	parser.add_argument('--mlst_definitions', type=str, required=False,
+		help='ST definitions for MLST scheme (required if mlst_db supplied and you want to calculate STs)')
+	parser.add_argument('--mlst_max_mismatch', type=str, required=False, default = "10",
+		help='Maximum number of mismatches per read for MLST allele calling (default 10)')
+
+	# Gene database parameters
+	parser.add_argument('--gene_db', type=str, required=False, nargs='+', help='Fasta file/s for gene databases (optional)')
+	parser.add_argument('--no_gene_details', action="store_false", required=False, help='Switch OFF verbose reporting of gene typing')
+	parser.add_argument('--gene_max_mismatch', type=str, required=False, default = "10",
+		help='Maximum number of mismatches per read for gene detection and allele calling (default 10)')
+
+	# Cutoffs for scoring/heuristics
+	parser.add_argument('--min_coverage', type=float, required=False, help='Minimum %%coverage cutoff for gene reporting (default 90)',default=90)
+	parser.add_argument('--max_divergence', type=float, required=False, help='Maximum %%divergence cutoff for gene reporting (default 10)',default=10)
+	parser.add_argument('--min_depth', type=float, required=False, help='Minimum mean depth to flag as dubious allele call (default 5)',default=5)
+	parser.add_argument('--min_edge_depth', type=float, required=False, help='Minimum edge depth to flag as dubious allele call (default 2)',default=2)
+	parser.add_argument('--prob_err', type=float, default=0.01, help='Probability of sequencing error (default 0.01)')
+
+	# Mapping parameters for bowtie2
+	parser.add_argument('--stop_after', type=str, required=False, help='Stop mapping after this number of reads have been mapped (otherwise map all)')
+	parser.add_argument('--other', type=str, help='Other arguments to pass to bowtie2.', required=False)
+
+	# Samtools parameters
+	parser.add_argument('--mapq', type=int, default=1, help='Samtools -q parameter (default 1)')
+	parser.add_argument('--baseq', type=int, default=20, help='Samtools -Q parameter (default 20)')
+
+	# Reporting options
+	parser.add_argument('--output', type=str, required=True, help='Prefix for srst2 output files')
+	parser.add_argument('--log', action="store_true", required=False, help='Switch ON logging to file (otherwise log to stdout)')
+	parser.add_argument('--save_scores', action="store_true", required=False, help='Switch ON verbose reporting of all scores')
+	parser.add_argument('--report_new_consensus', action="store_true", required=False, help='If a matching alleles is not found, report the consensus allele. Note, only SNP differences are considered, not indels.')
+	parser.add_argument('--report_all_consensus', action="store_true", required=False, help='Report the consensus allele for the most likely allele. Note, only SNP differences are considered, not indels.')
+
+	# Run options
+	parser.add_argument('--use_existing_pileup', action="store_true", required=False,
+		help='Use existing pileups if available, otherwise they will be generated') # to facilitate testing of rescoring from pileups
+	parser.add_argument('--use_existing_scores', action="store_true", required=False,
+		help='Use existing scores files if available, otherwise they will be generated') # to facilitate testing of reporting from scores
+	parser.add_argument('--keep_interim_alignment', action="store_true", required=False, default=False,
+		help='Keep interim files (sam & unsorted bam), otherwise they will be deleted after sorted bam is created') # to facilitate testing of sam processing
+#	parser.add_argument('--keep_final_alignment', action="store_true", required=False, default=False,
+#		help='Keep interim files (sam & unsorted bam), otherwise they will be deleted after sorted bam is created') # to facilitate testing of sam processing
+
+	# Compile previous output files
+	parser.add_argument('--prev_output', nargs='+', type=str, required=False,
+		help='SRST2 results files to compile (any new results from this run will also be incorporated)')
+
+	return parser.parse_args()
+
+
+# Exception to raise if the command we try to run fails for some reason
+class CommandError(Exception):
+	pass
+
+def run_command(command, **kwargs):
+	'Execute a shell command and check the exit status and any O/S exceptions'
+	command_str = ' '.join(command)
+	logging.info('Running: {}'.format(command_str))
+	try:
+		exit_status = call(command, **kwargs)
+	except OSError as e:
+		message = "Command '{}' failed due to O/S error: {}".format(command_str, str(e))
+		raise CommandError({"message": message})
+	if exit_status != 0:
+		message = "Command '{}' failed with non-zero exit status: {}".format(command_str, exit_status)
+		raise CommandError({"message": message})
+
+
+def bowtie_index(fasta_files):
+	'Build a bowtie2 index from the given input fasta(s)'
+
+	# check that both bowtie and samtools have the right versions
+	check_command_version(['bowtie2', '--version'],
+				'bowtie2-align version 2.1.0',
+				'bowtie',
+				'2.1.0')
+
+	for fasta in fasta_files:
+		built_index = fasta + '.1.bt2'
+		if os.path.exists(built_index):
+			logging.info('Index for {} is already built...'.format(fasta))
+		else:
+			logging.info('Building bowtie2 index for {}...'.format(fasta))
+			run_command(['bowtie2-build', fasta, fasta])
+
+
+def modify_bowtie_sam(raw_bowtie_sam,max_mismatch):
+	# fix sam flags for comprehensive pileup
+	with open(raw_bowtie_sam) as sam, open(raw_bowtie_sam + '.mod', 'w') as sam_mod:
+		for line in sam:
+			if not line.startswith('@'):
+				fields = line.split('\t')
+				flag = int(fields[1])
+				flag = (flag - 256) if (flag & 256) else flag
+				m = re.search("NM:i:(\d+)\s",line)
+				if m != None:
+					num_mismatch = m.group(1)
+					if int(num_mismatch) <= int(max_mismatch):
+						sam_mod.write('\t'.join([fields[0], str(flag)] + fields[2:]))
+				else:
+					logging.info('Excluding read from SAM file due to missing NM (num mismatches) field: ' + fields[0])
+					num_mismatch = 0
+			else:
+				sam_mod.write(line)
+	return(raw_bowtie_sam,raw_bowtie_sam + '.mod')
+
+
+def parse_fai(fai_file,db_type,delimiter):
+	'Get sequence lengths for reference alleles - important for scoring'
+	'Get gene names also, required if no MLST definitions provided'
+	size = {}
+	gene_clusters = [] # for gene DBs, this is cluster ID
+	allele_symbols = []
+	gene_cluster_symbols = {} # key = cluster ID, value = gene symbol (for gene DBs)
+	unique_allele_symbols = True
+	unique_gene_symbols = True
+	delimiter_check = [] # list of names that may violate the MLST delimiter supplied
+	with open(fai_file) as fai:
+		for line in fai:
+			fields = line.split('\t')
+			name = fields[0] # full allele name
+			size[name] = int(fields[1]) # store length
+			if db_type!="mlst":
+				allele_info = name.split()[0].split("__")
+				if len(allele_info) > 2:
+					gene_cluster = allele_info[0] # ID number for the cluster
+					cluster_symbol = allele_info[1] # gene name for the cluster
+					name = allele_info[2] # specific allele name
+					if gene_cluster in gene_cluster_symbols:
+						if gene_cluster_symbols[gene_cluster] != cluster_symbol:
+							unique_gene_symbols = False # already seen this cluster symbol
+							logging.info( "Non-unique:" +  gene_cluster + ", " + cluster_symbol)
+					else:
+						gene_cluster_symbols[gene_cluster] = cluster_symbol
+				else:
+					# treat as unclustered database, use whole header
+					gene_cluster = cluster_symbol = name
+			else:
+				gene_cluster = name.split(delimiter)[0] # accept gene clusters raw for mlst
+				# check if the delimiter makes sense
+				parts = name.split(delimiter)
+				if len(parts) != 2:
+					delimiter_check.append(name)
+				else:
+					try:
+						x = int(parts[1])
+					except:
+						delimiter_check.append(name)
+
+			# check if we have seen this allele name before
+			if name in allele_symbols:
+				unique_allele_symbols = False # already seen this allele name
+			allele_symbols.append(name)
+
+			# record gene (cluster):
+			if gene_cluster not in gene_clusters:
+				gene_clusters.append(gene_cluster)
+
+	if len(delimiter_check) > 0:
+		print "Warning! MLST delimiter is " + delimiter + " but these genes may violate the pattern and cause problems:"
+		print ",".join(delimiter_check)
+
+	return size, gene_clusters, unique_gene_symbols, unique_allele_symbols, gene_cluster_symbols
+
+
+def read_pileup_data(pileup_file, size, prob_err, consensus_file = ""):
+	with open(pileup_file) as pileup:
+		prob_success = 1 - prob_err	# Set by user, default is prob_err = 0.01
+		hash_alignment = {}
+		hash_max_depth = {}
+		hash_edge_depth = {}
+		max_depth = 1
+		avg_depth_allele = {}
+		next_to_del_depth_allele = {}
+		coverage_allele = {}
+		mismatch_allele = {}
+		indel_allele = {}
+		missing_allele = {}
+		size_allele = {}
+
+		# Split all lines in the pileup by whitespace
+		pileup_split = ( x.split() for x in pileup )
+		# Group the split lines based on the first field (allele)
+		for allele, lines in groupby(pileup_split, itemgetter(0)):
+
+			# Reset variables for new allele
+			allele_line = 1 # Keep track of line for this allele
+			exp_nuc_num = 0 # Expected position in ref allele
+			allele_size = size[allele]
+			total_depth = 0
+			depth_a = depth_z = 0
+			position_depths = [0] * allele_size # store depths in case required for penalties; then we don't need to track total_missing_bases
+			hash_alignment[allele] = []
+			total_missing_bases = 0
+			total_mismatch = 0
+			ins_poscount = 0
+			del_poscount = 0
+			next_to_del_depth = 99999
+			consensus_seq = ""
+
+			for fields in lines:
+				# Parse this line and store details required for scoring
+				nuc_num = int(fields[1]) # Actual position in ref allele
+				exp_nuc_num += 1
+				allele_line += 1
+				nuc = fields[2]
+				nuc_depth = int(fields[3])
+				position_depths[nuc_num-1] = nuc_depth
+				if len(fields) <= 5:
+					aligned_bases = ''
+				else:
+					aligned_bases = fields[4]
+
+				# Missing bases (pileup skips basepairs)
+				if nuc_num > exp_nuc_num:
+					total_missing_bases += abs(exp_nuc_num - nuc_num)
+				exp_nuc_num = nuc_num
+				if nuc_depth == 0:
+					total_missing_bases += 1
+
+				# Calculate depths for this position
+				if nuc_num <= edge_a:
+					depth_a += nuc_depth
+				if abs(nuc_num - allele_size) < edge_z:
+					depth_z += nuc_depth
+				if nuc_depth > max_depth:
+					hash_max_depth[allele] = nuc_depth
+					max_depth = nuc_depth
+
+				total_depth = total_depth + nuc_depth
+
+				# Parse aligned bases list for this position in the pileup
+				num_match = 0
+				ins_readcount = 0
+				del_readcount = 0
+				nuc_counts = {}
+
+				i = 0
+				while i < len(aligned_bases):
+
+					if aligned_bases[i] == "^":
+						# Signifies start of a read, next char is mapping quality (skip it)
+						i += 2
+						continue
+
+					if aligned_bases[i] == "+":
+						i += int(aligned_bases[i+1]) + 2 # skip to next read
+						ins_readcount += 1
+						continue
+
+					if aligned_bases[i] == "-":
+						i += int(aligned_bases[i+1]) + 2 # skip to next read
+						continue
+
+					if aligned_bases[i] == "*":
+						i += 1 # skip to next read
+						del_readcount += 1
+						continue
+
+					if aligned_bases[i] == "." or aligned_bases[i] == ",":
+						num_match += 1
+						i += 1
+						continue
+
+					elif aligned_bases[i].upper() in "ATCG":
+						this_nuc = aligned_bases[i].upper()
+						if this_nuc not in nuc_counts:
+							nuc_counts[this_nuc] = 0
+						nuc_counts[this_nuc] += 1
+
+					i += 1
+
+				# Save the most common nucleotide at this position
+				consensus_nuc = nuc # by default use reference nucleotide
+				max_freq = num_match # Number of bases matching the reference
+				for nucleotide in nuc_counts:
+					if nuc_counts[nucleotide] > max_freq:
+						consensus_nuc = nucleotide
+						max_freq = nuc_counts[nucleotide]
+				consensus_seq += (consensus_nuc)
+
+				# Calculate details of this position for scoring and reporting
+
+				# mismatches and indels
+				num_mismatch = nuc_depth - num_match
+				if num_mismatch > num_match:
+					total_mismatch += 1 # record as mismatch (could be a snp or deletion)
+				if del_readcount > num_match:
+					del_poscount += 1
+				if ins_readcount > nuc_depth / 2:
+					ins_poscount += 1
+
+				# Hash for later processing
+				hash_alignment[allele].append((num_match, num_mismatch, prob_success)) # snp or deletion
+				if ins_readcount > 0:
+					hash_alignment[allele].append((nuc_depth - ins_readcount, ins_readcount, prob_success)) # penalize for any insertion calls at this position
+
+			# Determine the consensus sequence if required
+			if consensus_file != "":
+				if consensus_file.split(".")[-2] == "new_consensus_alleles":
+					consensus_type = "variant"
+				elif consensus_file.split(".")[-2] == "all_consensus_alleles":
+					consensus_type = "consensus"
+				with open(consensus_file, "a") as consensus_outfile:
+					consensus_outfile.write(">{0}.{1} {2}\n".format(allele, consensus_type, pileup_file.split(".")[1].split("__")[1]))
+					outstring = consensus_seq + "\n"
+					consensus_outfile.write(outstring)
+
+			# Finished reading pileup for this allele
+
+			# Check for missing bases at the end of the allele
+			if nuc_num < allele_size:
+				total_missing_bases += abs(allele_size - nuc_num)
+				# determine penalty based on coverage of last 2 bases
+				penalty = float(position_depths[nuc_num-1] + position_depths[nuc_num-2])/2
+				m = min(position_depths[nuc_num-1],position_depths[nuc_num-2])
+				hash_alignment[allele].append((0, penalty, prob_success))
+				if next_to_del_depth > m:
+					next_to_del_depth = m # keep track of lowest near-del depth for reporting
+
+			# Calculate allele summary stats and save
+			avg_depth = round(total_depth / float(allele_line),3)
+			avg_a = depth_a / float(edge_a)   # Avg depth at 5' end, num basepairs determined by edge_a
+			avg_z = depth_z / float(edge_z)	# 3'
+			hash_max_depth[allele] = max_depth
+			hash_edge_depth[allele] = (avg_a, avg_z)
+			min_penalty = max(5, int(avg_depth))
+			coverage_allele[allele] = 100*(allele_size - total_missing_bases - del_poscount)/float(allele_size) # includes in-read deletions
+			mismatch_allele[allele] = total_mismatch - del_poscount # snps only
+			indel_allele[allele] = del_poscount + ins_poscount # insertions or deletions
+			missing_allele[allele] = total_missing_bases # truncated bases
+			size_allele[allele] = allele_size
+
+			# Penalize truncations or large deletions (i.e. positions not covered in pileup)
+			j = 0
+			while j < (len(position_depths)-2):
+				# note end-of-seq truncations are dealt with above)
+				if position_depths[j]==0 and position_depths[j+1]!=0:
+					penalty = float(position_depths[j+1]+position_depths[j+2])/2 # mean of next 2 bases
+					hash_alignment[allele].append((0, penalty, prob_success))
+					m = min(position_depths[nuc_num-1],position_depths[nuc_num-2])
+					if next_to_del_depth > m:
+						next_to_del_depth = m # keep track of lowest near-del depth for reporting
+				j += 1
+
+			# Store depth info for reporting
+			avg_depth_allele[allele] = avg_depth
+			if next_to_del_depth == 99999:
+				next_to_del_depth = "NA"
+			next_to_del_depth_allele[allele] = next_to_del_depth
+
+	return hash_alignment, hash_max_depth, hash_edge_depth, avg_depth_allele, coverage_allele, mismatch_allele, indel_allele, missing_allele, size_allele, next_to_del_depth_allele
+
+
+def score_alleles(args, mapping_files_pre, hash_alignment, hash_max_depth, hash_edge_depth,
+		avg_depth_allele, coverage_allele, mismatch_allele, indel_allele, missing_allele,
+		size_allele, next_to_del_depth_allele, run_type):
+
+	if args.save_scores:
+		scores_output = file(mapping_files_pre + '.scores', 'w')
+		scores_output.write("Allele\tScore\tAvg_depth\tEdge1_depth\tEdge2_depth\tPercent_coverage\tSize\tMismatches\tIndels\tTruncated_bases\tDepthNeighbouringTruncation\tmaxMAF\tLeastConfident_Rate\tLeastConfident_Mismatches\tLeastConfident_Depth\tLeastConfident_Pvalue\n")
+
+	scores = {} # key = allele, value = score
+	mix_rates = {} # key = allele, value = highest minor allele frequency, 0 -> 0.5
+
+	for allele in hash_alignment:
+		if (run_type == "mlst") or (coverage_allele[allele] > args.min_coverage):
+			pvals = []
+			min_pval = 1.0
+			min_pval_data = (999,999) # (mismatch, depth) for position with lowest p-value
+			mix_rate = 0 # highest minor allele frequency 0 -> 0.5
+			for nuc_info in hash_alignment[allele]:
+				if nuc_info is not None:
+					match, mismatch, prob_success = nuc_info
+					if match > 0 or mismatch > 0:
+						if mismatch == 0:
+							p_value = 1.0
+						else:
+							p_value = binom_test([match, mismatch], None, prob_success)
+						# Weight pvalue by (depth/max_depth)
+						max_depth = hash_max_depth[allele]
+						weight = (match + mismatch) / float(max_depth)
+						p_value *= weight
+						if p_value < min_pval:
+							min_pval = p_value
+							min_pval_data = (mismatch,match + mismatch)
+						if p_value > 0:
+							p_value = -log(p_value, 10)
+						else:
+							p_value = 1000
+						pvals.append(p_value)
+						mismatch_prop = float(match)/float(match+mismatch)
+						if min(mismatch_prop, 1-mismatch_prop) > mix_rate:
+							mix_rate = min(mismatch_prop, 1-mismatch_prop)
+			# Fit linear model to observed Pval distribution vs expected Pval distribution (QQ plot)
+			pvals.sort(reverse=True)
+			len_obs_pvals = len(pvals)
+			exp_pvals = range(1, len_obs_pvals + 1)
+			exp_pvals2 = [-log(float(ep) / (len_obs_pvals + 1), 10) for ep in exp_pvals]
+
+			# Slope is score
+			slope, _intercept, _r_value, _p_value, _std_err = linregress(exp_pvals2, pvals)
+
+			# Store all scores for later processing
+			scores[allele] = slope
+			mix_rates[allele] = mix_rate
+
+			# print scores for each allele, if requested
+			if args.save_scores:
+				if allele in hash_edge_depth:
+					start_depth, end_depth = hash_edge_depth[allele]
+					edge_depth_str = str(start_depth) + '\t' + str(end_depth)
+				else:
+					edge_depth_str = "NA\tNA"
+				this_depth = avg_depth_allele.get(allele, "NA")
+				this_coverage = coverage_allele.get(allele, "NA")
+				this_mismatch = mismatch_allele.get(allele, "NA")
+				this_indel = indel_allele.get(allele, "NA")
+				this_missing = missing_allele.get(allele, "NA")
+				this_size = size_allele.get(allele, "NA")
+				this_next_to_del_depth = next_to_del_depth_allele.get(allele, "NA")
+				scores_output.write('\t'.join([allele, str(slope), str(this_depth), edge_depth_str,
+						str(this_coverage), str(this_size), str(this_mismatch), str(this_indel), str(this_missing), str(this_next_to_del_depth), str(mix_rate), str(float(min_pval_data[0])/min_pval_data[1]),str(min_pval_data[0]),str(min_pval_data[1]),str(min_pval)]) + '\n')
+
+	if args.save_scores:
+		scores_output.close()
+
+	return(scores,mix_rates)
+
+# Check that an acceptable version of a command is installed
+# Exits the program if it can't be found.
+# - command_list is the command to run to determine the version.
+# - version_identifier is the unique string we look for in the stdout of the program.
+# - command_name is the name of the command to show in error messages.
+# - required_version is the version number to show in error messages.
+def check_command_version(command_list, version_identifier, command_name, required_version):
+	try:
+		command_stdout = check_output(command_list, stderr=STDOUT)
+	except OSError as e:
+		logging.error("Failed command: {}".format(' '.join(command_list)))
+		logging.error(str(e))
+		logging.error("Could not determine the version of {}.".format(command_name))
+		logging.error("Do you have {} installed in your PATH?".format(command_name))
+		exit(-1)
+	except CalledProcessError as e:
+		# some programs such as samtools return a non-zero exit status
+		# when you ask for the version (sigh). We ignore it here.
+		command_stdout = e.output
+
+	if version_identifier not in command_stdout:
+		logging.error("Incorrect version of {} installed.".format(command_name))
+		logging.error("{} version {} is required by SRST2.".format(command_name, required_version))
+		exit(-1)
+
+
+def run_bowtie(mapping_files_pre,sample_name,fastqs,args,db_name,db_full_path):
+
+	print "Starting mapping with bowtie2"
+	
+	# check that both bowtie and samtools have the right versions
+	check_command_version(['bowtie2', '--version'],
+				'bowtie2-align version 2.1.0',
+				'bowtie',
+				'2.1.0')
+
+	check_command_version(['samtools'],
+				'Version: 0.1.18',
+				'samtools',
+				'0.1.18')
+
+	command = ['bowtie2']
+
+	if len(fastqs)==1:
+		# single end
+		command += ['-U', fastqs[0]]
+	elif len(fastqs)==2:
+		# paired end
+		command += ['-1', fastqs[0], '-2', fastqs[1]]
+
+	sam = mapping_files_pre + ".sam"
+	logging.info('Output prefix set to: ' + mapping_files_pre)
+
+	command += ['-S', sam,
+				'-' + args.read_type,	# add a dash to the front of the option
+				'--very-sensitive-local',
+				'--no-unal',
+				'-a',					 # Search for and report all alignments
+				'-x', db_full_path			   # The index to be aligned to
+			   ]
+
+	if args.stop_after:
+		try:
+			command += ['-u',str(int(args.stop_after))]
+		except ValueError:
+			print "WARNING. You asked to stop after mapping '" + args.stop_after + "' reads. I don't understand this, and will map all reads. Please speficy an integer with --stop_after or leave this as default to map 1 million reads."
+
+	if args.other:
+		command += args.other.split()
+
+	logging.info('Aligning reads to index {} using bowtie2...'.format(db_full_path))
+
+	run_command(command)
+
+	return(sam)
+
+def get_pileup(args,mapping_files_pre,raw_bowtie_sam,bowtie_sam_mod,fasta,pileup_file):
+	# Analyse output with SAMtools
+	logging.info('Processing Bowtie2 output with SAMtools...')
+	logging.info('Generate and sort BAM file...')
+	out_file_bam = mapping_files_pre + ".unsorted.bam"
+	run_command(['samtools', 'view', '-b', '-o', out_file_bam,
+				 '-q', str(args.mapq), '-S', bowtie_sam_mod])
+	out_file_bam_sorted = mapping_files_pre + ".sorted"
+	run_command(['samtools', 'sort', out_file_bam, out_file_bam_sorted])
+
+	# Delete interim files (sam, modified sam, unsorted bam) unless otherwise specified.
+	# Note users may also want to delete final sorted bam and pileup on completion to save space.
+	if not args.keep_interim_alignment:
+		logging.info('Deleting sam and bam files that are not longer needed...')
+		del_filenames = [raw_bowtie_sam, bowtie_sam_mod, out_file_bam]
+		for f in del_filenames:
+			logging.info('Deleting ' + f)
+			os.remove(f)
+
+	logging.info('Generate pileup...')
+	with open(pileup_file, 'w') as sam_pileup:
+		run_command(['samtools', 'mpileup', '-L', '1000', '-f', fasta,
+					 '-Q', str(args.baseq), '-q', str(args.mapq), out_file_bam_sorted + '.bam'],
+					 stdout=sam_pileup)
+
+def calculate_ST(allele_scores, ST_db, gene_names, sample_name, mlst_delimiter, avg_depth_allele, mix_rates):
+	allele_numbers = [] # clean allele calls for determing ST. order is taken from gene names, as in ST definitions file
+	alleles_with_flags = [] # flagged alleles for printing (* if mismatches, ? if depth issues)
+	mismatch_flags = [] # allele/diffs
+	uncertainty_flags = [] #allele/uncertainty
+#	st_flags = [] # (* if mismatches, ? if depth issues)
+	depths = [] # depths for each typed locus
+	mafs = [] # minor allele freqencies for each typed locus
+
+	# get allele numbers & info
+	for gene in gene_names:
+		if gene in allele_scores:
+			(allele,diffs,depth_problem,divergence) = allele_scores[gene]
+			allele_number = allele.split(mlst_delimiter)[-1]
+			depths.append(avg_depth_allele[allele])
+			mix_rate = mix_rates[allele]
+			mafs.append(mix_rate)
+		else:
+			allele_number = "-"
+			diffs = ""
+			depth_problem = ""
+			mix_rate = ""
+		allele_numbers.append(allele_number)
+
+		allele_with_flags = allele_number
+		if diffs != "":
+			if diffs != "trun":
+				allele_with_flags+="*" # trun indicates only that a truncated form had lower score, which isn't a mismatch
+			mismatch_flags.append(allele+"/"+diffs)
+		if depth_problem != "":
+			allele_with_flags+="?"
+			uncertainty_flags.append(allele+"/"+depth_problem)
+		alleles_with_flags.append(allele_with_flags)
+
+	# calculate ST (no flags)
+	if ST_db:
+		allele_string = " ".join(allele_numbers) # for determining ST
+		try:
+			clean_st = ST_db[allele_string]
+		except KeyError:
+			print "This combination of alleles was not found in the sequence type database:",
+			print sample_name,
+			for gene in allele_scores:
+				(allele,diffs,depth_problems,divergence) = allele_scores[gene]
+				print allele,
+			print
+			clean_st = "NF"
+	else:
+		clean_st = "ND"
+
+	# add flags for reporting
+	st = clean_st
+	if len(mismatch_flags) > 0:
+		if mismatch_flags!=["trun"]:
+			st += "*" # trun indicates only that a truncated form had lower score, which isn't a mismatch
+	else:
+		mismatch_flags = ['0'] # record no mismatches
+	if len(uncertainty_flags) > 0:
+		st += "?"
+	else:
+		uncertainty_flags = ['-']
+
+	# mean depth across loci
+	if len(depths) > 0:
+		mean_depth = float(sum(depths))/len(depths)
+	else:
+		mean_depth = 0
+
+	# maximum maf across locus
+	if len(mafs) > 0:
+		max_maf = max(mafs)
+	else:
+		max_maf = 0
+
+	return (st,clean_st,alleles_with_flags,mismatch_flags,uncertainty_flags,mean_depth,max_maf)
+
+def parse_ST_database(ST_filename,gene_names_from_fai):
+	# Read ST definitions
+	ST_db = {} # key = allele string, value = ST
+	gene_names = []
+	num_gene_cols_expected = len(gene_names_from_fai)
+	print "Attempting to read " + str(num_gene_cols_expected) + " loci from ST database " + ST_filename
+	with open(ST_filename) as f:
+		count = 0
+		for line in f:
+			count += 1
+			line_split = line.rstrip().split("\t")
+			if count == 1: # Header
+				gene_names = line_split[1:min(num_gene_cols_expected+1,len(line_split))]
+				for g in gene_names_from_fai:
+					if g not in gene_names:
+						print "Warning: gene " + g + " in database file isn't among the columns in the ST definitions: " + ",".join(gene_names)
+						print " Any sequences with this gene identifer from the database will not be included in typing."
+						if len(line_split) == num_gene_cols_expected+1:
+							gene_names.pop() # we read too many columns
+							num_gene_cols_expected -= 1
+				for g in gene_names:
+					if g not in gene_names_from_fai:
+						print "Warning: gene " + g + " in ST definitions file isn't among those in the database " + ",".join(gene_names_from_fai)
+						print " This will result in all STs being called as unknown (but allele calls will be accurate for other loci)."
+			else:
+				ST = line_split[0]
+				if ST not in ST_db.values():
+					ST_string = " ".join(line_split[1:num_gene_cols_expected+1])
+					ST_db[ST_string] = ST
+				else:
+					print "Warning: this ST is not unique in the ST definitions file: " + ST
+		print "Read ST database " + ST_filename + " successfully"
+		return (ST_db, gene_names)
+
+def get_allele_name_from_db(allele,unique_allele_symbols,unique_cluster_symbols,run_type,args):
+
+	if run_type != "mlst":
+		# header format: >[cluster]___[gene]___[allele]___[uniqueID] [info]
+		allele_parts = allele.split()
+		allele_detail = allele_parts.pop(0)
+		allele_info = allele_detail.split("__")
+
+		if len(allele_info)>2:
+			cluster_id = allele_info[0] # ID number for the cluster
+			gene_name = allele_info[1] # gene name/symbol for the cluster
+			allele_name = allele_info[2] # specific allele name
+			seqid = allele_info[3] # unique identifier for this seq
+		else:
+			cluster_id = gene_name = allele_name = seqid = allele_parts[0]
+
+		if not unique_allele_symbols:
+			allele_name += "_" + seqid
+
+	else:
+		gene_name = allele.split(args.mlst_delimiter)
+		allele_name = allele
+		seqid = None
+		cluster_id = None
+
+	return gene_name, allele_name, cluster_id, seqid
+
+def create_allele_pileup(allele_name, all_pileup_file):
+	outpileup = allele_name + "." + all_pileup_file
+	with open(outpileup, 'w') as allele_pileup:
+		with open(all_pileup_file) as all_pileup:
+			for line in all_pileup:
+				if line.split()[0] == allele_name:
+					allele_pileup.write(line)
+	return outpileup
+
+def parse_scores(run_type,args,scores, hash_edge_depth,
+					avg_depth_allele, coverage_allele, mismatch_allele, indel_allele,
+					missing_allele, size_allele, next_to_del_depth_allele,
+					unique_cluster_symbols,unique_allele_symbols, pileup_file):
+
+	# sort into hash for each gene locus
+	scores_by_gene = collections.defaultdict(dict) # key1 = gene, key2 = allele, value = score
+
+	if run_type=="mlst":
+		for allele in scores:
+			if coverage_allele[allele] > args.min_coverage:
+				allele_info = allele.split(args.mlst_delimiter)
+				scores_by_gene[allele_info[0]][allele] = scores[allele]
+	else:
+		for allele in scores:
+			if coverage_allele[allele] > args.min_coverage:
+				gene_name = get_allele_name_from_db(allele,unique_allele_symbols,unique_cluster_symbols,run_type,args)[2] # cluster ID
+				scores_by_gene[gene_name][allele] = scores[allele]
+
+	# determine best allele for each gene locus/cluster
+	results = {} # key = gene, value = (allele,diffs,depth)
+
+	for gene in scores_by_gene:
+
+		gene_hash = scores_by_gene[gene]
+		scores_sorted = sorted(gene_hash.iteritems(),key=operator.itemgetter(1)) # sort by score
+		(top_allele,top_score) = scores_sorted[0]
+
+		# check if depth is adequate for confident call
+		adequate_depth = False
+		depth_problem = ""
+		if hash_edge_depth[top_allele][0] > args.min_edge_depth and hash_edge_depth[top_allele][1] > args.min_edge_depth:
+			if next_to_del_depth_allele[top_allele] != "NA":
+				if float(next_to_del_depth_allele[top_allele]) > args.min_edge_depth:
+					if avg_depth_allele[top_allele] > args.min_depth:
+						adequate_depth = True
+					else:
+						depth_problem="depth"+str(avg_depth_allele[top_allele])
+				else:
+					depth_problem = "del"+str(next_to_del_depth_allele[top_allele])
+			elif avg_depth_allele[top_allele] > args.min_depth:
+				adequate_depth = True
+			else:
+				depth_problem="depth"+str(avg_depth_allele[top_allele])
+		else:
+			depth_problem = "edge"+str(min(hash_edge_depth[top_allele][0],hash_edge_depth[top_allele][1]))
+
+		# check if there are confident differences against this allele
+		differences = ""
+		if mismatch_allele[top_allele] > 0:
+			differences += str(mismatch_allele[top_allele])+"snp"
+		if indel_allele[top_allele] > 0:
+			differences += str(indel_allele[top_allele])+"indel"
+		if missing_allele[top_allele] > 0:
+			differences += str(missing_allele[top_allele])+"holes"
+
+		divergence = float(mismatch_allele[top_allele]) / float( size_allele[top_allele] - missing_allele[top_allele] )
+
+		# check for truncated
+		if differences != "" or not adequate_depth:
+			# if there are SNPs or not enough depth to trust the result, no need to screen next best match
+			results[gene] = (top_allele, differences, depth_problem, divergence)
+		else:
+			# looks good but this could be a truncated version of the real allele; check for longer versions
+			truncation_override = False
+			if len(scores_sorted) > 1:
+				(next_best_allele,next_best_score) = scores_sorted[1]
+				if size_allele[next_best_allele] > size_allele[top_allele]:
+					# next best is longer, top allele could be a truncation?
+					if (mismatch_allele[next_best_allele] + indel_allele[next_best_allele] + missing_allele[next_best_allele]) == 0:
+						# next best also has no mismatches
+						if (next_best_score - top_score)/top_score < 0.1:
+							# next best has score within 10% of this one
+							truncation_override = True
+			if truncation_override:
+				results[gene] = (next_best_allele, "trun", "", divergence) # no diffs but report this call is based on truncation test
+			else:
+				results[gene] = (top_allele, "", "",divergence) # no caveats to report
+
+		# Check if there are any potential new alleles
+		#depth_problem = results[gene][2]
+		#divergence = results[gene][3]
+		if depth_problem == "" and divergence > 0:
+			new_allele = True
+			# Get the consensus for this new allele and write it to file
+			if args.report_new_consensus or args.report_all_consensus:
+				new_alleles_filename = args.output + ".new_consensus_alleles.fasta"
+				allele_pileup_file = create_allele_pileup(top_allele, pileup_file) # XXX Creates a new pileup file for that allele. Not currently cleaned up
+				read_pileup_data(allele_pileup_file, size_allele, args.prob_err, consensus_file = new_alleles_filename)
+		if args.report_all_consensus:
+			new_alleles_filename = args.output + ".all_consensus_alleles.fasta"
+			allele_pileup_file = create_allele_pileup(top_allele, pileup_file)
+			read_pileup_data(allele_pileup_file, size_allele, args.prob_err, consensus_file = new_alleles_filename)
+
+	return results # (allele, diffs, depth_problem, divergence)
+
+
+def get_readFile_components(full_file_path):
+	(file_path,file_name) = os.path.split(full_file_path)
+	m1 = re.match("(.*).gz",file_name)
+	ext = ""
+	if m1 != None:
+		# gzipped
+		ext = ".gz"
+		file_name = m1.groups()[0]
+	(file_name_before_ext,ext2) = os.path.splitext(file_name)
+	full_ext = ext2+ext
+	return(file_path,file_name_before_ext,full_ext)
+
+def read_file_sets(args):
+
+	fileSets = {} # key = id, value = list of files for that sample
+	num_single_readsets = 0
+	num_paired_readsets = 0
+
+	if args.input_se:
+		# single end
+		for fastq in args.input_se:
+			(file_path,file_name_before_ext,full_ext) = get_readFile_components(fastq)
+			m=re.match("(.*)(_S.*)(_L.*)(_R.*)(_.*)", file_name_before_ext)
+			if m==None:
+				fileSets[file_name_before_ext] = [fastq]
+			else:
+				fileSets[m.groups()[0]] = [fastq] # Illumina names
+			num_single_readsets += 1
+
+	elif args.input_pe:
+		# paired end
+		forward_reads = {} # key = sample, value = full path to file
+		reverse_reads = {} # key = sample, value = full path to file
+		num_paired_readsets = 0
+		num_single_readsets = 0
+		for fastq in args.input_pe:
+			(file_path,file_name_before_ext,full_ext) = get_readFile_components(fastq)
+			# try to match to MiSeq format:
+			m=re.match("(.*)(_S.*)(_L.*)(_R.*)(_.*)", file_name_before_ext)
+			if m==None:
+				# not default Illumina file naming format, expect simple/ENA format
+				m=re.match("(.*)("+args.forward+")$",file_name_before_ext)
+				if m!=None:
+					# store as forward read
+					(baseName,read) = m.groups()
+					forward_reads[baseName] = fastq
+				else:
+					m=re.match("(.*)("+args.reverse+")$",file_name_before_ext)
+					if m!=None:
+					# store as reverse read
+						(baseName,read) = m.groups()
+						reverse_reads[baseName] = fastq
+					else:
+						logging.info("Could not determine forward/reverse read status for input file " + fastq)
+			else:
+				# matches default Illumina file naming format, e.g. m.groups() = ('samplename', '_S1', '_L001', '_R1', '_001')
+				baseName, read  = m.groups()[0], m.groups()[3]
+				if read == "_R1":
+					forward_reads[baseName] = fastq
+				elif read == "_R2":
+					reverse_reads[baseName] = fastq
+				else:
+					logging.info( "Could not determine forward/reverse read status for input file " + fastq )
+					logging.info( "  this file appears to match the MiSeq file naming convention (samplename_S1_L001_[R1]_001), but we were expecting [R1] or [R2] to designate read as forward or reverse?" )
+					fileSets[file_name_before_ext] = fastq
+					num_single_readsets += 1
+		# store in pairs
+		for sample in forward_reads:
+			if sample in reverse_reads:
+				fileSets[sample] = [forward_reads[sample],reverse_reads[sample]] # store pair
+				num_paired_readsets += 1
+			else:
+				fileSets[sample] = [forward_reads[sample]] # no reverse found
+				num_single_readsets += 1
+				logging.info('Warning, could not find pair for read:' + forward_reads[sample])
+		for sample in reverse_reads:
+			if sample not in fileSets:
+				fileSets[sample] = reverse_reads[sample] # no forward found
+				num_single_readsets += 1
+				logging.info('Warning, could not find pair for read:' + reverse_reads[sample])
+
+	if num_paired_readsets > 0:
+		logging.info('Total paired readsets found:' + str(num_paired_readsets))
+	if num_single_readsets > 0:
+		logging.info('Total single reads found:' + str(num_single_readsets))
+
+	return fileSets
+
+def read_results_from_file(infile):
+
+	if os.stat(infile).st_size == 0:
+		logging.info("WARNING: Results file provided is empty: " + infile)
+		return False, False, False
+
+	results_info = infile.split("__")
+	if len(results_info) > 1:
+
+		if re.search("compiledResults",infile)!=None:
+			dbtype = "compiled"
+			dbname = results_info[0] # output identifier
+		else:
+			dbtype = results_info[1] # mlst or genes
+			dbname = results_info[2] # database
+
+		logging.info("Processing " + dbtype + " results from file " + infile)
+
+		if dbtype == "genes":
+			results = collections.defaultdict(dict) # key1 = sample, key2 = gene, value = allele
+			with open(infile) as f:
+				header = []
+				for line in f:
+					line_split = line.rstrip().split("\t")
+					if len(header) == 0:
+						header = line_split
+					else:
+						sample = line_split[0]
+						for i in range(1,len(line_split)):
+							gene = header[i] # cluster_id
+							results[sample][gene] = line_split[i]
+
+		elif dbtype == "mlst":
+			results = {} # key = sample, value = MLST string
+			with open(infile) as f:
+				header = 0
+				for line in f:
+					if header > 0:
+						results[line.split("\t")[0]] = line.rstrip()
+						if "maxMAF" not in header:
+							results[line.split("\t")[0]] += "\tNC" # empty column for maxMAF
+					else:
+						header = line.rstrip()
+						results[line.split("\t")[0]] = line.rstrip() # store header line too (index "Sample")
+						if "maxMAF" not in header:
+							results[line.split("\t")[0]] += "\tmaxMAF" # add column for maxMAF
+
+		elif dbtype == "compiled":
+			results = collections.defaultdict(dict) # key1 = sample, key2 = gene, value = allele
+			with open(infile) as f:
+				header = []
+				mlst_cols = 0 # INDEX of the last mlst column
+				n_cols = 0
+				for line in f:
+					line_split = line.rstrip().split("\t")
+					if len(header) == 0:
+						header = line_split
+						n_cols = len(header)
+						if n_cols > 1:
+							if header[1] == "ST":
+								# there is mlst data reported
+								mlst_cols = 2 # first locus column
+								while header[mlst_cols] != "depth":
+									mlst_cols += 1
+								results["Sample"]["mlst"] = "\t".join(line_split[0:(mlst_cols+1)])
+								results["Sample"]["mlst"] += "\tmaxMAF" # add to mlst header even if not encountered in this file, as it may be in others
+								if header[mlst_cols+1] == "maxMAF":
+									mlst_cols += 1 # record maxMAF column within MLST data, if present
+							else:
+								# no mlst data reported
+								dbtype = "genes"
+								logging.info("No MLST data in compiled results file " + infile)
+						else:
+							# no mlst data reported
+							dbtype = "genes"
+							logging.info("No MLST data in compiled results file " + infile)
+
+					else:
+						sample = line_split[0]
+						if mlst_cols > 0:
+							results[sample]["mlst"] = "\t".join(line_split[0:(mlst_cols+1)])
+							if "maxMAF" not in header:
+								results[sample]["mlst"] += "\t" # add to mlst section even if not encountered in this file, as it may be in others
+						if n_cols > mlst_cols:
+							# read genes component
+							for i in range(mlst_cols+1,n_cols):
+								# note i=1 if mlst_cols==0, ie we are reading all
+								gene = header[i]
+								if len(line_split) > i:
+									results[sample][gene] = line_split[i]
+								else:
+									results[sample][gene] = "-"
+		else:
+			results = False
+			dbtype = False
+			dbname = False
+			logging.info("Couldn't decide what to do with file results file provided: " + infile)
+
+	else:
+		results = False
+		dbtype = False
+		dbname = False
+		logging.info("Couldn't decide what to do with file results file provided: " + infile)
+
+	return results, dbtype, dbname
+
+def read_scores_file(scores_file):
+	hash_edge_depth = {}
+	avg_depth_allele = {}
+	coverage_allele = {}
+	mismatch_allele = {}
+	indel_allele = {}
+	missing_allele = {}
+	size_allele = {}
+	next_to_del_depth_allele = {}
+	mix_rates = {}
+	scores = {}
+
+	f = file(scores_file,"r")
+
+	for line in f:
+		line_split = line.rstrip().split("\t")
+		allele = line_split[0]
+		if allele != "Allele": # skip header row
+			scores[allele] = float(line_split[1])
+			mix_rates[allele] = float(line_split[11])
+			avg_depth_allele[allele] = float(line_split[2])
+			hash_edge_depth[allele] = (float(line_split[3]),float(line_split[4]))
+			coverage_allele[allele] = float(line_split[5])
+			size_allele[allele] = int(line_split[6])
+			mismatch_allele[allele] = int(line_split[7])
+			indel_allele[allele] = int(line_split[8])
+			missing_allele[allele] = int(line_split[9])
+			next_to_del_depth = line_split[10]
+			next_to_del_depth_allele[allele] = line_split[10]
+
+	return hash_edge_depth, avg_depth_allele, coverage_allele, mismatch_allele, indel_allele, \
+			missing_allele, size_allele, next_to_del_depth_allele, scores, mix_rates
+
+def run_srst2(args, fileSets, dbs, run_type):
+
+	db_reports = [] # list of db-specific output files to return
+	db_results_list = [] # list of results hashes, one per db
+
+	for fasta in dbs:
+		db_reports, db_results_list = process_fasta_db(args, fileSets, run_type, db_reports, db_results_list, fasta)
+
+	return db_reports, db_results_list
+
+def process_fasta_db(args, fileSets, run_type, db_reports, db_results_list, fasta):
+
+	check_command_version(['samtools'],
+				'Version: 0.1.18',
+				'samtools',
+				'0.1.18')
+
+	logging.info('Processing database ' + fasta)
+
+	db_path, db_name = os.path.split(fasta) # database
+	(db_name,db_ext) = os.path.splitext(db_name)
+	db_results = "__".join([args.output,run_type,db_name,"results.txt"])
+	db_report = file(db_results,"w")
+	db_reports.append(db_results)
+
+	# Get sequence lengths and gene names
+	#  lengths are needed for MLST heuristic to distinguish alleles from their truncated forms
+	#  gene names read from here are needed for non-MLST dbs
+	fai_file = fasta + '.fai'
+	if not os.path.exists(fai_file):
+		run_command(['samtools', 'faidx', fasta])
+	size, gene_names, unique_gene_symbols, unique_allele_symbols, cluster_symbols = \
+		parse_fai(fai_file,run_type,args.mlst_delimiter)
+
+	# Prepare for MLST reporting
+	ST_db = False
+	if run_type == "mlst":
+		results = {} # key = sample, value = ST string for printing
+		if args.mlst_definitions:
+			# store MLST profiles, replace gene names (we want the order as they appear in this file)
+			ST_db, gene_names = parse_ST_database(args.mlst_definitions,gene_names)
+		db_report.write("\t".join(["Sample","ST"]+gene_names+["mismatches","uncertainty","depth","maxMAF"]) + "\n")
+		results["Sample"] = "\t".join(["Sample","ST"]+gene_names+["mismatches","uncertainty","depth","maxMAF"])
+
+	else:
+		# store final results for later tabulation
+		results = collections.defaultdict(dict) #key1 = sample, key2 = gene, value = allele
+
+	gene_list = [] # start with empty gene list; will add genes from each genedb test
+
+	# determine maximum mismatches per read to use for pileup
+	if run_type == "mlst":
+		max_mismatch = args.mlst_max_mismatch
+	else:
+		max_mismatch = args.gene_max_mismatch
+
+	# Align and score each read set against this DB
+	for sample_name in fileSets:
+		logging.info('Processing sample ' + sample_name)
+		fastq_inputs = fileSets[sample_name] # reads
+
+		try:
+			# try mapping and scoring this fileset against the current database
+			# update the gene_list list and results dict with data from this strain
+			# __mlst__ will be printed during this routine if this is a mlst run
+			# __fullgenes__ will be printed during this routine if requested and this is a gene_db run
+			gene_list, results = \
+				map_fileSet_to_db(args,sample_name,fastq_inputs,db_name,fasta,size,gene_names,\
+				unique_gene_symbols, unique_allele_symbols,run_type,ST_db,results,gene_list,db_report,cluster_symbols,max_mismatch)
+		# if we get an error from one of the commands we called
+		# log the error message and continue onto the next fasta db
+            	except CommandError as e:
+                	logging.error(e.message)
+                	# record results as unknown, so we know that we did attempt to analyse this readset
+                	if run_type == "mlst":
+                		st_result_string = "\t".join( [sample_name,"-"] + ["-"] * (len(gene_names)+3)) # record missing results
+                		db_report.write( st_result_string + "\n")
+                		logging.info(" " + st_result_string)
+                		results[sample_name] = st_result_string
+                	else:
+                		logging.info(" failed gene detection")
+                		results[sample_name]["failed"] = True # so we know that we tried this strain
+
+	if run_type != "mlst":
+		# tabulate results across samples for this gene db (i.e. __genes__ file)
+		logging.info('Tabulating results for database {} ...'.format(fasta))
+		gene_list.sort()
+		db_report.write("\t".join(["Sample"]+gene_list)+"\n") # report header row
+		for sample_name in fileSets:
+			db_report.write(sample_name)
+			if sample_name in results:
+				# print results
+				if "failed" not in results[sample_name]:
+					for cluster_id in gene_list:
+						if cluster_id in results[sample_name]:
+							db_report.write("\t"+results[sample_name][cluster_id]) # print full allele name
+						else:
+							db_report.write("\t-") # no hits for this gene cluster
+				else:
+					# no data on this, as the sample failed mapping
+					for cluster_id in gene_list:
+						db_report.write("\t?") #
+						results[sample_name][cluster_id] = "?" # record as unknown
+			else:
+				# no data on this because genes were not found (but no mapping errors)
+				for cluster_id in gene_list:
+					db_report.write("\t?") #
+					results[sample_name][cluster_id] = "-" # record as absent
+			db_report.write("\n")
+
+	# Finished with this database
+	logging.info('Finished processing for database {} ...'.format(fasta))
+	db_report.close()
+	db_results_list.append(results)
+
+	return db_reports, db_results_list
+
+def map_fileSet_to_db(args,sample_name,fastq_inputs,db_name,fasta,size,gene_names,\
+	unique_gene_symbols, unique_allele_symbols,run_type,ST_db,results,gene_list,db_report,cluster_symbols,max_mismatch):
+
+	mapping_files_pre = args.output + '__' + sample_name + '.' + db_name
+	pileup_file = mapping_files_pre + '.pileup'
+	scores_file = mapping_files_pre + '.scores'
+
+	# Get or read scores
+
+	if args.use_existing_scores and os.path.exists(scores_file):
+
+		logging.info(' Using existing scores in ' + scores_file)
+
+		# read in scores and info from existing scores file
+		hash_edge_depth, avg_depth_allele, coverage_allele, \
+				mismatch_allele, indel_allele, missing_allele, size_allele, \
+				next_to_del_depth_allele, scores, mix_rates = read_scores_file(scores_file)
+
+	else:
+
+		# Get or read pileup
+
+		if args.use_existing_pileup and os.path.exists(pileup_file):
+			logging.info(' Using existing pileup in ' + pileup_file)
+
+		else:
+
+			# run bowtie against this db
+			bowtie_sam = run_bowtie(mapping_files_pre,sample_name,fastq_inputs,args,db_name,fasta)
+
+			# Modify Bowtie's SAM formatted output so that we get secondary
+			# alignments in downstream pileup
+			(raw_bowtie_sam,bowtie_sam_mod) = modify_bowtie_sam(bowtie_sam,max_mismatch)
+
+			# generate pileup from sam (via sorted bam)
+			get_pileup(args,mapping_files_pre,raw_bowtie_sam,bowtie_sam_mod,fasta,pileup_file)
+
+		# Get scores
+
+		# Process the pileup and extract info for scoring and reporting on each allele
+		logging.info(' Processing SAMtools pileup...')
+		hash_alignment, hash_max_depth, hash_edge_depth, avg_depth_allele, coverage_allele, \
+				mismatch_allele, indel_allele, missing_allele, size_allele, next_to_del_depth_allele= \
+				read_pileup_data(pileup_file, size, args.prob_err)
+
+		# Generate scores for all alleles (prints these and associated info if verbose)
+		#   result = dict, with key=allele, value=score
+		logging.info(' Scoring alleles...')
+		scores, mix_rates = score_alleles(args, mapping_files_pre, hash_alignment, hash_max_depth, hash_edge_depth, \
+				avg_depth_allele, coverage_allele, mismatch_allele, indel_allele, missing_allele, \
+				size_allele, next_to_del_depth_allele, run_type)
+
+	# GET BEST SCORE for each gene/cluster
+	#  result = dict, with key = gene, value = (allele,diffs,depth_problem)
+	#			for MLST DBs, key = gene = locus, allele = gene-number
+	#           for gene DBs, key = gene = cluster ID, allele = cluster__gene__allele__id
+	#  for gene DBs, only those alleles passing the coverage cutoff are returned
+
+	allele_scores = parse_scores(run_type, args, scores, \
+			hash_edge_depth, avg_depth_allele, coverage_allele, mismatch_allele,  \
+			indel_allele, missing_allele, size_allele, next_to_del_depth_allele,
+			unique_gene_symbols, unique_allele_symbols, pileup_file)
+
+	# REPORT/RECORD RESULTS
+
+	# Report MLST results to __mlst__ file
+	if run_type == "mlst" and len(allele_scores) > 0:
+
+		# Calculate ST and get info for reporting
+		(st,clean_st,alleles_with_flags,mismatch_flags,uncertainty_flags,mean_depth,max_maf) = \
+				calculate_ST(allele_scores, ST_db, gene_names, sample_name, args.mlst_delimiter, avg_depth_allele, mix_rates)
+
+		# Print to MLST report, log and save the result
+		st_result_string = "\t".join([sample_name,st]+alleles_with_flags+[";".join(mismatch_flags),";".join(uncertainty_flags),str(mean_depth),str(max_maf)])
+		db_report.write( st_result_string + "\n")
+		logging.info(" " + st_result_string)
+		results[sample_name] = st_result_string
+
+		# Make sure scores are printed if there was uncertainty in the call
+		scores_output_file = mapping_files_pre + '.scores'
+		if uncertainty_flags != ["-"] and not args.save_scores and not os.path.exists(scores_output_file):
+			# print full score set
+			logging.info("Printing all MLST scores to " + scores_output_file)
+			scores_output = file(scores_output_file, 'w')
+			scores_output.write("Allele\tScore\tAvg_depth\tEdge1_depth\tEdge2_depth\tPercent_coverage\tSize\tMismatches\tIndels\tTruncated_bases\tDepthNeighbouringTruncation\tMmaxMAF\n")
+			for allele in scores.keys():
+				score = scores[allele]
+				scores_output.write('\t'.join([allele, str(score), str(avg_depth_allele[allele]), \
+					str(hash_edge_depth[allele][0]), str(hash_edge_depth[allele][1]), \
+					str(coverage_allele[allele]), str(size_allele[allele]), str(mismatch_allele[allele]), \
+					str(indel_allele[allele]), str(missing_allele[allele]), str(next_to_del_depth_allele[allele]), str(round(mix_rates[allele],3))]) + '\n')
+			scores_output.close()
+
+	# Record gene results for later processing and optionally print detailed gene results to __fullgenes__ file
+	elif run_type == "genes" and len(allele_scores) > 0:
+		if args.no_gene_details:
+			full_results = "__".join([args.output,"full"+run_type,db_name,"results.txt"])
+			logging.info("Printing verbose gene detection results to " + full_results)
+			f = file(full_results,"w")
+			f.write("\t".join(["Sample","DB","gene","allele","coverage","depth","diffs","uncertainty","divergence","length", "maxMAF","clusterid","seqid","annotation"])+"\n")
+		for gene in allele_scores:
+			(allele,diffs,depth_problem,divergence) = allele_scores[gene] # gene = top scoring alleles for each cluster
+			gene_name, allele_name, cluster_id, seqid = \
+				get_allele_name_from_db(allele,unique_allele_symbols,unique_gene_symbols,run_type,args)
+
+			# store for gene result table only if divergence passes minimum threshold:
+			if divergence*100 <= float(args.max_divergence):
+				column_header = cluster_symbols[cluster_id]
+				results[sample_name][column_header] = allele_name
+				if diffs != "":
+					results[sample_name][column_header] += "*"
+				if depth_problem != "":
+					results[sample_name][column_header] += "?"
+				if gene not in gene_list:
+					gene_list.append(column_header)
+
+			# write details to full genes report
+			if args.no_gene_details:
+
+				# get annotation info
+				header_string = os.popen(" ".join(["grep",allele,fasta]))
+				try:
+					header = header_string.read().rstrip().split()
+					header.pop(0) # remove allele name
+					if len(header) > 0:
+						annotation = " ".join(header) # put back the spaces
+					else:
+						annotation = ""
+
+				except:
+					annotation = ""
+
+				f.write("\t".join([sample_name,db_name,gene_name,allele_name,str(round(coverage_allele[allele],3)),str(avg_depth_allele[allele]),diffs,depth_problem,str(round(divergence*100,3)),str(size_allele[allele]),str(round(mix_rates[allele],3)),cluster_id,seqid,annotation])+"\n")
+
+		# log the gene detection result
+		logging.info(" " + str(len(allele_scores)) + " genes identified in " + sample_name)
+
+	# Finished with this read set
+	logging.info(' Finished processing for read set {} ...'.format(sample_name))
+
+	return gene_list, results
+
+def compile_results(args,mlst_results,db_results,compiled_output_file):
+
+	o = file(compiled_output_file,"w")
+
+	# get list of all samples and genes present in these datasets
+	sample_list = [] # each entry is a sample present in at least one db
+	gene_list = []
+	mlst_cols = 0
+	mlst_header_string = ""
+	blank_mlst_section = ""
+
+	mlst_results_master = {} # compilation of all MLST results
+	db_results_master = collections.defaultdict(dict) # compilation of all gene results
+	st_counts = {} # key = ST, value = count
+
+	if len(mlst_results) > 0:
+
+		for mlst_result in mlst_results:
+
+			# check length of the mlst string
+			if "Sample" in mlst_result:
+				test_string = mlst_result["Sample"]
+				if mlst_cols == 0:
+					mlst_header_string = test_string
+			else:
+				test_string = mlst_result[mlst_result.keys()[0]] # no header line?
+			test_string_split = test_string.split("\t")
+			this_mlst_cols = len(test_string)
+
+			if (mlst_cols == 0) or (mlst_cols == this_mlst_cols):
+				mlst_cols = this_mlst_cols
+				blank_mlst_section = "\t" * (mlst_cols-1) # blank MLST string in case some samples missing
+				# use this data
+				for sample in mlst_result:
+					mlst_results_master[sample] = mlst_result[sample]
+					if sample not in sample_list:
+						sample_list.append(sample)
+			elif mlst_cols != this_mlst_cols:
+				# don't process this data further
+				logging.info("Problem reconciling MLST data from two files, first MLST results encountered had " + str(mlst_cols) + " columns, this one has " + str(this_mlst_cols) + " columns?")
+				if args.mlst_db:
+					logging.info("Compiled report will contain only the MLST data from this run, not previous outputs")
+				else:
+					logging.info("Compiled report will contain only the data from the first MLST result set provided")
+
+	if len(db_results) > 0:
+		for results in db_results:
+			for sample in results:
+				if sample not in sample_list:
+					sample_list.append(sample)
+				for gene in results[sample]:
+					if gene != "failed":
+						db_results_master[sample][gene] = results[sample][gene]
+						if gene not in gene_list:
+							gene_list.append(gene)
+
+	if "Sample" in sample_list:
+		sample_list.remove("Sample")
+	sample_list.sort()
+	gene_list.sort()
+
+	# print header
+	header_elements = []
+	if len(mlst_results) > 0:
+		header_elements.append(mlst_header_string)
+	else:
+		header_elements.append("Sample")
+	if (gene_list) > 0:
+		header_elements += gene_list
+	o.write("\t".join(header_elements)+"\n")
+
+	# print results for all samples
+	for sample in sample_list:
+
+		sample_info = [] # first entry is mlst string OR sample name, rest are genes
+
+		# print mlst if provided, otherwise just print sample name
+		if len(mlst_results_master) > 0:
+			if sample in mlst_results_master:
+				sample_info.append(mlst_results_master[sample])
+				this_st = mlst_results_master[sample].split("\t")[1]
+			else:
+				sample_info.append(sample+blank_mlst_section)
+				this_st = "unknown"
+			# record the MLST result
+			if this_st in st_counts:
+				st_counts[this_st] += 1
+			else:
+				st_counts[this_st] = 1
+		else:
+			sample_info.append(sample)
+
+		# get gene info if provided
+		if sample in db_results_master:
+			for gene in gene_list:
+				if gene in db_results_master[sample]:
+					sample_info.append(db_results_master[sample][gene])
+				else:
+					sample_info.append("-")
+		else:
+			for gene in gene_list:
+				sample_info.append("?") # record no gene data on this strain
+
+		o.write("\t".join(sample_info)+"\n")
+
+	o.close()
+
+	logging.info("Compiled data on " + str(len(sample_list)) + " samples printed to: " + compiled_output_file)
+
+	# log ST counts
+	if len(mlst_results_master) > 0:
+		logging.info("Detected " + str(len(st_counts.keys())) + " STs: ")
+		sts = st_counts.keys()
+		sts.sort()
+		for st in sts:
+			logging.info("ST" + st + "\t" + str(st_counts[st]))
+
+	return True
+
+
+def main():
+	args = parse_args()
+	if args.log is True:
+		logfile = args.output + ".log"
+	else:
+		logfile = None
+	logging.basicConfig(
+		filename=logfile,
+		level=logging.DEBUG,
+		filemode='w',
+		format='%(asctime)s %(message)s',
+		datefmt='%m/%d/%Y %H:%M:%S')
+	logging.info('program started')
+	logging.info('command line: {0}'.format(' '.join(sys.argv)))
+
+	# Delete consensus file if it already exists (so can use append file in funtions)
+	if args.report_new_consensus or args.report_all_consensus:
+		new_alleles_filename = args.output + ".consensus_alleles.fasta"
+		if os.path.exists(new_alleles_filename):
+			os.remove(new_alleles_filename)
+
+	# vars to store results
+	mlst_results_hashes = [] # dict (sample->MLST result string) for each MLST output files created/read
+	gene_result_hashes = [] # dict (sample->gene->result) for each gene typing output files created/read
+
+	# parse list of file sets to analyse
+	fileSets = read_file_sets(args) # get list of files to process
+
+	# run MLST scoring
+	if fileSets and args.mlst_db:
+
+		if not args.mlst_definitions:
+
+			# print warning to screen to alert user, may want to stop and restart
+			print "Warning, MLST allele sequences were provided without ST definitions:"
+			print " allele sequences: " + str(args.mlst_db)
+			print " these will be mapped and scored, but STs can not be calculated"
+
+			# log
+			logging.info("Warning, MLST allele sequences were provided without ST definitions:")
+			logging.info(" allele sequences: " + str(args.mlst_db))
+			logging.info(" these will be mapped and scored, but STs can not be calculated")
+
+		bowtie_index(args.mlst_db) # index the MLST database
+
+		# score file sets against MLST database
+		mlst_report, mlst_results = run_srst2(args,fileSets,args.mlst_db,"mlst")
+
+		logging.info('MLST output printed to ' + mlst_report[0])
+
+		#mlst_reports_files += mlst_report
+		mlst_results_hashes += mlst_results
+
+	# run gene detection
+	if fileSets and args.gene_db:
+
+		bowtie_index(args.gene_db) # index the gene databases
+
+		db_reports, db_results = run_srst2(args,fileSets,args.gene_db,"genes")
+
+		for outfile in db_reports:
+			logging.info('Gene detection output printed to ' + outfile)
+
+		gene_result_hashes += db_results
+
+	# process prior results files
+	if args.prev_output:
+
+		unique_results_files = list(OrderedDict.fromkeys(args.prev_output))
+
+		for results_file in unique_results_files:
+
+			results, dbtype, dbname = read_results_from_file(results_file)
+
+			if dbtype == "mlst":
+				mlst_results_hashes.append(results)
+
+			elif dbtype == "genes":
+				gene_result_hashes.append(results)
+
+			elif dbtype == "compiled":
+				# store mlst in its own db
+				mlst_results = {}
+				for sample in results:
+					if "mlst" in results[sample]:
+						mlst_results[sample] = results[sample]["mlst"]
+						del results[sample]["mlst"]
+				mlst_results_hashes.append(mlst_results)
+				gene_result_hashes.append(results)
+
+	# compile results if multiple databases or datasets provided
+	if ( (len(gene_result_hashes) + len(mlst_results_hashes)) > 1 ):
+		compiled_output_file = args.output + "__compiledResults.txt"
+		compile_results(args,mlst_results_hashes,gene_result_hashes,compiled_output_file)
+
+	elif args.prev_output:
+		logging.info('One previous output file was provided, but there is no other data to compile with.')
+
+	logging.info('SRST2 has finished.')
+
+
+if __name__ == '__main__':
+	main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srst2.xml	Mon Feb 06 12:31:04 2017 -0500
@@ -0,0 +1,378 @@
+<tool id="srst2" name="SRST2" version="0.3.6">
+    <description>Short Read Sequence Typing for Bacterial Pathogens</description>
+    <requirements>
+        <requirement type="package" version="0.1.18">samtools</requirement>
+        <requirement type="package" version="2.1.0">bowtie2</requirement>
+        <requirement type="package" version="0.1.4.6">srst2</requirement>
+        <requirement type="package" version="08-07-2014">vfdb</requirement>
+    </requirements>
+    <stdio>
+      <exit_code range="1:" level="fatal" description="Unknown error has occurred"/>
+    </stdio>
+    <command interpreter="perl">
+        srst2.pl \$BASE/srst2.py $bam_results $scores $pileup
+
+        #if $mlst_or_genedb.job_type == "mlst_only"
+            m $txt_results $alleles
+            #if ($mlst_or_genedb.allele_choice.allele_report=="all")
+                all
+            #else if ($mlst_or_genedb.allele_choice.allele_report=="new")
+                new
+            #end if
+        #else if $mlst_or_genedb.job_type == "custom_only"
+            g $genes_results $fullgenes_results
+    #*
+    to allow multiple custom databases join all db names into comma separated variable then send that variable to the perl script to be parsed
+    make the database names an array and then join
+    *#
+            #set $dbs = ','.join([$database.gene_db.name for $database in ( $mlst_or_genedb.databases )])
+            "$dbs"
+        #else if $mlst_or_genedb.job_type == "vfdb_only"
+            g $genes_results $fullgenes_results $mlst_or_genedb.vfdb_in.name
+        #else if $mlst_or_genedb.job_type == "mlst_custom"
+            b $txt_results $genes_results $fullgenes_results
+            #set $dbs = ','.join([$database.gene_db.name for $database in ( $mlst_or_genedb.databases )])
+            "$dbs"
+        #else if $mlst_or_genedb.job_type == "mlst_vfdb"
+            b $txt_results $genes_results $fullgenes_results $mlst_or_genedb.vfdb_in.name
+        #end if
+
+        #if $single_or_paired.type == "single"
+            "$single_or_paired.input_se.element_identifier"
+            --input_se "$input_se"
+        #elif $single_or_paired.type == "paired"
+            "$single_or_paired.forward_pe.name"
+            --input_pe "$single_or_paired.forward_pe" "$single_or_paired.reverse_pe"
+        #else
+            "$single_or_paired.fastq_collection.forward.name"
+            --input_pe "$single_or_paired.fastq_collection.forward" "$single_or_paired.fastq_collection.reverse"
+        #end if
+
+        #if ($mlst_or_genedb.job_type=="mlst_only")
+            --mlst_db $mlst_db
+            --mlst_definition $mlst_defs
+            --mlst_delimiter "'$mlst_or_genedb.mlst_delim'"
+            --mlst_max_mismatch $mlst_or_genedb.mlst_max_mismatch
+            --report_all_consensus
+        #else if ($mlst_or_genedb.job_type=="mlst_vfdb")
+            --mlst_db $mlst_db
+            --mlst_definition $mlst_defs
+            --mlst_delimiter "'$mlst_or_genedb.mlst_delim'"
+            --mlst_max_mismatch $mlst_or_genedb.mlst_max_mismatch
+            --gene_max_mismatch $mlst_or_genedb.gene_max_mismatch
+            --gene_db \$VF_PATH/${mlst_or_genedb.vfdb_in.fields.path}
+        #else if ($mlst_or_genedb.job_type=="mlst_custom")
+            --gene_db
+            #for $i, $database in enumerate( $mlst_or_genedb.databases )
+                $database.gene_db
+            #end for
+            --mlst_db $mlst_db
+            --mlst_delimiter "'$mlst_or_genedb.mlst_delim'"
+            --mlst_max_mismatch $mlst_or_genedb.mlst_max_mismatch
+            --gene_max_mismatch $mlst_or_genedb.gene_max_mismatch
+            --mlst_definition $mlst_defs
+        #else if ($mlst_or_genedb.job_type=="vfdb_only")
+            --gene_db \$VF_PATH/${mlst_or_genedb.vfdb_in.fields.path}
+            --gene_max_mismatch $mlst_or_genedb.gene_max_mismatch
+        #else if ($mlst_or_genedb.job_type=="custom_only")
+            --gene_db
+            #for $i, $database in enumerate( $mlst_or_genedb.databases )
+                $database.gene_db
+            #end for
+            --gene_max_mismatch $mlst_or_genedb.gene_max_mismatch
+        #end if
+
+        --read_type q
+
+        --save_scores
+
+        #if $options.select == "advanced"
+            #if $options.min_coverage
+                --min_coverage $options.min_coverage
+            #end if
+            #if $options.max_divergence
+                --max_divergence $options.max_divergence
+            #end if
+            #if $options.min_depth
+                --min_depth $options.min_depth
+            #end if
+            #if $options.min_edge_depth
+                --min_edge_depth $options.min_edge_depth
+            #end if
+            #if $options.prob_err
+                --prob_err $options.prob_err
+            #end if
+            #if $options.stop_after
+                --stop_after $options.stop_after
+            #end if
+                --other "'-p \${GALAXY_SLOTS:-1}
+            #if $options.maxins
+                --maxins $options.maxins
+                --minins $options.minins
+            #end if
+                '"
+            #if $options.mapq
+                --mapq $options.mapq
+            #end if
+            #if $options.baseq
+                --baseq $options.baseq
+            #end if
+        #else
+            --other "'-p \${GALAXY_SLOTS:-1}'"
+        #end if
+
+        --output out
+    </command>
+    <inputs>
+        <conditional name="single_or_paired">
+            <param name="type" type="select" label="Read type">
+                <option value="single">Single-end</option>
+                <option value="paired">Paired-end</option>
+                <option value="collection">Collection Paired-end</option>
+                </param>
+            <when value="single">
+                <param name="input_se" type="data" format="fastqsanger" label="Single end read file(s)"/>
+            </when>
+            <when value="paired">
+                <param name="forward_pe" type="data" format="fastqsanger" label="Forward paired-end read file"/>
+                <param name="reverse_pe" type="data" format="fastqsanger" label="Reverse paired-end read file"/>
+            </when>
+            <when value="collection">
+                <param name="fastq_collection" type="data_collection" label="Paired-end reads collection" optional="false" format="txt" collection_type="paired" />
+            </when>
+        </conditional>
+
+        <conditional name="mlst_or_genedb">
+            <param name="job_type" type="select" label="Job type">
+                <option value="mlst_only">MLST only</option>
+                <option value="mlst_vfdb">MLST and VFDB</option>
+                <option value="mlst_custom">MLST and custom database</option>
+                <option value="vfdb_only">VFDB only</option>
+                <option value="custom_only">Custom database only</option>
+            </param>
+            <when value="mlst_only">
+                <param name="mlst_defs" type="data" format="tabular" label="ST definitions for MLST scheme"/>
+                <param name="mlst_db" type="data" format="fasta" label="Fasta file of MLST alleles"/>
+                <param name="mlst_max_mismatch" type="integer" label="Maximum number of mismatches per read for MLST allele calling" value="" help="SRST2.1 default value is 10 however our testing indicates that the value should be set to 250 to prevent erroneous allele calls."/>
+                <conditional name="allele_choice">
+                    <param name="allele_report" type="select" label="Reported Alleles" >
+                        <option value="all">All</option>
+                        <option value="new">Only New</option>
+                    </param>
+                    <when value="all"/>
+                    <when value="new"/>
+                </conditional>
+                <param name="mlst_delim" type="text" label="Character(s) separating gene name from allele number in MLST database" value="" help="Typically _ or -" optional="false" >
+                    <validator type="expression" message="Must enter a delimiter.">len(value) >= 1</validator>
+                </param>
+            </when>
+            <when value="mlst_vfdb">
+                <param name="mlst_defs" type="data" format="tabular"  label="ST definitions for MLST scheme"/>
+                <param name="mlst_db" type="data" format="fasta" label="Fasta file of MLST alleles"/>
+                <param name="vfdb_in" type="select" label="Choose a VFDB strain">
+                <options from_data_table="vfdb_fasta_files" />
+                </param>
+                <param name="mlst_max_mismatch" type="integer" label="Maximum number of mismatches per read for MLST allele calling" value=""  help="SRST2.1 default value is 10 however our testing indicates that the value should be set to 250 to prevent erroneous allele calls."/>
+                <param name="gene_max_mismatch" type="integer" label="Maximum number of mismatches per read for gene allele calling" value=""  help="SRST2.1 default value is 10 however our testing indicates that the value should be set to 250 to prevent erroneous allele calls."/>
+                <param name="mlst_delim" type="text" label="Character(s) separating gene name from allele number in MLST database" value="" help="Typically _ or -" optional="false" >
+                    <validator type="expression" message="Must enter a delimiter.">len(value) >= 1</validator>
+                </param>
+            </when>
+            <when value="mlst_custom">
+                <param name="mlst_defs" type="data" format="tabular" label="ST definitions for MLST scheme"/>
+                <param name="mlst_db" type="data" format="fasta" label="Fasta file of MLST alleles"/>
+                <repeat name="databases" title="Databases" min="1">
+                    <param name="gene_db" type="data" format="fasta" label="Fasta file for gene database" />
+                </repeat>
+                <param name="mlst_max_mismatch" type="integer" label="Maximum number of mismatches per read for MLST allele calling" value="" help="SRST2.1 default value is 10 however our testing indicates that the value should be set to 250 to prevent erroneous allele calls."/>
+                <param name="gene_max_mismatch" type="integer" label="Maximum number of mismatches per read for gene allele calling" value="" help="SRST2.1 default value is 10 however our testing indicates that the value should be set to 250 to prevent erroneous allele calls."/>
+                <param name="mlst_delim" type="text" label="Character(s) separating gene name from allele number in MLST database" value="" help="Typically _ or -" optional="false" >
+                    <validator type="expression" message="Must enter a delimiter.">len(value) >= 1</validator>
+                </param>
+            </when>
+            <when value="vfdb_only">
+                <param name="vfdb_in" type="select" label="Choose a VFDB strain">
+                    <options from_data_table="vfdb_fasta_files" >
+                        <filter type="sort_by" column="2" />
+                        <validator type="no_options" message="No strains are available" />
+                    </options>
+                </param>
+                <param name="gene_max_mismatch" type="integer" label="Maximum number of mismatches per read for gene allele calling" value="" help="SRST2.1 default value is 10 however our testing indicates that the value should be set to 250 to prevent erroneous allele calls."/>
+            </when>
+            <when value="custom_only">
+                <param name="gene_max_mismatch" type="integer" label="Maximum number of mismatches per read for gene allele calling" value="" help="SRST2.1 default value is 10 however our testing indicates that the value should be set to 250 to prevent erroneous allele calls."/>
+                <repeat name="databases" title="Databases" min="1">
+                    <param name="gene_db" type="data" format="fasta" label="Fasta file for gene database" />
+                </repeat>
+            </when>
+        </conditional>
+        <conditional name="options">
+            <param name="select" type="select" label="Options Type">
+                <option value="basic">Basic</option>
+                <option value="advanced">Advanced</option>
+            </param>
+            <when value="advanced">
+                <param name="min_coverage" type="integer" label="Minimum %coverage cutoff for gene reporting" value="90"/>
+                <param name="max_divergence" type="integer" label="Maximum %divergence cutoff for gene reporting" value="10"/>
+                <param name="min_depth" type="integer" label="Minimum mean depth to flag as dubious allele call" value="5"/>
+                <param name="min_edge_depth" type="integer" label="Minimum edge depth to flag as dubious allele call" value="2"/>
+                <param name="prob_err" type="float" label="Probability of sequencing error" value="0.01"/>
+                <param name="stop_after" type="integer" label="Stop mapping after this number of reads have been mapped (otherwise map all)" optional="true"/>
+                <param name="mapq" type="integer" label="Samtools -q parameter" value="1"/>
+                <param name="baseq" type="integer" label="Samtools -Q parameter" value="20"/>
+                <param name="minins" type="integer" label="Bowtie 2 -I parameter. The minimum fragment length for valid paired-end alignments." value="0" >
+                     <validator type="in_range" message="Must be less than -X parameter." min="0"/>
+                </param>
+                <param name="maxins" type="integer" label="Bowtie 2 -X parameter. The maximum fragment length for valid paired-end alignments." value="1000" >
+                     <validator type="in_range" message="Must be greater than -I parameter." min="0"/>
+                </param>
+
+            </when>
+            <when value="basic"/>
+        </conditional>
+    </inputs>
+
+    <outputs>
+        <data format="bam" name="bam_results" label="Bam Results"/>
+        <data format="tabular" name="scores" label="Scores"/>
+        <data format="tabular" name="pileup" label="Pileup"/>
+        <data format="fasta" name="alleles" label="Alleles">
+            <filter>mlst_or_genedb['job_type']=="mlst_only"</filter>
+        </data>
+        <data format="tabular" name="txt_results" label="Text Results" >
+            <filter>mlst_or_genedb['job_type']!="vfdb_only"</filter>
+            <filter>mlst_or_genedb['job_type']!="custom_only"</filter>
+        </data>
+        <data format="tabular" name="genes_results" label="Genes Results" >
+            <filter>mlst_or_genedb['job_type']!="mlst_only"</filter>
+        </data>
+        <data format="tabular" name="fullgenes_results" label="Full Genes Results" >
+            <filter>mlst_or_genedb['job_type']!= "mlst_only"</filter>
+        </data>
+    </outputs>
+
+    <tests>
+      <test>
+        <output/>
+      </test>
+    </tests>
+
+
+    <help>
+What it does
+============
+
+Short Read Sequence Typing for Bacterial Pathogens
+
+This program is designed to take Illumina sequence data, a MLST database and/or a database of gene sequences (e.g. resistance genes, virulence genes, etc) and report the presence of STs and/or reference genes. The tool has a database of virulence factors that was extracted from http://www.mgc.ac.cn/VFs/ .
+
+For more information about SRST2 and for instructions on how to format custom databases, visit https://github.com/katholt/srst2
+
+
+Usage
+=====
+
+Basic Options
+-------------
+
+**Read Type**
+   - Single-end: Single end read file(s) for analysing (--input_se)
+   - Paired-end: Paired end read file(s) for analysing (--input_pe)
+
+**Job Type**
+    - MLST only: Reports Sequence Types
+    - MLST and VFDB: Reports Sequence Types and user can choose one of the built-in Virulence Factor Datebase (VFDB) strains
+    - MLST and custom database: Reports Sequence Types and user can upload their own custom database
+    - VFDB only: Use can choose one of the built-in Virulence Factor Databasse (VFDB) strains
+    - Custom database only: Use can upload their own custom database
+
+**ST definitions for MLST scheme:**
+    - Required if you want to calculate STs (--mlst_definitions)
+
+**Fasta file of MLST alleles:**
+    - Required if you want to calculate STs (--mlst_db)
+
+**Fasta file for gene database:**
+    - Required if you want details of the sequences. The user must provide their own database (--gene_db)
+
+**VFDB strain:**
+    - Required if you want details of the sequences. The use may choose one of the listed strains (--gene_db)
+
+**Read file type:**
+    - fastq
+    - solexa
+    - fasta
+
+**Character(s) separating gene name from allele number in MLST database:**
+    - Required for all MLST job types
+    - Typically either _ or -
+    - The output from getMLST will identify the delimiter.
+
+**Maximum number of mismatches per read for MLST allele calling:**
+    - Required for all MLST job types
+    - For MLST schemas with inserts this number should be set to a high value (recommended: 250)
+
+**Maximum number of mismatches per read for gene allele calling:**
+    - Required for all VDFB or custom database job types
+    - For genes with inserts this number should be set to a high value (recommended: 250).
+
+**Option Type:**
+    - Basic: Includes only the options listed above
+    - Advanced: Includes the options listed below
+
+-------------------------------
+
+Advanced Options
+----------------
+
+**Minimum %coverage cutoff for gene reporting:**
+    - Default is 90 (--min_coverage)
+
+**Maximum %divergence cutoff for gene reporting:**
+    - Default is 10 (--max_divergence)
+
+**Minimum mean depth to flag as dubious allele call:**
+    - Default is 5 (--min_depth)
+
+**Minimum edge depth to flag as dubious allele call:**
+    - Default is 2 (--min_edge_depth)
+
+**Probability of sequencing error:**
+    - Default is 0.01 (--prob_err)
+
+**Stop mapping after this number of reads have been mapped (otherwise map all):**
+    - Default maps all (--stop_after)
+
+**Other arguments to pass to bowtie2:**
+    --other
+
+**Samtools -q parameter:**
+    - Default is 1 (--mapq)
+
+**Samtools -Q parameter:**
+    - Default is 20 (--baseq)
+
+**Bowtie2 -I/--minins:**
+    - The minimum fragment length for valid paired-end alignments. E.g. if -I 60 is specified and a paired-end alignment consists of two 20-bp alignments in the appropriate orientation with a 20-bp gap between them, that alignment is considered valid (as long as -X is also satisfied). A 19-bp gap would not be valid in that case. If trimming options -3 or -5 are also used, the -I constraint is applied with respect to the untrimmed mates.
+    - The larger the difference between -I and -X, the slower Bowtie 2 will run. This is because larger differences bewteen -I and -X require that Bowtie 2 scan a larger window to determine if a concordant alignment exists. For typical fragment length ranges (200 to 400 nucleotides), Bowtie 2 is very efficient.
+    - Default: 0 (essentially imposing no minimum)
+
+**Bowtie2 -X/--maxins:**
+    - The maximum fragment length for valid paired-end alignments. E.g. if -X 100 is specified and a paired-end alignment consists of two 20-bp alignments in the proper orientation with a 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). A 61-bp gap would not be valid in that case. If trimming options -3 or -5 are also used, the -X constraint is applied with respect to the untrimmed mates, not the trimmed mates.
+    - The larger the difference between -I and -X, the slower Bowtie 2 will run. This is because larger differences bewteen -I and -X require that Bowtie 2 scan a larger window to determine if a concordant alignment exists. For typical fragment length ranges (200 to 400 nucleotides), Bowtie 2 is very efficient.
+    - Default: 500.
+
+**Acknowledgments**
+    Original Author: Mariam Iskander
+
+    Jen Cabral
+
+    Philip Mabon
+
+    Mark Iskander
+
+    </help>
+    <citations>
+      <citation type="doi">10.1128/AAC.01310-13</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Mon Feb 06 12:31:04 2017 -0500
@@ -0,0 +1,5 @@
+   <!-- Location of VFDB files -->
+   <table name="vfdb_fasta_files" comment_char="#">
+      <columns>value,dbkey,name,path</columns>
+      <file path="tool-data/vfdb_files.loc" />
+   </table>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Feb 06 12:31:04 2017 -0500
@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="vfdb" version="08-07-2014">
+        <install version="1.0">
+            <actions_group>
+                <actions>
+                    <action sha256sum="eae59df6f7aad698da5e43322c58a28c25dfe91dd656cd2ee12ea330db6aa6bb" type="download_by_url">https://github.com/mariamiskander/VirulenceFactorsDB/archive/master.zip</action>
+                    <action type="make_directory">$INSTALL_DIR/vfdb</action>
+                    <action type="move_directory_files">
+                        <source_directory>vfdb</source_directory>
+                        <destination_directory>$INSTALL_DIR/vfdb</destination_directory>
+                    </action>
+                    <action type="set_environment">
+                        <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR</environment_variable>
+                        <environment_variable action="set_to" name="VF_PATH">$INSTALL_DIR</environment_variable>
+                    </action>
+                    <action type="shell_command">ls -lrt</action>
+                    <action type="shell_command">echo $INSTALL_DIR</action>
+                    <action type="shell_command">echo $PATH</action>
+                </actions>
+            </actions_group>
+        </install>
+    </package>
+    
+    <package name="srst2" version="0.1.4.6">
+    	<repository changeset_revision="79a05e500381" name="package_srst2_0_1_4_6" owner="nml" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+	<package name="samtools" version="0.1.18">
+		<repository changeset_revision="1409782220c9" name="package_samtools_0_1_18" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+	</package>
+	<package name="bowtie2" version="2.1.0">
+		<repository changeset_revision="017a00c265f1" name="package_bowtie2_2_1_0" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" />
+	</package>
+</tool_dependency>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vfdb_files.loc	Mon Feb 06 12:31:04 2017 -0500
@@ -0,0 +1,25 @@
+bacillus	bacillus	Bacillus	/vfdb/Bacillus_VF_clustered.fasta
+bartonella	bartonella	Bartonella	/vfdb/Bartonella_VF_clustered.fasta
+bordetella	bordetella	Bordetella	/vfdb/Bordetella_VF_clustered.fasta
+brucella	brucella	Brucella	/vfdb/Brucella_VF_clustered.fasta
+burkholderia	burkholderia	Burkholderia	/vfdb/Burkholderia_VF_clustered.fasta
+campylobacter	campylobacter	Campylobacter	/vfdb/Campylobacter_VF_clustered.fasta
+chlamydia	chlamydia	Chlamydia	/vfdb/Chlamydia_VF_clustered.fasta
+clostridium	clostridium	Clostridium	/vfdb/Clostridium_VF_clustered.fasta
+corynebacterium	corynebacterium	Corynebacterium	/vfdb/Corynebacterium_VF_clustered.fasta
+enterococcus	enterococcus	Enterococcus	/vfdb/Enterococcus_VF_clustered.fasta
+escherichia	escherichia	Escherichia	/vfdb/Escherichia_VF_clustered.fasta
+haemophilus	haemophilus	Haemophilus	/vfdb/Haemophilus_VF_clustered.fasta
+helicobacter	helicobacter	Helicobacter	/vfdb/Helicobacter_VF_clustered.fasta
+legionella	legionella	Legionella	/vfdb/Legionella_VF_clustered.fasta
+listeria	listeria	Listeria	/vfdb/Listeria_VF_clustered.fasta
+mycobacterium	mycobacterium	Mycobacterium	/vfdb/Mycobacterium_VF_clustered.fasta
+mycoplasma	mycoplasma	Mycoplasma	/vfdb/Mycoplasma_VF_clustered.fasta
+neisseria	neisseria	Neisseria	/vfdb/Neisseria_VF_clustered.fasta
+pseudomonas	pseudomonas	Pseudomonas	/vfdb/Pseudomonas_VF_clustered.fasta
+salmonella	salmonella	Salmonella	/vfdb/Salmonella_VF_clustered.fasta
+shigella	shigella	Shigella	/vfdb/Shigella_VF_clustered.fasta
+staphylococcus	staphylococcus	Staphylococcus	/vfdb/Staphylococcus_VF_clustered.fasta
+streptococcus	streptococcus	Streptococcus	/vfdb/Streptococcus_VF_clustered.fasta
+vibrio	vibrio	Vibrio	/vfdb/Vibrio_VF_clustered.fasta
+yersinia	yersinia	Yersinia	/vfdb/Yersinia_VF_clustered.fasta