changeset 0:50df2d629d51 draft

Uploaded
author hepcat72
date Fri, 26 Aug 2016 16:30:56 -0400
parents
children e3f91eee8c75
files barcode_splitter_multi/._barcode_splitter.py barcode_splitter_multi/._barcode_splitter.xml barcode_splitter_multi/._barcode_splitter_galaxy_wrapper.sh barcode_splitter_multi/barcode_splitter.py barcode_splitter_multi/barcode_splitter.xml barcode_splitter_multi/barcode_splitter_galaxy_wrapper.sh
diffstat 6 files changed, 1062 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file barcode_splitter_multi/._barcode_splitter.py has changed
Binary file barcode_splitter_multi/._barcode_splitter.xml has changed
Binary file barcode_splitter_multi/._barcode_splitter_galaxy_wrapper.sh has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/barcode_splitter_multi/barcode_splitter.py	Fri Aug 26 16:30:56 2016 -0400
@@ -0,0 +1,870 @@
+#!/usr/bin/env python
+""" Split one or more fastq files based on barcode sequence.
+"""
+from __future__ import division
+import gzip
+import argparse
+import os
+import re
+import sys
+from collections import defaultdict
+import subprocess
+
+__version__      = "0.12"
+__author__       = "Lance Parsons & Robert Leach"
+__author_email__ = "lparsons@princeton.edu,rleach@princeton.edu"
+__copyright__    = "Copyright 2011, Lance Parsons & Robert leach"
+__license__      = ("BSD 2-Clause License "
+                    "http://www.opensource.org/licenses/BSD-2-Clause")
+
+UNMATCHED    = 'unmatched'
+MATCHED      = 'matched'
+MULTIMATCHED = 'multimatched'
+ANY          = 'any'       #Used in the N-dimensional dictionary to help
+                           #diagnose which barcode set is matching/not matching
+
+def main (argv=None):
+    if argv is None:
+        argv = sys.argv
+    
+    parser = argparse.ArgumentParser(description=globals()['__doc__'])
+    parser.add_argument('--version', action='version',
+                        version='%(prog)s version ' + globals()['__version__'])
+
+    required_group = parser.add_argument_group("Barcodes")
+    required_group.add_argument('--bcfile', metavar='FILE',
+                                help='REQUIRED: Tab delimited file: '
+                                '"Sample_ID <tab> Barcode_Sequence"')
+    required_group.add_argument ('--idxread', metavar='READNUM', type=int,
+                                 nargs='+', help="REQUIRED: Indicate in "
+                                 "which read file(s) to search for the "
+                                 "corresponding column of barcode sequences, "
+                                 "e.g. if the first column of barcodes is in "
+                                 "the second sequence read file and the "
+                                 "second column's barcodes are in the third "
+                                 "sequence read file, you'd supply "
+                                 "`--idxread 2 3`")
+    required_group.add_argument('--mismatches', default=0, type=int,
+                                help='Number of mismatches allowed in '
+                                'barcode matching')
+    required_group.add_argument('--barcodes_at_end', action='store_true',
+                                default=False, help='Barcodes are at the end '
+                                'of the index read (default is at the '
+                                'beginning)')
+
+    #pairedend_group = parser.add_argument_group("Paired-End")
+    #pairedend_group.add_argument ('--split-paired-ends', action='store_true',
+    #                              default=False, help='In addition to '
+    #                              'splitting based on barcodes, also split '
+    #                              'paired ends.')
+
+    output_group = parser.add_argument_group("Output Options")
+    output_group.add_argument('--prefix', default='',
+                              help='Prefix for output files')
+    output_group.add_argument('--suffix', default=None,
+                              help='Suffix for output files (default based '
+                              'on --format)')
+    output_group.add_argument('--galaxy', action='store_true', default=False,
+                              help='Produce "Galaxy safe" filenames by '
+                              'removing underscores (default: %(default)s)')
+    output_group.add_argument('-v', '--verbose', action='store_true',
+                              default=False, help='verbose output')
+    output_group.add_argument('--gzipout', action='store_true', default=False,
+                              help='Output files in compressed gzip format '
+                              '(default is uncompressed)')
+
+    input_group = parser.add_argument_group("Input format")
+    input_group.add_argument('--format', default='fastq',
+                             help='Specify format for sequence files (fasta '
+                             'or fastq)')
+    input_group.add_argument('--gzipin', action='store_true', default=False,
+                             help='Assume input files are in gzip format, '
+                             'despite file extension (default is auto based '
+                             'on input file extension)')
+
+    seqs_group = parser.add_argument_group("Sequence Inputs")
+    seqs_group.add_argument('fastq_files', metavar='FILE', type=str, nargs='+',
+                            help='A series of 1 or more [optionally zipped] '
+                            'fastq files.')
+
+    #Error-check the values provided from the command line
+    try:
+        options = parser.parse_args()
+	if options.fastq_files is None or options.idxread is None:
+	    parser.error('Sequence files and at least one number indicating '
+                         'the indexed file(s) (--idxread) is required')
+        if len(options.fastq_files) < 1:
+            parser.error('Must specify at least one sequence file')
+        if len(options.fastq_files) < len(options.idxread):
+            parser.error('Must specify at least one sequence file')
+        if not options.bcfile:
+            parser.error('Must specify a barcodes file with "--bcfile" option')
+        if ((min(options.idxread) < 1) or
+            (max(options.idxread) > len(options.fastq_files))):
+            parser.error('Invalid index read number ("--idxread"), must be '
+                         'between 1 and %s (the number of supplied sequence '
+                         'files)' % len(options.fastq_files))
+    except SystemExit: # Prevent exit when calling as function
+        return 2
+
+    #Read barcodes files into n-dimensional dictionary
+    (barcode_dict,approx_bc_dict) = read_barcodes(options.bcfile,
+                                                  len(options.idxread),
+                                                  options.mismatches)
+    barcode_sizes = getBarcodeSizes(barcode_dict)
+    #print(barcode_dict)
+    #print(approx_bc_dict)
+
+    #initialize the read counts for each barcode
+    total_read_count = 0
+    matched_counts   = multi_dimensions(len(options.idxread), int)
+    unmatched_counts = multi_dimensions(len(options.idxread), int)
+    #TODO Verbose: print barcode_dict
+
+    #Determine if we should use gzip for input
+    basename, extension = os.path.splitext(options.fastq_files[0])
+    if extension == '.gz':
+        options.gzipin = True
+
+    #Set the suffix (before determination of gzip output mode)
+    if options.suffix is not None:
+        suffix = options.suffix
+    elif options.format is not None:
+        suffix = '.%s' % options.format
+    elif extension != '.gz':
+        suffix = '.%s' % extension
+    else:
+        suffix = 'fastq'
+
+    #Determine if we should use gzip for output
+    if (options.gzipout is True or
+        (options.suffix is not None and options.suffix == '.gz')):
+        options.gzipout = True
+        if options.suffix is None or suffix != '.gz':
+            suffix = '%s.gz' % suffix
+
+    #Determine whether to use the gzip command line tool or not
+    use_gzip_cmd_line = False
+    if options.gzipin is True or options.gzipout is True:
+        use_gzip_cmd_line = exeExists('gzip')
+
+    #Determine if we need to edit the prefix for galaxy mode
+    prefix = options.prefix
+    if options.galaxy:
+        prefix = options.prefix.replace("_","-")
+        
+    #Open input filehandles for each read
+    #print "Opening the input file handles..."
+    inputs = {}
+    for i in xrange(0,len(options.fastq_files)):
+        if options.gzipin:
+            if(use_gzip_cmd_line is True):
+                inputs[i] = run_gzip_in(options.fastq_files[i])
+            else:
+                inputs[i] = gzip.open(options.fastq_files[i], 'rb')
+        else:
+            inputs[i] = open(options.fastq_files[i], 'rb')
+
+    #Open output filehandles for each barcode set and sequence file
+    #print "Opening the output barcode file handles..."
+    outputs = multi_dimensions((len(options.idxread) + 1), file)
+    openOutfiles(outputs, barcode_dict, prefix, suffix, inputs, options.galaxy,
+                 options.gzipout,use_gzip_cmd_line)
+
+    #Open the file handle for the unmatched file
+    #print "Opening the output un/multi-matched file handles..."
+    unmatchedOutputs    = defaultdict(file)
+    multimatchedOutputs = defaultdict(file)
+    for i in xrange(0,len(inputs)):
+        unmf = '%s%s-read-%s%s' %(prefix, UNMATCHED, i+1, suffix)
+        mmf  = '%s%s-read-%s%s' %(prefix, MULTIMATCHED, i+1, suffix)
+	if options.gzipout:
+            if use_gzip_cmd_line is True:
+                unmatchedOutputs[i] = run_gzip_out(unmf)
+                multimatchedOutputs[i] = run_gzip_out(mmf)
+            else:
+                unmatchedOutputs[i] = gzip.open(unmf,'wb')
+                multimatchedOutputs[i] = gzip.open(mmf, 'wb')
+	else:
+	    unmatchedOutputs[i] = open(unmf, 'wb')
+	    multimatchedOutputs[i] = open(mmf, 'wb')
+
+    readidxs     = [i - 1 for i in options.idxread]
+    prim_readidx = readidxs[0]
+
+    #Debug mode reads the 1st 10000 reads of each file when prefix has "debug"
+    debug_mode = False
+    if re.match('debug', prefix):
+	debug_mode = True
+
+    # For each input line in index read, get index sequence
+    for prim_index_read in read_fastq(inputs[prim_readidx]):
+
+        total_read_count += 1
+
+        #print "Processing read number %s\r" % (total_read_count),
+
+        #For debugging any file
+        if debug_mode and total_read_count > 10000:
+            break
+
+        #Determine the ID format using the first record
+        if total_read_count == 1:
+            id_format = determine_id_format(prim_index_read['seq_id'][1:])
+
+	#Get 1 from each file & assert that their IDs match
+	cur_reads = {}
+	for read_index in xrange(0,len(inputs)):
+	    if(read_index is not prim_readidx):
+		cur_reads[read_index] = read_fastq(inputs[read_index]).next()
+                #If the IDs don't match between all the files, issue a warning
+                try:
+                    assert(match_id(prim_index_read['seq_id'],
+                                    cur_reads[read_index]['seq_id'],id_format))
+                except AssertionError:
+                    sys.stderr.write("WARNING: Index ID mismatch: %s does "
+                                     "not match %s\n"
+                                     %(prim_index_read['seq_id'],
+                                       cur_reads[read_index]['seq_id']))
+	    else:
+		cur_reads[read_index] = prim_index_read
+
+        #Get the index sequences from the indexed reads
+        index_seqs = []
+        if options.barcodes_at_end:
+            index_seqs = [cur_reads[readidxs[i]]['seq'][-barcode_sizes[i]:]
+                          for i in xrange(0,len(readidxs))]
+        else:
+            index_seqs = [cur_reads[readidxs[i]]['seq'][0:barcode_sizes[i]]
+                          for i in xrange(0,len(readidxs))]
+
+        barcode_path = getNDDictVal(approx_bc_dict,index_seqs)
+
+        if(barcode_path is None):
+            cur_outputs = unmatchedOutputs
+            unmatched_path = getBarcodeMatchPath(approx_bc_dict,index_seqs)
+	    incrementNDDictInt(unmatched_counts,unmatched_path)
+        elif(MULTIMATCHED in barcode_path):
+            cur_outputs = multimatchedOutputs
+            unmatched_path = getBarcodeMatchPath(approx_bc_dict,barcode_path)
+	    incrementNDDictInt(unmatched_counts,unmatched_path)
+            sys.stderr.write('WARNING: More than one barcode matches %s, '
+                             'moving to %s category\n'
+                             %(prim_index_read['seq_id'], MULTIMATCHED))
+        else:
+            cur_outputs = getNDDictVal(outputs,barcode_path)
+	    incrementNDDictInt(matched_counts,barcode_path)
+
+        #Write each sequence to the matched or unmatched output handle
+        for readnum in xrange(0,len(inputs)):
+            cur_outputs[readnum].write(fastq_string(cur_reads[readnum]))
+
+    #Report the final matched/unmatched counts in a table to STDOUT
+    printCounts(barcode_dict,matched_counts,unmatched_counts,total_read_count,
+                len(options.idxread))
+
+    return 0
+
+def read_barcodes(filename, num_dims, mismatches):
+    '''Read barcodes file into dictionary'''
+
+    #Declare a variable-dimension dictionary
+    #This was previously a static multi-dimensional dictionary, but it would
+    #not have worked...
+    l = lambda:defaultdict(l)
+    real_bc_dict = l()
+    approx_bc_dict = l()
+    approx_bc_any_dict = l()
+
+    linenum = 0
+    filehandle = open(filename, 'rb')
+    for line in filehandle:
+        
+        linenum += 1
+        line = line.strip()
+        cols = []
+
+        #If the line has something on it and isn't commented
+        if line and line[0] != '#':
+            
+            cols = re.split('\t+',line)
+
+            #If we got the expected number of columns
+            if len(cols) == (num_dims + 1):
+                sample_id         = cols.pop(0)
+                barcode_sequences = cols
+            else:
+                sys.stderr.write("Unable to parse line in barcode file: [%s]: "
+                                 "[%s].  Must contain %s columns (the number "
+                                 "of values supplied to --idxread plus 1 for "
+                                 "the sample ID), but found [%s].  Skipping.\n"
+                                 %(filename, line, (num_dims + 1), len(cols)))
+                continue
+
+            #If the sample ID is not none and the number of defined barcodes is
+            #equal to the number of expected barcodes
+            defined_barcodes = [e for e in barcode_sequences if e is not None]
+            if (sample_id is not None) and len(defined_barcodes) == num_dims:
+
+                #Set the sample ID in the N-Dimensional dictionary
+                setNDDictVal(real_bc_dict, defined_barcodes, sample_id)
+
+                #This creates a dictionary containing all possible barcode
+                #combos, with MULTIMATCHED values pre-computed
+                #print ("Calling setNDApproxDictVal with %s and dict: "
+                #       "[%s]") % (defined_barcodes, approx_bc_dict)
+                setNDApproxDictVal(approx_bc_dict,
+                                   defined_barcodes,
+                                   mismatches)
+
+                #Make sure we can get all the barcodes for each dict dimension
+                dim = 0
+                for barcode in defined_barcodes:
+                    dim += 1
+                    real_bc_dict[ANY][str(dim)][barcode] = ""
+
+                    #This creates a dictionary containing all possible barcode
+                    #combos, with MULTIMATCHED values pre-computed
+                    setNDApproxDictVal(approx_bc_any_dict[ANY][str(dim)],
+                                       [barcode],
+                                       mismatches)
+
+            elif ((sample_id is not None) and
+                  (len(defined_barcodes) > num_dims)):
+                raise Exception("More barcode indexes were found in the "
+                                "barcodes file on line %s: '%s' than were "
+                                "expected: [%s] (the number of values "
+                                "supplied to --idxread)."
+                                %(linenum, line, num_dims))
+
+            else:
+                raise Exception("Unable to parse barcode(s) from line %s: "
+                                "'%s'. Expected a sample ID followed by [%s] "
+                                "tab-delimited barcodes"
+                                %(linenum, line, num_dims))
+
+    if len(real_bc_dict) == 0:
+	raise Exception("Unable to parse any barcodes from barcode file [%s]."
+                        %(filename))
+
+    #print(approx_bc_dict)
+
+    reduceNDDictPaths(approx_bc_dict)
+
+    for dim in real_bc_dict[ANY]:
+        reduceNDDictPaths(approx_bc_any_dict[ANY][dim])
+        approx_bc_dict[ANY][dim] = approx_bc_any_dict[ANY][dim]
+
+    return(real_bc_dict,approx_bc_dict)
+
+def openOutfiles(outputs,barcode_dict,prefix,suffix,inputs,galaxy,gzip_mode,
+                 use_gzip_cmd_line):
+    '''Opens all the output files for all barcode matches (unmatched
+    barcode output files not opened)'''
+    if isinstance(barcode_dict,dict):
+	for barcode in getRealBarcodes(barcode_dict.keys()):
+            openOutfiles(outputs[barcode], barcode_dict[barcode], prefix,
+                         suffix, inputs, galaxy, gzip_mode, use_gzip_cmd_line)
+    else:
+        sample_id = barcode_dict
+        if galaxy:
+            #Replace underscore to allow this to work with Galaxy
+            sample_id = barcode_dict.replace("_","-")
+        for i in xrange(0,len(inputs)):
+            fn = '%s%s-read-%s%s' % (prefix, sample_id, i+1, suffix)
+            if gzip_mode:
+                if use_gzip_cmd_line is True:
+                    outputs[i] = run_gzip_out(fn)
+                else:
+                    outputs[i] = gzip.open(fn, 'wb')
+            else:
+                outputs[i] = open(fn, 'wb')
+
+def getBarcodeSizes(barcode_dict):
+    '''Uses the first key in each level of a dictionary to build a list of key
+    sizes (assuming all keys are the same size)'''
+
+    if (isinstance(barcode_dict,dict) and len(barcode_dict.keys()) > 0 and
+        isinstance(barcode_dict[barcode_dict.keys()[0]],dict) and
+        len(barcode_dict[barcode_dict.keys()[0]].keys()) > 0):
+
+	return([len(barcode_dict.keys()[0])] +
+               getBarcodeSizes(barcode_dict[barcode_dict.keys()[0]]))
+
+    elif isinstance(barcode_dict,dict) and len(barcode_dict.keys()) > 0:
+
+	return([len(barcode_dict.keys()[0])])
+
+    else:
+
+        return(None)
+
+def setNDDictVal(in_dict, keys_list, val):
+    '''Sets the value of an N-Dimensional dictionary to the supplied value
+    using the list of keys'''
+    cur_dict = in_dict
+    for cur_key in keys_list[0:-1]:
+	cur_dict = cur_dict[cur_key]
+    cur_dict[keys_list[-1]] = val
+
+def incrementNDDictInt(in_dict, keys_list):
+    '''Sets the value of an N-Dimensional dictionary to the supplied value
+    using the list of keys'''
+    cur_dict = in_dict
+    for cur_key in keys_list[0:-1]:
+	cur_dict = cur_dict[cur_key]
+    cur_dict[keys_list[-1]] += 1
+
+def getNDDictVal(in_dict, keys_list):
+    '''Gets the value of an N-Dimensional dictionary using the list of keys'''
+    cur_dict = in_dict
+    for cur_key in keys_list:
+        if cur_key in cur_dict:
+            cur_dict = cur_dict[cur_key]
+        else:
+            return None
+    return cur_dict
+
+def multi_dimensions(n, type):
+    """Creates an N-dimensional dictionary containing values at the leaves of
+    type 'type'. E.g. A 2-dimensional dictionary of strs would have 2 levels of
+    keys that hold string values (mydict['key1']['key2'] = 'my string value')
+    """
+
+    #n <= 0 is correct. E.g. if called with 2 (i.e. a 2D dictionary - like a 2D
+    #array/matrix - where array[2][1] = 5 or dict['key1']['key2'] = 'str' are
+    #2-dimensional data structures), the the first type returned is a dict (for
+    #holding 'key1') and the second thing returned is a defaultdict for holding
+    #'key2' and the last thing returned is the type of the data held at the
+    #leaves of the dictionary
+    if n <= 0:
+        return type()
+
+    return defaultdict(lambda:multi_dimensions(n-1, type))
+
+def getRealBarcodes(barcodes):
+    '''Return a list of barcodes that exclude generic "matched", "unmatched",
+    and "any" pseudo-barcodes'''
+    return [bc for bc in barcodes if (bc is not UNMATCHED and bc is not MATCHED
+                                      and bc is not ANY and bc is not
+                                      MULTIMATCHED and bc is not None)]
+
+def matchBarcodes(sequence, barcodes, mismatches):
+    '''Find closest match(es) in barcodes to specified sequence with max number
+    of mismatches'''
+    best_distance = mismatches
+    results = []
+    for barcode in barcodes:
+        if mismatches == 0:
+            if (sequence == barcode):
+                results.append(barcode)
+        else:
+            distance = hamming_distance(sequence, barcode)
+            if (distance <= best_distance):
+                best_distance = distance
+                results.append(barcode)
+    return results
+    
+def hamming_distance(s1, s2):
+    assert len(s1) == len(s2)
+    return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2))    
+
+''' Supported types of Fastq IDs '''
+ILLUMINA = 'illumina' # CASVA 1.8+, match up to space
+STRIPONE = 'stripone' # Illumina CASAVA 1.7 and lower (/1 /2) and NCBI SRA
+                      #   (/f /r), match all but last character
+OTHER    = 'other'    # Other, match exactly
+
+def determine_id_format(seq_id):
+    '''Determine if the id is new illumina, old illumina (/1 /2 ...), sanger
+    (/f /r), or other'''
+    
+    id_format = None
+    # Illumina CASAVA 1.8+ fastq headers use new format
+    read_id_regex = re.compile(r'(?P<instrument>[a-zA-Z0-9_-]+):'
+                               '(?P<run_number>[0-9]+):'
+                               '(?P<flowcell_id>[a-zA-Z0-9]+):'
+                               '(?P<lane>[0-9]+):(?P<tile>[0-9]+):'
+                               '(?P<x_pos>[0-9]+):'
+                               '(?P<y_pos>[0-9]+) (?P<read>[0-9]+):'
+                               '(?P<is_filtered>[YN]):'
+                               '(?P<control_number>[0-9]+):'
+                               '(?P<index_sequence>[ACGT]+){0,1}')
+    # Old illumina and sanger reads use /1 /2 /3 or /f /r
+    strip_one_endings = ['/1', '/2', '/3', '/f', '/r']
+    
+    if read_id_regex.match(seq_id):
+        id_format = ILLUMINA
+    elif (seq_id[-2:] in strip_one_endings):
+        id_format = STRIPONE
+    else:
+        id_format = OTHER
+    return id_format
+
+def strip_read_from_id(seq_id, id_format=None):
+    new_id = seq_id
+    if not id_format:
+        id_format = determine_id_format(seq_id)
+    elif id_format == STRIPONE:
+        new_id = seq_id[0:-1]
+    elif id_format == ILLUMINA:
+        new_id = seq_id.split(' ')[0]
+    return new_id
+
+def strip_read_from_id_stripone(seq_id):
+    return seq_id[0:-1]
+
+def strip_read_from_id_illumina(seq_id):
+    return seq_id.split(' ')[0]
+
+def match_id(id1, id2, id_format=OTHER):
+    ''' Return true if IDs match using rules for specified format '''
+    if id_format == STRIPONE:
+        if id1[0:-1] == id2[0:-1]:
+            return True
+        else:
+            return False
+    elif id_format == ILLUMINA:
+        if (id1.split(' ')[0] == id2.split(' ')[0]):
+            return True
+        else:
+            return False
+    elif id1 == id2:
+        return True
+    else:
+        return False
+
+def read_fastq(filehandle):
+    ''' Return dictionary with "seq_id", "seq", "qual_id", and "qual" '''
+    record_line = 0
+    read_number = 0
+    fastq_record = dict()
+    for line in filehandle:
+        record_line += 1
+        if record_line == 1:
+            fastq_record['seq_id'] = line.strip()
+        elif record_line == 2:
+            fastq_record['seq'] = line.strip()
+        elif record_line == 3:
+            fastq_record['qual_id'] = line.strip()
+        elif record_line == 4:
+            record_line = 0
+            fastq_record['qual'] = line.strip()
+            read_number += 1
+            yield fastq_record
+
+def fastq_string(record):
+    return "%s\n%s\n%s\n%s\n" % (record['seq_id'], record['seq'],
+                                 record['qual_id'], record['qual'])
+
+def printCounts(barcode_dict,matched_counts,unmatched_counts,total_read_count,
+                numBarcodes):
+    '''Prints the header of the counts table and calls the 2 recursive
+    functions for printing barcode and unmatched barcode count data'''
+
+    #Print the table header
+    print "Sample\t",
+    for b in xrange(1,numBarcodes + 1):
+	print "Barcode%s\t" %(b),
+    print "Count\tPercent"
+
+    if total_read_count == 0:
+        print "ERROR: Total read count is 0.  No reads were parsed."
+        return()
+
+    #Print the matched counts, sorting the lines
+    for line in sorted(getMatchedCountsTable(barcode_dict,matched_counts,
+                                             total_read_count,[])):
+        print(line),
+
+    #Print the unmatched counts (default sorted by string)
+    printUnmatchedCounts(unmatched_counts,total_read_count,[])
+
+def getMatchedCountsTable(barcode_dict,matched_counts,total_read_count,
+                          barcode_path):
+    '''Recursively builds a "barcode path" and prints a line in the barcode
+    counts table for every leaf in the barcode dictionary'''
+
+    ret_list = []
+    if isinstance(barcode_dict,dict):
+	for barcode in sorted(barcode_dict, key=barcode_dict.get):
+	    if barcode is not ANY:
+		if len(barcode_path) > 0:
+		    new_barcode_path = barcode_path[:]
+		else:
+		    new_barcode_path = []
+		new_barcode_path.append(barcode)
+		ret_list += getMatchedCountsTable(barcode_dict[barcode],
+                                                  matched_counts[barcode],
+                                                  total_read_count,
+                                                  new_barcode_path)
+    else:
+	ret_str = barcode_dict + "\t"
+	for bc in barcode_path:
+	    ret_str += bc + "\t"
+        ret_str += "%s\t%.2f%%\n" % (matched_counts,
+                                     (matched_counts/total_read_count)*100)
+        ret_list += [ret_str]
+
+    return(ret_list)
+
+def printUnmatchedCounts(unmatched_counts,total_read_count,barcode_path):
+    '''Recursively builds a "barcode path" (consisting of "MATCHED" and
+    "UNMATCHED" values) and prints a line in the barcode counts table for every
+    leaf in the unmatched counts dictionary'''
+    if isinstance(unmatched_counts,dict):
+	for barcode in sorted(unmatched_counts, key=unmatched_counts.get):
+	    if len(barcode_path) > 0:
+		new_barcode_path = barcode_path[:]
+	    else:
+		new_barcode_path = []
+	    new_barcode_path.append(barcode)
+	    printUnmatchedCounts(unmatched_counts[barcode],total_read_count,
+                                 new_barcode_path)
+    else:
+        label = UNMATCHED
+        if (UNMATCHED not in barcode_path and MULTIMATCHED in barcode_path):
+            label = MULTIMATCHED
+        print "%s\t" %(label),
+	for bc in barcode_path:
+	    print "%s\t" %(bc),
+	print "%s\t%.2f%%" %(unmatched_counts,
+                             (unmatched_counts/total_read_count)*100)
+
+def getBarcodeMatchPath(in_dict, keys_list):
+    '''Generates a path of pseudo-barcodes (consisting of "MATCHED",
+    "UNMATCHED", and "MULTIMATCHED" values) to use in the summary output.  The
+    intent is to give the user useful feedback about which level(s) of barcodes
+    are not matrching instead of listing all sequences as simply "UNMATCHED".
+    It also reduces complexity of the unmatched output by not including
+    specific barcodes, which do not matter in unmatched/multimatched cases.'''
+    cur_dict = in_dict
+    dim = 0
+    path = []
+    for cur_key in keys_list:
+        dim += 1
+        if cur_key is MATCHED or cur_key is MULTIMATCHED:
+            path.append(cur_key)
+        elif cur_key in cur_dict:
+            path.append(MATCHED)
+        else:
+            path.append(UNMATCHED)
+        cur_dict = in_dict[ANY][str(dim)]
+    return path
+
+def setNDApproxDictVal(in_dict, keys_list, mismatches):
+    '''Creates (or builds upon) an N-Dimensional ("ND") dictionary (in_dict)
+    whose keys are mismatch variants (constructed using the keys_list and the
+    number of mismatches provided).  The values at each level (initially:
+    changed by reduceNDDictPaths) is a 2 member list containing the number of
+    mismatches the key has and the dictionary for the next level.  The values
+    at the end of the ND dictionary are (initially: changed by
+    reduceNDDictPaths) a 2 member list containing the number of mismatches and
+    a list of "original key paths".  (Each path is a list of barcodes.)  The
+    keys_list is a series of barcodes representing a row from the barcodes file
+    (i.e. 1 barcode from each dimension, in order).
+    Note: in_dict must be "l()" where "l" = lambda:defaultdict(l).'''
+    setNDApproxDictValHelper(in_dict, keys_list, mismatches, [])
+
+def setNDApproxDictValHelper(in_dict, keys_list, mismatches, real_path):
+    '''A recursive helper of setNDApproxDictVal which tracks the "real barcode
+    path"  to use in the list of real barcode paths at the end of the
+    dictionary.  Note that barcodes may have a 1:many relationship, indicated
+    by a barcode being repeated in earlier columns of the barcode file.
+    Different paths may also intersect given the number of allowed mismatches.
+    '''
+    l = lambda:defaultdict(l)
+    real_path.append(keys_list[0])
+    if len(keys_list) == 1:
+        #print "Doing last key %s" %(keys_list[0])
+        for bcl in getMismatchedBarcodes(keys_list[0],mismatches):
+            #print "Processing mismatched barcode record: %s" % (bcl)
+            nmm = bcl[0]
+            bc  = bcl[1]
+            if (bc in in_dict and type(in_dict[bc][1]) is list
+                and len(in_dict[bc][1]) > 0
+                and not real_path in in_dict[bc][1]):
+
+                #print ("Adding real path %s to the one existing at barcode "
+                #       "%s: %s whose type should be a list: %s"
+                #       ) % (real_path, bc, in_dict[bc], type(in_dict[bc]))
+
+                #If the number of mismatches is the same for this barcode
+                #variant as it is for the real paths that have already been
+                #added to the real paths list, append this real path
+                if nmm == in_dict[bc][0]:
+                    in_dict[bc][1] += [real_path]
+                #Else if the number of mismatches is less for this barcode
+                #variant as it is for the real paths that have already been
+                #added to the real paths list, overwrite the old list
+                elif nmm < in_dict[bc][0]:
+                    in_dict[bc] = [nmm, [real_path]]
+            elif not (bc in in_dict and type(in_dict[bc][1]) is list
+                      and len(in_dict[bc][1]) > 0):
+                #print ("Setting new real path %s to real path list for "
+                #       "barcode %s: %s whose type should be a list: %s"
+                #       ) % (real_path, bc, in_dict[bc], type(in_dict[bc]))
+                in_dict[bc] = [nmm, [real_path]]
+            #Else: skip poorer match
+    else:
+        #print "Doing first key %s" %(keys_list[0])
+        for bcl in getMismatchedBarcodes(keys_list[0],mismatches):
+            nmm = bcl[0]
+            bc  = bcl[1]
+            if bc not in in_dict:
+                in_dict[bc] = [nmm, l()]
+                setNDApproxDictValHelper(in_dict[bc][1],
+                                         keys_list[1:],
+                                         mismatches,
+                                         real_path[:])
+            elif (bc in in_dict and nmm == in_dict[bc][0]):
+                setNDApproxDictValHelper(in_dict[bc][1],
+                                         keys_list[1:],
+                                         mismatches,
+                                         real_path[:])
+            elif (bc in in_dict and nmm < in_dict[bc][0]):
+                #print "Type of in_dict[%s] is: %s Resetting list." % (bc, type(in_dict[bc]))
+                in_dict[bc] = [nmm, l()]
+                #in_dict[bc][0] = nmm
+                #in_dict[bc][1] = l()
+                setNDApproxDictValHelper(in_dict[bc][1],
+                                         keys_list[1:],
+                                         mismatches,
+                                         real_path[:])
+
+def reduceNDDictPaths(in_dict):
+    '''After an approximate barcode dictionary has been fully created by
+    setNDApproxDictVal, this method reduces the lists of barcode paths to
+    individual discrete paths or individual generic MATCHED/MULTIMATCHED paths.
+    '''
+    if isinstance(in_dict,dict) and len(in_dict) > 0:
+        arb_bc = in_dict.iterkeys().next()
+        #print "Checking arbitrary barcode to see what level of the dictionary we're at: %s" % (arb_bc)
+        #print "The keys of this level of the dictionary are: %s" % (in_dict.keys())
+    if (isinstance(in_dict,dict) and len(in_dict) > 0 and
+        isinstance(in_dict[arb_bc][1],dict)):
+        for bc in in_dict:
+            #Reduction (skipping the list...)
+            in_dict[bc] = in_dict[bc][1]
+            reduceNDDictPaths(in_dict[bc])
+    #Else if the value is a list of lists
+    elif (isinstance(in_dict,dict) and len(in_dict) > 0 and
+          isinstance(in_dict[in_dict.keys()[0]][1],list)):
+        #For each last barcode in the dictionary
+        for bc in in_dict:
+            #Reduction, skipping the outer list
+            in_dict[bc] = in_dict[bc][1]
+            #If the value at the leaf of the dict is a list of lists
+            if (len(in_dict[bc]) > 0 and isinstance(in_dict[bc],list) and
+                len(in_dict[bc][0]) > 0 and isinstance(in_dict[bc][0],list)):
+                #If the number of paths is 1
+                if len(in_dict[bc]) == 1:
+                    in_dict[bc] = in_dict[bc][0]
+                else:
+                    inner_len = len(in_dict[bc][0])
+                    ref_path = in_dict[bc][0]
+                    new_path = []
+                    for bci in xrange(0,inner_len):
+                        all_matched = True
+                        for path in in_dict[bc][1:]:
+                            if path[bci] != ref_path[bci]:
+                                all_matched = False
+                                break
+                        if all_matched is True:
+                            new_path.append(MATCHED)
+                        else:
+                            new_path.append(MULTIMATCHED)
+                    in_dict[bc] = new_path
+
+def getMismatchedBarcodes(barcode,mismatches):
+    '''Given a barcode sequence and a number of allowed mismatches, generate a
+    list of barcodes representing all possible matching sequences with the
+    allowed number of mismatches.'''
+    return(getMismatchedBarcodesHelper(barcode,mismatches,[]))
+
+def getMismatchedBarcodesHelper(barcode,mismatches,skips):
+    offbys = [[0, barcode]]
+    if mismatches <= 0:
+        return(offbys)
+    for p in xrange(0,len(barcode)):
+        if p in skips:
+            continue
+        #Keep track of bases that have already been mutated
+        newskips = skips + [p]
+        if barcode[p] != 'N':
+            offbys.append([len(newskips),
+                           barcode[:p] + 'N' + barcode[p+1:]])
+        if barcode[p] != 'A':
+            offbys.append([len(newskips),
+                           barcode[:p] + 'A' + barcode[p+1:]])
+        if barcode[p] != 'T':
+            offbys.append([len(newskips),
+                           barcode[:p] + 'T' + barcode[p+1:]])
+        if barcode[p] != 'G':
+            offbys.append([len(newskips),
+                           barcode[:p] + 'G' + barcode[p+1:]])
+        if barcode[p] != 'C':
+            offbys.append([len(newskips),
+                           barcode[:p] + 'C' + barcode[p+1:]])
+        #Recurse to add next allowed mismatch
+        if mismatches > 1:
+            if barcode[p] != 'N':
+                offbys += getMismatchedBarcodesHelper(barcode[:p] + 'N' +
+                                                      barcode[p+1:],
+                                                      mismatches - 1,
+                                                      newskips)
+            if barcode[p] != 'A':
+                offbys += getMismatchedBarcodesHelper(barcode[:p] + 'A' +
+                                                      barcode[p+1:],
+                                                      mismatches - 1,
+                                                      newskips)
+            if barcode[p] != 'T':
+                offbys += getMismatchedBarcodesHelper(barcode[:p] + 'T' +
+                                                      barcode[p+1:],
+                                                      mismatches - 1,
+                                                      newskips)
+            if barcode[p] != 'G':
+                offbys += getMismatchedBarcodesHelper(barcode[:p] + 'G' +
+                                                      barcode[p+1:],
+                                                      mismatches - 1,
+                                                      newskips)
+            if barcode[p] != 'C':
+                offbys += getMismatchedBarcodesHelper(barcode[:p] + 'C' +
+                                                      barcode[p+1:],
+                                                      mismatches - 1,
+                                                      newskips)
+    return(offbys)
+
+def exeExists(program):
+    '''Determines whether an executable exists (i.e. is in the user\'s path)'''
+    import os
+    def is_exe(fpath):
+        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return True
+    else:
+        for path in os.environ["PATH"].split(os.pathsep):
+            path = path.strip('"')
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return True
+
+    return False
+
+def run_gzip_in(infile):
+    '''Takes a gzipped file and returns an iterator on the unzipped lines of
+    the file using a system call to gzip.'''
+    p = subprocess.Popen(["gzip", "-dc", infile], stdout=subprocess.PIPE,
+                         bufsize=1)
+    return iter(p.stdout.readline, b'')
+
+def run_gzip_out(outfile):
+    p = subprocess.Popen(["gzip"], stdin=subprocess.PIPE,
+                         stdout=open(outfile,'wb'), bufsize=-1)
+    return p.stdin
+
+if __name__ == '__main__':
+    sys.exit(main())
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/barcode_splitter_multi/barcode_splitter.xml	Fri Aug 26 16:30:56 2016 -0400
@@ -0,0 +1,133 @@
+<tool id="cshl_princeton_fastx_barcode_splitter" version="0.4" name="Barcode Splitter">
+  <description></description>
+  <command interpreter="bash" detect_errors="aggressive"><![CDATA[
+barcode_splitter_galaxy_wrapper.sh split
+#for $sf in $seqfiles
+    ${sf.input.extension}
+    #break
+#end for
+--bcfile $bcfile --mismatches $mismatches --galaxy $zip $barcodes_at_end
+#for $sf in $seqfiles
+    ${sf.input}
+#end for
+--idxread
+#set $bound = $num_barcode_columns.value + 1
+#for $n in range( 1, $bound )
+    ${n}
+#end for
+> $summary
+]]>
+  </command>
+
+  <inputs>
+    <param format="txt" name="bcfile" type="data" label="Barcode File" help="Tab-delimited text file where the first column is a sample ID and subsequent columns are barcodes." />
+    <param name="num_barcode_columns" type="integer" size="2" value="1" label="Number of barcode columns" help="The number of columns in the barcode file containing barcode sequences.  Note that you must submit at least this many read files." />
+
+
+    <repeat name="seqfiles" title="Read Files" min="1" default="2">
+        <param format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to split" help="Barcoded reads files must be first. If there are multiple barcode columns in the barcode file, the files must be supplied in the same order as the barcode columns (from left to right)." />
+    </repeat>
+
+    <param name="mismatches" type="integer" size="3" value="0" label="Number of allowed mismatches" />
+
+    <param name="barcodes_at_end" type="boolean" truevalue="--barcodes_at_end" falsevalue="" checked="false"
+        label="Barcodes are at the end of all sequences" help="Default is the beginning of all sequences" />
+
+    <param name="zip" type="boolean" truevalue="--gzip" falsevalue="" checked="false"
+        label="Compress/zip the output" help="This generates reads files with a .gz extension.  Default is based on the file extension of the first input file." />
+
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="summary" label="${tool.name} on ${on_string}: Summary" />
+    <collection name="split_output" type="list" format_source="input" label="${tool.name} on ${on_string}">
+      <discover_datasets pattern="__designation_and_ext__" directory="split" visible="false" label="${designation}"/>
+    </collection>
+  </outputs>
+
+  <tests>
+    <test>
+      <!-- Split a FASTQ file -->
+      <param name="bcfile" value="barcode_splitter1.txt" />
+      <param name="num_barcode_columns" value="1" />
+      <repeat name="seqfiles">
+        <param name="input" value="barcode_splitter1.fastq" ftype="fastqsolexa" />
+      </repeat>
+      <param name="barcodes_at_end" value="" />
+      <param name="mismatches" value="2" />
+      <output name="summary" file="barcode_splitter1.out" />
+      <collection name="output" type="list">
+        <discovered_dataset designation="BC1" ftype="fastqsolexa" file="barcode_splitter1_BC1.out" />
+        <discovered_dataset designation="BC2" ftype="fastqsolexa" file="barcode_splitter1_BC2.out" />
+        <discovered_dataset designation="BC3" ftype="fastqsolexa" file="barcode_splitter1_BC3.out" />
+        <discovered_dataset designation="BC4" ftype="fastqsolexa" file="barcode_splitter1_BC4.out" />
+        <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="barcode_splitter1_unmatched.out" />
+      </collection>
+    </test>
+
+    <test>
+      <!-- Split a FASTQ file, using separate index read -->
+      <param name="bcfile" value="barcode_splitter1.txt" />
+      <param name="num_barcode_columns" value="1" />
+      <repeat name="seqfiles">
+        <param name="input" value="barcode_splitter_index.fastq" ftype="fastqsolexa" />
+      </repeat>
+      <repeat name="seqfiles">
+        <param name="input" value="barcode_splitter1.fastq" ftype="fastqsolexa" />
+      </repeat>
+      <param name="barcodes_at_end" value="" />
+      <param name="mismatches" value="2" />
+      <output name="output" file="barcode_splitter1.out" />
+      <collection name="split_output" type="list">
+        <discovered_dataset designation="BC1" ftype="fastqsolexa" file="barcode_splitter1_BC1.out" />
+        <discovered_dataset designation="BC2" ftype="fastqsolexa" file="barcode_splitter1_BC2.out" />
+        <discovered_dataset designation="BC3" ftype="fastqsolexa" file="barcode_splitter1_BC3.out" />
+        <discovered_dataset designation="BC4" ftype="fastqsolexa" file="barcode_splitter1_BC4.out" />
+        <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="barcode_splitter1_unmatched.out" />
+      </collection>
+    </test>
+  </tests>
+
+    <help><![CDATA[
+**What it does**
+
+This tool splits a FASTQ file into several files, using barcodes as the split criteria.  Barcodes in one file can be used to split multiple sorted files.  Multiple sets of barcodes, each located in a different file, can be used.
+
+--------
+
+**Barcode file Format**
+
+Barcode files are simple text files.
+Each line should contain an identifier (descriptive name for the barcode), and at least 1 barcode, separated by TAB characters. Multiple columns of barcodes are supported (each corresponding to a separate barcoded read file), though there's usually just 1.  An example of the usage of multiple sets of barcodes could be the first set of barcodes can denote user and the second set can be each user's sample barcodes.
+Example::
+
+    #This line is a comment (starts with a 'number' sign)
+    BC1	GATCT	TTGCAT
+    BC2	ATCGT	GCGCAT
+    BC3	GTGAT	AGGTCA
+    BC4	TGTCT	CTTTGG
+
+For each barcode, a new FASTQ file will be created (with the barcodes' identifier as part of the file name).
+Sequences matching the barcodes in a row will be stored in the appropriate file.
+
+The first sequence file submitted must contain sequences with the barcodes in the first column of the barcode file.  The second sequence file must contain sequences with the barcodes in the second column, and so on.  The 'Number of barcode columns' specified must match the number of actual columns in the barcode file.
+
+One (possibly two) additional FASTQ files will be created: the 'unmatched' file (and the 'multimatched' file), where sequences not matching any barcode (or matching more than 1 barcode when mismatches are taken into account) will be stored.
+
+The output of this tool is a summary table displaying the split counts for each barcode identifier and the percentage of the total reads those represent.
+In addition, each fastq file produced will be loaded into the galaxy history as part of a collection list.
+]]>
+  </help>
+
+  <!-- Barcode-Splitter is part of the paired_sequence_utils package, by L.Parsons (lparsons@princeton.edu) and R.Leach (rleach@princeton.edu) -->
+  <citations>
+        <citation type="bibtex">
+      @misc{paired_sequence_utils,
+        title = {{Barcode}-{Splitter}},
+        url = {https://bitbucket.org/hepcat72/paired_sequence_utils},
+        author = "Parsons, Lance and Leach, Robert"
+      }
+    </citation>
+  </citations>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/barcode_splitter_multi/barcode_splitter_galaxy_wrapper.sh	Fri Aug 26 16:30:56 2016 -0400
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+#    FASTX-toolkit - FASTA/FASTQ preprocessing tools.
+#    Copyright (C) 2009  A. Gordon (gordon@cshl.edu)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as
+#   published by the Free Software Foundation, either version 3 of the
+#   License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#    You should have received a copy of the GNU Affero General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#  Modified by Lance Parsons (lparsons@princeton.edu)
+#  2011-03-15  Adapted to allow galaxy to determine filetype
+#  2015-10-21  Updated to make compatible with OSX (BSD sed)
+#  2015-11-13  Removed LIBRARY_NAME, no longer needed
+#  2016-04-28  Output summary as simple tabular output
+
+# This is a shell script wrapper for 'fastx_barcode_splitter.pl'
+#
+# 1. Output files are saved at the dataset's files_path directory.
+
+if [ "$1x" = "x" ]; then
+  echo "Usage: $0 [OUTPUT_PATH] [OUTFILETYPE] [OPTIONS]" >&2
+  exit 1
+fi
+
+OUTPUT_PATH="$1"
+FILETYPE="$2"
+shift 2
+# The rest of the parameters are passed to the split program
+
+if [ "${OUTPUT_PATH}x" = "x" ]; then
+  echo "Usage: $0 [OUTPUT_PATH] [OUTFILETYPE] [OPTIONS]" >&2
+  exit 1
+fi
+
+mkdir -p "$OUTPUT_PATH"
+if [ ! -d "$OUTPUT_PATH" ]; then
+  echo "Error: failed to create output path '$OUTPUT_PATH'" >&2
+  exit 1
+fi
+
+PREFIX="$OUTPUT_PATH/"
+SUFFIX=".$FILETYPE"
+DIRECTORY="$(cd "$(dirname "$0")" && pwd)"
+
+RESULTS=$("$DIRECTORY/barcode_splitter.py" --format "$FILETYPE" --prefix "$PREFIX" --suffix "$SUFFIX" "$@")
+if [ $? != 0 ]; then
+  echo "error"
+fi
+
+echo "$RESULTS"
\ No newline at end of file