# HG changeset patch # User hepcat72 # Date 1472243456 14400 # Node ID 50df2d629d5197f52d4f674bbe4da97df245d632 Uploaded diff -r 000000000000 -r 50df2d629d51 barcode_splitter_multi/._barcode_splitter.py Binary file barcode_splitter_multi/._barcode_splitter.py has changed diff -r 000000000000 -r 50df2d629d51 barcode_splitter_multi/._barcode_splitter.xml Binary file barcode_splitter_multi/._barcode_splitter.xml has changed diff -r 000000000000 -r 50df2d629d51 barcode_splitter_multi/._barcode_splitter_galaxy_wrapper.sh Binary file barcode_splitter_multi/._barcode_splitter_galaxy_wrapper.sh has changed diff -r 000000000000 -r 50df2d629d51 barcode_splitter_multi/barcode_splitter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/barcode_splitter_multi/barcode_splitter.py Fri Aug 26 16:30:56 2016 -0400 @@ -0,0 +1,870 @@ +#!/usr/bin/env python +""" Split one or more fastq files based on barcode sequence. +""" +from __future__ import division +import gzip +import argparse +import os +import re +import sys +from collections import defaultdict +import subprocess + +__version__ = "0.12" +__author__ = "Lance Parsons & Robert Leach" +__author_email__ = "lparsons@princeton.edu,rleach@princeton.edu" +__copyright__ = "Copyright 2011, Lance Parsons & Robert leach" +__license__ = ("BSD 2-Clause License " + "http://www.opensource.org/licenses/BSD-2-Clause") + +UNMATCHED = 'unmatched' +MATCHED = 'matched' +MULTIMATCHED = 'multimatched' +ANY = 'any' #Used in the N-dimensional dictionary to help + #diagnose which barcode set is matching/not matching + +def main (argv=None): + if argv is None: + argv = sys.argv + + parser = argparse.ArgumentParser(description=globals()['__doc__']) + parser.add_argument('--version', action='version', + version='%(prog)s version ' + globals()['__version__']) + + required_group = parser.add_argument_group("Barcodes") + required_group.add_argument('--bcfile', metavar='FILE', + help='REQUIRED: Tab delimited file: ' + '"Sample_ID Barcode_Sequence"') + required_group.add_argument ('--idxread', metavar='READNUM', type=int, + nargs='+', help="REQUIRED: Indicate in " + "which read file(s) to search for the " + "corresponding column of barcode sequences, " + "e.g. if the first column of barcodes is in " + "the second sequence read file and the " + "second column's barcodes are in the third " + "sequence read file, you'd supply " + "`--idxread 2 3`") + required_group.add_argument('--mismatches', default=0, type=int, + help='Number of mismatches allowed in ' + 'barcode matching') + required_group.add_argument('--barcodes_at_end', action='store_true', + default=False, help='Barcodes are at the end ' + 'of the index read (default is at the ' + 'beginning)') + + #pairedend_group = parser.add_argument_group("Paired-End") + #pairedend_group.add_argument ('--split-paired-ends', action='store_true', + # default=False, help='In addition to ' + # 'splitting based on barcodes, also split ' + # 'paired ends.') + + output_group = parser.add_argument_group("Output Options") + output_group.add_argument('--prefix', default='', + help='Prefix for output files') + output_group.add_argument('--suffix', default=None, + help='Suffix for output files (default based ' + 'on --format)') + output_group.add_argument('--galaxy', action='store_true', default=False, + help='Produce "Galaxy safe" filenames by ' + 'removing underscores (default: %(default)s)') + output_group.add_argument('-v', '--verbose', action='store_true', + default=False, help='verbose output') + output_group.add_argument('--gzipout', action='store_true', default=False, + help='Output files in compressed gzip format ' + '(default is uncompressed)') + + input_group = parser.add_argument_group("Input format") + input_group.add_argument('--format', default='fastq', + help='Specify format for sequence files (fasta ' + 'or fastq)') + input_group.add_argument('--gzipin', action='store_true', default=False, + help='Assume input files are in gzip format, ' + 'despite file extension (default is auto based ' + 'on input file extension)') + + seqs_group = parser.add_argument_group("Sequence Inputs") + seqs_group.add_argument('fastq_files', metavar='FILE', type=str, nargs='+', + help='A series of 1 or more [optionally zipped] ' + 'fastq files.') + + #Error-check the values provided from the command line + try: + options = parser.parse_args() + if options.fastq_files is None or options.idxread is None: + parser.error('Sequence files and at least one number indicating ' + 'the indexed file(s) (--idxread) is required') + if len(options.fastq_files) < 1: + parser.error('Must specify at least one sequence file') + if len(options.fastq_files) < len(options.idxread): + parser.error('Must specify at least one sequence file') + if not options.bcfile: + parser.error('Must specify a barcodes file with "--bcfile" option') + if ((min(options.idxread) < 1) or + (max(options.idxread) > len(options.fastq_files))): + parser.error('Invalid index read number ("--idxread"), must be ' + 'between 1 and %s (the number of supplied sequence ' + 'files)' % len(options.fastq_files)) + except SystemExit: # Prevent exit when calling as function + return 2 + + #Read barcodes files into n-dimensional dictionary + (barcode_dict,approx_bc_dict) = read_barcodes(options.bcfile, + len(options.idxread), + options.mismatches) + barcode_sizes = getBarcodeSizes(barcode_dict) + #print(barcode_dict) + #print(approx_bc_dict) + + #initialize the read counts for each barcode + total_read_count = 0 + matched_counts = multi_dimensions(len(options.idxread), int) + unmatched_counts = multi_dimensions(len(options.idxread), int) + #TODO Verbose: print barcode_dict + + #Determine if we should use gzip for input + basename, extension = os.path.splitext(options.fastq_files[0]) + if extension == '.gz': + options.gzipin = True + + #Set the suffix (before determination of gzip output mode) + if options.suffix is not None: + suffix = options.suffix + elif options.format is not None: + suffix = '.%s' % options.format + elif extension != '.gz': + suffix = '.%s' % extension + else: + suffix = 'fastq' + + #Determine if we should use gzip for output + if (options.gzipout is True or + (options.suffix is not None and options.suffix == '.gz')): + options.gzipout = True + if options.suffix is None or suffix != '.gz': + suffix = '%s.gz' % suffix + + #Determine whether to use the gzip command line tool or not + use_gzip_cmd_line = False + if options.gzipin is True or options.gzipout is True: + use_gzip_cmd_line = exeExists('gzip') + + #Determine if we need to edit the prefix for galaxy mode + prefix = options.prefix + if options.galaxy: + prefix = options.prefix.replace("_","-") + + #Open input filehandles for each read + #print "Opening the input file handles..." + inputs = {} + for i in xrange(0,len(options.fastq_files)): + if options.gzipin: + if(use_gzip_cmd_line is True): + inputs[i] = run_gzip_in(options.fastq_files[i]) + else: + inputs[i] = gzip.open(options.fastq_files[i], 'rb') + else: + inputs[i] = open(options.fastq_files[i], 'rb') + + #Open output filehandles for each barcode set and sequence file + #print "Opening the output barcode file handles..." + outputs = multi_dimensions((len(options.idxread) + 1), file) + openOutfiles(outputs, barcode_dict, prefix, suffix, inputs, options.galaxy, + options.gzipout,use_gzip_cmd_line) + + #Open the file handle for the unmatched file + #print "Opening the output un/multi-matched file handles..." + unmatchedOutputs = defaultdict(file) + multimatchedOutputs = defaultdict(file) + for i in xrange(0,len(inputs)): + unmf = '%s%s-read-%s%s' %(prefix, UNMATCHED, i+1, suffix) + mmf = '%s%s-read-%s%s' %(prefix, MULTIMATCHED, i+1, suffix) + if options.gzipout: + if use_gzip_cmd_line is True: + unmatchedOutputs[i] = run_gzip_out(unmf) + multimatchedOutputs[i] = run_gzip_out(mmf) + else: + unmatchedOutputs[i] = gzip.open(unmf,'wb') + multimatchedOutputs[i] = gzip.open(mmf, 'wb') + else: + unmatchedOutputs[i] = open(unmf, 'wb') + multimatchedOutputs[i] = open(mmf, 'wb') + + readidxs = [i - 1 for i in options.idxread] + prim_readidx = readidxs[0] + + #Debug mode reads the 1st 10000 reads of each file when prefix has "debug" + debug_mode = False + if re.match('debug', prefix): + debug_mode = True + + # For each input line in index read, get index sequence + for prim_index_read in read_fastq(inputs[prim_readidx]): + + total_read_count += 1 + + #print "Processing read number %s\r" % (total_read_count), + + #For debugging any file + if debug_mode and total_read_count > 10000: + break + + #Determine the ID format using the first record + if total_read_count == 1: + id_format = determine_id_format(prim_index_read['seq_id'][1:]) + + #Get 1 from each file & assert that their IDs match + cur_reads = {} + for read_index in xrange(0,len(inputs)): + if(read_index is not prim_readidx): + cur_reads[read_index] = read_fastq(inputs[read_index]).next() + #If the IDs don't match between all the files, issue a warning + try: + assert(match_id(prim_index_read['seq_id'], + cur_reads[read_index]['seq_id'],id_format)) + except AssertionError: + sys.stderr.write("WARNING: Index ID mismatch: %s does " + "not match %s\n" + %(prim_index_read['seq_id'], + cur_reads[read_index]['seq_id'])) + else: + cur_reads[read_index] = prim_index_read + + #Get the index sequences from the indexed reads + index_seqs = [] + if options.barcodes_at_end: + index_seqs = [cur_reads[readidxs[i]]['seq'][-barcode_sizes[i]:] + for i in xrange(0,len(readidxs))] + else: + index_seqs = [cur_reads[readidxs[i]]['seq'][0:barcode_sizes[i]] + for i in xrange(0,len(readidxs))] + + barcode_path = getNDDictVal(approx_bc_dict,index_seqs) + + if(barcode_path is None): + cur_outputs = unmatchedOutputs + unmatched_path = getBarcodeMatchPath(approx_bc_dict,index_seqs) + incrementNDDictInt(unmatched_counts,unmatched_path) + elif(MULTIMATCHED in barcode_path): + cur_outputs = multimatchedOutputs + unmatched_path = getBarcodeMatchPath(approx_bc_dict,barcode_path) + incrementNDDictInt(unmatched_counts,unmatched_path) + sys.stderr.write('WARNING: More than one barcode matches %s, ' + 'moving to %s category\n' + %(prim_index_read['seq_id'], MULTIMATCHED)) + else: + cur_outputs = getNDDictVal(outputs,barcode_path) + incrementNDDictInt(matched_counts,barcode_path) + + #Write each sequence to the matched or unmatched output handle + for readnum in xrange(0,len(inputs)): + cur_outputs[readnum].write(fastq_string(cur_reads[readnum])) + + #Report the final matched/unmatched counts in a table to STDOUT + printCounts(barcode_dict,matched_counts,unmatched_counts,total_read_count, + len(options.idxread)) + + return 0 + +def read_barcodes(filename, num_dims, mismatches): + '''Read barcodes file into dictionary''' + + #Declare a variable-dimension dictionary + #This was previously a static multi-dimensional dictionary, but it would + #not have worked... + l = lambda:defaultdict(l) + real_bc_dict = l() + approx_bc_dict = l() + approx_bc_any_dict = l() + + linenum = 0 + filehandle = open(filename, 'rb') + for line in filehandle: + + linenum += 1 + line = line.strip() + cols = [] + + #If the line has something on it and isn't commented + if line and line[0] != '#': + + cols = re.split('\t+',line) + + #If we got the expected number of columns + if len(cols) == (num_dims + 1): + sample_id = cols.pop(0) + barcode_sequences = cols + else: + sys.stderr.write("Unable to parse line in barcode file: [%s]: " + "[%s]. Must contain %s columns (the number " + "of values supplied to --idxread plus 1 for " + "the sample ID), but found [%s]. Skipping.\n" + %(filename, line, (num_dims + 1), len(cols))) + continue + + #If the sample ID is not none and the number of defined barcodes is + #equal to the number of expected barcodes + defined_barcodes = [e for e in barcode_sequences if e is not None] + if (sample_id is not None) and len(defined_barcodes) == num_dims: + + #Set the sample ID in the N-Dimensional dictionary + setNDDictVal(real_bc_dict, defined_barcodes, sample_id) + + #This creates a dictionary containing all possible barcode + #combos, with MULTIMATCHED values pre-computed + #print ("Calling setNDApproxDictVal with %s and dict: " + # "[%s]") % (defined_barcodes, approx_bc_dict) + setNDApproxDictVal(approx_bc_dict, + defined_barcodes, + mismatches) + + #Make sure we can get all the barcodes for each dict dimension + dim = 0 + for barcode in defined_barcodes: + dim += 1 + real_bc_dict[ANY][str(dim)][barcode] = "" + + #This creates a dictionary containing all possible barcode + #combos, with MULTIMATCHED values pre-computed + setNDApproxDictVal(approx_bc_any_dict[ANY][str(dim)], + [barcode], + mismatches) + + elif ((sample_id is not None) and + (len(defined_barcodes) > num_dims)): + raise Exception("More barcode indexes were found in the " + "barcodes file on line %s: '%s' than were " + "expected: [%s] (the number of values " + "supplied to --idxread)." + %(linenum, line, num_dims)) + + else: + raise Exception("Unable to parse barcode(s) from line %s: " + "'%s'. Expected a sample ID followed by [%s] " + "tab-delimited barcodes" + %(linenum, line, num_dims)) + + if len(real_bc_dict) == 0: + raise Exception("Unable to parse any barcodes from barcode file [%s]." + %(filename)) + + #print(approx_bc_dict) + + reduceNDDictPaths(approx_bc_dict) + + for dim in real_bc_dict[ANY]: + reduceNDDictPaths(approx_bc_any_dict[ANY][dim]) + approx_bc_dict[ANY][dim] = approx_bc_any_dict[ANY][dim] + + return(real_bc_dict,approx_bc_dict) + +def openOutfiles(outputs,barcode_dict,prefix,suffix,inputs,galaxy,gzip_mode, + use_gzip_cmd_line): + '''Opens all the output files for all barcode matches (unmatched + barcode output files not opened)''' + if isinstance(barcode_dict,dict): + for barcode in getRealBarcodes(barcode_dict.keys()): + openOutfiles(outputs[barcode], barcode_dict[barcode], prefix, + suffix, inputs, galaxy, gzip_mode, use_gzip_cmd_line) + else: + sample_id = barcode_dict + if galaxy: + #Replace underscore to allow this to work with Galaxy + sample_id = barcode_dict.replace("_","-") + for i in xrange(0,len(inputs)): + fn = '%s%s-read-%s%s' % (prefix, sample_id, i+1, suffix) + if gzip_mode: + if use_gzip_cmd_line is True: + outputs[i] = run_gzip_out(fn) + else: + outputs[i] = gzip.open(fn, 'wb') + else: + outputs[i] = open(fn, 'wb') + +def getBarcodeSizes(barcode_dict): + '''Uses the first key in each level of a dictionary to build a list of key + sizes (assuming all keys are the same size)''' + + if (isinstance(barcode_dict,dict) and len(barcode_dict.keys()) > 0 and + isinstance(barcode_dict[barcode_dict.keys()[0]],dict) and + len(barcode_dict[barcode_dict.keys()[0]].keys()) > 0): + + return([len(barcode_dict.keys()[0])] + + getBarcodeSizes(barcode_dict[barcode_dict.keys()[0]])) + + elif isinstance(barcode_dict,dict) and len(barcode_dict.keys()) > 0: + + return([len(barcode_dict.keys()[0])]) + + else: + + return(None) + +def setNDDictVal(in_dict, keys_list, val): + '''Sets the value of an N-Dimensional dictionary to the supplied value + using the list of keys''' + cur_dict = in_dict + for cur_key in keys_list[0:-1]: + cur_dict = cur_dict[cur_key] + cur_dict[keys_list[-1]] = val + +def incrementNDDictInt(in_dict, keys_list): + '''Sets the value of an N-Dimensional dictionary to the supplied value + using the list of keys''' + cur_dict = in_dict + for cur_key in keys_list[0:-1]: + cur_dict = cur_dict[cur_key] + cur_dict[keys_list[-1]] += 1 + +def getNDDictVal(in_dict, keys_list): + '''Gets the value of an N-Dimensional dictionary using the list of keys''' + cur_dict = in_dict + for cur_key in keys_list: + if cur_key in cur_dict: + cur_dict = cur_dict[cur_key] + else: + return None + return cur_dict + +def multi_dimensions(n, type): + """Creates an N-dimensional dictionary containing values at the leaves of + type 'type'. E.g. A 2-dimensional dictionary of strs would have 2 levels of + keys that hold string values (mydict['key1']['key2'] = 'my string value') + """ + + #n <= 0 is correct. E.g. if called with 2 (i.e. a 2D dictionary - like a 2D + #array/matrix - where array[2][1] = 5 or dict['key1']['key2'] = 'str' are + #2-dimensional data structures), the the first type returned is a dict (for + #holding 'key1') and the second thing returned is a defaultdict for holding + #'key2' and the last thing returned is the type of the data held at the + #leaves of the dictionary + if n <= 0: + return type() + + return defaultdict(lambda:multi_dimensions(n-1, type)) + +def getRealBarcodes(barcodes): + '''Return a list of barcodes that exclude generic "matched", "unmatched", + and "any" pseudo-barcodes''' + return [bc for bc in barcodes if (bc is not UNMATCHED and bc is not MATCHED + and bc is not ANY and bc is not + MULTIMATCHED and bc is not None)] + +def matchBarcodes(sequence, barcodes, mismatches): + '''Find closest match(es) in barcodes to specified sequence with max number + of mismatches''' + best_distance = mismatches + results = [] + for barcode in barcodes: + if mismatches == 0: + if (sequence == barcode): + results.append(barcode) + else: + distance = hamming_distance(sequence, barcode) + if (distance <= best_distance): + best_distance = distance + results.append(barcode) + return results + +def hamming_distance(s1, s2): + assert len(s1) == len(s2) + return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2)) + +''' Supported types of Fastq IDs ''' +ILLUMINA = 'illumina' # CASVA 1.8+, match up to space +STRIPONE = 'stripone' # Illumina CASAVA 1.7 and lower (/1 /2) and NCBI SRA + # (/f /r), match all but last character +OTHER = 'other' # Other, match exactly + +def determine_id_format(seq_id): + '''Determine if the id is new illumina, old illumina (/1 /2 ...), sanger + (/f /r), or other''' + + id_format = None + # Illumina CASAVA 1.8+ fastq headers use new format + read_id_regex = re.compile(r'(?P[a-zA-Z0-9_-]+):' + '(?P[0-9]+):' + '(?P[a-zA-Z0-9]+):' + '(?P[0-9]+):(?P[0-9]+):' + '(?P[0-9]+):' + '(?P[0-9]+) (?P[0-9]+):' + '(?P[YN]):' + '(?P[0-9]+):' + '(?P[ACGT]+){0,1}') + # Old illumina and sanger reads use /1 /2 /3 or /f /r + strip_one_endings = ['/1', '/2', '/3', '/f', '/r'] + + if read_id_regex.match(seq_id): + id_format = ILLUMINA + elif (seq_id[-2:] in strip_one_endings): + id_format = STRIPONE + else: + id_format = OTHER + return id_format + +def strip_read_from_id(seq_id, id_format=None): + new_id = seq_id + if not id_format: + id_format = determine_id_format(seq_id) + elif id_format == STRIPONE: + new_id = seq_id[0:-1] + elif id_format == ILLUMINA: + new_id = seq_id.split(' ')[0] + return new_id + +def strip_read_from_id_stripone(seq_id): + return seq_id[0:-1] + +def strip_read_from_id_illumina(seq_id): + return seq_id.split(' ')[0] + +def match_id(id1, id2, id_format=OTHER): + ''' Return true if IDs match using rules for specified format ''' + if id_format == STRIPONE: + if id1[0:-1] == id2[0:-1]: + return True + else: + return False + elif id_format == ILLUMINA: + if (id1.split(' ')[0] == id2.split(' ')[0]): + return True + else: + return False + elif id1 == id2: + return True + else: + return False + +def read_fastq(filehandle): + ''' Return dictionary with "seq_id", "seq", "qual_id", and "qual" ''' + record_line = 0 + read_number = 0 + fastq_record = dict() + for line in filehandle: + record_line += 1 + if record_line == 1: + fastq_record['seq_id'] = line.strip() + elif record_line == 2: + fastq_record['seq'] = line.strip() + elif record_line == 3: + fastq_record['qual_id'] = line.strip() + elif record_line == 4: + record_line = 0 + fastq_record['qual'] = line.strip() + read_number += 1 + yield fastq_record + +def fastq_string(record): + return "%s\n%s\n%s\n%s\n" % (record['seq_id'], record['seq'], + record['qual_id'], record['qual']) + +def printCounts(barcode_dict,matched_counts,unmatched_counts,total_read_count, + numBarcodes): + '''Prints the header of the counts table and calls the 2 recursive + functions for printing barcode and unmatched barcode count data''' + + #Print the table header + print "Sample\t", + for b in xrange(1,numBarcodes + 1): + print "Barcode%s\t" %(b), + print "Count\tPercent" + + if total_read_count == 0: + print "ERROR: Total read count is 0. No reads were parsed." + return() + + #Print the matched counts, sorting the lines + for line in sorted(getMatchedCountsTable(barcode_dict,matched_counts, + total_read_count,[])): + print(line), + + #Print the unmatched counts (default sorted by string) + printUnmatchedCounts(unmatched_counts,total_read_count,[]) + +def getMatchedCountsTable(barcode_dict,matched_counts,total_read_count, + barcode_path): + '''Recursively builds a "barcode path" and prints a line in the barcode + counts table for every leaf in the barcode dictionary''' + + ret_list = [] + if isinstance(barcode_dict,dict): + for barcode in sorted(barcode_dict, key=barcode_dict.get): + if barcode is not ANY: + if len(barcode_path) > 0: + new_barcode_path = barcode_path[:] + else: + new_barcode_path = [] + new_barcode_path.append(barcode) + ret_list += getMatchedCountsTable(barcode_dict[barcode], + matched_counts[barcode], + total_read_count, + new_barcode_path) + else: + ret_str = barcode_dict + "\t" + for bc in barcode_path: + ret_str += bc + "\t" + ret_str += "%s\t%.2f%%\n" % (matched_counts, + (matched_counts/total_read_count)*100) + ret_list += [ret_str] + + return(ret_list) + +def printUnmatchedCounts(unmatched_counts,total_read_count,barcode_path): + '''Recursively builds a "barcode path" (consisting of "MATCHED" and + "UNMATCHED" values) and prints a line in the barcode counts table for every + leaf in the unmatched counts dictionary''' + if isinstance(unmatched_counts,dict): + for barcode in sorted(unmatched_counts, key=unmatched_counts.get): + if len(barcode_path) > 0: + new_barcode_path = barcode_path[:] + else: + new_barcode_path = [] + new_barcode_path.append(barcode) + printUnmatchedCounts(unmatched_counts[barcode],total_read_count, + new_barcode_path) + else: + label = UNMATCHED + if (UNMATCHED not in barcode_path and MULTIMATCHED in barcode_path): + label = MULTIMATCHED + print "%s\t" %(label), + for bc in barcode_path: + print "%s\t" %(bc), + print "%s\t%.2f%%" %(unmatched_counts, + (unmatched_counts/total_read_count)*100) + +def getBarcodeMatchPath(in_dict, keys_list): + '''Generates a path of pseudo-barcodes (consisting of "MATCHED", + "UNMATCHED", and "MULTIMATCHED" values) to use in the summary output. The + intent is to give the user useful feedback about which level(s) of barcodes + are not matrching instead of listing all sequences as simply "UNMATCHED". + It also reduces complexity of the unmatched output by not including + specific barcodes, which do not matter in unmatched/multimatched cases.''' + cur_dict = in_dict + dim = 0 + path = [] + for cur_key in keys_list: + dim += 1 + if cur_key is MATCHED or cur_key is MULTIMATCHED: + path.append(cur_key) + elif cur_key in cur_dict: + path.append(MATCHED) + else: + path.append(UNMATCHED) + cur_dict = in_dict[ANY][str(dim)] + return path + +def setNDApproxDictVal(in_dict, keys_list, mismatches): + '''Creates (or builds upon) an N-Dimensional ("ND") dictionary (in_dict) + whose keys are mismatch variants (constructed using the keys_list and the + number of mismatches provided). The values at each level (initially: + changed by reduceNDDictPaths) is a 2 member list containing the number of + mismatches the key has and the dictionary for the next level. The values + at the end of the ND dictionary are (initially: changed by + reduceNDDictPaths) a 2 member list containing the number of mismatches and + a list of "original key paths". (Each path is a list of barcodes.) The + keys_list is a series of barcodes representing a row from the barcodes file + (i.e. 1 barcode from each dimension, in order). + Note: in_dict must be "l()" where "l" = lambda:defaultdict(l).''' + setNDApproxDictValHelper(in_dict, keys_list, mismatches, []) + +def setNDApproxDictValHelper(in_dict, keys_list, mismatches, real_path): + '''A recursive helper of setNDApproxDictVal which tracks the "real barcode + path" to use in the list of real barcode paths at the end of the + dictionary. Note that barcodes may have a 1:many relationship, indicated + by a barcode being repeated in earlier columns of the barcode file. + Different paths may also intersect given the number of allowed mismatches. + ''' + l = lambda:defaultdict(l) + real_path.append(keys_list[0]) + if len(keys_list) == 1: + #print "Doing last key %s" %(keys_list[0]) + for bcl in getMismatchedBarcodes(keys_list[0],mismatches): + #print "Processing mismatched barcode record: %s" % (bcl) + nmm = bcl[0] + bc = bcl[1] + if (bc in in_dict and type(in_dict[bc][1]) is list + and len(in_dict[bc][1]) > 0 + and not real_path in in_dict[bc][1]): + + #print ("Adding real path %s to the one existing at barcode " + # "%s: %s whose type should be a list: %s" + # ) % (real_path, bc, in_dict[bc], type(in_dict[bc])) + + #If the number of mismatches is the same for this barcode + #variant as it is for the real paths that have already been + #added to the real paths list, append this real path + if nmm == in_dict[bc][0]: + in_dict[bc][1] += [real_path] + #Else if the number of mismatches is less for this barcode + #variant as it is for the real paths that have already been + #added to the real paths list, overwrite the old list + elif nmm < in_dict[bc][0]: + in_dict[bc] = [nmm, [real_path]] + elif not (bc in in_dict and type(in_dict[bc][1]) is list + and len(in_dict[bc][1]) > 0): + #print ("Setting new real path %s to real path list for " + # "barcode %s: %s whose type should be a list: %s" + # ) % (real_path, bc, in_dict[bc], type(in_dict[bc])) + in_dict[bc] = [nmm, [real_path]] + #Else: skip poorer match + else: + #print "Doing first key %s" %(keys_list[0]) + for bcl in getMismatchedBarcodes(keys_list[0],mismatches): + nmm = bcl[0] + bc = bcl[1] + if bc not in in_dict: + in_dict[bc] = [nmm, l()] + setNDApproxDictValHelper(in_dict[bc][1], + keys_list[1:], + mismatches, + real_path[:]) + elif (bc in in_dict and nmm == in_dict[bc][0]): + setNDApproxDictValHelper(in_dict[bc][1], + keys_list[1:], + mismatches, + real_path[:]) + elif (bc in in_dict and nmm < in_dict[bc][0]): + #print "Type of in_dict[%s] is: %s Resetting list." % (bc, type(in_dict[bc])) + in_dict[bc] = [nmm, l()] + #in_dict[bc][0] = nmm + #in_dict[bc][1] = l() + setNDApproxDictValHelper(in_dict[bc][1], + keys_list[1:], + mismatches, + real_path[:]) + +def reduceNDDictPaths(in_dict): + '''After an approximate barcode dictionary has been fully created by + setNDApproxDictVal, this method reduces the lists of barcode paths to + individual discrete paths or individual generic MATCHED/MULTIMATCHED paths. + ''' + if isinstance(in_dict,dict) and len(in_dict) > 0: + arb_bc = in_dict.iterkeys().next() + #print "Checking arbitrary barcode to see what level of the dictionary we're at: %s" % (arb_bc) + #print "The keys of this level of the dictionary are: %s" % (in_dict.keys()) + if (isinstance(in_dict,dict) and len(in_dict) > 0 and + isinstance(in_dict[arb_bc][1],dict)): + for bc in in_dict: + #Reduction (skipping the list...) + in_dict[bc] = in_dict[bc][1] + reduceNDDictPaths(in_dict[bc]) + #Else if the value is a list of lists + elif (isinstance(in_dict,dict) and len(in_dict) > 0 and + isinstance(in_dict[in_dict.keys()[0]][1],list)): + #For each last barcode in the dictionary + for bc in in_dict: + #Reduction, skipping the outer list + in_dict[bc] = in_dict[bc][1] + #If the value at the leaf of the dict is a list of lists + if (len(in_dict[bc]) > 0 and isinstance(in_dict[bc],list) and + len(in_dict[bc][0]) > 0 and isinstance(in_dict[bc][0],list)): + #If the number of paths is 1 + if len(in_dict[bc]) == 1: + in_dict[bc] = in_dict[bc][0] + else: + inner_len = len(in_dict[bc][0]) + ref_path = in_dict[bc][0] + new_path = [] + for bci in xrange(0,inner_len): + all_matched = True + for path in in_dict[bc][1:]: + if path[bci] != ref_path[bci]: + all_matched = False + break + if all_matched is True: + new_path.append(MATCHED) + else: + new_path.append(MULTIMATCHED) + in_dict[bc] = new_path + +def getMismatchedBarcodes(barcode,mismatches): + '''Given a barcode sequence and a number of allowed mismatches, generate a + list of barcodes representing all possible matching sequences with the + allowed number of mismatches.''' + return(getMismatchedBarcodesHelper(barcode,mismatches,[])) + +def getMismatchedBarcodesHelper(barcode,mismatches,skips): + offbys = [[0, barcode]] + if mismatches <= 0: + return(offbys) + for p in xrange(0,len(barcode)): + if p in skips: + continue + #Keep track of bases that have already been mutated + newskips = skips + [p] + if barcode[p] != 'N': + offbys.append([len(newskips), + barcode[:p] + 'N' + barcode[p+1:]]) + if barcode[p] != 'A': + offbys.append([len(newskips), + barcode[:p] + 'A' + barcode[p+1:]]) + if barcode[p] != 'T': + offbys.append([len(newskips), + barcode[:p] + 'T' + barcode[p+1:]]) + if barcode[p] != 'G': + offbys.append([len(newskips), + barcode[:p] + 'G' + barcode[p+1:]]) + if barcode[p] != 'C': + offbys.append([len(newskips), + barcode[:p] + 'C' + barcode[p+1:]]) + #Recurse to add next allowed mismatch + if mismatches > 1: + if barcode[p] != 'N': + offbys += getMismatchedBarcodesHelper(barcode[:p] + 'N' + + barcode[p+1:], + mismatches - 1, + newskips) + if barcode[p] != 'A': + offbys += getMismatchedBarcodesHelper(barcode[:p] + 'A' + + barcode[p+1:], + mismatches - 1, + newskips) + if barcode[p] != 'T': + offbys += getMismatchedBarcodesHelper(barcode[:p] + 'T' + + barcode[p+1:], + mismatches - 1, + newskips) + if barcode[p] != 'G': + offbys += getMismatchedBarcodesHelper(barcode[:p] + 'G' + + barcode[p+1:], + mismatches - 1, + newskips) + if barcode[p] != 'C': + offbys += getMismatchedBarcodesHelper(barcode[:p] + 'C' + + barcode[p+1:], + mismatches - 1, + newskips) + return(offbys) + +def exeExists(program): + '''Determines whether an executable exists (i.e. is in the user\'s path)''' + import os + def is_exe(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return True + else: + for path in os.environ["PATH"].split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return True + + return False + +def run_gzip_in(infile): + '''Takes a gzipped file and returns an iterator on the unzipped lines of + the file using a system call to gzip.''' + p = subprocess.Popen(["gzip", "-dc", infile], stdout=subprocess.PIPE, + bufsize=1) + return iter(p.stdout.readline, b'') + +def run_gzip_out(outfile): + p = subprocess.Popen(["gzip"], stdin=subprocess.PIPE, + stdout=open(outfile,'wb'), bufsize=-1) + return p.stdin + +if __name__ == '__main__': + sys.exit(main()) diff -r 000000000000 -r 50df2d629d51 barcode_splitter_multi/barcode_splitter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/barcode_splitter_multi/barcode_splitter.xml Fri Aug 26 16:30:56 2016 -0400 @@ -0,0 +1,133 @@ + + + $summary +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @misc{paired_sequence_utils, + title = {{Barcode}-{Splitter}}, + url = {https://bitbucket.org/hepcat72/paired_sequence_utils}, + author = "Parsons, Lance and Leach, Robert" + } + + + + diff -r 000000000000 -r 50df2d629d51 barcode_splitter_multi/barcode_splitter_galaxy_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/barcode_splitter_multi/barcode_splitter_galaxy_wrapper.sh Fri Aug 26 16:30:56 2016 -0400 @@ -0,0 +1,59 @@ +#!/bin/bash + +# FASTX-toolkit - FASTA/FASTQ preprocessing tools. +# Copyright (C) 2009 A. Gordon (gordon@cshl.edu) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +# Modified by Lance Parsons (lparsons@princeton.edu) +# 2011-03-15 Adapted to allow galaxy to determine filetype +# 2015-10-21 Updated to make compatible with OSX (BSD sed) +# 2015-11-13 Removed LIBRARY_NAME, no longer needed +# 2016-04-28 Output summary as simple tabular output + +# This is a shell script wrapper for 'fastx_barcode_splitter.pl' +# +# 1. Output files are saved at the dataset's files_path directory. + +if [ "$1x" = "x" ]; then + echo "Usage: $0 [OUTPUT_PATH] [OUTFILETYPE] [OPTIONS]" >&2 + exit 1 +fi + +OUTPUT_PATH="$1" +FILETYPE="$2" +shift 2 +# The rest of the parameters are passed to the split program + +if [ "${OUTPUT_PATH}x" = "x" ]; then + echo "Usage: $0 [OUTPUT_PATH] [OUTFILETYPE] [OPTIONS]" >&2 + exit 1 +fi + +mkdir -p "$OUTPUT_PATH" +if [ ! -d "$OUTPUT_PATH" ]; then + echo "Error: failed to create output path '$OUTPUT_PATH'" >&2 + exit 1 +fi + +PREFIX="$OUTPUT_PATH/" +SUFFIX=".$FILETYPE" +DIRECTORY="$(cd "$(dirname "$0")" && pwd)" + +RESULTS=$("$DIRECTORY/barcode_splitter.py" --format "$FILETYPE" --prefix "$PREFIX" --suffix "$SUFFIX" "$@") +if [ $? != 0 ]; then + echo "error" +fi + +echo "$RESULTS" \ No newline at end of file