Mercurial > repos > hepcat72 > barcode_splitter_multi
view barcode_splitter_multi/barcode_splitter.py @ 0:50df2d629d51 draft
Uploaded
author | hepcat72 |
---|---|
date | Fri, 26 Aug 2016 16:30:56 -0400 |
parents | |
children | e3f91eee8c75 |
line wrap: on
line source
#!/usr/bin/env python """ Split one or more fastq files based on barcode sequence. """ from __future__ import division import gzip import argparse import os import re import sys from collections import defaultdict import subprocess __version__ = "0.12" __author__ = "Lance Parsons & Robert Leach" __author_email__ = "lparsons@princeton.edu,rleach@princeton.edu" __copyright__ = "Copyright 2011, Lance Parsons & Robert leach" __license__ = ("BSD 2-Clause License " "http://www.opensource.org/licenses/BSD-2-Clause") UNMATCHED = 'unmatched' MATCHED = 'matched' MULTIMATCHED = 'multimatched' ANY = 'any' #Used in the N-dimensional dictionary to help #diagnose which barcode set is matching/not matching def main (argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser(description=globals()['__doc__']) parser.add_argument('--version', action='version', version='%(prog)s version ' + globals()['__version__']) required_group = parser.add_argument_group("Barcodes") required_group.add_argument('--bcfile', metavar='FILE', help='REQUIRED: Tab delimited file: ' '"Sample_ID <tab> Barcode_Sequence"') required_group.add_argument ('--idxread', metavar='READNUM', type=int, nargs='+', help="REQUIRED: Indicate in " "which read file(s) to search for the " "corresponding column of barcode sequences, " "e.g. if the first column of barcodes is in " "the second sequence read file and the " "second column's barcodes are in the third " "sequence read file, you'd supply " "`--idxread 2 3`") required_group.add_argument('--mismatches', default=0, type=int, help='Number of mismatches allowed in ' 'barcode matching') required_group.add_argument('--barcodes_at_end', action='store_true', default=False, help='Barcodes are at the end ' 'of the index read (default is at the ' 'beginning)') #pairedend_group = parser.add_argument_group("Paired-End") #pairedend_group.add_argument ('--split-paired-ends', action='store_true', # default=False, help='In addition to ' # 'splitting based on barcodes, also split ' # 'paired ends.') output_group = parser.add_argument_group("Output Options") output_group.add_argument('--prefix', default='', help='Prefix for output files') output_group.add_argument('--suffix', default=None, help='Suffix for output files (default based ' 'on --format)') output_group.add_argument('--galaxy', action='store_true', default=False, help='Produce "Galaxy safe" filenames by ' 'removing underscores (default: %(default)s)') output_group.add_argument('-v', '--verbose', action='store_true', default=False, help='verbose output') output_group.add_argument('--gzipout', action='store_true', default=False, help='Output files in compressed gzip format ' '(default is uncompressed)') input_group = parser.add_argument_group("Input format") input_group.add_argument('--format', default='fastq', help='Specify format for sequence files (fasta ' 'or fastq)') input_group.add_argument('--gzipin', action='store_true', default=False, help='Assume input files are in gzip format, ' 'despite file extension (default is auto based ' 'on input file extension)') seqs_group = parser.add_argument_group("Sequence Inputs") seqs_group.add_argument('fastq_files', metavar='FILE', type=str, nargs='+', help='A series of 1 or more [optionally zipped] ' 'fastq files.') #Error-check the values provided from the command line try: options = parser.parse_args() if options.fastq_files is None or options.idxread is None: parser.error('Sequence files and at least one number indicating ' 'the indexed file(s) (--idxread) is required') if len(options.fastq_files) < 1: parser.error('Must specify at least one sequence file') if len(options.fastq_files) < len(options.idxread): parser.error('Must specify at least one sequence file') if not options.bcfile: parser.error('Must specify a barcodes file with "--bcfile" option') if ((min(options.idxread) < 1) or (max(options.idxread) > len(options.fastq_files))): parser.error('Invalid index read number ("--idxread"), must be ' 'between 1 and %s (the number of supplied sequence ' 'files)' % len(options.fastq_files)) except SystemExit: # Prevent exit when calling as function return 2 #Read barcodes files into n-dimensional dictionary (barcode_dict,approx_bc_dict) = read_barcodes(options.bcfile, len(options.idxread), options.mismatches) barcode_sizes = getBarcodeSizes(barcode_dict) #print(barcode_dict) #print(approx_bc_dict) #initialize the read counts for each barcode total_read_count = 0 matched_counts = multi_dimensions(len(options.idxread), int) unmatched_counts = multi_dimensions(len(options.idxread), int) #TODO Verbose: print barcode_dict #Determine if we should use gzip for input basename, extension = os.path.splitext(options.fastq_files[0]) if extension == '.gz': options.gzipin = True #Set the suffix (before determination of gzip output mode) if options.suffix is not None: suffix = options.suffix elif options.format is not None: suffix = '.%s' % options.format elif extension != '.gz': suffix = '.%s' % extension else: suffix = 'fastq' #Determine if we should use gzip for output if (options.gzipout is True or (options.suffix is not None and options.suffix == '.gz')): options.gzipout = True if options.suffix is None or suffix != '.gz': suffix = '%s.gz' % suffix #Determine whether to use the gzip command line tool or not use_gzip_cmd_line = False if options.gzipin is True or options.gzipout is True: use_gzip_cmd_line = exeExists('gzip') #Determine if we need to edit the prefix for galaxy mode prefix = options.prefix if options.galaxy: prefix = options.prefix.replace("_","-") #Open input filehandles for each read #print "Opening the input file handles..." inputs = {} for i in xrange(0,len(options.fastq_files)): if options.gzipin: if(use_gzip_cmd_line is True): inputs[i] = run_gzip_in(options.fastq_files[i]) else: inputs[i] = gzip.open(options.fastq_files[i], 'rb') else: inputs[i] = open(options.fastq_files[i], 'rb') #Open output filehandles for each barcode set and sequence file #print "Opening the output barcode file handles..." outputs = multi_dimensions((len(options.idxread) + 1), file) openOutfiles(outputs, barcode_dict, prefix, suffix, inputs, options.galaxy, options.gzipout,use_gzip_cmd_line) #Open the file handle for the unmatched file #print "Opening the output un/multi-matched file handles..." unmatchedOutputs = defaultdict(file) multimatchedOutputs = defaultdict(file) for i in xrange(0,len(inputs)): unmf = '%s%s-read-%s%s' %(prefix, UNMATCHED, i+1, suffix) mmf = '%s%s-read-%s%s' %(prefix, MULTIMATCHED, i+1, suffix) if options.gzipout: if use_gzip_cmd_line is True: unmatchedOutputs[i] = run_gzip_out(unmf) multimatchedOutputs[i] = run_gzip_out(mmf) else: unmatchedOutputs[i] = gzip.open(unmf,'wb') multimatchedOutputs[i] = gzip.open(mmf, 'wb') else: unmatchedOutputs[i] = open(unmf, 'wb') multimatchedOutputs[i] = open(mmf, 'wb') readidxs = [i - 1 for i in options.idxread] prim_readidx = readidxs[0] #Debug mode reads the 1st 10000 reads of each file when prefix has "debug" debug_mode = False if re.match('debug', prefix): debug_mode = True # For each input line in index read, get index sequence for prim_index_read in read_fastq(inputs[prim_readidx]): total_read_count += 1 #print "Processing read number %s\r" % (total_read_count), #For debugging any file if debug_mode and total_read_count > 10000: break #Determine the ID format using the first record if total_read_count == 1: id_format = determine_id_format(prim_index_read['seq_id'][1:]) #Get 1 from each file & assert that their IDs match cur_reads = {} for read_index in xrange(0,len(inputs)): if(read_index is not prim_readidx): cur_reads[read_index] = read_fastq(inputs[read_index]).next() #If the IDs don't match between all the files, issue a warning try: assert(match_id(prim_index_read['seq_id'], cur_reads[read_index]['seq_id'],id_format)) except AssertionError: sys.stderr.write("WARNING: Index ID mismatch: %s does " "not match %s\n" %(prim_index_read['seq_id'], cur_reads[read_index]['seq_id'])) else: cur_reads[read_index] = prim_index_read #Get the index sequences from the indexed reads index_seqs = [] if options.barcodes_at_end: index_seqs = [cur_reads[readidxs[i]]['seq'][-barcode_sizes[i]:] for i in xrange(0,len(readidxs))] else: index_seqs = [cur_reads[readidxs[i]]['seq'][0:barcode_sizes[i]] for i in xrange(0,len(readidxs))] barcode_path = getNDDictVal(approx_bc_dict,index_seqs) if(barcode_path is None): cur_outputs = unmatchedOutputs unmatched_path = getBarcodeMatchPath(approx_bc_dict,index_seqs) incrementNDDictInt(unmatched_counts,unmatched_path) elif(MULTIMATCHED in barcode_path): cur_outputs = multimatchedOutputs unmatched_path = getBarcodeMatchPath(approx_bc_dict,barcode_path) incrementNDDictInt(unmatched_counts,unmatched_path) sys.stderr.write('WARNING: More than one barcode matches %s, ' 'moving to %s category\n' %(prim_index_read['seq_id'], MULTIMATCHED)) else: cur_outputs = getNDDictVal(outputs,barcode_path) incrementNDDictInt(matched_counts,barcode_path) #Write each sequence to the matched or unmatched output handle for readnum in xrange(0,len(inputs)): cur_outputs[readnum].write(fastq_string(cur_reads[readnum])) #Report the final matched/unmatched counts in a table to STDOUT printCounts(barcode_dict,matched_counts,unmatched_counts,total_read_count, len(options.idxread)) return 0 def read_barcodes(filename, num_dims, mismatches): '''Read barcodes file into dictionary''' #Declare a variable-dimension dictionary #This was previously a static multi-dimensional dictionary, but it would #not have worked... l = lambda:defaultdict(l) real_bc_dict = l() approx_bc_dict = l() approx_bc_any_dict = l() linenum = 0 filehandle = open(filename, 'rb') for line in filehandle: linenum += 1 line = line.strip() cols = [] #If the line has something on it and isn't commented if line and line[0] != '#': cols = re.split('\t+',line) #If we got the expected number of columns if len(cols) == (num_dims + 1): sample_id = cols.pop(0) barcode_sequences = cols else: sys.stderr.write("Unable to parse line in barcode file: [%s]: " "[%s]. Must contain %s columns (the number " "of values supplied to --idxread plus 1 for " "the sample ID), but found [%s]. Skipping.\n" %(filename, line, (num_dims + 1), len(cols))) continue #If the sample ID is not none and the number of defined barcodes is #equal to the number of expected barcodes defined_barcodes = [e for e in barcode_sequences if e is not None] if (sample_id is not None) and len(defined_barcodes) == num_dims: #Set the sample ID in the N-Dimensional dictionary setNDDictVal(real_bc_dict, defined_barcodes, sample_id) #This creates a dictionary containing all possible barcode #combos, with MULTIMATCHED values pre-computed #print ("Calling setNDApproxDictVal with %s and dict: " # "[%s]") % (defined_barcodes, approx_bc_dict) setNDApproxDictVal(approx_bc_dict, defined_barcodes, mismatches) #Make sure we can get all the barcodes for each dict dimension dim = 0 for barcode in defined_barcodes: dim += 1 real_bc_dict[ANY][str(dim)][barcode] = "" #This creates a dictionary containing all possible barcode #combos, with MULTIMATCHED values pre-computed setNDApproxDictVal(approx_bc_any_dict[ANY][str(dim)], [barcode], mismatches) elif ((sample_id is not None) and (len(defined_barcodes) > num_dims)): raise Exception("More barcode indexes were found in the " "barcodes file on line %s: '%s' than were " "expected: [%s] (the number of values " "supplied to --idxread)." %(linenum, line, num_dims)) else: raise Exception("Unable to parse barcode(s) from line %s: " "'%s'. Expected a sample ID followed by [%s] " "tab-delimited barcodes" %(linenum, line, num_dims)) if len(real_bc_dict) == 0: raise Exception("Unable to parse any barcodes from barcode file [%s]." %(filename)) #print(approx_bc_dict) reduceNDDictPaths(approx_bc_dict) for dim in real_bc_dict[ANY]: reduceNDDictPaths(approx_bc_any_dict[ANY][dim]) approx_bc_dict[ANY][dim] = approx_bc_any_dict[ANY][dim] return(real_bc_dict,approx_bc_dict) def openOutfiles(outputs,barcode_dict,prefix,suffix,inputs,galaxy,gzip_mode, use_gzip_cmd_line): '''Opens all the output files for all barcode matches (unmatched barcode output files not opened)''' if isinstance(barcode_dict,dict): for barcode in getRealBarcodes(barcode_dict.keys()): openOutfiles(outputs[barcode], barcode_dict[barcode], prefix, suffix, inputs, galaxy, gzip_mode, use_gzip_cmd_line) else: sample_id = barcode_dict if galaxy: #Replace underscore to allow this to work with Galaxy sample_id = barcode_dict.replace("_","-") for i in xrange(0,len(inputs)): fn = '%s%s-read-%s%s' % (prefix, sample_id, i+1, suffix) if gzip_mode: if use_gzip_cmd_line is True: outputs[i] = run_gzip_out(fn) else: outputs[i] = gzip.open(fn, 'wb') else: outputs[i] = open(fn, 'wb') def getBarcodeSizes(barcode_dict): '''Uses the first key in each level of a dictionary to build a list of key sizes (assuming all keys are the same size)''' if (isinstance(barcode_dict,dict) and len(barcode_dict.keys()) > 0 and isinstance(barcode_dict[barcode_dict.keys()[0]],dict) and len(barcode_dict[barcode_dict.keys()[0]].keys()) > 0): return([len(barcode_dict.keys()[0])] + getBarcodeSizes(barcode_dict[barcode_dict.keys()[0]])) elif isinstance(barcode_dict,dict) and len(barcode_dict.keys()) > 0: return([len(barcode_dict.keys()[0])]) else: return(None) def setNDDictVal(in_dict, keys_list, val): '''Sets the value of an N-Dimensional dictionary to the supplied value using the list of keys''' cur_dict = in_dict for cur_key in keys_list[0:-1]: cur_dict = cur_dict[cur_key] cur_dict[keys_list[-1]] = val def incrementNDDictInt(in_dict, keys_list): '''Sets the value of an N-Dimensional dictionary to the supplied value using the list of keys''' cur_dict = in_dict for cur_key in keys_list[0:-1]: cur_dict = cur_dict[cur_key] cur_dict[keys_list[-1]] += 1 def getNDDictVal(in_dict, keys_list): '''Gets the value of an N-Dimensional dictionary using the list of keys''' cur_dict = in_dict for cur_key in keys_list: if cur_key in cur_dict: cur_dict = cur_dict[cur_key] else: return None return cur_dict def multi_dimensions(n, type): """Creates an N-dimensional dictionary containing values at the leaves of type 'type'. E.g. A 2-dimensional dictionary of strs would have 2 levels of keys that hold string values (mydict['key1']['key2'] = 'my string value') """ #n <= 0 is correct. E.g. if called with 2 (i.e. a 2D dictionary - like a 2D #array/matrix - where array[2][1] = 5 or dict['key1']['key2'] = 'str' are #2-dimensional data structures), the the first type returned is a dict (for #holding 'key1') and the second thing returned is a defaultdict for holding #'key2' and the last thing returned is the type of the data held at the #leaves of the dictionary if n <= 0: return type() return defaultdict(lambda:multi_dimensions(n-1, type)) def getRealBarcodes(barcodes): '''Return a list of barcodes that exclude generic "matched", "unmatched", and "any" pseudo-barcodes''' return [bc for bc in barcodes if (bc is not UNMATCHED and bc is not MATCHED and bc is not ANY and bc is not MULTIMATCHED and bc is not None)] def matchBarcodes(sequence, barcodes, mismatches): '''Find closest match(es) in barcodes to specified sequence with max number of mismatches''' best_distance = mismatches results = [] for barcode in barcodes: if mismatches == 0: if (sequence == barcode): results.append(barcode) else: distance = hamming_distance(sequence, barcode) if (distance <= best_distance): best_distance = distance results.append(barcode) return results def hamming_distance(s1, s2): assert len(s1) == len(s2) return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2)) ''' Supported types of Fastq IDs ''' ILLUMINA = 'illumina' # CASVA 1.8+, match up to space STRIPONE = 'stripone' # Illumina CASAVA 1.7 and lower (/1 /2) and NCBI SRA # (/f /r), match all but last character OTHER = 'other' # Other, match exactly def determine_id_format(seq_id): '''Determine if the id is new illumina, old illumina (/1 /2 ...), sanger (/f /r), or other''' id_format = None # Illumina CASAVA 1.8+ fastq headers use new format read_id_regex = re.compile(r'(?P<instrument>[a-zA-Z0-9_-]+):' '(?P<run_number>[0-9]+):' '(?P<flowcell_id>[a-zA-Z0-9]+):' '(?P<lane>[0-9]+):(?P<tile>[0-9]+):' '(?P<x_pos>[0-9]+):' '(?P<y_pos>[0-9]+) (?P<read>[0-9]+):' '(?P<is_filtered>[YN]):' '(?P<control_number>[0-9]+):' '(?P<index_sequence>[ACGT]+){0,1}') # Old illumina and sanger reads use /1 /2 /3 or /f /r strip_one_endings = ['/1', '/2', '/3', '/f', '/r'] if read_id_regex.match(seq_id): id_format = ILLUMINA elif (seq_id[-2:] in strip_one_endings): id_format = STRIPONE else: id_format = OTHER return id_format def strip_read_from_id(seq_id, id_format=None): new_id = seq_id if not id_format: id_format = determine_id_format(seq_id) elif id_format == STRIPONE: new_id = seq_id[0:-1] elif id_format == ILLUMINA: new_id = seq_id.split(' ')[0] return new_id def strip_read_from_id_stripone(seq_id): return seq_id[0:-1] def strip_read_from_id_illumina(seq_id): return seq_id.split(' ')[0] def match_id(id1, id2, id_format=OTHER): ''' Return true if IDs match using rules for specified format ''' if id_format == STRIPONE: if id1[0:-1] == id2[0:-1]: return True else: return False elif id_format == ILLUMINA: if (id1.split(' ')[0] == id2.split(' ')[0]): return True else: return False elif id1 == id2: return True else: return False def read_fastq(filehandle): ''' Return dictionary with "seq_id", "seq", "qual_id", and "qual" ''' record_line = 0 read_number = 0 fastq_record = dict() for line in filehandle: record_line += 1 if record_line == 1: fastq_record['seq_id'] = line.strip() elif record_line == 2: fastq_record['seq'] = line.strip() elif record_line == 3: fastq_record['qual_id'] = line.strip() elif record_line == 4: record_line = 0 fastq_record['qual'] = line.strip() read_number += 1 yield fastq_record def fastq_string(record): return "%s\n%s\n%s\n%s\n" % (record['seq_id'], record['seq'], record['qual_id'], record['qual']) def printCounts(barcode_dict,matched_counts,unmatched_counts,total_read_count, numBarcodes): '''Prints the header of the counts table and calls the 2 recursive functions for printing barcode and unmatched barcode count data''' #Print the table header print "Sample\t", for b in xrange(1,numBarcodes + 1): print "Barcode%s\t" %(b), print "Count\tPercent" if total_read_count == 0: print "ERROR: Total read count is 0. No reads were parsed." return() #Print the matched counts, sorting the lines for line in sorted(getMatchedCountsTable(barcode_dict,matched_counts, total_read_count,[])): print(line), #Print the unmatched counts (default sorted by string) printUnmatchedCounts(unmatched_counts,total_read_count,[]) def getMatchedCountsTable(barcode_dict,matched_counts,total_read_count, barcode_path): '''Recursively builds a "barcode path" and prints a line in the barcode counts table for every leaf in the barcode dictionary''' ret_list = [] if isinstance(barcode_dict,dict): for barcode in sorted(barcode_dict, key=barcode_dict.get): if barcode is not ANY: if len(barcode_path) > 0: new_barcode_path = barcode_path[:] else: new_barcode_path = [] new_barcode_path.append(barcode) ret_list += getMatchedCountsTable(barcode_dict[barcode], matched_counts[barcode], total_read_count, new_barcode_path) else: ret_str = barcode_dict + "\t" for bc in barcode_path: ret_str += bc + "\t" ret_str += "%s\t%.2f%%\n" % (matched_counts, (matched_counts/total_read_count)*100) ret_list += [ret_str] return(ret_list) def printUnmatchedCounts(unmatched_counts,total_read_count,barcode_path): '''Recursively builds a "barcode path" (consisting of "MATCHED" and "UNMATCHED" values) and prints a line in the barcode counts table for every leaf in the unmatched counts dictionary''' if isinstance(unmatched_counts,dict): for barcode in sorted(unmatched_counts, key=unmatched_counts.get): if len(barcode_path) > 0: new_barcode_path = barcode_path[:] else: new_barcode_path = [] new_barcode_path.append(barcode) printUnmatchedCounts(unmatched_counts[barcode],total_read_count, new_barcode_path) else: label = UNMATCHED if (UNMATCHED not in barcode_path and MULTIMATCHED in barcode_path): label = MULTIMATCHED print "%s\t" %(label), for bc in barcode_path: print "%s\t" %(bc), print "%s\t%.2f%%" %(unmatched_counts, (unmatched_counts/total_read_count)*100) def getBarcodeMatchPath(in_dict, keys_list): '''Generates a path of pseudo-barcodes (consisting of "MATCHED", "UNMATCHED", and "MULTIMATCHED" values) to use in the summary output. The intent is to give the user useful feedback about which level(s) of barcodes are not matrching instead of listing all sequences as simply "UNMATCHED". It also reduces complexity of the unmatched output by not including specific barcodes, which do not matter in unmatched/multimatched cases.''' cur_dict = in_dict dim = 0 path = [] for cur_key in keys_list: dim += 1 if cur_key is MATCHED or cur_key is MULTIMATCHED: path.append(cur_key) elif cur_key in cur_dict: path.append(MATCHED) else: path.append(UNMATCHED) cur_dict = in_dict[ANY][str(dim)] return path def setNDApproxDictVal(in_dict, keys_list, mismatches): '''Creates (or builds upon) an N-Dimensional ("ND") dictionary (in_dict) whose keys are mismatch variants (constructed using the keys_list and the number of mismatches provided). The values at each level (initially: changed by reduceNDDictPaths) is a 2 member list containing the number of mismatches the key has and the dictionary for the next level. The values at the end of the ND dictionary are (initially: changed by reduceNDDictPaths) a 2 member list containing the number of mismatches and a list of "original key paths". (Each path is a list of barcodes.) The keys_list is a series of barcodes representing a row from the barcodes file (i.e. 1 barcode from each dimension, in order). Note: in_dict must be "l()" where "l" = lambda:defaultdict(l).''' setNDApproxDictValHelper(in_dict, keys_list, mismatches, []) def setNDApproxDictValHelper(in_dict, keys_list, mismatches, real_path): '''A recursive helper of setNDApproxDictVal which tracks the "real barcode path" to use in the list of real barcode paths at the end of the dictionary. Note that barcodes may have a 1:many relationship, indicated by a barcode being repeated in earlier columns of the barcode file. Different paths may also intersect given the number of allowed mismatches. ''' l = lambda:defaultdict(l) real_path.append(keys_list[0]) if len(keys_list) == 1: #print "Doing last key %s" %(keys_list[0]) for bcl in getMismatchedBarcodes(keys_list[0],mismatches): #print "Processing mismatched barcode record: %s" % (bcl) nmm = bcl[0] bc = bcl[1] if (bc in in_dict and type(in_dict[bc][1]) is list and len(in_dict[bc][1]) > 0 and not real_path in in_dict[bc][1]): #print ("Adding real path %s to the one existing at barcode " # "%s: %s whose type should be a list: %s" # ) % (real_path, bc, in_dict[bc], type(in_dict[bc])) #If the number of mismatches is the same for this barcode #variant as it is for the real paths that have already been #added to the real paths list, append this real path if nmm == in_dict[bc][0]: in_dict[bc][1] += [real_path] #Else if the number of mismatches is less for this barcode #variant as it is for the real paths that have already been #added to the real paths list, overwrite the old list elif nmm < in_dict[bc][0]: in_dict[bc] = [nmm, [real_path]] elif not (bc in in_dict and type(in_dict[bc][1]) is list and len(in_dict[bc][1]) > 0): #print ("Setting new real path %s to real path list for " # "barcode %s: %s whose type should be a list: %s" # ) % (real_path, bc, in_dict[bc], type(in_dict[bc])) in_dict[bc] = [nmm, [real_path]] #Else: skip poorer match else: #print "Doing first key %s" %(keys_list[0]) for bcl in getMismatchedBarcodes(keys_list[0],mismatches): nmm = bcl[0] bc = bcl[1] if bc not in in_dict: in_dict[bc] = [nmm, l()] setNDApproxDictValHelper(in_dict[bc][1], keys_list[1:], mismatches, real_path[:]) elif (bc in in_dict and nmm == in_dict[bc][0]): setNDApproxDictValHelper(in_dict[bc][1], keys_list[1:], mismatches, real_path[:]) elif (bc in in_dict and nmm < in_dict[bc][0]): #print "Type of in_dict[%s] is: %s Resetting list." % (bc, type(in_dict[bc])) in_dict[bc] = [nmm, l()] #in_dict[bc][0] = nmm #in_dict[bc][1] = l() setNDApproxDictValHelper(in_dict[bc][1], keys_list[1:], mismatches, real_path[:]) def reduceNDDictPaths(in_dict): '''After an approximate barcode dictionary has been fully created by setNDApproxDictVal, this method reduces the lists of barcode paths to individual discrete paths or individual generic MATCHED/MULTIMATCHED paths. ''' if isinstance(in_dict,dict) and len(in_dict) > 0: arb_bc = in_dict.iterkeys().next() #print "Checking arbitrary barcode to see what level of the dictionary we're at: %s" % (arb_bc) #print "The keys of this level of the dictionary are: %s" % (in_dict.keys()) if (isinstance(in_dict,dict) and len(in_dict) > 0 and isinstance(in_dict[arb_bc][1],dict)): for bc in in_dict: #Reduction (skipping the list...) in_dict[bc] = in_dict[bc][1] reduceNDDictPaths(in_dict[bc]) #Else if the value is a list of lists elif (isinstance(in_dict,dict) and len(in_dict) > 0 and isinstance(in_dict[in_dict.keys()[0]][1],list)): #For each last barcode in the dictionary for bc in in_dict: #Reduction, skipping the outer list in_dict[bc] = in_dict[bc][1] #If the value at the leaf of the dict is a list of lists if (len(in_dict[bc]) > 0 and isinstance(in_dict[bc],list) and len(in_dict[bc][0]) > 0 and isinstance(in_dict[bc][0],list)): #If the number of paths is 1 if len(in_dict[bc]) == 1: in_dict[bc] = in_dict[bc][0] else: inner_len = len(in_dict[bc][0]) ref_path = in_dict[bc][0] new_path = [] for bci in xrange(0,inner_len): all_matched = True for path in in_dict[bc][1:]: if path[bci] != ref_path[bci]: all_matched = False break if all_matched is True: new_path.append(MATCHED) else: new_path.append(MULTIMATCHED) in_dict[bc] = new_path def getMismatchedBarcodes(barcode,mismatches): '''Given a barcode sequence and a number of allowed mismatches, generate a list of barcodes representing all possible matching sequences with the allowed number of mismatches.''' return(getMismatchedBarcodesHelper(barcode,mismatches,[])) def getMismatchedBarcodesHelper(barcode,mismatches,skips): offbys = [[0, barcode]] if mismatches <= 0: return(offbys) for p in xrange(0,len(barcode)): if p in skips: continue #Keep track of bases that have already been mutated newskips = skips + [p] if barcode[p] != 'N': offbys.append([len(newskips), barcode[:p] + 'N' + barcode[p+1:]]) if barcode[p] != 'A': offbys.append([len(newskips), barcode[:p] + 'A' + barcode[p+1:]]) if barcode[p] != 'T': offbys.append([len(newskips), barcode[:p] + 'T' + barcode[p+1:]]) if barcode[p] != 'G': offbys.append([len(newskips), barcode[:p] + 'G' + barcode[p+1:]]) if barcode[p] != 'C': offbys.append([len(newskips), barcode[:p] + 'C' + barcode[p+1:]]) #Recurse to add next allowed mismatch if mismatches > 1: if barcode[p] != 'N': offbys += getMismatchedBarcodesHelper(barcode[:p] + 'N' + barcode[p+1:], mismatches - 1, newskips) if barcode[p] != 'A': offbys += getMismatchedBarcodesHelper(barcode[:p] + 'A' + barcode[p+1:], mismatches - 1, newskips) if barcode[p] != 'T': offbys += getMismatchedBarcodesHelper(barcode[:p] + 'T' + barcode[p+1:], mismatches - 1, newskips) if barcode[p] != 'G': offbys += getMismatchedBarcodesHelper(barcode[:p] + 'G' + barcode[p+1:], mismatches - 1, newskips) if barcode[p] != 'C': offbys += getMismatchedBarcodesHelper(barcode[:p] + 'C' + barcode[p+1:], mismatches - 1, newskips) return(offbys) def exeExists(program): '''Determines whether an executable exists (i.e. is in the user\'s path)''' import os def is_exe(fpath): return os.path.isfile(fpath) and os.access(fpath, os.X_OK) fpath, fname = os.path.split(program) if fpath: if is_exe(program): return True else: for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') exe_file = os.path.join(path, program) if is_exe(exe_file): return True return False def run_gzip_in(infile): '''Takes a gzipped file and returns an iterator on the unzipped lines of the file using a system call to gzip.''' p = subprocess.Popen(["gzip", "-dc", infile], stdout=subprocess.PIPE, bufsize=1) return iter(p.stdout.readline, b'') def run_gzip_out(outfile): p = subprocess.Popen(["gzip"], stdin=subprocess.PIPE, stdout=open(outfile,'wb'), bufsize=-1) return p.stdin if __name__ == '__main__': sys.exit(main())