Mercurial > repos > davidvanzessen > shm_csr
diff change_o/DefineClones.py @ 52:22dddabe3637 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 23 May 2017 08:32:58 -0400 |
parents | c33d93683a09 |
children |
line wrap: on
line diff
--- a/change_o/DefineClones.py Tue May 16 08:52:48 2017 -0400 +++ b/change_o/DefineClones.py Tue May 23 08:32:58 2017 -0400 @@ -10,6 +10,7 @@ import os import re import sys +import csv import numpy as np from argparse import ArgumentParser from collections import OrderedDict @@ -25,56 +26,108 @@ from presto.Multiprocessing import manageProcesses from presto.Sequence import getDNAScoreDict from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs -from changeo.Distance import getDNADistMatrix, getAADistMatrix, \ - hs1f_model, m1n_model, hs5f_model, \ - calcDistances, formClusters +from changeo.Distance import distance_models, calcDistances, formClusters from changeo.IO import getDbWriter, readDbFile, countDbFile from changeo.Multiprocessing import DbData, DbResult +## Set maximum field size for csv.reader +csv.field_size_limit(sys.maxsize) + # Defaults default_translate = False default_distance = 0.0 -default_bygroup_model = 'hs1f' +default_index_mode = 'gene' +default_index_action = 'set' +default_bygroup_model = 'ham' default_hclust_model = 'chen2010' default_seq_field = 'JUNCTION' default_norm = 'len' default_sym = 'avg' default_linkage = 'single' - -# TODO: should be in Distance, but need to be after function definitions -# Amino acid Hamming distance -aa_model = getAADistMatrix(mask_dist=1, gap_dist=0) - -# DNA Hamming distance -ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0) +choices_bygroup_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f', 'mk_rs1nf', 'mk_rs5nf', 'hs1f_compat', 'm1n_compat') -# TODO: this function is an abstraction to facilitate later cleanup -def getModelMatrix(model): +def indexByIdentity(index, key, rec, fields=None): """ - Simple wrapper to get distance matrix from model name + Updates a preclone index with a simple key + + Arguments: + index = preclone index from indexJunctions + key = index key + rec = IgRecord to add to the index + fields = additional annotation fields to use to group preclones; + if None use only V, J and junction length + + Returns: + None. Updates index with new key and records. + """ + index.setdefault(tuple(key), []).append(rec) + + +def indexByUnion(index, key, rec, fields=None): + """ + Updates a preclone index with the union of nested keys Arguments: - model = model name + index = preclone index from indexJunctions + key = index key + rec = IgRecord to add to the index + fields = additional annotation fields to use to group preclones; + if None use only V, J and junction length - Return: - a pandas.DataFrame containing the character distance matrix + Returns: + None. Updates index with new key and records. """ - if model == 'aa': - return(aa_model) - elif model == 'ham': - return(ham_model) - elif model == 'm1n': - return(m1n_model) - elif model == 'hs1f': - return(hs1f_model) - elif model == 'hs5f': - return(hs5f_model) + # List of values for this/new key + val = [rec] + f_range = list(range(2, 3 + (len(fields) if fields else 0))) + + # See if field/junction length combination exists in index + outer_dict = index + for field in f_range: + try: + outer_dict = outer_dict[key[field]] + except (KeyError): + outer_dict = None + break + # If field combination exists, look through Js + j_matches = [] + if outer_dict is not None: + for j in outer_dict.keys(): + if not set(key[1]).isdisjoint(set(j)): + key[1] = tuple(set(key[1]).union(set(j))) + j_matches += [j] + # If J overlap exists, look through Vs for each J + for j in j_matches: + v_matches = [] + # Collect V matches for this J + for v in outer_dict[j].keys(): + if not set(key[0]).isdisjoint(set(v)): + key[0] = tuple(set(key[0]).union(set(v))) + v_matches += [v] + # If there are V overlaps for this J, pop them out + if v_matches: + val += list(chain(*(outer_dict[j].pop(v) for v in v_matches))) + # If the J dict is now empty, remove it + if not outer_dict[j]: + outer_dict.pop(j, None) + + # Add value(s) into index nested dictionary + # OMG Python pointers are the best! + # Add field dictionaries into index + outer_dict = index + for field in f_range: + outer_dict.setdefault(key[field], {}) + outer_dict = outer_dict[key[field]] + # Add J, then V into index + if key[1] in outer_dict: + outer_dict[key[1]].update({key[0]: val}) else: - sys.stderr.write('Unrecognized distance model: %s.\n' % model) + outer_dict[key[1]] = {key[0]: val} -def indexJunctions(db_iter, fields=None, mode='gene', action='first'): +def indexJunctions(db_iter, fields=None, mode=default_index_mode, + action=default_index_action): """ Identifies preclonal groups by V, J and junction length @@ -90,27 +143,47 @@ Returns: a dictionary of {(V, J, junction length):[IgRecords]} """ + # print(fields) # Define functions for grouping keys if mode == 'allele' and fields is None: def _get_key(rec, act): - return (rec.getVAllele(act), rec.getJAllele(act), - None if rec.junction is None else len(rec.junction)) + return [rec.getVAllele(act), rec.getJAllele(act), + None if rec.junction is None else len(rec.junction)] elif mode == 'gene' and fields is None: def _get_key(rec, act): - return (rec.getVGene(act), rec.getJGene(act), - None if rec.junction is None else len(rec.junction)) + return [rec.getVGene(act), rec.getJGene(act), + None if rec.junction is None else len(rec.junction)] elif mode == 'allele' and fields is not None: def _get_key(rec, act): vdj = [rec.getVAllele(act), rec.getJAllele(act), None if rec.junction is None else len(rec.junction)] ann = [rec.toDict().get(k, None) for k in fields] - return tuple(chain(vdj, ann)) + return list(chain(vdj, ann)) elif mode == 'gene' and fields is not None: def _get_key(rec, act): vdj = [rec.getVGene(act), rec.getJGene(act), None if rec.junction is None else len(rec.junction)] ann = [rec.toDict().get(k, None) for k in fields] - return tuple(chain(vdj, ann)) + return list(chain(vdj, ann)) + + # Function to flatten nested dictionary + def _flatten_dict(d, parent_key=''): + items = [] + for k, v in d.items(): + new_key = parent_key + [k] if parent_key else [k] + if isinstance(v, dict): + items.extend(_flatten_dict(v, new_key).items()) + else: + items.append((new_key, v)) + flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items} + return flat_dict + + if action == 'first': + index_func = indexByIdentity + elif action == 'set': + index_func = indexByUnion + else: + sys.stderr.write('Unrecognized action: %s.\n' % action) start_time = time() clone_index = {} @@ -127,35 +200,16 @@ # Assigned passed preclone records to key and failed to index None if all([k is not None and k != '' for k in key]): - #print key - # TODO: Has much slow. Should have less slow. - if action == 'set': - - f_range = list(range(2, 3 + (len(fields) if fields else 0))) - vdj_range = list(range(2)) - - # Check for any keys that have matching columns and junction length and overlapping genes/alleles - to_remove = [] - if len(clone_index) > (1 if None in clone_index else 0) and key not in clone_index: - key = list(key) - for k in clone_index: - if k is not None and all([key[i] == k[i] for i in f_range]): - if all([not set(key[i]).isdisjoint(set(k[i])) for i in vdj_range]): - for i in vdj_range: key[i] = tuple(set(key[i]).union(set(k[i]))) - to_remove.append(k) - - # Remove original keys, replace with union of all genes/alleles and append values to new key - val = [rec] - val += list(chain(*(clone_index.pop(k) for k in to_remove))) - clone_index[tuple(key)] = clone_index.get(tuple(key),[]) + val - - elif action == 'first': - clone_index.setdefault(key, []).append(rec) + # Update index dictionary + index_func(clone_index, key, rec, fields) else: clone_index.setdefault(None, []).append(rec) printProgress(rec_count, step=1000, start_time=start_time, end=True) + if action == 'set': + clone_index = _flatten_dict(clone_index) + return clone_index @@ -179,15 +233,20 @@ a dictionary of lists defining {clone number: [IgRecords clonal group]} """ # Get distance matrix if not provided - if dist_mat is None: dist_mat = getModelMatrix(model) + if dist_mat is None: + try: + dist_mat = distance_models[model] + except KeyError: + sys.exit('Unrecognized distance model: %s' % args_dict['model']) + # TODO: can be cleaned up with abstract model class # Determine length of n-mers - if model in ['hs1f', 'm1n', 'aa', 'ham']: + if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']: nmer_len = 1 - elif model in ['hs5f']: + elif model in ['hh_s5f', 'mk_rs5nf']: nmer_len = 5 else: - sys.stderr.write('Unrecognized distance model: %s.\n' % model) + sys.exit('Unrecognized distance model: %s.\n' % model) # Define unique junction mapping seq_map = {} @@ -197,7 +256,7 @@ if len(seq) == 0: return None - seq = re.sub('[\.-]','N', str(seq)) + seq = re.sub('[\.-]', 'N', str(seq)) if model == 'aa': seq = translate(seq) seq_map.setdefault(seq, []).append(ig) @@ -210,7 +269,7 @@ seqs = list(seq_map.keys()) # Calculate pairwise distance matrix - dists = calcDistances(seqs, nmer_len, dist_mat, norm, sym) + dists = calcDistances(seqs, nmer_len, dist_mat, sym=sym, norm=norm) # Perform hierarchical clustering clusters = formClusters(dists, linkage, distance) @@ -449,6 +508,7 @@ # Define result object for iteration and get data records records = data.data + # print(data.id) result = DbResult(data.id, records) # Check for invalid data (due to failed indexing) and add failed result @@ -901,7 +961,7 @@ database with records failing clonal grouping. required fields: - SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION_LENGTH + SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION <field> sequence field specified by the --sf parameter @@ -926,32 +986,36 @@ # Distance cloning method parser_bygroup = subparsers.add_parser('bygroup', parents=[parser_parent], - formatter_class=CommonHelpFormatter, - help='''Defines clones as having same V assignment, - J assignment, and junction length with - specified substitution distance model.''') + formatter_class=CommonHelpFormatter, + help='''Defines clones as having same V assignment, + J assignment, and junction length with + specified substitution distance model.''', + description='''Defines clones as having same V assignment, + J assignment, and junction length with + specified substitution distance model.''') parser_bygroup.add_argument('-f', nargs='+', action='store', dest='fields', default=None, help='Additional fields to use for grouping clones (non VDJ)') parser_bygroup.add_argument('--mode', action='store', dest='mode', - choices=('allele', 'gene'), default='gene', + choices=('allele', 'gene'), default=default_index_mode, help='''Specifies whether to use the V(D)J allele or gene for initial grouping.''') - parser_bygroup.add_argument('--act', action='store', dest='action', default='set', - choices=('first', 'set'), + parser_bygroup.add_argument('--act', action='store', dest='action', + choices=('first', 'set'), default=default_index_action, help='''Specifies how to handle multiple V(D)J assignments for initial grouping.''') parser_bygroup.add_argument('--model', action='store', dest='model', - choices=('aa', 'ham', 'm1n', 'hs1f', 'hs5f'), + choices=choices_bygroup_model, default=default_bygroup_model, - help='''Specifies which substitution model to use for - calculating distance between sequences. Where m1n is the - mouse single nucleotide transition/trasversion model - of Smith et al, 1996; hs1f is the human single - nucleotide model derived from Yaari et al, 2013; hs5f - is the human S5F model of Yaari et al, 2013; ham is - nucleotide Hamming distance; and aa is amino acid - Hamming distance. The hs5f data should be - considered experimental.''') + help='''Specifies which substitution model to use for calculating distance + between sequences. The "ham" model is nucleotide Hamming distance and + "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are + human specific single nucleotide and 5-mer content models, respectively, + from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are + mouse specific single nucleotide and 5-mer content models, respectively, + from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are + deprecated models provided backwards compatibility with the "m1n" and + "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both + 5-mer models should be considered experimental.''') parser_bygroup.add_argument('--dist', action='store', dest='distance', type=float, default=default_distance, help='The distance threshold for clonal grouping') @@ -977,22 +1041,25 @@ parser_bygroup.set_defaults(group_func=indexJunctions) parser_bygroup.set_defaults(clone_func=distanceClones) - - # Hierarchical clustering cloning method - parser_hclust = subparsers.add_parser('hclust', parents=[parser_parent], + # Chen2010 + parser_chen = subparsers.add_parser('chen2010', parents=[parser_parent], formatter_class=CommonHelpFormatter, - help='Defines clones by specified distance metric on CDR3s and \ - cutting of hierarchical clustering tree') -# parser_hclust.add_argument('-f', nargs='+', action='store', dest='fields', default=None, -# help='Fields to use for grouping clones (non VDJ)') - parser_hclust.add_argument('--method', action='store', dest='method', - choices=('chen2010', 'ademokun2011'), default=default_hclust_model, - help='Specifies which cloning method to use for calculating distance \ - between CDR3s, computing linkage, and cutting clusters') - parser_hclust.set_defaults(feed_func=feedQueueClust) - parser_hclust.set_defaults(work_func=processQueueClust) - parser_hclust.set_defaults(collect_func=collectQueueClust) - parser_hclust.set_defaults(cluster_func=hierClust) + help='''Defines clones by method specified in Chen, 2010.''', + description='''Defines clones by method specified in Chen, 2010.''') + parser_chen.set_defaults(feed_func=feedQueueClust) + parser_chen.set_defaults(work_func=processQueueClust) + parser_chen.set_defaults(collect_func=collectQueueClust) + parser_chen.set_defaults(cluster_func=hierClust) + + # Ademokun2011 + parser_ade = subparsers.add_parser('ademokun2011', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='''Defines clones by method specified in Ademokun, 2011.''', + description='''Defines clones by method specified in Ademokun, 2011.''') + parser_ade.set_defaults(feed_func=feedQueueClust) + parser_ade.set_defaults(work_func=processQueueClust) + parser_ade.set_defaults(collect_func=collectQueueClust) + parser_ade.set_defaults(cluster_func=hierClust) return parser @@ -1023,8 +1090,11 @@ 'linkage': args_dict['linkage'], 'seq_field': args_dict['seq_field']} - # TODO: can be cleaned up with abstract model class - args_dict['clone_args']['dist_mat'] = getModelMatrix(args_dict['model']) + # Get distance matrix + try: + args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']] + except KeyError: + sys.exit('Unrecognized distance model: %s' % args_dict['model']) del args_dict['fields'] del args_dict['action'] @@ -1037,16 +1107,17 @@ del args_dict['seq_field'] # Define clone_args - if args.command == 'hclust': - dist_funcs = {'chen2010':distChen2010, 'ademokun2011':distAdemokun2011} - args_dict['clone_func'] = dist_funcs[args_dict['method']] - args_dict['cluster_args'] = {'method': args_dict['method']} - #del args_dict['fields'] - del args_dict['method'] + if args.command == 'chen2010': + args_dict['clone_func'] = distChen2010 + args_dict['cluster_args'] = {'method': args.command } + + if args.command == 'ademokun2011': + args_dict['clone_func'] = distAdemokun2011 + args_dict['cluster_args'] = {'method': args.command } # Call defineClones del args_dict['command'] del args_dict['db_files'] for f in args.__dict__['db_files']: args_dict['db_file'] = f - defineClones(**args_dict) \ No newline at end of file + defineClones(**args_dict)