shm_csr: change_o/DefineClones.py comparison

comparison change_o/DefineClones.py @ 52:22dddabe3637 draft

Uploaded

author	davidvanzessen
date	Tue, 23 May 2017 08:32:58 -0400
parents	c33d93683a09
children

comparison

equal deleted inserted replaced

-:8fa8836bd605
+:22dddabe3637
 # Imports
 import os
 import re
 import sys
+import csv
 import numpy as np
 from argparse import ArgumentParser
 from collections import OrderedDict
 from itertools import chain
 from textwrap import dedent
 from presto.Defaults import default_out_args
 from presto.IO import getFileType, getOutputHandle, printLog, printProgress
 from presto.Multiprocessing import manageProcesses
 from presto.Sequence import getDNAScoreDict
 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
-from changeo.Distance import getDNADistMatrix, getAADistMatrix, \
+from changeo.Distance import distance_models, calcDistances, formClusters
-hs1f_model, m1n_model, hs5f_model, \
-calcDistances, formClusters
 from changeo.IO import getDbWriter, readDbFile, countDbFile
 from changeo.Multiprocessing import DbData, DbResult
+## Set maximum field size for csv.reader
+csv.field_size_limit(sys.maxsize)
 # Defaults
 default_translate = False
 default_distance = 0.0
-default_bygroup_model = 'hs1f'
+default_index_mode = 'gene'
+default_index_action = 'set'
+default_bygroup_model = 'ham'
 default_hclust_model = 'chen2010'
 default_seq_field = 'JUNCTION'
 default_norm = 'len'
 default_sym = 'avg'
 default_linkage = 'single'
+choices_bygroup_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f', 'mk_rs1nf', 'mk_rs5nf', 'hs1f_compat', 'm1n_compat')
-# TODO:  should be in Distance, but need to be after function definitions
-# Amino acid Hamming distance
-aa_model = getAADistMatrix(mask_dist=1, gap_dist=0)
+def indexByIdentity(index, key, rec, fields=None):
+"""
-# DNA Hamming distance
+Updates a preclone index with a simple key
-ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0)
-# TODO:  this function is an abstraction to facilitate later cleanup
-def getModelMatrix(model):
-"""
-Simple wrapper to get distance matrix from model name
 Arguments:
-model = model name
+index = preclone index from indexJunctions
+key = index key
-Return:
+rec = IgRecord to add to the index
-a pandas.DataFrame containing the character distance matrix
+fields = additional annotation fields to use to group preclones;
-"""
+if None use only V, J and junction length
-if model == 'aa':
-return(aa_model)
+Returns:
-elif model == 'ham':
+None. Updates index with new key and records.
-return(ham_model)
+"""
-elif model == 'm1n':
+index.setdefault(tuple(key), []).append(rec)
-return(m1n_model)
-elif model == 'hs1f':
-return(hs1f_model)
+def indexByUnion(index, key, rec, fields=None):
-elif model == 'hs5f':
+"""
-return(hs5f_model)
+Updates a preclone index with the union of nested keys
+Arguments:
+index = preclone index from indexJunctions
+key = index key
+rec = IgRecord to add to the index
+fields = additional annotation fields to use to group preclones;
+if None use only V, J and junction length
+Returns:
+None. Updates index with new key and records.
+"""
+# List of values for this/new key
+val = [rec]
+f_range = list(range(2, 3 + (len(fields) if fields else 0)))
+# See if field/junction length combination exists in index
+outer_dict = index
+for field in f_range:
+try:
+outer_dict = outer_dict[key[field]]
+except (KeyError):
+outer_dict = None
+break
+# If field combination exists, look through Js
+j_matches = []
+if outer_dict is not None:
+for j in outer_dict.keys():
+if not set(key[1]).isdisjoint(set(j)):
+key[1] = tuple(set(key[1]).union(set(j)))
+j_matches += [j]
+# If J overlap exists, look through Vs for each J
+for j in j_matches:
+v_matches = []
+# Collect V matches for this J
+for v in outer_dict[j].keys():
+if not set(key[0]).isdisjoint(set(v)):
+key[0] = tuple(set(key[0]).union(set(v)))
+v_matches += [v]
+# If there are V overlaps for this J, pop them out
+if v_matches:
+val += list(chain(*(outer_dict[j].pop(v) for v in v_matches)))
+# If the J dict is now empty, remove it
+if not outer_dict[j]:
+outer_dict.pop(j, None)
+# Add value(s) into index nested dictionary
+# OMG Python pointers are the best!
+# Add field dictionaries into index
+outer_dict = index
+for field in f_range:
+outer_dict.setdefault(key[field], {})
+outer_dict = outer_dict[key[field]]
+# Add J, then V into index
+if key[1] in outer_dict:
+outer_dict[key[1]].update({key[0]: val})
 else:
-sys.stderr.write('Unrecognized distance model: %s.\n' % model)
+outer_dict[key[1]] = {key[0]: val}
-def indexJunctions(db_iter, fields=None, mode='gene', action='first'):
+def indexJunctions(db_iter, fields=None, mode=default_index_mode,
+action=default_index_action):
 """
 Identifies preclonal groups by V, J and junction length
 Arguments:
 db_iter = an iterator of IgRecords defined by readDbFile
 one of ('first', 'set')
 Returns:
 a dictionary of {(V, J, junction length):[IgRecords]}
 """
+# print(fields)
 # Define functions for grouping keys
 if mode == 'allele' and fields is None:
 def _get_key(rec, act):
-return (rec.getVAllele(act), rec.getJAllele(act),
+return [rec.getVAllele(act), rec.getJAllele(act),
-None if rec.junction is None else len(rec.junction))
+None if rec.junction is None else len(rec.junction)]
 elif mode == 'gene' and fields is None:
 def _get_key(rec, act):
-return (rec.getVGene(act), rec.getJGene(act),
+return [rec.getVGene(act), rec.getJGene(act),
-None if rec.junction is None else len(rec.junction))
+None if rec.junction is None else len(rec.junction)]
 elif mode == 'allele' and fields is not None:
 def _get_key(rec, act):
 vdj = [rec.getVAllele(act), rec.getJAllele(act),
 None if rec.junction is None else len(rec.junction)]
 ann = [rec.toDict().get(k, None) for k in fields]
-return tuple(chain(vdj, ann))
+return list(chain(vdj, ann))
 elif mode == 'gene' and fields is not None:
 def _get_key(rec, act):
 vdj = [rec.getVGene(act), rec.getJGene(act),
 None if rec.junction is None else len(rec.junction)]
 ann = [rec.toDict().get(k, None) for k in fields]
-return tuple(chain(vdj, ann))
+return list(chain(vdj, ann))
+# Function to flatten nested dictionary
+def _flatten_dict(d, parent_key=''):
+items = []
+for k, v in d.items():
+new_key = parent_key + [k] if parent_key else [k]
+if isinstance(v, dict):
+items.extend(_flatten_dict(v, new_key).items())
+else:
+items.append((new_key, v))
+flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items}
+return flat_dict
+if action == 'first':
+index_func = indexByIdentity
+elif action == 'set':
+index_func = indexByUnion
+else:
+sys.stderr.write('Unrecognized action: %s.\n' % action)
 start_time = time()
 clone_index = {}
 rec_count = 0
 for rec in db_iter:
 printProgress(rec_count, step=1000, start_time=start_time)
 rec_count += 1
 # Assigned passed preclone records to key and failed to index None
 if all([k is not None and k != '' for k in key]):
-#print key
+# Update index dictionary
-# TODO:  Has much slow. Should have less slow.
+index_func(clone_index, key, rec, fields)
-if action == 'set':
-f_range = list(range(2, 3 + (len(fields) if fields else 0)))
-vdj_range = list(range(2))
-# Check for any keys that have matching columns and junction length and overlapping genes/alleles
-to_remove = []
-if len(clone_index) > (1 if None in clone_index else 0) and key not in clone_index:
-key = list(key)
-for k in clone_index:
-if k is not None and all([key[i] == k[i] for i in f_range]):
-if all([not set(key[i]).isdisjoint(set(k[i])) for i in vdj_range]):
-for i in vdj_range:  key[i] = tuple(set(key[i]).union(set(k[i])))
-to_remove.append(k)
-# Remove original keys, replace with union of all genes/alleles and append values to new key
-val = [rec]
-val += list(chain(*(clone_index.pop(k) for k in to_remove)))
-clone_index[tuple(key)] = clone_index.get(tuple(key),[]) + val
-elif action == 'first':
-clone_index.setdefault(key, []).append(rec)
 else:
 clone_index.setdefault(None, []).append(rec)
 printProgress(rec_count, step=1000, start_time=start_time, end=True)
+if action == 'set':
+clone_index = _flatten_dict(clone_index)
 return clone_index
 def distanceClones(records, model=default_bygroup_model, distance=default_distance,
 Returns:
 a dictionary of lists defining {clone number: [IgRecords clonal group]}
 """
 # Get distance matrix if not provided
-if dist_mat is None:  dist_mat = getModelMatrix(model)
+if dist_mat is None:
+try:
+dist_mat = distance_models[model]
+except KeyError:
+sys.exit('Unrecognized distance model: %s' % args_dict['model'])
+# TODO:  can be cleaned up with abstract model class
 # Determine length of n-mers
-if model in ['hs1f', 'm1n', 'aa', 'ham']:
+if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']:
 nmer_len = 1
-elif model in ['hs5f']:
+elif model in ['hh_s5f', 'mk_rs5nf']:
 nmer_len = 5
 else:
-sys.stderr.write('Unrecognized distance model: %s.\n' % model)
+sys.exit('Unrecognized distance model: %s.\n' % model)
 # Define unique junction mapping
 seq_map = {}
 for ig in records:
 seq = ig.getSeqField(seq_field)
 # Check if sequence length is 0
 if len(seq) == 0:
 return None
-seq = re.sub('[\.-]','N', str(seq))
+seq = re.sub('[\.-]', 'N', str(seq))
 if model == 'aa':  seq = translate(seq)
 seq_map.setdefault(seq, []).append(ig)
 # Process records
 # Define sequences
 seqs = list(seq_map.keys())
 # Calculate pairwise distance matrix
-dists = calcDistances(seqs, nmer_len, dist_mat, norm, sym)
+dists = calcDistances(seqs, nmer_len, dist_mat, sym=sym, norm=norm)
 # Perform hierarchical clustering
 clusters = formClusters(dists, linkage, distance)
 # Turn clusters into clone dictionary
 # Exit upon reaching sentinel
 if data is None:  break
 # Define result object for iteration and get data records
 records = data.data
+# print(data.id)
 result = DbResult(data.id, records)
 # Check for invalid data (due to failed indexing) and add failed result
 if not data:
 result_queue.put(result)
 database with assigned clonal group numbers.
 clone-fail
 database with records failing clonal grouping.
 required fields:
-SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION_LENGTH
+SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION
 <field>
 sequence field specified by the --sf parameter
 output fields:
 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True,
 multiproc=True)
 # Distance cloning method
 parser_bygroup = subparsers.add_parser('bygroup', parents=[parser_parent],
 formatter_class=CommonHelpFormatter,
 help='''Defines clones as having same V assignment,
 J assignment, and junction length with
-specified substitution distance model.''')
+specified substitution distance model.''',
+description='''Defines clones as having same V assignment,
+J assignment, and junction length with
+specified substitution distance model.''')
 parser_bygroup.add_argument('-f', nargs='+', action='store', dest='fields', default=None,
 help='Additional fields to use for grouping clones (non VDJ)')
 parser_bygroup.add_argument('--mode', action='store', dest='mode',
-choices=('allele', 'gene'), default='gene',
+choices=('allele', 'gene'), default=default_index_mode,
 help='''Specifies whether to use the V(D)J allele or gene for
 initial grouping.''')
-parser_bygroup.add_argument('--act', action='store', dest='action', default='set',
+parser_bygroup.add_argument('--act', action='store', dest='action',
-choices=('first', 'set'),
+choices=('first', 'set'), default=default_index_action,
 help='''Specifies how to handle multiple V(D)J assignments
 for initial grouping.''')
 parser_bygroup.add_argument('--model', action='store', dest='model',
-choices=('aa', 'ham', 'm1n', 'hs1f', 'hs5f'),
+choices=choices_bygroup_model,
 default=default_bygroup_model,
-help='''Specifies which substitution model to use for
+help='''Specifies which substitution model to use for calculating distance
-calculating distance between sequences. Where m1n is the
+between sequences. The "ham" model is nucleotide Hamming distance and
-mouse single nucleotide transition/trasversion model
+"aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are
-of Smith et al, 1996; hs1f is the human single
+human specific single nucleotide and 5-mer content models, respectively,
-nucleotide model derived from Yaari et al, 2013; hs5f
+from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are
-is the human S5F model of Yaari et al, 2013; ham is
+mouse specific single nucleotide and 5-mer content models, respectively,
-nucleotide Hamming distance; and aa is amino acid
+from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are
-Hamming distance. The hs5f data should be
+deprecated models provided backwards compatibility with the "m1n" and
-considered experimental.''')
+"hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both
+5-mer models should be considered experimental.''')
 parser_bygroup.add_argument('--dist', action='store', dest='distance', type=float,
 default=default_distance,
 help='The distance threshold for clonal grouping')
 parser_bygroup.add_argument('--norm', action='store', dest='norm',
 choices=('len', 'mut', 'none'), default=default_norm,
 parser_bygroup.set_defaults(work_func=processQueue)
 parser_bygroup.set_defaults(collect_func=collectQueue)
 parser_bygroup.set_defaults(group_func=indexJunctions)
 parser_bygroup.set_defaults(clone_func=distanceClones)
+# Chen2010
-# Hierarchical clustering cloning method
+parser_chen = subparsers.add_parser('chen2010', parents=[parser_parent],
-parser_hclust = subparsers.add_parser('hclust', parents=[parser_parent],
 formatter_class=CommonHelpFormatter,
-help='Defines clones by specified distance metric on CDR3s and \
+help='''Defines clones by method specified in Chen, 2010.''',
-cutting of hierarchical clustering tree')
+description='''Defines clones by method specified in Chen, 2010.''')
-#     parser_hclust.add_argument('-f', nargs='+', action='store', dest='fields', default=None,
+parser_chen.set_defaults(feed_func=feedQueueClust)
-#                              help='Fields to use for grouping clones (non VDJ)')
+parser_chen.set_defaults(work_func=processQueueClust)
-parser_hclust.add_argument('--method', action='store', dest='method',
+parser_chen.set_defaults(collect_func=collectQueueClust)
-choices=('chen2010', 'ademokun2011'), default=default_hclust_model,
+parser_chen.set_defaults(cluster_func=hierClust)
-help='Specifies which cloning method to use for calculating distance \
-between CDR3s, computing linkage, and cutting clusters')
+# Ademokun2011
-parser_hclust.set_defaults(feed_func=feedQueueClust)
+parser_ade = subparsers.add_parser('ademokun2011', parents=[parser_parent],
-parser_hclust.set_defaults(work_func=processQueueClust)
+formatter_class=CommonHelpFormatter,
-parser_hclust.set_defaults(collect_func=collectQueueClust)
+help='''Defines clones by method specified in Ademokun, 2011.''',
-parser_hclust.set_defaults(cluster_func=hierClust)
+description='''Defines clones by method specified in Ademokun, 2011.''')
+parser_ade.set_defaults(feed_func=feedQueueClust)
+parser_ade.set_defaults(work_func=processQueueClust)
+parser_ade.set_defaults(collect_func=collectQueueClust)
+parser_ade.set_defaults(cluster_func=hierClust)
 return parser
 if __name__ == '__main__':
 'norm': args_dict['norm'],
 'sym': args_dict['sym'],
 'linkage': args_dict['linkage'],
 'seq_field': args_dict['seq_field']}
-# TODO:  can be cleaned up with abstract model class
+# Get distance matrix
-args_dict['clone_args']['dist_mat'] = getModelMatrix(args_dict['model'])
+try:
+args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']]
+except KeyError:
+sys.exit('Unrecognized distance model: %s' % args_dict['model'])
 del args_dict['fields']
 del args_dict['action']
 del args_dict['mode']
 del args_dict['model']
 del args_dict['sym']
 del args_dict['linkage']
 del args_dict['seq_field']
 # Define clone_args
-if args.command == 'hclust':
+if args.command == 'chen2010':
-dist_funcs = {'chen2010':distChen2010, 'ademokun2011':distAdemokun2011}
+args_dict['clone_func'] = distChen2010
-args_dict['clone_func'] = dist_funcs[args_dict['method']]
+args_dict['cluster_args'] = {'method': args.command }
-args_dict['cluster_args'] = {'method':  args_dict['method']}
-#del args_dict['fields']
+if args.command == 'ademokun2011':
-del args_dict['method']
+args_dict['clone_func'] = distAdemokun2011
+args_dict['cluster_args'] = {'method': args.command }
 # Call defineClones
 del args_dict['command']
 del args_dict['db_files']
 for f in args.__dict__['db_files']:

Mercurial > repos > davidvanzessen > shm_csr

comparison change_o/DefineClones.py @ 52:22dddabe3637 draft