Mercurial > repos > davidvanzessen > shm_csr
changeset 78:aff3ba86ef7a draft
Uploaded
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.gitattributes Mon Aug 31 11:20:08 2020 -0400 @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.gitignore Mon Aug 31 11:20:08 2020 -0400 @@ -0,0 +1,4 @@ + +shm_csr\.tar\.gz + +\.vscode/settings\.json
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/DefineClones.py Mon Aug 31 11:20:08 2020 -0400 @@ -0,0 +1,739 @@ +#!/usr/bin/env python3 +""" +Assign Ig sequences into clones +""" + +# Info +__author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman' +from changeo import __version__, __date__ + +# Imports +import os +import re +import sys +from argparse import ArgumentParser +from collections import OrderedDict +from itertools import chain +from textwrap import dedent +from time import time +from Bio.Seq import translate + +# Presto and changeo imports +from presto.Defaults import default_out_args +from presto.IO import printLog, printProgress, printCount, printWarning, printError +from presto.Multiprocessing import manageProcesses +from changeo.Defaults import default_format, default_v_field, default_j_field, default_junction_field +from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs +from changeo.Distance import distance_models, calcDistances, formClusters +from changeo.IO import countDbFile, getDbFields, getFormatOperators, getOutputHandle, \ + AIRRWriter, ChangeoWriter +from changeo.Multiprocessing import DbResult, feedDbQueue, processDbQueue + +# Defaults +default_translate = False +default_distance = 0.0 +default_index_mode = 'gene' +default_index_action = 'set' +default_distance_model = 'ham' +default_norm = 'len' +default_sym = 'avg' +default_linkage = 'single' +default_max_missing=0 +choices_distance_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f', + 'mk_rs1nf', 'mk_rs5nf', + 'hs1f_compat', 'm1n_compat') + + +def filterMissing(data, seq_field=default_junction_field, v_field=default_v_field, + j_field=default_j_field, max_missing=default_max_missing): + """ + Splits a set of sequence into passed and failed groups based on the number + of missing characters in the sequence + + Arguments: + data : changeo.Multiprocessing.DbData object. + seq_field : sequence field to filter on. + v_field : field containing the V call. + j_field : field containing the J call. + max_missing : maximum number of missing characters (non-ACGT) to permit before failing the record. + + Returns: + changeo.Multiprocessing.DbResult : objected containing filtered records. + """ + # Function to validate the sequence string + def _pass(seq): + if len(seq) > 0 and len(re.findall(r'[^ACGT]', seq)) <= max_missing: + return True + else: + return False + + # Define result object for iteration and get data records + result = DbResult(data.id, data.data) + + if not data: + result.data_pass = [] + result.data_fail = data.data + return result + + result.data_pass = [] + result.data_fail = [] + for rec in data.data: + seq = rec.getField(seq_field) + if _pass(seq): result.data_pass.append(rec) + else: result.data_fail.append(rec) + + # Add V(D)J to log + result.log['ID'] = ','.join([str(x) for x in data.id]) + result.log['VCALL'] = ','.join(set([(r.getVAllele(field=v_field) or '') for r in data.data])) + result.log['JCALL'] = ','.join(set([(r.getJAllele(field=j_field) or '') for r in data.data])) + result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in data.data])) + result.log['CLONED'] = len(result.data_pass) + result.log['FILTERED'] = len(result.data_fail) + + return result + + +def indexByIdentity(index, key, rec, group_fields=None): + """ + Updates a preclone index with a simple key + + Arguments: + index : preclone index from groupByGene + key : index key + rec : Receptor to add to the index + group_fields : additional annotation fields to use to group preclones; + if None use only V, J and junction length + + Returns: + None : Updates index with new key and records. + """ + index.setdefault(tuple(key), []).append(rec) + + +def indexByUnion(index, key, rec, group_fields=None): + """ + Updates a preclone index with the union of nested keys + + Arguments: + index : preclone index from groupByGene + key : index key + rec : Receptor to add to the index + group_fields : additional annotation fields to use to group preclones; + if None use only V, J and junction length + + Returns: + None : Updates index with new key and records. + """ + # List of values for this/new key + val = [rec] + f_range = list(range(2, 3 + (len(group_fields) if group_fields else 0))) + + # See if field/junction length combination exists in index + outer_dict = index + for field in f_range: + try: + outer_dict = outer_dict[key[field]] + except KeyError: + outer_dict = None + break + # If field combination exists, look through Js + j_matches = [] + if outer_dict is not None: + for j in outer_dict.keys(): + if not set(key[1]).isdisjoint(set(j)): + key[1] = tuple(set(key[1]).union(set(j))) + j_matches += [j] + # If J overlap exists, look through Vs for each J + for j in j_matches: + v_matches = [] + # Collect V matches for this J + for v in outer_dict[j].keys(): + if not set(key[0]).isdisjoint(set(v)): + key[0] = tuple(set(key[0]).union(set(v))) + v_matches += [v] + # If there are V overlaps for this J, pop them out + if v_matches: + val += list(chain(*(outer_dict[j].pop(v) for v in v_matches))) + # If the J dict is now empty, remove it + if not outer_dict[j]: + outer_dict.pop(j, None) + + # Add value(s) into index nested dictionary + # OMG Python pointers are the best! + # Add field dictionaries into index + outer_dict = index + for field in f_range: + outer_dict.setdefault(key[field], {}) + outer_dict = outer_dict[key[field]] + # Add J, then V into index + if key[1] in outer_dict: + outer_dict[key[1]].update({key[0]: val}) + else: + outer_dict[key[1]] = {key[0]: val} + + +def groupByGene(db_iter, group_fields=None, v_field=default_v_field, j_field=default_j_field, + mode=default_index_mode, action=default_index_action): + """ + Identifies preclonal groups by V, J and junction length + + Arguments: + db_iter : an iterator of Receptor objects defined by ChangeoReader + group_fields : additional annotation fields to use to group preclones; + if None use only V, J and junction length + mode : specificity of alignment call to use for assigning preclones; + one of ('allele', 'gene') + action : how to handle multiple value fields when assigning preclones; + one of ('first', 'set') + + Returns: + dict: dictionary of {(V, J, junction length):[Receptor]} + """ + # print(fields) + # Define functions for grouping keys + if mode == 'allele' and group_fields is None: + def _get_key(rec, act): + return [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field), + None if rec.junction is None else len(rec.junction)] + elif mode == 'gene' and group_fields is None: + def _get_key(rec, act): + return [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field), + None if rec.junction is None else len(rec.junction)] + elif mode == 'allele' and group_fields is not None: + def _get_key(rec, act): + vdj = [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field), + None if rec.junction is None else len(rec.junction)] + ann = [rec.getField(k) for k in group_fields] + return list(chain(vdj, ann)) + elif mode == 'gene' and group_fields is not None: + def _get_key(rec, act): + vdj = [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field), + None if rec.junction is None else len(rec.junction)] + ann = [rec.getField(k) for k in group_fields] + return list(chain(vdj, ann)) + + # Function to flatten nested dictionary + def _flatten_dict(d, parent_key=''): + items = [] + for k, v in d.items(): + new_key = parent_key + [k] if parent_key else [k] + if isinstance(v, dict): + items.extend(_flatten_dict(v, new_key).items()) + else: + items.append((new_key, v)) + flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items} + return flat_dict + + if action == 'first': + index_func = indexByIdentity + elif action == 'set': + index_func = indexByUnion + else: + sys.stderr.write('Unrecognized action: %s.\n' % action) + + start_time = time() + clone_index = {} + rec_count = 0 + for rec in db_iter: + key = _get_key(rec, action) + + # Print progress + printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences') + rec_count += 1 + + # Assigned passed preclone records to key and failed to index None + if all([k is not None and k != '' for k in key]): + # Update index dictionary + index_func(clone_index, key, rec, group_fields) + else: + clone_index.setdefault(None, []).append(rec) + + printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences', end=True) + + if action == 'set': + clone_index = _flatten_dict(clone_index) + + return clone_index + + +def distanceClones(result, seq_field=default_junction_field, model=default_distance_model, + distance=default_distance, dist_mat=None, norm=default_norm, sym=default_sym, + linkage=default_linkage): + """ + Separates a set of Receptor objects into clones + + Arguments: + result : a changeo.Multiprocessing.DbResult object with filtered records to clone + seq_field : sequence field used to calculate distance between records + model : substitution model used to calculate distance + distance : the distance threshold to assign clonal groups + dist_mat : pandas DataFrame of pairwise nucleotide or amino acid distances + norm : normalization method + sym : symmetry method + linkage : type of linkage + + Returns: + changeo.Multiprocessing.DbResult : an updated DbResult object + """ + # Get distance matrix if not provided + if dist_mat is None: + try: + dist_mat = distance_models[model] + except KeyError: + printError('Unrecognized distance model: %s' % args_dict['model']) + + # TODO: can be cleaned up with abstract model class + # Determine length of n-mers + if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']: + nmer_len = 1 + elif model in ['hh_s5f', 'mk_rs5nf']: + nmer_len = 5 + else: + printError('Unrecognized distance model: %s.\n' % model) + + # Define unique junction mapping + seq_map = {} + for rec in result.data_pass: + seq = rec.getField(seq_field) + seq = re.sub('[\.-]', 'N', seq) + if model == 'aa': seq = translate(seq) + seq_map.setdefault(seq, []).append(rec) + + # Define sequences + sequences = list(seq_map.keys()) + + # Zero record case + if not sequences: + result.valid = False + result.log['CLONES'] = 0 + return result + + # Single record case + if len(sequences) == 1: + result.results = {1: result.data_pass} + result.valid = True + result.log['CLONES'] = 1 + return result + + # Calculate pairwise distance matrix + dists = calcDistances(sequences, nmer_len, dist_mat, sym=sym, norm=norm) + + # Perform hierarchical clustering + clusters = formClusters(dists, linkage, distance) + + # Turn clusters into clone dictionary + clone_dict = {} + for i, c in enumerate(clusters): + clone_dict.setdefault(c, []).extend(seq_map[sequences[i]]) + + if clone_dict: + result.results = clone_dict + result.valid = True + result.log['CLONES'] = len(clone_dict) + else: + result.log['CLONES'] = 0 + + return result + + +def collectQueue(alive, result_queue, collect_queue, db_file, fields, + writer=ChangeoWriter, out_file=None, out_args=default_out_args): + """ + Assembles results from a queue of individual sequence results and manages log/file I/O + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + result_queue : a multiprocessing.Queue holding processQueue results + collect_queue : a multiprocessing.Queue to store collector return values + db_file : the input database file name + fields : list of output field names + writer : writer class. + out_file : output file name. Automatically generated from the input file if None. + out_args : common output argument dictionary from parseCommonArgs + + Returns: + None : Adds a dictionary with key value pairs to collect_queue containing + 'log' defining a log object along with the 'pass' and 'fail' output file names. + """ + # Wrapper for opening handles and writers + def _open(x, f, writer=writer, out_file=out_file): + if out_file is not None and x == 'pass': + handle = open(out_file, 'w') + else: + handle = getOutputHandle(db_file, + out_label='clone-%s' % x, + out_dir=out_args['out_dir'], + out_name=out_args['out_name'], + out_type=out_args['out_type']) + return handle, writer(handle, fields=f) + + # Open log file + try: + # Count input records + result_count = countDbFile(db_file) + + # Define log handle + if out_args['log_file'] is None: + log_handle = None + else: + log_handle = open(out_args['log_file'], 'w') + except: + #sys.stderr.write('Exception in collector file opening step\n') + alive.value = False + raise + + # Get results from queue and write to files + try: + # Initialize handles, writers and counters + pass_handle, pass_writer = None, None + fail_handle, fail_writer = None, None + rec_count, clone_count, pass_count, fail_count = 0, 0, 0, 0 + start_time = time() + + # Iterator over results queue until sentinel object reached + while alive.value: + # Get result from queue + if result_queue.empty(): continue + else: result = result_queue.get() + # Exit upon reaching sentinel + if result is None: break + + # Print progress for previous iteration and update record count + printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones') + rec_count += len(result.data) + + # Write passed and failed records + if result: + # Writing passing sequences + for clone in result.results.values(): + clone_count += 1 + for i, rec in enumerate(clone, start=1): + pass_count += 1 + rec.setField('clone', str(clone_count)) + result.log['CLONE%i-%i' % (clone_count, i)] = rec.junction + try: + pass_writer.writeReceptor(rec) + except AttributeError: + # Open pass file and define writer object + pass_handle, pass_writer = _open('pass', fields) + pass_writer.writeReceptor(rec) + + # Write failed sequences from passing sets + if result.data_fail: + # Write failed sequences + for i, rec in enumerate(result.data_fail, start=1): + fail_count += 1 + result.log['FAIL%i-%i' % (clone_count, i)] = rec.junction + if out_args['failed']: + try: + fail_writer.writeReceptor(rec) + except AttributeError: + # Open fail file and define writer object + fail_handle, fail_writer = _open('fail', fields) + fail_writer.writeReceptor(rec) + else: + # Write failing records + for i, rec in enumerate(result.data, start=1): + fail_count += 1 + result.log['CLONE0-%i' % (i)] = rec.junction + if out_args['failed']: + try: + fail_writer.writeReceptor(rec) + except AttributeError: + # Open fail file and define writer object + fail_handle, fail_writer = _open('fail', fields) + fail_writer.writeReceptor(rec) + + # Write log + printLog(result.log, handle=log_handle) + else: + sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + + # Print total counts + printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones') + + # Update return list + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None + log['CLONES'] = clone_count + log['RECORDS'] = rec_count + log['PASS'] = pass_count + log['FAIL'] = fail_count + + # Close file handles and generate return data + collect_dict = {'log': log, 'pass': None, 'fail': None} + if pass_handle is not None: + collect_dict['pass'] = pass_handle.name + pass_handle.close() + if fail_handle is not None: + collect_dict['fail'] = fail_handle.name + fail_handle.close() + if log_handle is not None: + log_handle.close() + collect_queue.put(collect_dict) + except: + alive.value = False + raise + + return None + + +def defineClones(db_file, seq_field=default_junction_field, v_field=default_v_field, + j_field=default_j_field, max_missing=default_max_missing, + group_fields=None, group_func=groupByGene, group_args={}, + clone_func=distanceClones, clone_args={}, + format=default_format, out_file=None, out_args=default_out_args, + nproc=None, queue_size=None): + """ + Define clonally related sequences + + Arguments: + db_file : filename of input database. + seq_field : sequence field used to determine clones. + v_field : field containing the V call. + j_field : field containing the J call. + max_missing : maximum number of non-ACGT characters to allow in the junction sequence. + group_fields : additional annotation fields to use to group preclones; + if None use only V and J. + group_func : the function to use for assigning preclones. + group_args : a dictionary of arguments to pass to group_func. + clone_func : the function to use for determining clones within preclonal groups. + clone_args : a dictionary of arguments to pass to clone_func. + format : input and output format. + out_file : output file name. Automatically generated from the input file if None. + out_args : common output argument dictionary from parseCommonArgs. + nproc : the number of processQueue processes; + if None defaults to the number of CPUs. + queue_size : maximum size of the argument queue; + if None defaults to 2*nproc. + + Returns: + dict: dictionary of output pass and fail files. + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'DefineClones' + log['FILE'] = os.path.basename(db_file) + log['SEQ_FIELD'] = seq_field + log['V_FIELD'] = v_field + log['J_FIELD'] = j_field + log['MAX_MISSING'] = max_missing + log['GROUP_FIELDS'] = ','.join(group_fields) if group_fields is not None else None + for k in sorted(group_args): + log[k.upper()] = group_args[k] + for k in sorted(clone_args): + if k != 'dist_mat': log[k.upper()] = clone_args[k] + log['NPROC'] = nproc + printLog(log) + + # Define format operators + try: + reader, writer, schema = getFormatOperators(format) + except ValueError: + printError('Invalid format %s.' % format) + + # Translate to Receptor attribute names + seq_field = schema.toReceptor(seq_field) + v_field = schema.toReceptor(v_field) + j_field = schema.toReceptor(j_field) + if group_fields is not None: + group_fields = [schema.toReceptor(f) for f in group_fields] + + # Define feeder function and arguments + group_args['group_fields'] = group_fields + group_args['v_field'] = v_field + group_args['j_field'] = j_field + feed_args = {'db_file': db_file, + 'reader': reader, + 'group_func': group_func, + 'group_args': group_args} + + # Define worker function and arguments + filter_args = {'seq_field': seq_field, + 'v_field': v_field, + 'j_field': j_field, + 'max_missing': max_missing} + clone_args['seq_field'] = seq_field + work_args = {'process_func': clone_func, + 'process_args': clone_args, + 'filter_func': filterMissing, + 'filter_args': filter_args} + + # Define collector function and arguments + out_fields = getDbFields(db_file, add=schema.fromReceptor('clone'), reader=reader) + out_args['out_type'] = schema.out_type + collect_args = {'db_file': db_file, + 'fields': out_fields, + 'writer': writer, + 'out_file': out_file, + 'out_args': out_args} + + # Call process manager + result = manageProcesses(feed_func=feedDbQueue, work_func=processDbQueue, collect_func=collectQueue, + feed_args=feed_args, work_args=work_args, collect_args=collect_args, + nproc=nproc, queue_size=queue_size) + + # Print log + result['log']['END'] = 'DefineClones' + printLog(result['log']) + output = {k: v for k, v in result.items() if k in ('pass', 'fail')} + + return output + + +def getArgParser(): + """ + Defines the ArgumentParser + + Arguments: + None + + Returns: + an ArgumentParser object + """ + # Define input and output fields + fields = dedent( + ''' + output files: + clone-pass + database with assigned clonal group numbers. + clone-fail + database with records failing clonal grouping. + + required fields: + SEQUENCE_ID, V_CALL, J_CALL, JUNCTION + + output fields: + CLONE + ''') + # Define argument parser + parser = ArgumentParser(description=__doc__, epilog=fields, + parents=[getCommonArgParser(format=False, multiproc=True)], + formatter_class=CommonHelpFormatter, add_help=False) + + # Distance cloning method + group = parser.add_argument_group('cloning arguments') + group.add_argument('--sf', action='store', dest='seq_field', default=default_junction_field, + help='Field to be used to calculate distance between records.') + group.add_argument('--vf', action='store', dest='v_field', default=default_v_field, + help='Field containing the germline V segment call.') + group.add_argument('--jf', action='store', dest='j_field', default=default_j_field, + help='Field containing the germline J segment call.') + group.add_argument('--gf', nargs='+', action='store', dest='group_fields', default=None, + help='Additional fields to use for grouping clones aside from V, J and junction length.') + group.add_argument('--mode', action='store', dest='mode', + choices=('allele', 'gene'), default=default_index_mode, + help='''Specifies whether to use the V(D)J allele or gene for + initial grouping.''') + group.add_argument('--act', action='store', dest='action', + choices=('first', 'set'), default=default_index_action, + help='''Specifies how to handle multiple V(D)J assignments for initial grouping. + The "first" action will use only the first gene listed. + The "set" action will use all gene assignments and construct a larger gene + grouping composed of any sequences sharing an assignment or linked to another + sequence by a common assignment (similar to single-linkage).''') + group.add_argument('--model', action='store', dest='model', + choices=choices_distance_model, + default=default_distance_model, + help='''Specifies which substitution model to use for calculating distance + between sequences. The "ham" model is nucleotide Hamming distance and + "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are + human specific single nucleotide and 5-mer content models, respectively, + from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are + mouse specific single nucleotide and 5-mer content models, respectively, + from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are + deprecated models provided backwards compatibility with the "m1n" and + "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both + 5-mer models should be considered experimental.''') + group.add_argument('--dist', action='store', dest='distance', type=float, + default=default_distance, + help='The distance threshold for clonal grouping') + group.add_argument('--norm', action='store', dest='norm', + choices=('len', 'mut', 'none'), default=default_norm, + help='''Specifies how to normalize distances. One of none + (do not normalize), len (normalize by length), + or mut (normalize by number of mutations between sequences).''') + group.add_argument('--sym', action='store', dest='sym', + choices=('avg', 'min'), default=default_sym, + help='''Specifies how to combine asymmetric distances. One of avg + (average of A->B and B->A) or min (minimum of A->B and B->A).''') + group.add_argument('--link', action='store', dest='linkage', + choices=('single', 'average', 'complete'), default=default_linkage, + help='''Type of linkage to use for hierarchical clustering.''') + group.add_argument('--maxmiss', action='store', dest='max_missing', type=int, + default=default_max_missing, + help='''The maximum number of non-ACGT characters (gaps or Ns) to + permit in the junction sequence before excluding the record + from clonal assignment. Note, under single linkage + non-informative positions can create artifactual links + between unrelated sequences. Use with caution.''') + parser.set_defaults(group_func=groupByGene) + parser.set_defaults(clone_func=distanceClones) + + return parser + + +if __name__ == '__main__': + """ + Parses command line arguments and calls main function + """ + # Parse arguments + parser = getArgParser() + checkArgs(parser) + args = parser.parse_args() + args_dict = parseCommonArgs(args) + + # # Set default fields if not specified. + # default_fields = {'seq_field': default_junction_field, + # 'v_field': default_v_field, + # 'j_field': default_j_field} + # + # # Default Change-O fields + # if args_dict['format'] == 'changeo': + # for f in default_fields: + # if args_dict[f] is None: args_dict[f] = default_fields[f] + # else: args_dict[f] = args_dict[f].upper() + # + # # Default AIRR fields + # if args_dict['format'] == 'airr': + # for f in default_fields: + # if args_dict[f] is None: args_dict[f] = ChangeoSchema.toAIRR(default_fields[f]) + # else: args_dict[f] = args_dict[f].lower() + + # Define grouping and cloning function arguments + args_dict['group_args'] = {'action': args_dict['action'], + 'mode':args_dict['mode']} + args_dict['clone_args'] = {'model': args_dict['model'], + 'distance': args_dict['distance'], + 'norm': args_dict['norm'], + 'sym': args_dict['sym'], + 'linkage': args_dict['linkage']} + + # Get distance matrix + try: + args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']] + except KeyError: + printError('Unrecognized distance model: %s' % args_dict['model']) + + # Clean argument dictionary + del args_dict['action'] + del args_dict['mode'] + del args_dict['model'] + del args_dict['distance'] + del args_dict['norm'] + del args_dict['sym'] + del args_dict['linkage'] + + # Clean arguments dictionary + del args_dict['db_files'] + if 'out_files' in args_dict: del args_dict['out_files'] + + # Call main function for each input file + for i, f in enumerate(args.__dict__['db_files']): + args_dict['db_file'] = f + args_dict['out_file'] = args.__dict__['out_files'][i] \ + if args.__dict__['out_files'] else None + defineClones(**args_dict)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/MakeDb.py Mon Aug 31 11:20:08 2020 -0400 @@ -0,0 +1,885 @@ +#!/usr/bin/env python3 +""" +Create tab-delimited database file to store sequence alignment information +""" + +# Info +__author__ = 'Namita Gupta, Jason Anthony Vander Heiden' +from changeo import __version__, __date__ + +# Imports +import os +import re +import csv +from argparse import ArgumentParser +from collections import OrderedDict +from textwrap import dedent +from time import time +from Bio import SeqIO + +# Presto and changeo imports +from presto.Annotation import parseAnnotation +from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile +from changeo.Defaults import default_format, default_out_args +from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs +from changeo.Alignment import RegionDefinition +from changeo.Gene import buildGermline +from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \ + AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader +from changeo.Receptor import ChangeoSchema, AIRRSchema + +# 10X Receptor attributes +cellranger_base = ['cell', 'c_call', 'conscount', 'umicount'] +cellranger_extended = ['cell', 'c_call', 'conscount', 'umicount', + 'v_call_10x', 'd_call_10x', 'j_call_10x', + 'junction_10x', 'junction_10x_aa'] + +def readCellRanger(cellranger_file, fields=cellranger_base): + """ + Load a Cell Ranger annotation table + + Arguments: + cellranger_file (str): path to the annotation file. + fields (list): list of fields to keep. + + Returns: + dict: dict of dicts with contig_id as the primary key. + """ + # Mapping of 10X annotations to Receptor attributes + cellranger_map = {'cell': 'barcode', + 'c_call': 'c_gene', + 'locus': 'chain', + 'conscount': 'reads', + 'umicount': 'umis', + 'v_call_10x': 'v_gene', + 'd_call_10x': 'd_gene', + 'j_call_10x': 'j_gene', + 'junction_10x': 'cdr3_nt', + 'junction_10x_aa': 'cdr3'} + + # Function to parse individual fields + def _parse(x): + return '' if x == 'None' else x + + # Generate annotation dictionary + ann_dict = {} + with open(cellranger_file) as csv_file: + # Detect delimiters + dialect = csv.Sniffer().sniff(csv_file.readline()) + csv_file.seek(0) + # Read in annotation file + csv_reader = csv.DictReader(csv_file, dialect=dialect) + + # Generate annotation dictionary + for row in csv_reader: + ann_dict[row['contig_id']] = {f: _parse(row[cellranger_map[f]]) for f in fields} + + return ann_dict + + +def addGermline(receptor, references, amino_acid=False): + """ + Add full length germline to Receptor object + + Arguments: + receptor (changeo.Receptor.Receptor): Receptor object to modify. + references (dict): dictionary of IMGT-gapped references sequences. + amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline + + Returns: + changeo.Receptor.Receptor: modified Receptor with the germline sequence added. + """ + if amino_acid: + __, germlines, __ = buildGermline(receptor, references, seq_field='sequence_aa_imgt', + amino_acid=True) + germline_seq = None if germlines is None else germlines['full'] + receptor.setField('germline_aa_imgt', germline_seq) + else: + __, germlines, __ = buildGermline(receptor, references, amino_acid=False) + germline_seq = None if germlines is None else germlines['full'] + receptor.setField('germline_imgt', germline_seq) + + return receptor + + +def getIDforIMGT(seq_file): + """ + Create a sequence ID translation using IMGT truncation. + + Arguments: + seq_file : a fasta file of sequences input to IMGT. + + Returns: + dict : a dictionary of with the IMGT truncated ID as the key and the full sequence description as the value. + """ + + # Create a sequence ID translation using IDs truncate up to space or 50 chars + ids = {} + for rec in readSeqFile(seq_file): + if len(rec.description) <= 50: + id_key = rec.description + else: + id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:50]) + ids.update({id_key: rec.description}) + + return ids + + +def getSeqDict(seq_file): + """ + Create a dictionary from a sequence file. + + Arguments: + seq_file : sequence file. + + Returns: + dict : sequence description as keys with Bio.SeqRecords as values. + """ + seq_dict = SeqIO.to_dict(readSeqFile(seq_file), key_function=lambda x: x.description) + + return seq_dict + + +def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None, + amino_acid=False, partial=False, asis_id=True, regions='default', + writer=AIRRWriter, out_file=None, out_args=default_out_args): + """ + Writes parsed records to an output file + + Arguments: + records : a iterator of Receptor objects containing alignment data. + fields : a list of ordered field names to write. + aligner_file : input file name. + total_count : number of records (for progress bar). + id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. + annotations : additional annotation dictionary. + amino_acid : if True do verification on amino acid fields. + partial : if True put incomplete alignments in the pass file. + asis_id : if ID is to be parsed for pRESTO output with default delimiters. + regions (str): name of the IMGT FWR/CDR region definitions to use. + writer : writer class. + out_file : output file name. Automatically generated from the input file if None. + out_args : common output argument dictionary from parseCommonArgs. + + Returns: + None + """ + # Wrapper for opening handles and writers + def _open(x, f, writer=writer, out_file=out_file): + if out_file is not None and x == 'pass': + handle = open(out_file, 'w') + else: + handle = getOutputHandle(aligner_file, + out_label='db-%s' % x, + out_dir=out_args['out_dir'], + out_name=out_args['out_name'], + out_type=out_args['out_type']) + return handle, writer(handle, fields=f) + + # Function to convert fasta header annotations to changeo columns + def _changeo(f, header): + h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f] + f.extend(h) + return f + + def _airr(f, header): + h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f] + f.extend(h) + return f + + # Function to verify IMGT-gapped sequence and junction concur + def _imgt_check(rec): + try: + if amino_acid: + rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions) + x, y = rd.positions['junction'] + check = (rec.junction_aa == rec.sequence_aa_imgt[x:y]) + else: + rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions) + x, y = rd.positions['junction'] + check = (rec.junction == rec.sequence_imgt[x:y]) + except (TypeError, AttributeError): + check = False + return check + + # Function to check for valid records strictly + def _strict(rec): + if amino_acid: + valid = [rec.v_call and rec.v_call != 'None', + rec.j_call and rec.j_call != 'None', + rec.functional is not None, + rec.sequence_aa_imgt, + rec.junction_aa, + _imgt_check(rec)] + else: + valid = [rec.v_call and rec.v_call != 'None', + rec.j_call and rec.j_call != 'None', + rec.functional is not None, + rec.sequence_imgt, + rec.junction, + _imgt_check(rec)] + return all(valid) + + # Function to check for valid records loosely + def _gentle(rec): + valid = [rec.v_call and rec.v_call != 'None', + rec.d_call and rec.d_call != 'None', + rec.j_call and rec.j_call != 'None'] + return any(valid) + + # Set writer class and annotation conversion function + if writer == ChangeoWriter: + _annotate = _changeo + elif writer == AIRRWriter: + _annotate = _airr + else: + printError('Invalid output writer.') + + # Additional annotation (e.g. 10X cell calls) + # _append_table = None + # if cellranger_file is not None: + # with open(cellranger_file) as csv_file: + # # Read in annotation file (use Sniffer to discover file delimiters) + # dialect = csv.Sniffer().sniff(csv_file.readline()) + # csv_file.seek(0) + # csv_reader = csv.DictReader(csv_file, dialect = dialect) + # + # # Generate annotation dictionary + # anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \ + # for field in cellranger_map.keys()} for entry in csv_reader} + # + # fields = _annotate(fields, cellranger_map.values()) + # _append_table = lambda sequence_id: anntab_dict[sequence_id] + + # Set pass criteria + _pass = _gentle if partial else _strict + + # Define log handle + if out_args['log_file'] is None: + log_handle = None + else: + log_handle = open(out_args['log_file'], 'w') + + # Initialize handles, writers and counters + pass_handle, pass_writer = None, None + fail_handle, fail_writer = None, None + pass_count, fail_count = 0, 0 + start_time = time() + + # Validate and write output + printProgress(0, total_count, 0.05, start_time=start_time) + for i, record in enumerate(records, start=1): + # Replace sequence description with full string, if required + if id_dict is not None and record.sequence_id in id_dict: + record.sequence_id = id_dict[record.sequence_id] + + # Parse sequence description into new columns + if not asis_id: + try: + ann_raw = parseAnnotation(record.sequence_id) + record.sequence_id = ann_raw.pop('ID') + + # Convert to Receptor fields + ann_parsed = OrderedDict() + for k, v in ann_raw.items(): + ann_parsed[ChangeoSchema.toReceptor(k)] = v + + # Add annotations to Receptor and update field list + record.setDict(ann_parsed, parse=True) + if i == 1: fields = _annotate(fields, ann_parsed.keys()) + except IndexError: + # Could not parse pRESTO-style annotations so fall back to no parse + asis_id = True + printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.') + + # Add supplemental annotation fields + # if _append_table is not None: + # record.setDict(_append_table(record.sequence_id), parse=True) + if annotations is not None: + record.setDict(annotations[record.sequence_id], parse=True) + if i == 1: fields = _annotate(fields, annotations[record.sequence_id].keys()) + + # Count pass or fail and write to appropriate file + if _pass(record): + pass_count += 1 + # Write row to pass file + try: + pass_writer.writeReceptor(record) + except AttributeError: + # Open pass file and writer + pass_handle, pass_writer = _open('pass', fields) + pass_writer.writeReceptor(record) + else: + fail_count += 1 + # Write row to fail file if specified + if out_args['failed']: + try: + fail_writer.writeReceptor(record) + except AttributeError: + # Open fail file and writer + fail_handle, fail_writer = _open('fail', fields) + fail_writer.writeReceptor(record) + + # Write log + if log_handle is not None: + log = OrderedDict([('ID', record.sequence_id), + ('V_CALL', record.v_call), + ('D_CALL', record.d_call), + ('J_CALL', record.j_call), + ('PRODUCTIVE', record.functional)]) + if not _imgt_check(record) and not amino_acid: + log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.' + printLog(log, log_handle) + + # Print progress + printProgress(i, total_count, 0.05, start_time=start_time) + + # Print console log + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None + log['PASS'] = pass_count + log['FAIL'] = fail_count + log['END'] = 'MakeDb' + printLog(log) + + # Close file handles + output = {'pass': None, 'fail': None} + if pass_handle is not None: + output['pass'] = pass_handle.name + pass_handle.close() + if fail_handle is not None: + output['fail'] = fail_handle.name + fail_handle.close() + + return output + + +def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True, + extended=False, format=default_format, out_file=None, out_args=default_out_args): + """ + Main for IMGT aligned sample sequences. + + Arguments: + aligner_file : zipped file or unzipped folder output by IMGT. + seq_file : FASTA file input to IMGT (from which to get seqID). + repo : folder with germline repertoire files. + partial : If True put incomplete alignments in the pass file. + asis_id : if ID is to be parsed for pRESTO output with default delimiters. + extended : if True add alignment score, FWR, CDR and junction fields to output file. + format : output format. one of 'changeo' or 'airr'. + out_file : output file name. Automatically generated from the input file if None. + out_args : common output argument dictionary from parseCommonArgs. + + Returns: + dict : names of the 'pass' and 'fail' output files. + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'MakeDb' + log['COMMAND'] = 'imgt' + log['ALIGNER_FILE'] = aligner_file + log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' + log['ASIS_ID'] = asis_id + log['PARTIAL'] = partial + log['EXTENDED'] = extended + printLog(log) + + start_time = time() + printMessage('Loading files', start_time=start_time, width=20) + + # Extract IMGT files + temp_dir, imgt_files = extractIMGT(aligner_file) + + # Count records in IMGT files + total_count = countDbFile(imgt_files['summary']) + + # Get (parsed) IDs from fasta file submitted to IMGT + id_dict = getIDforIMGT(seq_file) if seq_file else {} + + # Load supplementary annotation table + if cellranger_file is not None: + f = cellranger_extended if extended else cellranger_base + annotations = readCellRanger(cellranger_file, fields=f) + else: + annotations = None + + printMessage('Done', start_time=start_time, end=True, width=20) + + # Define format operators + try: + __, writer, schema = getFormatOperators(format) + except ValueError: + printError('Invalid format %s.' % format) + out_args['out_type'] = schema.out_type + + # Define output fields + fields = list(schema.required) + if extended: + custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema) + fields.extend(custom) + + # Parse IMGT output and write db + with open(imgt_files['summary'], 'r') as summary_handle, \ + open(imgt_files['gapped'], 'r') as gapped_handle, \ + open(imgt_files['ntseq'], 'r') as ntseq_handle, \ + open(imgt_files['junction'], 'r') as junction_handle: + + # Open parser + parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle) + + # Add germline sequence + if repo is None: + germ_iter = parse_iter + else: + references = readGermlines(repo) + # Check for IMGT-gaps in germlines + if all('...' not in x for x in references.values()): + printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') + germ_iter = (addGermline(x, references) for x in parse_iter) + + # Write db + output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, + annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial, + writer=writer, out_file=out_file, out_args=out_args) + + # Cleanup temp directory + temp_dir.cleanup() + + return output + + +def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False, + asis_id=True, asis_calls=False, extended=False, regions='default', + format='changeo', out_file=None, out_args=default_out_args): + """ + Main for IgBLAST aligned sample sequences. + + Arguments: + aligner_file (str): IgBLAST output file to process. + seq_file (str): fasta file input to IgBlast (from which to get sequence). + repo (str): folder with germline repertoire files. + amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False. + partial : If True put incomplete alignments in the pass file. + asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters. + asis_calls (bool): if True do not parse gene calls for allele names. + extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output. + regions (str): name of the IMGT FWR/CDR region definitions to use. + format (str): output format. one of 'changeo' or 'airr'. + out_file (str): output file name. Automatically generated from the input file if None. + out_args (dict): common output argument dictionary from parseCommonArgs. + + Returns: + dict : names of the 'pass' and 'fail' output files. + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'MakeDB' + log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast' + log['ALIGNER_FILE'] = os.path.basename(aligner_file) + log['SEQ_FILE'] = os.path.basename(seq_file) + log['ASIS_ID'] = asis_id + log['ASIS_CALLS'] = asis_calls + log['PARTIAL'] = partial + log['EXTENDED'] = extended + printLog(log) + + # Set amino acid conditions + if amino_acid: + format = '%s-aa' % format + parser = IgBLASTReaderAA + else: + parser = IgBLASTReader + + # Start + start_time = time() + printMessage('Loading files', start_time=start_time, width=20) + + # Count records in sequence file + total_count = countSeqFile(seq_file) + + # Get input sequence dictionary + seq_dict = getSeqDict(seq_file) + + # Create germline repo dictionary + references = readGermlines(repo, asis=asis_calls) + + # Load supplementary annotation table + if cellranger_file is not None: + f = cellranger_extended if extended else cellranger_base + annotations = readCellRanger(cellranger_file, fields=f) + else: + annotations = None + + printMessage('Done', start_time=start_time, end=True, width=20) + + # Check for IMGT-gaps in germlines + if all('...' not in x for x in references.values()): + printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') + + # Define format operators + try: + __, writer, schema = getFormatOperators(format) + except ValueError: + printError('Invalid format %s.' % format) + out_args['out_type'] = schema.out_type + + # Define output fields + fields = list(schema.required) + if extended: + custom = parser.customFields(schema=schema) + fields.extend(custom) + + # Parse and write output + with open(aligner_file, 'r') as f: + parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls) + germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter) + output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, + annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id, + regions=regions, writer=writer, out_file=out_file, out_args=out_args) + + return output + + +def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True, + extended=False, format=default_format, out_file=None, out_args=default_out_args): + """ + Main for iHMMuneAlign aligned sample sequences. + + Arguments: + aligner_file : iHMMune-Align output file to process. + seq_file : fasta file input to iHMMuneAlign (from which to get sequence). + repo : folder with germline repertoire files. + partial : If True put incomplete alignments in the pass file. + asis_id : if ID is to be parsed for pRESTO output with default delimiters. + extended : if True parse alignment scores, FWR and CDR region fields. + format : output format. One of 'changeo' or 'airr'. + out_file : output file name. Automatically generated from the input file if None. + out_args : common output argument dictionary from parseCommonArgs. + + Returns: + dict : names of the 'pass' and 'fail' output files. + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'MakeDB' + log['COMMAND'] = 'ihmm' + log['ALIGNER_FILE'] = os.path.basename(aligner_file) + log['SEQ_FILE'] = os.path.basename(seq_file) + log['ASIS_ID'] = asis_id + log['PARTIAL'] = partial + log['EXTENDED'] = extended + printLog(log) + + start_time = time() + printMessage('Loading files', start_time=start_time, width=20) + + # Count records in sequence file + total_count = countSeqFile(seq_file) + + # Get input sequence dictionary + seq_dict = getSeqDict(seq_file) + + # Create germline repo dictionary + references = readGermlines(repo) + + # Load supplementary annotation table + if cellranger_file is not None: + f = cellranger_extended if extended else cellranger_base + annotations = readCellRanger(cellranger_file, fields=f) + else: + annotations = None + + printMessage('Done', start_time=start_time, end=True, width=20) + + # Check for IMGT-gaps in germlines + if all('...' not in x for x in references.values()): + printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') + + # Define format operators + try: + __, writer, schema = getFormatOperators(format) + except ValueError: + printError('Invalid format %s.' % format) + out_args['out_type'] = schema.out_type + + # Define output fields + fields = list(schema.required) + if extended: + custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema) + fields.extend(custom) + + # Parse and write output + with open(aligner_file, 'r') as f: + parse_iter = IHMMuneReader(f, seq_dict, references) + germ_iter = (addGermline(x, references) for x in parse_iter) + output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, + annotations=annotations, asis_id=asis_id, partial=partial, + writer=writer, out_file=out_file, out_args=out_args) + + return output + + +def getArgParser(): + """ + Defines the ArgumentParser. + + Returns: + argparse.ArgumentParser + """ + fields = dedent( + ''' + output files: + db-pass + database of alignment records with functionality information, + V and J calls, and a junction region. + db-fail + database with records that fail due to no productivity information, + no gene V assignment, no J assignment, or no junction region. + + universal output fields: + sequence_id, sequence, sequence_alignment, germline_alignment, + rev_comp, productive, stop_codon, vj_in_frame, locus, + v_call, d_call, j_call, junction, junction_length, junction_aa, + v_sequence_start, v_sequence_end, v_germline_start, v_germline_end, + d_sequence_start, d_sequence_end, d_germline_start, d_germline_end, + j_sequence_start, j_sequence_end, j_germline_start, j_germline_end, + np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3 + + imgt specific output fields: + n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length, + d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity + + igblast specific output fields: + v_score, v_identity, v_support, v_cigar, + d_score, d_identity, d_support, d_cigar, + j_score, j_identity, j_support, j_cigar + + ihmm specific output fields: + vdj_score + + 10X specific output fields: + cell_id, c_call, consensus_count, umi_count, + v_call_10x, d_call_10x, j_call_10x, + junction_10x, junction_10x_aa + ''') + + # Define ArgumentParser + parser = ArgumentParser(description=__doc__, epilog=fields, + formatter_class=CommonHelpFormatter, add_help=False) + group_help = parser.add_argument_group('help') + group_help.add_argument('--version', action='version', + version='%(prog)s:' + ' %s %s' %(__version__, __date__)) + group_help.add_argument('-h', '--help', action='help', help='show this help message and exit') + subparsers = parser.add_subparsers(title='subcommands', dest='command', + help='Aligner used', metavar='') + # TODO: This is a temporary fix for Python issue 9253 + subparsers.required = True + + # Parent parser + parser_parent = getCommonArgParser(db_in=False) + + # igblastn output parser + parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent], + formatter_class=CommonHelpFormatter, add_help=False, + help='Process igblastn output.', + description='Process igblastn output.') + group_igblast = parser_igblast.add_argument_group('aligner parsing arguments') + group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', + required=True, + help='''IgBLAST output files in format 7 with query sequence + (igblastn argument \'-outfmt "7 std qseq sseq btop"\').''') + group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, + help='''List of folders and/or fasta files containing + the same germline set used in the IgBLAST alignment. These + reference sequences must contain IMGT-numbering spacers (gaps) + in the V segment.''') + group_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', + required=True, + help='''List of input FASTA files (with .fasta, .fna or .fa + extension), containing sequences.''') + group_igblast.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', + help='''Table file containing 10X annotations (with .csv or .tsv + extension).''') + group_igblast.add_argument('--asis-id', action='store_true', dest='asis_id', + help='''Specify to prevent input sequence headers from being parsed + to add new columns to database. Parsing of sequence headers requires + headers to be in the pRESTO annotation format, so this should be specified + when sequence headers are incompatible with the pRESTO annotation scheme. + Note, unrecognized header formats will default to this behavior.''') + group_igblast.add_argument('--asis-calls', action='store_true', dest='asis_calls', + help='''Specify to prevent gene calls from being parsed into standard allele names + in both the IgBLAST output and reference database. Note, this requires + the sequence identifiers in the reference sequence set and the IgBLAST + database to be exact string matches.''') + group_igblast.add_argument('--partial', action='store_true', dest='partial', + help='''If specified, include incomplete V(D)J alignments in + the pass file instead of the fail file. An incomplete alignment + is defined as a record for which a valid IMGT-gapped sequence + cannot be built or that is missing a V gene assignment, + J gene assignment, junction region, or productivity call.''') + group_igblast.add_argument('--extended', action='store_true', dest='extended', + help='''Specify to include additional aligner specific fields in the output. + Adds <vdj>_score, <vdj>_identity, <vdj>_support, <vdj>_cigar, + fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''') + group_igblast.add_argument('--regions', action='store', dest='regions', + choices=('default', 'rhesus-igl'), default='default', + help='''IMGT CDR and FWR boundary definition to use.''') + parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False) + + # igblastp output parser + parser_igblast_aa = subparsers.add_parser('igblast-aa', parents=[parser_parent], + formatter_class=CommonHelpFormatter, add_help=False, + help='Process igblastp output.', + description='Process igblastp output.') + group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments') + group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files', + required=True, + help='''IgBLAST output files in format 7 with query sequence + (igblastp argument \'-outfmt "7 std qseq sseq btop"\').''') + group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True, + help='''List of folders and/or fasta files containing + the same germline set used in the IgBLAST alignment. These + reference sequences must contain IMGT-numbering spacers (gaps) + in the V segment.''') + group_igblast_aa.add_argument('-s', action='store', nargs='+', dest='seq_files', required=True, + help='''List of input FASTA files (with .fasta, .fna or .fa + extension), containing sequences.''') + group_igblast_aa.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', + help='''Table file containing 10X annotations (with .csv or .tsv extension).''') + group_igblast_aa.add_argument('--asis-id', action='store_true', dest='asis_id', + help='''Specify to prevent input sequence headers from being parsed + to add new columns to database. Parsing of sequence headers requires + headers to be in the pRESTO annotation format, so this should be specified + when sequence headers are incompatible with the pRESTO annotation scheme. + Note, unrecognized header formats will default to this behavior.''') + group_igblast_aa.add_argument('--asis-calls', action='store_true', dest='asis_calls', + help='''Specify to prevent gene calls from being parsed into standard allele names + in both the IgBLAST output and reference database. Note, this requires + the sequence identifiers in the reference sequence set and the IgBLAST + database to be exact string matches.''') + group_igblast_aa.add_argument('--extended', action='store_true', dest='extended', + help='''Specify to include additional aligner specific fields in the output. + Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''') + group_igblast_aa.add_argument('--regions', action='store', dest='regions', + choices=('default', 'rhesus-igl'), default='default', + help='''IMGT CDR and FWR boundary definition to use.''') + parser_igblast_aa.set_defaults(func=parseIgBLAST, partial=True, amino_acid=True) + + + # IMGT aligner + parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent], + formatter_class=CommonHelpFormatter, add_help=False, + help='''Process IMGT/HighV-Quest output + (does not work with V-QUEST).''', + description='''Process IMGT/HighV-Quest output + (does not work with V-QUEST).''') + group_imgt = parser_imgt.add_argument_group('aligner parsing arguments') + group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files', + help='''Either zipped IMGT output files (.zip or .txz) or a + folder containing unzipped IMGT output files (which must + include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, + and 6_Junction).''') + group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False, + help='''List of FASTA files (with .fasta, .fna or .fa + extension) that were submitted to IMGT/HighV-QUEST. + If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST + will not be corrected.''') + group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False, + help='''List of folders and/or fasta files containing + the germline sequence set used by IMGT/HighV-QUEST. + These reference sequences must contain IMGT-numbering spacers (gaps) + in the V segment. If unspecified, the germline sequence reconstruction + will not be included in the output.''') + group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', + help='''Table file containing 10X annotations (with .csv or .tsv + extension).''') + group_imgt.add_argument('--asis-id', action='store_true', dest='asis_id', + help='''Specify to prevent input sequence headers from being parsed + to add new columns to database. Parsing of sequence headers requires + headers to be in the pRESTO annotation format, so this should be specified + when sequence headers are incompatible with the pRESTO annotation scheme. + Note, unrecognized header formats will default to this behavior.''') + group_imgt.add_argument('--partial', action='store_true', dest='partial', + help='''If specified, include incomplete V(D)J alignments in + the pass file instead of the fail file. An incomplete alignment + is defined as a record that is missing a V gene assignment, + J gene assignment, junction region, or productivity call.''') + group_imgt.add_argument('--extended', action='store_true', dest='extended', + help='''Specify to include additional aligner specific fields in the output. + Adds <vdj>_score, <vdj>_identity>, fwr1, fwr2, fwr3, fwr4, + cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length, + p3d_length, p5j_length and d_frame.''') + parser_imgt.set_defaults(func=parseIMGT) + + # iHMMuneAlign Aligner + parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent], + formatter_class=CommonHelpFormatter, add_help=False, + help='Process iHMMune-Align output.', + description='Process iHMMune-Align output.') + group_ihmm = parser_ihmm.add_argument_group('aligner parsing arguments') + group_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_files', + required=True, + help='''iHMMune-Align output file.''') + group_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True, + help='''List of folders and/or FASTA files containing + the set of germline sequences used by iHMMune-Align. These + reference sequences must contain IMGT-numbering spacers (gaps) + in the V segment.''') + group_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files', + required=True, + help='''List of input FASTA files (with .fasta, .fna or .fa + extension) containing sequences.''') + group_ihmm.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', + help='''Table file containing 10X annotations (with .csv or .tsv + extension).''') + group_ihmm.add_argument('--asis-id', action='store_true', dest='asis_id', + help='''Specify to prevent input sequence headers from being parsed + to add new columns to database. Parsing of sequence headers requires + headers to be in the pRESTO annotation format, so this should be specified + when sequence headers are incompatible with the pRESTO annotation scheme. + Note, unrecognized header formats will default to this behavior.''') + group_ihmm.add_argument('--partial', action='store_true', dest='partial', + help='''If specified, include incomplete V(D)J alignments in + the pass file instead of the fail file. An incomplete alignment + is defined as a record for which a valid IMGT-gapped sequence + cannot be built or that is missing a V gene assignment, + J gene assignment, junction region, or productivity call.''') + group_ihmm.add_argument('--extended', action='store_true', dest='extended', + help='''Specify to include additional aligner specific fields in the output. + Adds the path score of the iHMMune-Align hidden Markov model as vdj_score; + adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''') + parser_ihmm.set_defaults(func=parseIHMM) + + return parser + + +if __name__ == "__main__": + """ + Parses command line arguments and calls main + """ + parser = getArgParser() + checkArgs(parser) + args = parser.parse_args() + args_dict = parseCommonArgs(args, in_arg='aligner_files') + + # Set no ID parsing if sequence files are not provided + if 'seq_files' in args_dict and not args_dict['seq_files']: + args_dict['asis_id'] = True + + # Delete + if 'aligner_files' in args_dict: del args_dict['aligner_files'] + if 'seq_files' in args_dict: del args_dict['seq_files'] + if 'out_files' in args_dict: del args_dict['out_files'] + if 'command' in args_dict: del args_dict['command'] + if 'func' in args_dict: del args_dict['func'] + + # Call main + for i, f in enumerate(args.__dict__['aligner_files']): + args_dict['aligner_file'] = f + args_dict['seq_file'] = args.__dict__['seq_files'][i] \ + if args.__dict__['seq_files'] else None + args_dict['out_file'] = args.__dict__['out_files'][i] \ + if args.__dict__['out_files'] else None + args_dict['cellranger_file'] = args.__dict__['cellranger_file'][i] \ + if args.__dict__['cellranger_file'] else None + args.func(**args_dict)
--- a/change_o/define_clones.sh Wed Jun 19 04:31:44 2019 -0400 +++ b/change_o/define_clones.sh Mon Aug 31 11:20:08 2020 -0400 @@ -21,7 +21,7 @@ output=${10} output2=${11} - DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + python3 $dir/DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 else @@ -29,7 +29,7 @@ output=$4 output2=$5 - DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + python3 $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 fi
--- a/change_o/makedb.sh Wed Jun 19 04:31:44 2019 -0400 +++ b/change_o/makedb.sh Mon Aug 31 11:20:08 2020 -0400 @@ -29,7 +29,7 @@ echo "makedb: $PWD/outdir" -MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions +python3 $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions mv $PWD/outdir/output_db-pass.tab $output
--- a/merge_and_filter.r Wed Jun 19 04:31:44 2019 -0400 +++ b/merge_and_filter.r Mon Aug 31 11:20:08 2020 -0400 @@ -53,10 +53,6 @@ hotspots = fix_column_names(hotspots) AAs = fix_column_names(AAs) -if(!("Sequence.number" %in% names(summ))){ - summ["Sequence.number"] = 1:nrow(summ) -} - if(method == "blastn"){ #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] @@ -173,17 +169,10 @@ write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T) -missing.FR1 = result$FR1.IMGT.seq == "" | is.na(result$FR1.IMGT.seq) -missing.CDR1 = result$CDR1.IMGT.seq == "" | is.na(result$CDR1.IMGT.seq) -missing.FR2 = result$FR2.IMGT.seq == "" | is.na(result$FR2.IMGT.seq) -missing.CDR2 = result$CDR2.IMGT.seq == "" | is.na(result$CDR2.IMGT.seq) -missing.FR3 = result$FR3.IMGT.seq == "" | is.na(result$FR3.IMGT.seq) - -print(paste("Number of empty CDR1 sequences:", sum(missing.FR1))) -print(paste("Number of empty FR2 sequences:", sum(missing.CDR1))) -print(paste("Number of empty CDR2 sequences:", sum(missing.FR2))) -print(paste("Number of empty FR3 sequences:", sum(missing.CDR2))) -print(paste("Number of empty FR3 sequences:", sum(missing.FR3))) +print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T))) +print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T))) +print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T))) +print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T))) if(empty.region.filter == "leader"){ result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_column_checker.py Mon Aug 31 11:20:08 2020 -0400 @@ -0,0 +1,27 @@ +import re + +mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") + +with open("7_V-REGION-mutation-and-AA-change-table.txt", 'r') as file_handle: + first = True + fr3_index = -1 + for i, line in enumerate(file_handle): + line_split = line.split("\t") + if first: + fr3_index = line_split.index("FR3-IMGT") + first = False + continue + + if len(line_split) < fr3_index: + continue + + fr3_data = line_split[fr3_index] + if len(fr3_data) > 5: + try: + test = [mutationMatcher.match(x).groups() for x in fr3_data.split("|") if x] + except: + print(line_split[1]) + print("Something went wrong at line {line} with:".format(line=line_split[0])) + #print([x for x in fr3_data.split("|") if not mutationMatcher.match(x)]) + if i % 100000 == 0: + print(i)
--- a/shm_csr.xml Wed Jun 19 04:31:44 2019 -0400 +++ b/shm_csr.xml Mon Aug 31 11:20:08 2020 -0400 @@ -2,13 +2,13 @@ <description></description> <requirements> <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.16.0">numpy</requirement> <requirement type="package" version="1.2.0">xlrd</requirement> <requirement type="package" version="3.0.0">r-ggplot2</requirement> <requirement type="package" version="1.4.3">r-reshape2</requirement> <requirement type="package" version="0.5.0">r-scales</requirement> <requirement type="package" version="3.4_5">r-seqinr</requirement> <requirement type="package" version="1.11.4">r-data.table</requirement> - <!--<requirement type="package" version="0.4.5">changeo</requirement>--> </requirements> <command interpreter="bash"> #if str ( $filter_unique.filter_unique_select ) == "remove":