Mercurial > repos > davidvanzessen > shm_csr

--- a/change_o/DefineClones.py	Mon Aug 31 11:20:08 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,739 +0,0 @@
-#!/usr/bin/env python3
-"""
-Assign Ig sequences into clones
-"""
-
-# Info
-__author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman'
-from changeo import __version__, __date__
-
-# Imports
-import os
-import re
-import sys
-from argparse import ArgumentParser
-from collections import OrderedDict
-from itertools import chain
-from textwrap import dedent
-from time import time
-from Bio.Seq import translate
-
-# Presto and changeo imports
-from presto.Defaults import default_out_args
-from presto.IO import printLog, printProgress, printCount, printWarning, printError
-from presto.Multiprocessing import manageProcesses
-from changeo.Defaults import default_format, default_v_field, default_j_field, default_junction_field
-from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
-from changeo.Distance import distance_models, calcDistances, formClusters
-from changeo.IO import countDbFile, getDbFields, getFormatOperators, getOutputHandle, \
-                       AIRRWriter, ChangeoWriter
-from changeo.Multiprocessing import DbResult, feedDbQueue, processDbQueue
-
-# Defaults
-default_translate = False
-default_distance = 0.0
-default_index_mode = 'gene'
-default_index_action = 'set'
-default_distance_model = 'ham'
-default_norm = 'len'
-default_sym = 'avg'
-default_linkage = 'single'
-default_max_missing=0
-choices_distance_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f',
-                          'mk_rs1nf', 'mk_rs5nf',
-                          'hs1f_compat', 'm1n_compat')
-
-
-def filterMissing(data, seq_field=default_junction_field, v_field=default_v_field,
-                  j_field=default_j_field, max_missing=default_max_missing):
-    """
-    Splits a set of sequence into passed and failed groups based on the number
-    of missing characters in the sequence
-
-    Arguments:
-        data : changeo.Multiprocessing.DbData object.
-        seq_field : sequence field to filter on.
-        v_field : field containing the V call.
-        j_field : field containing the J call.
-        max_missing : maximum number of missing characters (non-ACGT) to permit before failing the record.
-
-    Returns:
-        changeo.Multiprocessing.DbResult : objected containing filtered records.
-    """
-    # Function to validate the sequence string
-    def _pass(seq):
-        if len(seq) > 0 and len(re.findall(r'[^ACGT]', seq)) <= max_missing:
-            return True
-        else:
-            return False
-
-    # Define result object for iteration and get data records
-    result = DbResult(data.id, data.data)
-
-    if not data:
-        result.data_pass = []
-        result.data_fail = data.data
-        return result
-
-    result.data_pass = []
-    result.data_fail = []
-    for rec in data.data:
-        seq = rec.getField(seq_field)
-        if _pass(seq):  result.data_pass.append(rec)
-        else:  result.data_fail.append(rec)
-
-    # Add V(D)J to log
-    result.log['ID'] = ','.join([str(x) for x in data.id])
-    result.log['VCALL'] = ','.join(set([(r.getVAllele(field=v_field) or '') for r in data.data]))
-    result.log['JCALL'] = ','.join(set([(r.getJAllele(field=j_field) or '') for r in data.data]))
-    result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in data.data]))
-    result.log['CLONED'] = len(result.data_pass)
-    result.log['FILTERED'] = len(result.data_fail)
-
-    return result
-
-
-def indexByIdentity(index, key, rec, group_fields=None):
-    """
-    Updates a preclone index with a simple key
-
-    Arguments:
-      index : preclone index from groupByGene
-      key : index key
-      rec : Receptor to add to the index
-      group_fields : additional annotation fields to use to group preclones;
-                     if None use only V, J and junction length
-
-    Returns:
-      None : Updates index with new key and records.
-    """
-    index.setdefault(tuple(key), []).append(rec)
-
-
-def indexByUnion(index, key, rec, group_fields=None):
-    """
-    Updates a preclone index with the union of nested keys
-
-    Arguments:
-      index : preclone index from groupByGene
-      key : index key
-      rec : Receptor to add to the index
-      group_fields : additional annotation fields to use to group preclones;
-                     if None use only V, J and junction length
-
-    Returns:
-      None : Updates index with new key and records.
-    """
-    # List of values for this/new key
-    val = [rec]
-    f_range = list(range(2, 3 + (len(group_fields) if group_fields else 0)))
-
-    # See if field/junction length combination exists in index
-    outer_dict = index
-    for field in f_range:
-        try:
-            outer_dict = outer_dict[key[field]]
-        except KeyError:
-            outer_dict = None
-            break
-    # If field combination exists, look through Js
-    j_matches = []
-    if outer_dict is not None:
-        for j in outer_dict.keys():
-            if not set(key[1]).isdisjoint(set(j)):
-                key[1] = tuple(set(key[1]).union(set(j)))
-                j_matches += [j]
-    # If J overlap exists, look through Vs for each J
-    for j in j_matches:
-        v_matches = []
-        # Collect V matches for this J
-        for v in outer_dict[j].keys():
-            if not set(key[0]).isdisjoint(set(v)):
-                key[0] = tuple(set(key[0]).union(set(v)))
-                v_matches += [v]
-        # If there are V overlaps for this J, pop them out
-        if v_matches:
-            val += list(chain(*(outer_dict[j].pop(v) for v in v_matches)))
-            # If the J dict is now empty, remove it
-            if not outer_dict[j]:
-                outer_dict.pop(j, None)
-
-    # Add value(s) into index nested dictionary
-    # OMG Python pointers are the best!
-    # Add field dictionaries into index
-    outer_dict = index
-    for field in f_range:
-        outer_dict.setdefault(key[field], {})
-        outer_dict = outer_dict[key[field]]
-    # Add J, then V into index
-    if key[1] in outer_dict:
-        outer_dict[key[1]].update({key[0]: val})
-    else:
-        outer_dict[key[1]] = {key[0]: val}
-
-
-def groupByGene(db_iter, group_fields=None, v_field=default_v_field, j_field=default_j_field,
-                mode=default_index_mode, action=default_index_action):
-    """
-    Identifies preclonal groups by V, J and junction length
-
-    Arguments:
-      db_iter : an iterator of Receptor objects defined by ChangeoReader
-      group_fields : additional annotation fields to use to group preclones;
-                     if None use only V, J and junction length
-      mode : specificity of alignment call to use for assigning preclones;
-             one of ('allele', 'gene')
-      action : how to handle multiple value fields when assigning preclones;
-               one of ('first', 'set')
-
-    Returns:
-      dict: dictionary of {(V, J, junction length):[Receptor]}
-    """
-    # print(fields)
-    # Define functions for grouping keys
-    if mode == 'allele' and group_fields is None:
-        def _get_key(rec, act):
-            return [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field),
-                    None if rec.junction is None else len(rec.junction)]
-    elif mode == 'gene' and group_fields is None:
-        def _get_key(rec, act):
-            return [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field),
-                    None if rec.junction is None else len(rec.junction)]
-    elif mode == 'allele' and group_fields is not None:
-        def _get_key(rec, act):
-            vdj = [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field),
-                    None if rec.junction is None else len(rec.junction)]
-            ann = [rec.getField(k) for k in group_fields]
-            return list(chain(vdj, ann))
-    elif mode == 'gene' and group_fields is not None:
-        def _get_key(rec, act):
-            vdj = [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field),
-                    None if rec.junction is None else len(rec.junction)]
-            ann = [rec.getField(k) for k in group_fields]
-            return list(chain(vdj, ann))
-
-    # Function to flatten nested dictionary
-    def _flatten_dict(d, parent_key=''):
-        items = []
-        for k, v in d.items():
-            new_key = parent_key + [k] if parent_key else [k]
-            if isinstance(v, dict):
-                items.extend(_flatten_dict(v, new_key).items())
-            else:
-                items.append((new_key, v))
-        flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items}
-        return flat_dict
-
-    if action == 'first':
-        index_func = indexByIdentity
-    elif action == 'set':
-        index_func = indexByUnion
-    else:
-        sys.stderr.write('Unrecognized action: %s.\n' % action)
-
-    start_time = time()
-    clone_index = {}
-    rec_count = 0
-    for rec in db_iter:
-        key = _get_key(rec, action)
-
-        # Print progress
-        printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences')
-        rec_count += 1
-
-        # Assigned passed preclone records to key and failed to index None
-        if all([k is not None and k != '' for k in key]):
-            # Update index dictionary
-            index_func(clone_index, key, rec, group_fields)
-        else:
-            clone_index.setdefault(None, []).append(rec)
-
-    printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences', end=True)
-
-    if action == 'set':
-        clone_index = _flatten_dict(clone_index)
-
-    return clone_index
-
-
-def distanceClones(result, seq_field=default_junction_field, model=default_distance_model,
-                   distance=default_distance, dist_mat=None, norm=default_norm, sym=default_sym,
-                   linkage=default_linkage):
-    """
-    Separates a set of Receptor objects into clones
-
-    Arguments:
-      result : a changeo.Multiprocessing.DbResult object with filtered records to clone
-      seq_field : sequence field used to calculate distance between records
-      model : substitution model used to calculate distance
-      distance : the distance threshold to assign clonal groups
-      dist_mat : pandas DataFrame of pairwise nucleotide or amino acid distances
-      norm : normalization method
-      sym : symmetry method
-      linkage : type of linkage
-
-    Returns:
-      changeo.Multiprocessing.DbResult : an updated DbResult object
-    """
-    # Get distance matrix if not provided
-    if dist_mat is None:
-        try:
-            dist_mat = distance_models[model]
-        except KeyError:
-            printError('Unrecognized distance model: %s' % args_dict['model'])
-
-    # TODO:  can be cleaned up with abstract model class
-    # Determine length of n-mers
-    if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']:
-        nmer_len = 1
-    elif model in ['hh_s5f', 'mk_rs5nf']:
-        nmer_len = 5
-    else:
-        printError('Unrecognized distance model: %s.\n' % model)
-
-    # Define unique junction mapping
-    seq_map = {}
-    for rec in result.data_pass:
-        seq = rec.getField(seq_field)
-        seq = re.sub('[\.-]', 'N', seq)
-        if model == 'aa':  seq = translate(seq)
-        seq_map.setdefault(seq, []).append(rec)
-
-    # Define sequences
-    sequences = list(seq_map.keys())
-
-    # Zero record case
-    if not sequences:
-        result.valid = False
-        result.log['CLONES'] = 0
-        return result
-
-    # Single record case
-    if len(sequences) == 1:
-        result.results = {1: result.data_pass}
-        result.valid = True
-        result.log['CLONES'] = 1
-        return result
-
-    # Calculate pairwise distance matrix
-    dists = calcDistances(sequences, nmer_len, dist_mat, sym=sym, norm=norm)
-
-    # Perform hierarchical clustering
-    clusters = formClusters(dists, linkage, distance)
-
-    # Turn clusters into clone dictionary
-    clone_dict = {}
-    for i, c in enumerate(clusters):
-        clone_dict.setdefault(c, []).extend(seq_map[sequences[i]])
-
-    if clone_dict:
-        result.results = clone_dict
-        result.valid = True
-        result.log['CLONES'] = len(clone_dict)
-    else:
-        result.log['CLONES'] = 0
-
-    return result
-
-
-def collectQueue(alive, result_queue, collect_queue, db_file, fields,
-                 writer=ChangeoWriter, out_file=None, out_args=default_out_args):
-    """
-    Assembles results from a queue of individual sequence results and manages log/file I/O
-
-    Arguments:
-      alive = a multiprocessing.Value boolean controlling whether processing continues
-              if False exit process
-      result_queue : a multiprocessing.Queue holding processQueue results
-      collect_queue : a multiprocessing.Queue to store collector return values
-      db_file : the input database file name
-      fields : list of output field names
-      writer : writer class.
-      out_file : output file name. Automatically generated from the input file if None.
-      out_args : common output argument dictionary from parseCommonArgs
-
-    Returns:
-       None : Adds a dictionary with key value pairs to collect_queue containing
-            'log' defining a log object along with the 'pass' and 'fail' output file names.
-    """
-    # Wrapper for opening handles and writers
-    def _open(x, f, writer=writer, out_file=out_file):
-        if out_file is not None and x == 'pass':
-            handle = open(out_file, 'w')
-        else:
-            handle = getOutputHandle(db_file,
-                                     out_label='clone-%s' % x,
-                                     out_dir=out_args['out_dir'],
-                                     out_name=out_args['out_name'],
-                                     out_type=out_args['out_type'])
-        return handle, writer(handle, fields=f)
-
-    # Open log file
-    try:
-        # Count input records
-        result_count = countDbFile(db_file)
-
-        # Define log handle
-        if out_args['log_file'] is None:
-            log_handle = None
-        else:
-            log_handle = open(out_args['log_file'], 'w')
-    except:
-        #sys.stderr.write('Exception in collector file opening step\n')
-        alive.value = False
-        raise
-
-    # Get results from queue and write to files
-    try:
-        # Initialize handles, writers and counters
-        pass_handle, pass_writer = None, None
-        fail_handle, fail_writer = None, None
-        rec_count, clone_count, pass_count, fail_count = 0, 0, 0, 0
-        start_time = time()
-
-        # Iterator over results queue until sentinel object reached
-        while alive.value:
-            # Get result from queue
-            if result_queue.empty():  continue
-            else:  result = result_queue.get()
-            # Exit upon reaching sentinel
-            if result is None:  break
-
-            # Print progress for previous iteration and update record count
-            printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones')
-            rec_count += len(result.data)
-
-            # Write passed and failed records
-            if result:
-                # Writing passing sequences
-                for clone in result.results.values():
-                    clone_count += 1
-                    for i, rec in enumerate(clone, start=1):
-                        pass_count += 1
-                        rec.setField('clone', str(clone_count))
-                        result.log['CLONE%i-%i' % (clone_count, i)] = rec.junction
-                        try:
-                            pass_writer.writeReceptor(rec)
-                        except AttributeError:
-                            # Open pass file and define writer object
-                            pass_handle, pass_writer = _open('pass', fields)
-                            pass_writer.writeReceptor(rec)
-
-                # Write failed sequences from passing sets
-                if result.data_fail:
-                    # Write failed sequences
-                    for i, rec in enumerate(result.data_fail, start=1):
-                        fail_count += 1
-                        result.log['FAIL%i-%i' % (clone_count, i)] = rec.junction
-                        if out_args['failed']:
-                            try:
-                                fail_writer.writeReceptor(rec)
-                            except AttributeError:
-                                # Open fail file and define writer object
-                                fail_handle, fail_writer = _open('fail', fields)
-                                fail_writer.writeReceptor(rec)
-            else:
-                # Write failing records
-                for i, rec in enumerate(result.data, start=1):
-                    fail_count += 1
-                    result.log['CLONE0-%i' % (i)] = rec.junction
-                    if out_args['failed']:
-                        try:
-                            fail_writer.writeReceptor(rec)
-                        except AttributeError:
-                            # Open fail file and define writer object
-                            fail_handle, fail_writer = _open('fail', fields)
-                            fail_writer.writeReceptor(rec)
-
-            # Write log
-            printLog(result.log, handle=log_handle)
-        else:
-            sys.stderr.write('PID %s>  Error in sibling process detected. Cleaning up.\n' \
-                             % os.getpid())
-            return None
-
-        # Print total counts
-        printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones')
-
-        # Update return list
-        log = OrderedDict()
-        log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
-        log['CLONES'] = clone_count
-        log['RECORDS'] = rec_count
-        log['PASS'] = pass_count
-        log['FAIL'] = fail_count
-
-        # Close file handles and generate return data
-        collect_dict = {'log': log, 'pass': None, 'fail': None}
-        if pass_handle is not None:
-            collect_dict['pass'] = pass_handle.name
-            pass_handle.close()
-        if fail_handle is not None:
-            collect_dict['fail'] = fail_handle.name
-            fail_handle.close()
-        if log_handle is not None:
-            log_handle.close()
-        collect_queue.put(collect_dict)
-    except:
-        alive.value = False
-        raise
-
-    return None
-
-
-def defineClones(db_file, seq_field=default_junction_field, v_field=default_v_field,
-                 j_field=default_j_field, max_missing=default_max_missing,
-                 group_fields=None, group_func=groupByGene, group_args={},
-                 clone_func=distanceClones, clone_args={},
-                 format=default_format, out_file=None, out_args=default_out_args,
-                 nproc=None, queue_size=None):
-    """
-    Define clonally related sequences
-
-    Arguments:
-      db_file : filename of input database.
-      seq_field : sequence field used to determine clones.
-      v_field : field containing the V call.
-      j_field : field containing the J call.
-      max_missing : maximum number of non-ACGT characters to allow in the junction sequence.
-      group_fields : additional annotation fields to use to group preclones;
-                     if None use only V and J.
-      group_func : the function to use for assigning preclones.
-      group_args : a dictionary of arguments to pass to group_func.
-      clone_func : the function to use for determining clones within preclonal groups.
-      clone_args : a dictionary of arguments to pass to clone_func.
-      format : input and output format.
-      out_file : output file name. Automatically generated from the input file if None.
-      out_args : common output argument dictionary from parseCommonArgs.
-      nproc : the number of processQueue processes;
-              if None defaults to the number of CPUs.
-      queue_size : maximum size of the argument queue;
-                   if None defaults to 2*nproc.
-
-    Returns:
-      dict: dictionary of output pass and fail files.
-    """
-    # Print parameter info
-    log = OrderedDict()
-    log['START'] = 'DefineClones'
-    log['FILE'] = os.path.basename(db_file)
-    log['SEQ_FIELD'] = seq_field
-    log['V_FIELD'] = v_field
-    log['J_FIELD'] = j_field
-    log['MAX_MISSING'] = max_missing
-    log['GROUP_FIELDS'] = ','.join(group_fields) if group_fields is not None else None
-    for k in sorted(group_args):
-        log[k.upper()] = group_args[k]
-    for k in sorted(clone_args):
-        if k != 'dist_mat':  log[k.upper()] = clone_args[k]
-    log['NPROC'] = nproc
-    printLog(log)
-
-    # Define format operators
-    try:
-        reader, writer, schema = getFormatOperators(format)
-    except ValueError:
-        printError('Invalid format %s.' % format)
-
-    # Translate to Receptor attribute names
-    seq_field = schema.toReceptor(seq_field)
-    v_field = schema.toReceptor(v_field)
-    j_field = schema.toReceptor(j_field)
-    if group_fields is not None:
-        group_fields = [schema.toReceptor(f) for f in group_fields]
-
-    # Define feeder function and arguments
-    group_args['group_fields'] = group_fields
-    group_args['v_field'] = v_field
-    group_args['j_field'] = j_field
-    feed_args = {'db_file': db_file,
-                 'reader': reader,
-                 'group_func': group_func,
-                 'group_args': group_args}
-
-    # Define worker function and arguments
-    filter_args = {'seq_field': seq_field,
-                   'v_field': v_field,
-                   'j_field': j_field,
-                   'max_missing': max_missing}
-    clone_args['seq_field'] = seq_field
-    work_args = {'process_func': clone_func,
-                 'process_args': clone_args,
-                 'filter_func': filterMissing,
-                 'filter_args': filter_args}
-
-    # Define collector function and arguments
-    out_fields = getDbFields(db_file, add=schema.fromReceptor('clone'), reader=reader)
-    out_args['out_type'] = schema.out_type
-    collect_args = {'db_file': db_file,
-                    'fields': out_fields,
-                    'writer': writer,
-                    'out_file': out_file,
-                    'out_args': out_args}
-
-    # Call process manager
-    result = manageProcesses(feed_func=feedDbQueue, work_func=processDbQueue, collect_func=collectQueue,
-                             feed_args=feed_args, work_args=work_args, collect_args=collect_args,
-                             nproc=nproc, queue_size=queue_size)
-
-    # Print log
-    result['log']['END'] = 'DefineClones'
-    printLog(result['log'])
-    output = {k: v for k, v in result.items() if k in ('pass', 'fail')}
-
-    return output
-
-
-def getArgParser():
-    """
-    Defines the ArgumentParser
-
-    Arguments:
-    None
-
-    Returns:
-    an ArgumentParser object
-    """
-    # Define input and output fields
-    fields = dedent(
-             '''
-             output files:
-                 clone-pass
-                     database with assigned clonal group numbers.
-                 clone-fail
-                     database with records failing clonal grouping.
-
-             required fields:
-                 SEQUENCE_ID, V_CALL, J_CALL, JUNCTION
-
-             output fields:
-                 CLONE
-              ''')
-    # Define argument parser
-    parser = ArgumentParser(description=__doc__, epilog=fields,
-                            parents=[getCommonArgParser(format=False, multiproc=True)],
-                            formatter_class=CommonHelpFormatter, add_help=False)
-
-    # Distance cloning method
-    group = parser.add_argument_group('cloning arguments')
-    group.add_argument('--sf', action='store', dest='seq_field', default=default_junction_field,
-                        help='Field to be used to calculate distance between records.')
-    group.add_argument('--vf', action='store', dest='v_field', default=default_v_field,
-                        help='Field containing the germline V segment call.')
-    group.add_argument('--jf', action='store', dest='j_field', default=default_j_field,
-                        help='Field containing the germline J segment call.')
-    group.add_argument('--gf', nargs='+', action='store', dest='group_fields', default=None,
-                        help='Additional fields to use for grouping clones aside from V, J and junction length.')
-    group.add_argument('--mode', action='store', dest='mode',
-                        choices=('allele', 'gene'), default=default_index_mode,
-                        help='''Specifies whether to use the V(D)J allele or gene for
-                             initial grouping.''')
-    group.add_argument('--act', action='store', dest='action',
-                        choices=('first', 'set'), default=default_index_action,
-                        help='''Specifies how to handle multiple V(D)J assignments for initial grouping.
-                             The "first" action will use only the first gene listed.
-                             The "set" action will use all gene assignments and construct a larger gene
-                             grouping composed of any sequences sharing an assignment or linked to another
-                             sequence by a common assignment (similar to single-linkage).''')
-    group.add_argument('--model', action='store', dest='model',
-                        choices=choices_distance_model,
-                        default=default_distance_model,
-                        help='''Specifies which substitution model to use for calculating distance
-                             between sequences. The "ham" model is nucleotide Hamming distance and
-                             "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are
-                             human specific single nucleotide and 5-mer content models, respectively,
-                             from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are
-                             mouse specific single nucleotide and 5-mer content models, respectively,
-                             from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are
-                             deprecated models provided backwards compatibility with the "m1n" and
-                             "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both
-                             5-mer models should be considered experimental.''')
-    group.add_argument('--dist', action='store', dest='distance', type=float,
-                        default=default_distance,
-                        help='The distance threshold for clonal grouping')
-    group.add_argument('--norm', action='store', dest='norm',
-                        choices=('len', 'mut', 'none'), default=default_norm,
-                        help='''Specifies how to normalize distances. One of none
-                             (do not normalize), len (normalize by length),
-                             or mut (normalize by number of mutations between sequences).''')
-    group.add_argument('--sym', action='store', dest='sym',
-                        choices=('avg', 'min'), default=default_sym,
-                        help='''Specifies how to combine asymmetric distances. One of avg
-                             (average of A->B and B->A) or min (minimum of A->B and B->A).''')
-    group.add_argument('--link', action='store', dest='linkage',
-                        choices=('single', 'average', 'complete'), default=default_linkage,
-                        help='''Type of linkage to use for hierarchical clustering.''')
-    group.add_argument('--maxmiss', action='store', dest='max_missing', type=int,
-                        default=default_max_missing,
-                        help='''The maximum number of non-ACGT characters (gaps or Ns) to
-                             permit in the junction sequence before excluding the record
-                             from clonal assignment. Note, under single linkage
-                             non-informative positions can create artifactual links
-                             between unrelated sequences. Use with caution.''')
-    parser.set_defaults(group_func=groupByGene)
-    parser.set_defaults(clone_func=distanceClones)
-
-    return parser
-
-
-if __name__ == '__main__':
-    """
-    Parses command line arguments and calls main function
-    """
-    # Parse arguments
-    parser = getArgParser()
-    checkArgs(parser)
-    args = parser.parse_args()
-    args_dict = parseCommonArgs(args)
-
-    # # Set default fields if not specified.
-    # default_fields = {'seq_field': default_junction_field,
-    #                   'v_field': default_v_field,
-    #                   'j_field': default_j_field}
-    #
-    # # Default Change-O fields
-    # if args_dict['format'] == 'changeo':
-    #     for f in default_fields:
-    #         if args_dict[f] is None:  args_dict[f] = default_fields[f]
-    #         else: args_dict[f] = args_dict[f].upper()
-    #
-    # # Default AIRR fields
-    # if args_dict['format'] == 'airr':
-    #     for f in default_fields:
-    #         if args_dict[f] is None:  args_dict[f] = ChangeoSchema.toAIRR(default_fields[f])
-    #         else: args_dict[f] = args_dict[f].lower()
-
-    # Define grouping and cloning function arguments
-    args_dict['group_args'] = {'action': args_dict['action'],
-                               'mode':args_dict['mode']}
-    args_dict['clone_args'] = {'model':  args_dict['model'],
-                               'distance':  args_dict['distance'],
-                               'norm': args_dict['norm'],
-                               'sym': args_dict['sym'],
-                               'linkage': args_dict['linkage']}
-
-    # Get distance matrix
-    try:
-        args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']]
-    except KeyError:
-        printError('Unrecognized distance model: %s' % args_dict['model'])
-
-    # Clean argument dictionary
-    del args_dict['action']
-    del args_dict['mode']
-    del args_dict['model']
-    del args_dict['distance']
-    del args_dict['norm']
-    del args_dict['sym']
-    del args_dict['linkage']
-
-    # Clean arguments dictionary
-    del args_dict['db_files']
-    if 'out_files' in args_dict: del args_dict['out_files']
-
-    # Call main function for each input file
-    for i, f in enumerate(args.__dict__['db_files']):
-        args_dict['db_file'] = f
-        args_dict['out_file'] = args.__dict__['out_files'][i] \
-            if args.__dict__['out_files'] else None
-        defineClones(**args_dict)
--- a/change_o/MakeDb.py	Mon Aug 31 11:20:08 2020 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,885 +0,0 @@
-#!/usr/bin/env python3
-"""
-Create tab-delimited database file to store sequence alignment information
-"""
-
-# Info
-__author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
-from changeo import __version__, __date__
-
-# Imports
-import os
-import re
-import csv
-from argparse import ArgumentParser
-from collections import OrderedDict
-from textwrap import dedent
-from time import time
-from Bio import SeqIO
-
-# Presto and changeo imports
-from presto.Annotation import parseAnnotation
-from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile
-from changeo.Defaults import default_format, default_out_args
-from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
-from changeo.Alignment import RegionDefinition
-from changeo.Gene import buildGermline
-from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \
-                       AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader
-from changeo.Receptor import ChangeoSchema, AIRRSchema
-
-# 10X Receptor attributes
-cellranger_base = ['cell', 'c_call', 'conscount', 'umicount']
-cellranger_extended = ['cell', 'c_call', 'conscount', 'umicount',
-                       'v_call_10x', 'd_call_10x', 'j_call_10x',
-                       'junction_10x', 'junction_10x_aa']
-
-def readCellRanger(cellranger_file, fields=cellranger_base):
-    """
-    Load a Cell Ranger annotation table
-
-    Arguments:
-      cellranger_file (str): path to the annotation file.
-      fields (list): list of fields to keep.
-
-    Returns:
-      dict: dict of dicts with contig_id as the primary key.
-    """
-    # Mapping of 10X annotations to Receptor attributes
-    cellranger_map = {'cell':  'barcode',
-                      'c_call': 'c_gene',
-                      'locus': 'chain',
-                      'conscount': 'reads',
-                      'umicount': 'umis',
-                      'v_call_10x': 'v_gene',
-                      'd_call_10x': 'd_gene',
-                      'j_call_10x': 'j_gene',
-                      'junction_10x': 'cdr3_nt',
-                      'junction_10x_aa': 'cdr3'}
-
-    # Function to parse individual fields
-    def _parse(x):
-        return '' if x == 'None' else x
-
-    # Generate annotation dictionary
-    ann_dict = {}
-    with open(cellranger_file) as csv_file:
-        # Detect delimiters
-        dialect = csv.Sniffer().sniff(csv_file.readline())
-        csv_file.seek(0)
-        # Read in annotation file
-        csv_reader = csv.DictReader(csv_file, dialect=dialect)
-
-        # Generate annotation dictionary
-        for row in csv_reader:
-            ann_dict[row['contig_id']] = {f: _parse(row[cellranger_map[f]]) for f in fields}
-
-    return ann_dict
-
-
-def addGermline(receptor, references, amino_acid=False):
-    """
-    Add full length germline to Receptor object
-
-    Arguments:
-      receptor (changeo.Receptor.Receptor): Receptor object to modify.
-      references (dict): dictionary of IMGT-gapped references sequences.
-      amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline
-
-    Returns:
-      changeo.Receptor.Receptor: modified Receptor with the germline sequence added.
-    """
-    if amino_acid:
-        __, germlines, __ = buildGermline(receptor, references, seq_field='sequence_aa_imgt',
-                                          amino_acid=True)
-        germline_seq = None if germlines is None else germlines['full']
-        receptor.setField('germline_aa_imgt', germline_seq)
-    else:
-        __, germlines, __ = buildGermline(receptor, references, amino_acid=False)
-        germline_seq = None if germlines is None else germlines['full']
-        receptor.setField('germline_imgt', germline_seq)
-
-    return receptor
-
-
-def getIDforIMGT(seq_file):
-    """
-    Create a sequence ID translation using IMGT truncation.
-
-    Arguments:
-      seq_file : a fasta file of sequences input to IMGT.
-
-    Returns:
-      dict : a dictionary of with the IMGT truncated ID as the key and the full sequence description as the value.
-    """
-
-    # Create a sequence ID translation using IDs truncate up to space or 50 chars
-    ids = {}
-    for rec in readSeqFile(seq_file):
-        if len(rec.description) <= 50:
-            id_key = rec.description
-        else:
-            id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:50])
-        ids.update({id_key: rec.description})
-
-    return ids
-
-
-def getSeqDict(seq_file):
-    """
-    Create a dictionary from a sequence file.
-
-    Arguments:
-      seq_file : sequence file.
-
-    Returns:
-        dict : sequence description as keys with Bio.SeqRecords as values.
-    """
-    seq_dict = SeqIO.to_dict(readSeqFile(seq_file), key_function=lambda x: x.description)
-
-    return seq_dict
-
-
-def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None,
-            amino_acid=False, partial=False, asis_id=True, regions='default',
-            writer=AIRRWriter, out_file=None, out_args=default_out_args):
-    """
-    Writes parsed records to an output file
-
-    Arguments:
-      records : a iterator of Receptor objects containing alignment data.
-      fields : a list of ordered field names to write.
-      aligner_file : input file name.
-      total_count : number of records (for progress bar).
-      id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
-      annotations : additional annotation dictionary.
-      amino_acid : if True do verification on amino acid fields.
-      partial : if True put incomplete alignments in the pass file.
-      asis_id : if ID is to be parsed for pRESTO output with default delimiters.
-      regions (str): name of the IMGT FWR/CDR region definitions to use.
-      writer : writer class.
-      out_file : output file name. Automatically generated from the input file if None.
-      out_args : common output argument dictionary from parseCommonArgs.
-
-    Returns:
-      None
-    """
-    # Wrapper for opening handles and writers
-    def _open(x, f, writer=writer, out_file=out_file):
-        if out_file is not None and x == 'pass':
-            handle = open(out_file, 'w')
-        else:
-            handle = getOutputHandle(aligner_file,
-                                     out_label='db-%s' % x,
-                                     out_dir=out_args['out_dir'],
-                                     out_name=out_args['out_name'],
-                                     out_type=out_args['out_type'])
-        return handle, writer(handle, fields=f)
-
-    # Function to convert fasta header annotations to changeo columns
-    def _changeo(f, header):
-        h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f]
-        f.extend(h)
-        return f
-
-    def _airr(f, header):
-        h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f]
-        f.extend(h)
-        return f
-
-    # Function to verify IMGT-gapped sequence and junction concur
-    def _imgt_check(rec):
-        try:
-            if amino_acid:
-                rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions)
-                x, y = rd.positions['junction']
-                check = (rec.junction_aa == rec.sequence_aa_imgt[x:y])
-            else:
-                rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions)
-                x, y = rd.positions['junction']
-                check = (rec.junction == rec.sequence_imgt[x:y])
-        except (TypeError, AttributeError):
-            check = False
-        return check
-
-    # Function to check for valid records strictly
-    def _strict(rec):
-        if amino_acid:
-            valid = [rec.v_call and rec.v_call != 'None',
-                     rec.j_call and rec.j_call != 'None',
-                     rec.functional is not None,
-                     rec.sequence_aa_imgt,
-                     rec.junction_aa,
-                     _imgt_check(rec)]
-        else:
-            valid = [rec.v_call and rec.v_call != 'None',
-                     rec.j_call and rec.j_call != 'None',
-                     rec.functional is not None,
-                     rec.sequence_imgt,
-                     rec.junction,
-                     _imgt_check(rec)]
-        return all(valid)
-
-    # Function to check for valid records loosely
-    def _gentle(rec):
-        valid = [rec.v_call and rec.v_call != 'None',
-                 rec.d_call and rec.d_call != 'None',
-                 rec.j_call and rec.j_call != 'None']
-        return any(valid)
-
-    # Set writer class and annotation conversion function
-    if writer == ChangeoWriter:
-        _annotate = _changeo
-    elif writer == AIRRWriter:
-        _annotate = _airr
-    else:
-        printError('Invalid output writer.')
-
-    # Additional annotation (e.g. 10X cell calls)
-    # _append_table = None
-    # if cellranger_file is not None:
-    #     with open(cellranger_file) as csv_file:
-    #         # Read in annotation file (use Sniffer to discover file delimiters)
-    #         dialect = csv.Sniffer().sniff(csv_file.readline())
-    #         csv_file.seek(0)
-    #         csv_reader = csv.DictReader(csv_file, dialect = dialect)
-    #
-    #         # Generate annotation dictionary
-    #         anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \
-    #                        for field in cellranger_map.keys()} for entry in csv_reader}
-    #
-    #     fields = _annotate(fields, cellranger_map.values())
-    #     _append_table = lambda sequence_id: anntab_dict[sequence_id]
-
-    # Set pass criteria
-    _pass = _gentle if partial else _strict
-
-    # Define log handle
-    if out_args['log_file'] is None:
-        log_handle = None
-    else:
-        log_handle = open(out_args['log_file'], 'w')
-
-    # Initialize handles, writers and counters
-    pass_handle, pass_writer = None, None
-    fail_handle, fail_writer = None, None
-    pass_count, fail_count = 0, 0
-    start_time = time()
-
-    # Validate and write output
-    printProgress(0, total_count, 0.05, start_time=start_time)
-    for i, record in enumerate(records, start=1):
-        # Replace sequence description with full string, if required
-        if id_dict is not None and record.sequence_id in id_dict:
-            record.sequence_id = id_dict[record.sequence_id]
-
-        # Parse sequence description into new columns
-        if not asis_id:
-            try:
-                ann_raw = parseAnnotation(record.sequence_id)
-                record.sequence_id = ann_raw.pop('ID')
-
-                # Convert to Receptor fields
-                ann_parsed = OrderedDict()
-                for k, v in ann_raw.items():
-                    ann_parsed[ChangeoSchema.toReceptor(k)] = v
-
-                # Add annotations to Receptor and update field list
-                record.setDict(ann_parsed, parse=True)
-                if i == 1:  fields = _annotate(fields, ann_parsed.keys())
-            except IndexError:
-                # Could not parse pRESTO-style annotations so fall back to no parse
-                asis_id = True
-                printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.')
-
-        # Add supplemental annotation fields
-        # if _append_table is not None:
-        #     record.setDict(_append_table(record.sequence_id), parse=True)
-        if annotations is not None:
-            record.setDict(annotations[record.sequence_id], parse=True)
-            if i == 1:  fields = _annotate(fields, annotations[record.sequence_id].keys())
-
-        # Count pass or fail and write to appropriate file
-        if _pass(record):
-            pass_count += 1
-            # Write row to pass file
-            try:
-                pass_writer.writeReceptor(record)
-            except AttributeError:
-                # Open pass file and writer
-                pass_handle, pass_writer = _open('pass', fields)
-                pass_writer.writeReceptor(record)
-        else:
-            fail_count += 1
-            # Write row to fail file if specified
-            if out_args['failed']:
-                try:
-                    fail_writer.writeReceptor(record)
-                except AttributeError:
-                    # Open fail file and writer
-                    fail_handle, fail_writer = _open('fail', fields)
-                    fail_writer.writeReceptor(record)
-
-        # Write log
-        if log_handle is not None:
-            log = OrderedDict([('ID', record.sequence_id),
-                               ('V_CALL', record.v_call),
-                               ('D_CALL', record.d_call),
-                               ('J_CALL', record.j_call),
-                               ('PRODUCTIVE', record.functional)])
-            if not _imgt_check(record) and not amino_acid:
-                log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.'
-            printLog(log, log_handle)
-
-        # Print progress
-        printProgress(i, total_count, 0.05, start_time=start_time)
-
-    # Print console log
-    log = OrderedDict()
-    log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
-    log['PASS'] = pass_count
-    log['FAIL'] = fail_count
-    log['END'] = 'MakeDb'
-    printLog(log)
-
-    # Close file handles
-    output = {'pass': None, 'fail': None}
-    if pass_handle is not None:
-        output['pass'] = pass_handle.name
-        pass_handle.close()
-    if fail_handle is not None:
-        output['fail'] = fail_handle.name
-        fail_handle.close()
-
-    return output
-
-
-def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True,
-              extended=False, format=default_format, out_file=None, out_args=default_out_args):
-    """
-    Main for IMGT aligned sample sequences.
-
-    Arguments:
-      aligner_file : zipped file or unzipped folder output by IMGT.
-      seq_file : FASTA file input to IMGT (from which to get seqID).
-      repo : folder with germline repertoire files.
-      partial : If True put incomplete alignments in the pass file.
-      asis_id : if ID is to be parsed for pRESTO output with default delimiters.
-      extended : if True add alignment score, FWR, CDR and junction fields to output file.
-      format : output format. one of 'changeo' or 'airr'.
-      out_file : output file name. Automatically generated from the input file if None.
-      out_args : common output argument dictionary from parseCommonArgs.
-
-    Returns:
-      dict : names of the 'pass' and 'fail' output files.
-    """
-    # Print parameter info
-    log = OrderedDict()
-    log['START'] = 'MakeDb'
-    log['COMMAND'] = 'imgt'
-    log['ALIGNER_FILE'] = aligner_file
-    log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
-    log['ASIS_ID'] = asis_id
-    log['PARTIAL'] = partial
-    log['EXTENDED'] = extended
-    printLog(log)
-
-    start_time = time()
-    printMessage('Loading files', start_time=start_time, width=20)
-
-    # Extract IMGT files
-    temp_dir, imgt_files = extractIMGT(aligner_file)
-
-    # Count records in IMGT files
-    total_count = countDbFile(imgt_files['summary'])
-
-    # Get (parsed) IDs from fasta file submitted to IMGT
-    id_dict = getIDforIMGT(seq_file) if seq_file else {}
-
-    # Load supplementary annotation table
-    if cellranger_file is not None:
-        f = cellranger_extended if extended else cellranger_base
-        annotations = readCellRanger(cellranger_file, fields=f)
-    else:
-        annotations = None
-
-    printMessage('Done', start_time=start_time, end=True, width=20)
-
-    # Define format operators
-    try:
-        __, writer, schema = getFormatOperators(format)
-    except ValueError:
-        printError('Invalid format %s.' % format)
-    out_args['out_type'] = schema.out_type
-
-    # Define output fields
-    fields = list(schema.required)
-    if extended:
-        custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema)
-        fields.extend(custom)
-
-    # Parse IMGT output and write db
-    with open(imgt_files['summary'], 'r') as summary_handle, \
-            open(imgt_files['gapped'], 'r') as gapped_handle, \
-            open(imgt_files['ntseq'], 'r') as ntseq_handle, \
-            open(imgt_files['junction'], 'r') as junction_handle:
-
-        # Open parser
-        parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle)
-
-        # Add germline sequence
-        if repo is None:
-            germ_iter = parse_iter
-        else:
-            references = readGermlines(repo)
-            # Check for IMGT-gaps in germlines
-            if all('...' not in x for x in references.values()):
-                printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
-            germ_iter = (addGermline(x, references) for x in parse_iter)
-
-        # Write db
-        output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
-                         annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial,
-                         writer=writer, out_file=out_file, out_args=out_args)
-
-    # Cleanup temp directory
-    temp_dir.cleanup()
-
-    return output
-
-
-def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False,
-                 asis_id=True, asis_calls=False, extended=False, regions='default',
-                 format='changeo', out_file=None, out_args=default_out_args):
-    """
-    Main for IgBLAST aligned sample sequences.
-
-    Arguments:
-      aligner_file (str): IgBLAST output file to process.
-      seq_file (str): fasta file input to IgBlast (from which to get sequence).
-      repo (str): folder with germline repertoire files.
-      amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False.
-      partial : If True put incomplete alignments in the pass file.
-      asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters.
-      asis_calls (bool): if True do not parse gene calls for allele names.
-      extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output.
-      regions (str): name of the IMGT FWR/CDR region definitions to use.
-      format (str): output format. one of 'changeo' or 'airr'.
-      out_file (str): output file name. Automatically generated from the input file if None.
-      out_args (dict): common output argument dictionary from parseCommonArgs.
-
-    Returns:
-      dict : names of the 'pass' and 'fail' output files.
-    """
-    # Print parameter info
-    log = OrderedDict()
-    log['START'] = 'MakeDB'
-    log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast'
-    log['ALIGNER_FILE'] = os.path.basename(aligner_file)
-    log['SEQ_FILE'] = os.path.basename(seq_file)
-    log['ASIS_ID'] = asis_id
-    log['ASIS_CALLS'] = asis_calls
-    log['PARTIAL'] = partial
-    log['EXTENDED'] = extended
-    printLog(log)
-
-    # Set amino acid conditions
-    if amino_acid:
-        format = '%s-aa' % format
-        parser = IgBLASTReaderAA
-    else:
-        parser = IgBLASTReader
-
-    # Start
-    start_time = time()
-    printMessage('Loading files', start_time=start_time, width=20)
-
-    # Count records in sequence file
-    total_count = countSeqFile(seq_file)
-
-    # Get input sequence dictionary
-    seq_dict = getSeqDict(seq_file)
-
-    # Create germline repo dictionary
-    references = readGermlines(repo, asis=asis_calls)
-
-    # Load supplementary annotation table
-    if cellranger_file is not None:
-        f = cellranger_extended if extended else cellranger_base
-        annotations = readCellRanger(cellranger_file, fields=f)
-    else:
-        annotations = None
-
-    printMessage('Done', start_time=start_time, end=True, width=20)
-
-    # Check for IMGT-gaps in germlines
-    if all('...' not in x for x in references.values()):
-        printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
-
-    # Define format operators
-    try:
-        __, writer, schema = getFormatOperators(format)
-    except ValueError:
-        printError('Invalid format %s.' % format)
-    out_args['out_type'] = schema.out_type
-
-    # Define output fields
-    fields = list(schema.required)
-    if extended:
-        custom = parser.customFields(schema=schema)
-        fields.extend(custom)
-
-    # Parse and write output
-    with open(aligner_file, 'r') as f:
-        parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls)
-        germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter)
-        output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
-                         annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id,
-                         regions=regions, writer=writer, out_file=out_file, out_args=out_args)
-
-    return output
-
-
-def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True,
-              extended=False, format=default_format, out_file=None, out_args=default_out_args):
-    """
-    Main for iHMMuneAlign aligned sample sequences.
-
-    Arguments:
-      aligner_file : iHMMune-Align output file to process.
-      seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
-      repo : folder with germline repertoire files.
-      partial : If True put incomplete alignments in the pass file.
-      asis_id : if ID is to be parsed for pRESTO output with default delimiters.
-      extended : if True parse alignment scores, FWR and CDR region fields.
-      format : output format. One of 'changeo' or 'airr'.
-      out_file : output file name. Automatically generated from the input file if None.
-      out_args : common output argument dictionary from parseCommonArgs.
-
-    Returns:
-      dict : names of the 'pass' and 'fail' output files.
-    """
-    # Print parameter info
-    log = OrderedDict()
-    log['START'] = 'MakeDB'
-    log['COMMAND'] = 'ihmm'
-    log['ALIGNER_FILE'] = os.path.basename(aligner_file)
-    log['SEQ_FILE'] = os.path.basename(seq_file)
-    log['ASIS_ID'] = asis_id
-    log['PARTIAL'] = partial
-    log['EXTENDED'] = extended
-    printLog(log)
-
-    start_time = time()
-    printMessage('Loading files', start_time=start_time, width=20)
-
-    # Count records in sequence file
-    total_count = countSeqFile(seq_file)
-
-    # Get input sequence dictionary
-    seq_dict = getSeqDict(seq_file)
-
-    # Create germline repo dictionary
-    references = readGermlines(repo)
-
-    # Load supplementary annotation table
-    if cellranger_file is not None:
-        f = cellranger_extended if extended else cellranger_base
-        annotations = readCellRanger(cellranger_file, fields=f)
-    else:
-        annotations = None
-
-    printMessage('Done', start_time=start_time, end=True, width=20)
-
-    # Check for IMGT-gaps in germlines
-    if all('...' not in x for x in references.values()):
-        printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
-
-    # Define format operators
-    try:
-        __, writer, schema = getFormatOperators(format)
-    except ValueError:
-        printError('Invalid format %s.' % format)
-    out_args['out_type'] = schema.out_type
-
-    # Define output fields
-    fields = list(schema.required)
-    if extended:
-        custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema)
-        fields.extend(custom)
-
-    # Parse and write output
-    with open(aligner_file, 'r') as f:
-        parse_iter = IHMMuneReader(f, seq_dict, references)
-        germ_iter = (addGermline(x, references) for x in parse_iter)
-        output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
-                        annotations=annotations, asis_id=asis_id, partial=partial,
-                        writer=writer, out_file=out_file, out_args=out_args)
-
-    return output
-
-
-def getArgParser():
-    """
-    Defines the ArgumentParser.
-
-    Returns:
-      argparse.ArgumentParser
-    """
-    fields = dedent(
-             '''
-              output files:
-                  db-pass
-                      database of alignment records with functionality information,
-                      V and J calls, and a junction region.
-                  db-fail
-                      database with records that fail due to no productivity information,
-                      no gene V assignment, no J assignment, or no junction region.
-
-              universal output fields:
-                 sequence_id, sequence, sequence_alignment, germline_alignment,
-                 rev_comp, productive, stop_codon, vj_in_frame, locus,
-                 v_call, d_call, j_call, junction, junction_length, junction_aa,
-                 v_sequence_start, v_sequence_end, v_germline_start, v_germline_end,
-                 d_sequence_start, d_sequence_end, d_germline_start, d_germline_end,
-                 j_sequence_start, j_sequence_end, j_germline_start, j_germline_end,
-                 np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3
-
-              imgt specific output fields:
-                  n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
-                  d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
-
-              igblast specific output fields:
-                  v_score, v_identity, v_support, v_cigar,
-                  d_score, d_identity, d_support, d_cigar,
-                  j_score, j_identity, j_support, j_cigar
-
-              ihmm specific output fields:
-                  vdj_score
-
-              10X specific output fields:
-                  cell_id, c_call, consensus_count, umi_count,
-                  v_call_10x, d_call_10x, j_call_10x,
-                  junction_10x, junction_10x_aa
-              ''')
-
-    # Define ArgumentParser
-    parser = ArgumentParser(description=__doc__, epilog=fields,
-                            formatter_class=CommonHelpFormatter, add_help=False)
-    group_help = parser.add_argument_group('help')
-    group_help.add_argument('--version', action='version',
-                            version='%(prog)s:' + ' %s %s' %(__version__, __date__))
-    group_help.add_argument('-h', '--help', action='help', help='show this help message and exit')
-    subparsers = parser.add_subparsers(title='subcommands', dest='command',
-                                       help='Aligner used', metavar='')
-    # TODO:  This is a temporary fix for Python issue 9253
-    subparsers.required = True
-
-    # Parent parser
-    parser_parent = getCommonArgParser(db_in=False)
-
-    # igblastn output parser
-    parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
-                                           formatter_class=CommonHelpFormatter, add_help=False,
-                                           help='Process igblastn output.',
-                                           description='Process igblastn output.')
-    group_igblast = parser_igblast.add_argument_group('aligner parsing arguments')
-    group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
-                                required=True,
-                                help='''IgBLAST output files in format 7 with query sequence
-                                     (igblastn argument \'-outfmt "7 std qseq sseq btop"\').''')
-    group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
-                                help='''List of folders and/or fasta files containing
-                                     the same germline set used in the IgBLAST alignment. These
-                                     reference sequences must contain IMGT-numbering spacers (gaps)
-                                     in the V segment.''')
-    group_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
-                                required=True,
-                                help='''List of input FASTA files (with .fasta, .fna or .fa
-                                     extension), containing sequences.''')
-    group_igblast.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
-                                help='''Table file containing 10X annotations (with .csv or .tsv
-                                     extension).''')
-    group_igblast.add_argument('--asis-id', action='store_true', dest='asis_id',
-                                help='''Specify to prevent input sequence headers from being parsed
-                                     to add new columns to database. Parsing of sequence headers requires
-                                     headers to be in the pRESTO annotation format, so this should be specified
-                                     when sequence headers are incompatible with the pRESTO annotation scheme.
-                                     Note, unrecognized header formats will default to this behavior.''')
-    group_igblast.add_argument('--asis-calls', action='store_true', dest='asis_calls',
-                                help='''Specify to prevent gene calls from being parsed into standard allele names
-                                     in both the IgBLAST output and reference database. Note, this requires
-                                     the sequence identifiers in the reference sequence set and the IgBLAST
-                                     database to be exact string matches.''')
-    group_igblast.add_argument('--partial', action='store_true', dest='partial',
-                                help='''If specified, include incomplete V(D)J alignments in
-                                     the pass file instead of the fail file. An incomplete alignment
-                                     is defined as a record for which a valid IMGT-gapped sequence
-                                     cannot be built or that is missing a V gene assignment,
-                                     J gene assignment, junction region, or productivity call.''')
-    group_igblast.add_argument('--extended', action='store_true', dest='extended',
-                               help='''Specify to include additional aligner specific fields in the output.
-                                    Adds <vdj>_score, <vdj>_identity, <vdj>_support, <vdj>_cigar,
-                                    fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
-    group_igblast.add_argument('--regions', action='store', dest='regions',
-                               choices=('default', 'rhesus-igl'), default='default',
-                               help='''IMGT CDR and FWR boundary definition to use.''')
-    parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False)
-
-    # igblastp output parser
-    parser_igblast_aa = subparsers.add_parser('igblast-aa', parents=[parser_parent],
-                                           formatter_class=CommonHelpFormatter, add_help=False,
-                                           help='Process igblastp output.',
-                                           description='Process igblastp output.')
-    group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments')
-    group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files',
-                                  required=True,
-                                  help='''IgBLAST output files in format 7 with query sequence
-                                       (igblastp argument \'-outfmt "7 std qseq sseq btop"\').''')
-    group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
-                                  help='''List of folders and/or fasta files containing
-                                       the same germline set used in the IgBLAST alignment. These
-                                       reference sequences must contain IMGT-numbering spacers (gaps)
-                                       in the V segment.''')
-    group_igblast_aa.add_argument('-s', action='store', nargs='+', dest='seq_files', required=True,
-                                  help='''List of input FASTA files (with .fasta, .fna or .fa
-                                       extension), containing sequences.''')
-    group_igblast_aa.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
-                                  help='''Table file containing 10X annotations (with .csv or .tsv extension).''')
-    group_igblast_aa.add_argument('--asis-id', action='store_true', dest='asis_id',
-                                  help='''Specify to prevent input sequence headers from being parsed
-                                       to add new columns to database. Parsing of sequence headers requires
-                                       headers to be in the pRESTO annotation format, so this should be specified
-                                       when sequence headers are incompatible with the pRESTO annotation scheme.
-                                       Note, unrecognized header formats will default to this behavior.''')
-    group_igblast_aa.add_argument('--asis-calls', action='store_true', dest='asis_calls',
-                                  help='''Specify to prevent gene calls from being parsed into standard allele names
-                                       in both the IgBLAST output and reference database. Note, this requires
-                                       the sequence identifiers in the reference sequence set and the IgBLAST
-                                       database to be exact string matches.''')
-    group_igblast_aa.add_argument('--extended', action='store_true', dest='extended',
-                                  help='''Specify to include additional aligner specific fields in the output.
-                                       Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''')
-    group_igblast_aa.add_argument('--regions', action='store', dest='regions',
-                                  choices=('default', 'rhesus-igl'), default='default',
-                                  help='''IMGT CDR and FWR boundary definition to use.''')
-    parser_igblast_aa.set_defaults(func=parseIgBLAST, partial=True, amino_acid=True)
-
-
-    # IMGT aligner
-    parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
-                                        formatter_class=CommonHelpFormatter, add_help=False,
-                                        help='''Process IMGT/HighV-Quest output
-                                             (does not work with V-QUEST).''',
-                                        description='''Process IMGT/HighV-Quest output
-                                             (does not work with V-QUEST).''')
-    group_imgt = parser_imgt.add_argument_group('aligner parsing arguments')
-    group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files',
-                            help='''Either zipped IMGT output files (.zip or .txz) or a
-                                 folder containing unzipped IMGT output files (which must
-                                 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
-                                 and 6_Junction).''')
-    group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False,
-                            help='''List of FASTA files (with .fasta, .fna or .fa
-                                  extension) that were submitted to IMGT/HighV-QUEST.
-                                  If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST
-                                  will not be corrected.''')
-    group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False,
-                            help='''List of folders and/or fasta files containing
-                                 the germline sequence set used by IMGT/HighV-QUEST.
-                                 These reference sequences must contain IMGT-numbering spacers (gaps)
-                                 in the V segment. If unspecified, the germline sequence reconstruction
-                                 will not be included in the output.''')
-    group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
-                            help='''Table file containing 10X annotations (with .csv or .tsv
-                                 extension).''')
-    group_imgt.add_argument('--asis-id', action='store_true', dest='asis_id',
-                            help='''Specify to prevent input sequence headers from being parsed
-                                 to add new columns to database. Parsing of sequence headers requires
-                                 headers to be in the pRESTO annotation format, so this should be specified
-                                 when sequence headers are incompatible with the pRESTO annotation scheme.
-                                 Note, unrecognized header formats will default to this behavior.''')
-    group_imgt.add_argument('--partial', action='store_true', dest='partial',
-                            help='''If specified, include incomplete V(D)J alignments in
-                                 the pass file instead of the fail file. An incomplete alignment
-                                 is defined as a record that is missing a V gene assignment,
-                                 J gene assignment, junction region, or productivity call.''')
-    group_imgt.add_argument('--extended', action='store_true', dest='extended',
-                            help='''Specify to include additional aligner specific fields in the output.
-                                 Adds <vdj>_score, <vdj>_identity>, fwr1, fwr2, fwr3, fwr4,
-                                 cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
-                                 p3d_length, p5j_length and d_frame.''')
-    parser_imgt.set_defaults(func=parseIMGT)
-
-    # iHMMuneAlign Aligner
-    parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
-                                        formatter_class=CommonHelpFormatter, add_help=False,
-                                        help='Process iHMMune-Align output.',
-                                        description='Process iHMMune-Align output.')
-    group_ihmm = parser_ihmm.add_argument_group('aligner parsing arguments')
-    group_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_files',
-                             required=True,
-                             help='''iHMMune-Align output file.''')
-    group_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
-                             help='''List of folders and/or FASTA files containing
-                                   the set of germline sequences used by iHMMune-Align. These
-                                   reference sequences must contain IMGT-numbering spacers (gaps)
-                                   in the V segment.''')
-    group_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
-                             required=True,
-                             help='''List of input FASTA files (with .fasta, .fna or .fa
-                                  extension) containing sequences.''')
-    group_ihmm.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
-                                help='''Table file containing 10X annotations (with .csv or .tsv
-                                     extension).''')
-    group_ihmm.add_argument('--asis-id', action='store_true', dest='asis_id',
-                             help='''Specify to prevent input sequence headers from being parsed
-                                  to add new columns to database. Parsing of sequence headers requires
-                                  headers to be in the pRESTO annotation format, so this should be specified
-                                  when sequence headers are incompatible with the pRESTO annotation scheme.
-                                  Note, unrecognized header formats will default to this behavior.''')
-    group_ihmm.add_argument('--partial', action='store_true', dest='partial',
-                             help='''If specified, include incomplete V(D)J alignments in
-                                  the pass file instead of the fail file. An incomplete alignment
-                                     is defined as a record for which a valid IMGT-gapped sequence
-                                     cannot be built or that is missing a V gene assignment,
-                                     J gene assignment, junction region, or productivity call.''')
-    group_ihmm.add_argument('--extended', action='store_true', dest='extended',
-                             help='''Specify to include additional aligner specific fields in the output.
-                                  Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;
-                                  adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
-    parser_ihmm.set_defaults(func=parseIHMM)
-
-    return parser
-
-
-if __name__ == "__main__":
-    """
-    Parses command line arguments and calls main
-    """
-    parser = getArgParser()
-    checkArgs(parser)
-    args = parser.parse_args()
-    args_dict = parseCommonArgs(args, in_arg='aligner_files')
-
-    # Set no ID parsing if sequence files are not provided
-    if 'seq_files' in args_dict and not args_dict['seq_files']:
-        args_dict['asis_id'] = True
-
-    # Delete
-    if 'aligner_files' in args_dict: del args_dict['aligner_files']
-    if 'seq_files' in args_dict: del args_dict['seq_files']
-    if 'out_files' in args_dict: del args_dict['out_files']
-    if 'command' in args_dict: del args_dict['command']
-    if 'func' in args_dict: del args_dict['func']
-
-    # Call main
-    for i, f in enumerate(args.__dict__['aligner_files']):
-        args_dict['aligner_file'] = f
-        args_dict['seq_file'] = args.__dict__['seq_files'][i] \
-                                if args.__dict__['seq_files'] else None
-        args_dict['out_file'] = args.__dict__['out_files'][i] \
-                                if args.__dict__['out_files'] else None
-        args_dict['cellranger_file'] = args.__dict__['cellranger_file'][i] \
-                                if args.__dict__['cellranger_file'] else None
-        args.func(**args_dict)
--- a/change_o/define_clones.sh	Mon Aug 31 11:20:08 2020 -0400
+++ b/change_o/define_clones.sh	Tue Sep 01 16:03:44 2020 -0400
@@ -21,7 +21,7 @@
 	output=${10}
 	output2=${11}

-	python3 $dir/DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
+	DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link

 	Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
 else
@@ -29,7 +29,7 @@
 	output=$4
 	output2=$5

-	python3 $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
+	DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method

 	Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
 fi
--- a/change_o/makedb.sh	Mon Aug 31 11:20:08 2020 -0400
+++ b/change_o/makedb.sh	Tue Sep 01 16:03:44 2020 -0400
@@ -29,7 +29,7 @@

 echo "makedb: $PWD/outdir"

-python3 $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
+MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions

 mv $PWD/outdir/output_db-pass.tab $output