changeset 77:58d2377b507d draft

Uploaded
author davidvanzessen
date Wed, 19 Jun 2019 04:31:44 -0400 (2019-06-19)
parents a93136637bea
children aff3ba86ef7a
files README.md change_o/DefineClones.py change_o/MakeDb.py merge_and_filter.r shm_csr.xml
diffstat 5 files changed, 23 insertions(+), 1688 deletions(-) [+]
line wrap: on
line diff
--- a/README.md	Tue Jun 18 04:47:44 2019 -0400
+++ b/README.md	Wed Jun 19 04:31:44 2019 -0400
@@ -1,6 +1,7 @@
 # SHM CSR
 
-Somatic hypermutation and class switch recombination pipeline 
+Somatic hypermutation and class switch recombination pipeline.  
+The docker version can be found [here](https://github.com/ErasmusMC-Bioinformatics/ARGalaxy-docker).
 
 # Dependencies
 --------------------
@@ -9,4 +10,4 @@
 [Baseline](http://selection.med.yale.edu/baseline/)  
 [R data.table](https://cran.r-project.org/web/packages/data.table/data.table.pdf)
 [R ggplot2](https://cran.r-project.org/web/packages/ggplot2/ggplot2.pdf)
-[R reshape2](https://cran.r-project.org/web/packages/reshape/reshape.pdf)
\ No newline at end of file
+[R reshape2](https://cran.r-project.org/web/packages/reshape/reshape.pdf)
--- a/change_o/DefineClones.py	Tue Jun 18 04:47:44 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1123 +0,0 @@
-#!/usr/bin/env python3
-"""
-Assign Ig sequences into clones
-"""
-# Info
-__author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman'
-from changeo import __version__, __date__
-
-# Imports
-import os
-import re
-import sys
-import csv
-import numpy as np
-from argparse import ArgumentParser
-from collections import OrderedDict
-from itertools import chain
-from textwrap import dedent
-from time import time
-from Bio import pairwise2
-from Bio.Seq import translate
-
-# Presto and changeo imports
-from presto.Defaults import default_out_args
-from presto.IO import getFileType, getOutputHandle, printLog, printProgress
-from presto.Multiprocessing import manageProcesses
-from presto.Sequence import getDNAScoreDict
-from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
-from changeo.Distance import distance_models, calcDistances, formClusters
-from changeo.IO import getDbWriter, readDbFile, countDbFile
-from changeo.Multiprocessing import DbData, DbResult
-
-## Set maximum field size for csv.reader
-csv.field_size_limit(sys.maxsize)
-
-# Defaults
-default_translate = False
-default_distance = 0.0
-default_index_mode = 'gene'
-default_index_action = 'set'
-default_bygroup_model = 'ham'
-default_hclust_model = 'chen2010'
-default_seq_field = 'JUNCTION'
-default_norm = 'len'
-default_sym = 'avg'
-default_linkage = 'single'
-choices_bygroup_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f', 'mk_rs1nf', 'mk_rs5nf', 'hs1f_compat', 'm1n_compat')
-
-
-def indexByIdentity(index, key, rec, fields=None):
-    """
-    Updates a preclone index with a simple key
-
-    Arguments:
-    index = preclone index from indexJunctions
-    key = index key
-    rec = IgRecord to add to the index
-    fields = additional annotation fields to use to group preclones;
-             if None use only V, J and junction length
-
-    Returns:
-    None. Updates index with new key and records.
-    """
-    index.setdefault(tuple(key), []).append(rec)
-
-
-def indexByUnion(index, key, rec, fields=None):
-    """
-    Updates a preclone index with the union of nested keys
-
-    Arguments:
-    index = preclone index from indexJunctions
-    key = index key
-    rec = IgRecord to add to the index
-    fields = additional annotation fields to use to group preclones;
-             if None use only V, J and junction length
-
-    Returns:
-    None. Updates index with new key and records.
-    """
-    # List of values for this/new key
-    val = [rec]
-    f_range = list(range(2, 3 + (len(fields) if fields else 0)))
-
-    # See if field/junction length combination exists in index
-    outer_dict = index
-    for field in f_range:
-        try:
-            outer_dict = outer_dict[key[field]]
-        except (KeyError):
-            outer_dict = None
-            break
-    # If field combination exists, look through Js
-    j_matches = []
-    if outer_dict is not None:
-        for j in outer_dict.keys():
-            if not set(key[1]).isdisjoint(set(j)):
-                key[1] = tuple(set(key[1]).union(set(j)))
-                j_matches += [j]
-    # If J overlap exists, look through Vs for each J
-    for j in j_matches:
-        v_matches = []
-        # Collect V matches for this J
-        for v in outer_dict[j].keys():
-            if not set(key[0]).isdisjoint(set(v)):
-                key[0] = tuple(set(key[0]).union(set(v)))
-                v_matches += [v]
-        # If there are V overlaps for this J, pop them out
-        if v_matches:
-            val += list(chain(*(outer_dict[j].pop(v) for v in v_matches)))
-            # If the J dict is now empty, remove it
-            if not outer_dict[j]:
-                outer_dict.pop(j, None)
-
-    # Add value(s) into index nested dictionary
-    # OMG Python pointers are the best!
-    # Add field dictionaries into index
-    outer_dict = index
-    for field in f_range:
-        outer_dict.setdefault(key[field], {})
-        outer_dict = outer_dict[key[field]]
-    # Add J, then V into index
-    if key[1] in outer_dict:
-        outer_dict[key[1]].update({key[0]: val})
-    else:
-        outer_dict[key[1]] = {key[0]: val}
-
-
-def indexJunctions(db_iter, fields=None, mode=default_index_mode,
-                   action=default_index_action):
-    """
-    Identifies preclonal groups by V, J and junction length
-
-    Arguments: 
-    db_iter = an iterator of IgRecords defined by readDbFile
-    fields = additional annotation fields to use to group preclones;
-             if None use only V, J and junction length
-    mode = specificity of alignment call to use for assigning preclones;
-           one of ('allele', 'gene')
-    action = how to handle multiple value fields when assigning preclones;
-             one of ('first', 'set')
-    
-    Returns: 
-    a dictionary of {(V, J, junction length):[IgRecords]}
-    """
-    # print(fields)
-    # Define functions for grouping keys
-    if mode == 'allele' and fields is None:
-        def _get_key(rec, act):
-            return [rec.getVAllele(act), rec.getJAllele(act),
-                    None if rec.junction is None else len(rec.junction)]
-    elif mode == 'gene' and fields is None:
-        def _get_key(rec, act):  
-            return [rec.getVGene(act), rec.getJGene(act),
-                    None if rec.junction is None else len(rec.junction)]
-    elif mode == 'allele' and fields is not None:
-        def _get_key(rec, act):
-            vdj = [rec.getVAllele(act), rec.getJAllele(act),
-                    None if rec.junction is None else len(rec.junction)]
-            ann = [rec.toDict().get(k, None) for k in fields]
-            return list(chain(vdj, ann))
-    elif mode == 'gene' and fields is not None:
-        def _get_key(rec, act):
-            vdj = [rec.getVGene(act), rec.getJGene(act),
-                    None if rec.junction is None else len(rec.junction)]
-            ann = [rec.toDict().get(k, None) for k in fields]
-            return list(chain(vdj, ann))
-
-    # Function to flatten nested dictionary
-    def _flatten_dict(d, parent_key=''):
-        items = []
-        for k, v in d.items():
-            new_key = parent_key + [k] if parent_key else [k]
-            if isinstance(v, dict):
-                items.extend(_flatten_dict(v, new_key).items())
-            else:
-                items.append((new_key, v))
-        flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items}
-        return flat_dict
-
-    if action == 'first':
-        index_func = indexByIdentity
-    elif action == 'set':
-        index_func = indexByUnion
-    else:
-        sys.stderr.write('Unrecognized action: %s.\n' % action)
-
-    start_time = time()
-    clone_index = {}
-    rec_count = 0
-    for rec in db_iter:
-        key = _get_key(rec, action)
-
-        # Print progress
-        if rec_count == 0:
-            print('PROGRESS> Grouping sequences')
-
-        printProgress(rec_count, step=1000, start_time=start_time)
-        rec_count += 1
-
-        # Assigned passed preclone records to key and failed to index None
-        if all([k is not None and k != '' for k in key]):
-            # Update index dictionary
-            index_func(clone_index, key, rec, fields)
-        else:
-            clone_index.setdefault(None, []).append(rec)
-
-    printProgress(rec_count, step=1000, start_time=start_time, end=True)
-
-    if action == 'set':
-        clone_index = _flatten_dict(clone_index)
-
-    return clone_index
-
-
-def distanceClones(records, model=default_bygroup_model, distance=default_distance,
-                   dist_mat=None, norm=default_norm, sym=default_sym,
-                   linkage=default_linkage, seq_field=default_seq_field):
-    """
-    Separates a set of IgRecords into clones
-
-    Arguments: 
-    records = an iterator of IgRecords
-    model = substitution model used to calculate distance
-    distance = the distance threshold to assign clonal groups
-    dist_mat = pandas DataFrame of pairwise nucleotide or amino acid distances
-    norm = normalization method
-    sym = symmetry method
-    linkage = type of linkage
-    seq_field = sequence field used to calculate distance between records
-
-    Returns: 
-    a dictionary of lists defining {clone number: [IgRecords clonal group]}
-    """
-    # Get distance matrix if not provided
-    if dist_mat is None:
-        try:
-            dist_mat = distance_models[model]
-        except KeyError:
-            sys.exit('Unrecognized distance model: %s' % args_dict['model'])
-
-    # TODO:  can be cleaned up with abstract model class
-    # Determine length of n-mers
-    if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']:
-        nmer_len = 1
-    elif model in ['hh_s5f', 'mk_rs5nf']:
-        nmer_len = 5
-    else:
-        sys.exit('Unrecognized distance model: %s.\n' % model)
-
-    # Define unique junction mapping
-    seq_map = {}
-    for ig in records:
-        seq = ig.getSeqField(seq_field)
-        # Check if sequence length is 0
-        if len(seq) == 0:
-            return None
-
-        seq = re.sub('[\.-]', 'N', str(seq))
-        if model == 'aa':  seq = translate(seq)
-
-        seq_map.setdefault(seq, []).append(ig)
-
-    # Process records
-    if len(seq_map) == 1:
-        return {1:records}
-
-    # Define sequences
-    seqs = list(seq_map.keys())
-
-    # Calculate pairwise distance matrix
-    dists = calcDistances(seqs, nmer_len, dist_mat, sym=sym, norm=norm)
-
-    # Perform hierarchical clustering
-    clusters = formClusters(dists, linkage, distance)
-
-    # Turn clusters into clone dictionary
-    clone_dict = {}
-    for i, c in enumerate(clusters):
-        clone_dict.setdefault(c, []).extend(seq_map[seqs[i]])
-
-    return clone_dict
-
-
-def distChen2010(records):
-    """
-    Calculate pairwise distances as defined in Chen 2010
-    
-    Arguments:
-    records = list of IgRecords where first is query to be compared to others in list
-    
-    Returns:
-    list of distances
-    """
-    # Pull out query sequence and V/J information
-    query = records.popitem(last=False)
-    query_cdr3 = query.junction[3:-3]
-    query_v_allele = query.getVAllele()
-    query_v_gene = query.getVGene()
-    query_v_family = query.getVFamily()
-    query_j_allele = query.getJAllele()
-    query_j_gene = query.getJGene()
-    # Create alignment scoring dictionary
-    score_dict = getDNAScoreDict()
-    
-    scores = [0]*len(records)    
-    for i in range(len(records)):
-        ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3],
-                                      score_dict, -1, -1, one_alignment_only=True)
-        # Check V similarity
-        if records[i].getVAllele() == query_v_allele: ld += 0
-        elif records[i].getVGene() == query_v_gene: ld += 1
-        elif records[i].getVFamily() == query_v_family: ld += 3
-        else: ld += 5
-        # Check J similarity
-        if records[i].getJAllele() == query_j_allele: ld += 0
-        elif records[i].getJGene() == query_j_gene: ld += 1
-        else: ld += 3
-        # Divide by length
-        scores[i] = ld/max(len(records[i].junction[3:-3]), query_cdr3)
-        
-    return scores
-
-
-def distAdemokun2011(records):
-    """
-    Calculate pairwise distances as defined in Ademokun 2011
-    
-    Arguments:
-    records = list of IgRecords where first is query to be compared to others in list
-    
-    Returns:
-    list of distances
-    """
-    # Pull out query sequence and V family information
-    query = records.popitem(last=False)
-    query_cdr3 = query.junction[3:-3]
-    query_v_family = query.getVFamily()
-    # Create alignment scoring dictionary
-    score_dict = getDNAScoreDict()
-    
-    scores = [0]*len(records)    
-    for i in range(len(records)):
-        
-        if abs(len(query_cdr3) - len(records[i].junction[3:-3])) > 10:
-            scores[i] = 1
-        elif query_v_family != records[i].getVFamily(): 
-            scores[i] = 1
-        else: 
-            ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3], 
-                                          score_dict, -1, -1, one_alignment_only=True)
-            scores[i] = ld/min(len(records[i].junction[3:-3]), query_cdr3)
-    
-    return scores
-
-
-def hierClust(dist_mat, method='chen2010'):
-    """
-    Calculate hierarchical clustering
-    
-    Arguments:
-    dist_mat = square-formed distance matrix of pairwise CDR3 comparisons
-    
-    Returns:
-    list of cluster ids
-    """
-    if method == 'chen2010':
-        clusters = formClusters(dist_mat, 'average', 0.32)
-    elif method == 'ademokun2011':
-        clusters = formClusters(dist_mat, 'complete', 0.25)
-    else: clusters = np.ones(dist_mat.shape[0])
-        
-    return clusters
-
-# TODO:  Merge duplicate feed, process and collect functions.
-def feedQueue(alive, data_queue, db_file, group_func, group_args={}):
-    """
-    Feeds the data queue with Ig records
-
-    Arguments: 
-    alive = a multiprocessing.Value boolean controlling whether processing continues
-            if False exit process
-    data_queue = a multiprocessing.Queue to hold data for processing
-    db_file = the Ig record database file
-    group_func = the function to use for assigning preclones
-    group_args = a dictionary of arguments to pass to group_func
-    
-    Returns: 
-    None
-    """
-    # Open input file and perform grouping
-    try:
-        # Iterate over Ig records and assign groups
-        db_iter = readDbFile(db_file)
-        clone_dict = group_func(db_iter, **group_args)
-    except:
-        #sys.stderr.write('Exception in feeder grouping step\n')
-        alive.value = False
-        raise
-    
-    # Add groups to data queue
-    try:
-        #print 'START FEED', alive.value
-        # Iterate over groups and feed data queue
-        clone_iter = iter(clone_dict.items())
-        while alive.value:
-            # Get data from queue
-            if data_queue.full():  continue
-            else:  data = next(clone_iter, None)
-            # Exit upon reaching end of iterator
-            if data is None:  break
-            #print "FEED", alive.value, k
-            
-            # Feed queue
-            data_queue.put(DbData(*data))
-        else:
-            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
-                             % os.getpid())
-            return None
-    except:
-        #sys.stderr.write('Exception in feeder queue feeding step\n')
-        alive.value = False
-        raise
-
-    return None
-
-
-def feedQueueClust(alive, data_queue, db_file, group_func=None, group_args={}):
-    """
-    Feeds the data queue with Ig records
-
-    Arguments: 
-    alive = a multiprocessing.Value boolean controlling whether processing continues
-            if False exit process
-    data_queue = a multiprocessing.Queue to hold data for processing
-    db_file = the Ig record database file
-    
-    Returns: 
-    None
-    """
-    # Open input file and perform grouping
-    try:
-        # Iterate over Ig records and order by junction length
-        records = {}
-        db_iter = readDbFile(db_file)
-        for rec in db_iter:
-            records[rec.id] = rec
-        records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length))
-        dist_dict = {}
-        for __ in range(len(records)):
-            k,v = records.popitem(last=False)
-            dist_dict[k] = [v].append(list(records.values()))
-    except:
-        #sys.stderr.write('Exception in feeder grouping step\n')
-        alive.value = False
-        raise
-    
-    # Add groups to data queue
-    try:
-        # print 'START FEED', alive.value
-        # Iterate over groups and feed data queue
-        dist_iter = iter(dist_dict.items())
-        while alive.value:
-            # Get data from queue
-            if data_queue.full():  continue
-            else:  data = next(dist_iter, None)
-            # Exit upon reaching end of iterator
-            if data is None:  break
-            #print "FEED", alive.value, k
-            
-            # Feed queue
-            data_queue.put(DbData(*data))
-        else:
-            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
-                             % os.getpid())
-            return None
-    except:
-        #sys.stderr.write('Exception in feeder queue feeding step\n')
-        alive.value = False
-        raise
-
-    return None
-
-
-def processQueue(alive, data_queue, result_queue, clone_func, clone_args):
-    """
-    Pulls from data queue, performs calculations, and feeds results queue
-
-    Arguments: 
-    alive = a multiprocessing.Value boolean controlling whether processing continues
-            if False exit process
-    data_queue = a multiprocessing.Queue holding data to process
-    result_queue = a multiprocessing.Queue to hold processed results
-    clone_func = the function to call for clonal assignment
-    clone_args = a dictionary of arguments to pass to clone_func
-
-    Returns: 
-    None
-    """
-    try:
-        # Iterator over data queue until sentinel object reached
-        while alive.value:
-            # Get data from queue
-            if data_queue.empty():  continue
-            else:  data = data_queue.get()
-            # Exit upon reaching sentinel
-            if data is None:  break
-
-            # Define result object for iteration and get data records
-            records = data.data
-            # print(data.id)
-            result = DbResult(data.id, records)
-
-            # Check for invalid data (due to failed indexing) and add failed result
-            if not data:
-                result_queue.put(result)
-                continue
-
-            # Add V(D)J to log
-            result.log['ID'] = ','.join([str(x) for x in data.id])
-            result.log['VALLELE'] = ','.join(set([(r.getVAllele() or '') for r in records]))
-            result.log['DALLELE'] = ','.join(set([(r.getDAllele() or '') for r in records]))
-            result.log['JALLELE'] = ','.join(set([(r.getJAllele() or '') for r in records]))
-            result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in records]))
-            result.log['SEQUENCES'] = len(records)
-             
-            # Checking for preclone failure and assign clones
-            clones = clone_func(records, **clone_args) if data else None
-
-            # import cProfile
-            # prof = cProfile.Profile()
-            # clones = prof.runcall(clone_func, records, **clone_args)
-            # prof.dump_stats('worker-%d.prof' % os.getpid())
-
-            if clones is not None:
-                result.results = clones
-                result.valid = True
-                result.log['CLONES'] = len(clones)
-            else:
-                result.log['CLONES'] = 0
-  
-            # Feed results to result queue
-            result_queue.put(result)
-        else:
-            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
-                             % os.getpid())
-            return None
-    except:
-        #sys.stderr.write('Exception in worker\n')
-        alive.value = False
-        raise
-    
-    return None
-
-
-def processQueueClust(alive, data_queue, result_queue, clone_func, clone_args):
-    """
-    Pulls from data queue, performs calculations, and feeds results queue
-
-    Arguments: 
-    alive = a multiprocessing.Value boolean controlling whether processing continues
-            if False exit process
-    data_queue = a multiprocessing.Queue holding data to process
-    result_queue = a multiprocessing.Queue to hold processed results
-    clone_func = the function to call for calculating pairwise distances between sequences
-    clone_args = a dictionary of arguments to pass to clone_func
-
-    Returns: 
-    None
-    """
-    
-    try:
-        # print 'START WORK', alive.value
-        # Iterator over data queue until sentinel object reached
-        while alive.value:
-            # Get data from queue
-            if data_queue.empty():  continue
-            else:  data = data_queue.get()
-            # Exit upon reaching sentinel
-            if data is None:  break
-            # print "WORK", alive.value, data['id']
-
-            # Define result object for iteration and get data records
-            records = data.data
-            result = DbResult(data.id, records)
-             
-            # Create row of distance matrix and check for error
-            dist_row = clone_func(records, **clone_args) if data else None
-            if dist_row is not None:
-                result.results = dist_row
-                result.valid = True
-  
-            # Feed results to result queue
-            result_queue.put(result)
-        else:
-            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
-                             % os.getpid())
-            return None
-    except:
-        #sys.stderr.write('Exception in worker\n')
-        alive.value = False
-        raise
-    
-    return None
-
-
-def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}):
-    """
-    Assembles results from a queue of individual sequence results and manages log/file I/O
-
-    Arguments: 
-    alive = a multiprocessing.Value boolean controlling whether processing continues
-            if False exit process
-    result_queue = a multiprocessing.Queue holding processQueue results
-    collect_queue = a multiprocessing.Queue to store collector return values
-    db_file = the input database file name
-    out_args = common output argument dictionary from parseCommonArgs
-    cluster_func = the function to call for carrying out clustering on distance matrix
-    cluster_args = a dictionary of arguments to pass to cluster_func
-    
-    Returns: 
-    None
-    (adds 'log' and 'out_files' to collect_dict)
-    """
-    # Open output files
-    try:
-        # Count records and define output format 
-        out_type = getFileType(db_file) if out_args['out_type'] is None \
-                   else out_args['out_type']
-        result_count = countDbFile(db_file)
-        
-        # Defined successful output handle
-        pass_handle = getOutputHandle(db_file, 
-                                      out_label='clone-pass', 
-                                      out_dir=out_args['out_dir'], 
-                                      out_name=out_args['out_name'], 
-                                      out_type=out_type)
-        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')
-        
-        # Defined failed alignment output handle
-        if out_args['failed']:
-            fail_handle = getOutputHandle(db_file,
-                                          out_label='clone-fail', 
-                                          out_dir=out_args['out_dir'], 
-                                          out_name=out_args['out_name'], 
-                                          out_type=out_type)
-            fail_writer = getDbWriter(fail_handle, db_file)
-        else:
-            fail_handle = None
-            fail_writer = None
-
-        # Define log handle
-        if out_args['log_file'] is None:  
-            log_handle = None
-        else:  
-            log_handle = open(out_args['log_file'], 'w')
-    except:
-        #sys.stderr.write('Exception in collector file opening step\n')
-        alive.value = False
-        raise
-
-    # Get results from queue and write to files
-    try:
-        #print 'START COLLECT', alive.value
-        # Iterator over results queue until sentinel object reached
-        start_time = time()
-        rec_count = clone_count = pass_count = fail_count = 0
-        while alive.value:
-            # Get result from queue
-            if result_queue.empty():  continue
-            else:  result = result_queue.get()
-            # Exit upon reaching sentinel
-            if result is None:  break
-            #print "COLLECT", alive.value, result['id']
-            
-            # Print progress for previous iteration and update record count
-            if rec_count == 0:
-                print('PROGRESS> Assigning clones')
-            printProgress(rec_count, result_count, 0.05, start_time) 
-            rec_count += len(result.data)
-            
-            # Write passed and failed records
-            if result:
-                for clone in result.results.values():
-                    clone_count += 1
-                    for i, rec in enumerate(clone):
-                        rec.annotations['CLONE'] = clone_count
-                        pass_writer.writerow(rec.toDict())
-                        pass_count += 1
-                        result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)
-    
-            else:
-                for i, rec in enumerate(result.data):
-                    if fail_writer is not None: fail_writer.writerow(rec.toDict())
-                    fail_count += 1
-                    result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)
-                    
-            # Write log
-            printLog(result.log, handle=log_handle)
-        else:
-            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
-                             % os.getpid())
-            return None
-        
-        # Print total counts
-        printProgress(rec_count, result_count, 0.05, start_time)
-
-        # Close file handles
-        pass_handle.close()
-        if fail_handle is not None:  fail_handle.close()
-        if log_handle is not None:  log_handle.close()
-                
-        # Update return list
-        log = OrderedDict()
-        log['OUTPUT'] = os.path.basename(pass_handle.name)
-        log['CLONES'] = clone_count
-        log['RECORDS'] = rec_count
-        log['PASS'] = pass_count
-        log['FAIL'] = fail_count
-        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
-        collect_queue.put(collect_dict)
-    except:
-        #sys.stderr.write('Exception in collector result processing step\n')
-        alive.value = False
-        raise
-
-    return None
-
-
-def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args, cluster_func, cluster_args):
-    """
-    Assembles results from a queue of individual sequence results and manages log/file I/O
-
-    Arguments: 
-    alive = a multiprocessing.Value boolean controlling whether processing continues
-            if False exit process
-    result_queue = a multiprocessing.Queue holding processQueue results
-    collect_queue = a multiprocessing.Queue to store collector return values
-    db_file = the input database file name
-    out_args = common output argument dictionary from parseCommonArgs
-    cluster_func = the function to call for carrying out clustering on distance matrix
-    cluster_args = a dictionary of arguments to pass to cluster_func
-    
-    Returns: 
-    None
-    (adds 'log' and 'out_files' to collect_dict)
-    """
-    # Open output files
-    try:
-               
-        # Iterate over Ig records to count and order by junction length
-        result_count = 0
-        records = {}
-        # print 'Reading file...'
-        db_iter = readDbFile(db_file)
-        for rec in db_iter:
-            records[rec.id] = rec
-            result_count += 1
-        records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length))
-                
-        # Define empty matrix to store assembled results
-        dist_mat = np.zeros((result_count,result_count))
-        
-        # Count records and define output format 
-        out_type = getFileType(db_file) if out_args['out_type'] is None \
-                   else out_args['out_type']
-                   
-        # Defined successful output handle
-        pass_handle = getOutputHandle(db_file, 
-                                      out_label='clone-pass', 
-                                      out_dir=out_args['out_dir'], 
-                                      out_name=out_args['out_name'], 
-                                      out_type=out_type)
-        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')
-        
-        # Defined failed cloning output handle
-        if out_args['failed']:
-            fail_handle = getOutputHandle(db_file,
-                                          out_label='clone-fail', 
-                                          out_dir=out_args['out_dir'], 
-                                          out_name=out_args['out_name'], 
-                                          out_type=out_type)
-            fail_writer = getDbWriter(fail_handle, db_file)
-        else:
-            fail_handle = None
-            fail_writer = None
-
-        # Open log file
-        if out_args['log_file'] is None:
-            log_handle = None
-        else:
-            log_handle = open(out_args['log_file'], 'w')
-    except:
-        alive.value = False
-        raise
-    
-    try:
-        # Iterator over results queue until sentinel object reached
-        start_time = time()
-        row_count = rec_count = 0
-        while alive.value:
-            # Get result from queue
-            if result_queue.empty():  continue
-            else:  result = result_queue.get()
-            # Exit upon reaching sentinel
-            if result is None:  break
-
-            # Print progress for previous iteration
-            if row_count == 0:
-                print('PROGRESS> Assigning clones')
-            printProgress(row_count, result_count, 0.05, start_time)
-            
-            # Update counts for iteration
-            row_count += 1
-            rec_count += len(result)
-            
-            # Add result row to distance matrix
-            if result:
-                dist_mat[list(range(result_count-len(result),result_count)),result_count-len(result)] = result.results
-                
-        else:
-            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
-                             % os.getpid())
-            return None    
-        
-        # Calculate linkage and carry out clustering
-        # print dist_mat
-        clusters = cluster_func(dist_mat, **cluster_args) if dist_mat is not None else None
-        clones = {}
-        # print clusters
-        for i, c in enumerate(clusters):
-            clones.setdefault(c, []).append(records[list(records.keys())[i]])
-        
-        # Write passed and failed records
-        clone_count = pass_count = fail_count = 0
-        if clones:
-            for clone in clones.values():
-                clone_count += 1
-                for i, rec in enumerate(clone):
-                    rec.annotations['CLONE'] = clone_count
-                    pass_writer.writerow(rec.toDict())
-                    pass_count += 1
-                    #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)
-
-        else:
-            for i, rec in enumerate(result.data):
-                fail_writer.writerow(rec.toDict())
-                fail_count += 1
-                #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)
-        
-        # Print final progress
-        printProgress(row_count, result_count, 0.05, start_time)
-    
-        # Close file handles
-        pass_handle.close()
-        if fail_handle is not None:  fail_handle.close()
-        if log_handle is not None:  log_handle.close()
-                
-        # Update return list
-        log = OrderedDict()
-        log['OUTPUT'] = os.path.basename(pass_handle.name)
-        log['CLONES'] = clone_count
-        log['RECORDS'] = rec_count
-        log['PASS'] = pass_count
-        log['FAIL'] = fail_count
-        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
-        collect_queue.put(collect_dict)
-    except:
-        alive.value = False
-        raise
-    
-    return None
-
-
-def defineClones(db_file, feed_func, work_func, collect_func, clone_func, cluster_func=None,
-                 group_func=None, group_args={}, clone_args={}, cluster_args={}, 
-                 out_args=default_out_args, nproc=None, queue_size=None):
-    """
-    Define clonally related sequences
-    
-    Arguments:
-    db_file = filename of input database
-    feed_func = the function that feeds the queue
-    work_func = the worker function that will run on each CPU
-    collect_func = the function that collects results from the workers
-    group_func = the function to use for assigning preclones
-    clone_func = the function to use for determining clones within preclonal groups
-    group_args = a dictionary of arguments to pass to group_func
-    clone_args = a dictionary of arguments to pass to clone_func
-    out_args = common output argument dictionary from parseCommonArgs
-    nproc = the number of processQueue processes;
-            if None defaults to the number of CPUs
-    queue_size = maximum size of the argument queue;
-                 if None defaults to 2*nproc    
-    
-    Returns:
-    a list of successful output file names
-    """
-    # Print parameter info
-    log = OrderedDict()
-    log['START'] = 'DefineClones'
-    log['DB_FILE'] = os.path.basename(db_file)
-    if group_func is not None:
-        log['GROUP_FUNC'] = group_func.__name__
-        log['GROUP_ARGS'] = group_args
-    log['CLONE_FUNC'] = clone_func.__name__
-
-    # TODO:  this is yucky, but can be fixed by using a model class
-    clone_log = clone_args.copy()
-    if 'dist_mat' in clone_log:  del clone_log['dist_mat']
-    log['CLONE_ARGS'] = clone_log
-
-    if cluster_func is not None:
-        log['CLUSTER_FUNC'] = cluster_func.__name__
-        log['CLUSTER_ARGS'] = cluster_args
-    log['NPROC'] = nproc
-    printLog(log)
-    
-    # Define feeder function and arguments
-    feed_args = {'db_file': db_file,
-                 'group_func': group_func, 
-                 'group_args': group_args}
-    # Define worker function and arguments
-    work_args = {'clone_func': clone_func, 
-                 'clone_args': clone_args}
-    # Define collector function and arguments
-    collect_args = {'db_file': db_file,
-                    'out_args': out_args,
-                    'cluster_func': cluster_func,
-                    'cluster_args': cluster_args}
-    
-    # Call process manager
-    result = manageProcesses(feed_func, work_func, collect_func, 
-                             feed_args, work_args, collect_args, 
-                             nproc, queue_size)
-        
-    # Print log
-    result['log']['END'] = 'DefineClones'
-    printLog(result['log'])
-    
-    return result['out_files']
-
-
-def getArgParser():
-    """
-    Defines the ArgumentParser
-
-    Arguments: 
-    None
-                      
-    Returns: 
-    an ArgumentParser object
-    """
-    # Define input and output fields
-    fields = dedent(
-             '''
-             output files:
-                 clone-pass
-                     database with assigned clonal group numbers.
-                 clone-fail
-                     database with records failing clonal grouping.
-
-             required fields:
-                 SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION
-
-                 <field>
-                     sequence field specified by the --sf parameter
-                
-             output fields:
-                 CLONE
-              ''')
-
-    # Define ArgumentParser
-    parser = ArgumentParser(description=__doc__, epilog=fields,
-                            formatter_class=CommonHelpFormatter)
-    parser.add_argument('--version', action='version',
-                        version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
-    subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='',
-                                       help='Cloning method')
-    # TODO:  This is a temporary fix for Python issue 9253
-    subparsers.required = True
-    
-    # Parent parser    
-    parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True, 
-                                       multiproc=True)
-    
-    # Distance cloning method
-    parser_bygroup = subparsers.add_parser('bygroup', parents=[parser_parent],
-                                           formatter_class=CommonHelpFormatter,
-                                           help='''Defines clones as having same V assignment,
-                                                J assignment, and junction length with
-                                                specified substitution distance model.''',
-                                           description='''Defines clones as having same V assignment,
-                                                       J assignment, and junction length with
-                                                       specified substitution distance model.''')
-    parser_bygroup.add_argument('-f', nargs='+', action='store', dest='fields', default=None,
-                             help='Additional fields to use for grouping clones (non VDJ)')
-    parser_bygroup.add_argument('--mode', action='store', dest='mode', 
-                             choices=('allele', 'gene'), default=default_index_mode,
-                             help='''Specifies whether to use the V(D)J allele or gene for
-                                  initial grouping.''')
-    parser_bygroup.add_argument('--act', action='store', dest='action',
-                             choices=('first', 'set'), default=default_index_action,
-                             help='''Specifies how to handle multiple V(D)J assignments
-                                  for initial grouping.''')
-    parser_bygroup.add_argument('--model', action='store', dest='model', 
-                             choices=choices_bygroup_model,
-                             default=default_bygroup_model,
-                             help='''Specifies which substitution model to use for calculating distance
-                                  between sequences. The "ham" model is nucleotide Hamming distance and
-                                  "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are
-                                  human specific single nucleotide and 5-mer content models, respectively,
-                                  from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are
-                                  mouse specific single nucleotide and 5-mer content models, respectively,
-                                  from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are
-                                  deprecated models provided backwards compatibility with the "m1n" and
-                                  "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both
-                                  5-mer models should be considered experimental.''')
-    parser_bygroup.add_argument('--dist', action='store', dest='distance', type=float, 
-                             default=default_distance,
-                             help='The distance threshold for clonal grouping')
-    parser_bygroup.add_argument('--norm', action='store', dest='norm',
-                             choices=('len', 'mut', 'none'), default=default_norm,
-                             help='''Specifies how to normalize distances. One of none
-                                  (do not normalize), len (normalize by length),
-                                  or mut (normalize by number of mutations between sequences).''')
-    parser_bygroup.add_argument('--sym', action='store', dest='sym',
-                             choices=('avg', 'min'), default=default_sym,
-                             help='''Specifies how to combine asymmetric distances. One of avg
-                                  (average of A->B and B->A) or min (minimum of A->B and B->A).''')
-    parser_bygroup.add_argument('--link', action='store', dest='linkage',
-                             choices=('single', 'average', 'complete'), default=default_linkage,
-                             help='''Type of linkage to use for hierarchical clustering.''')
-    parser_bygroup.add_argument('--sf', action='store', dest='seq_field',
-                                default=default_seq_field,
-                                help='''The name of the field to be used to calculate
-                                     distance between records''')
-    parser_bygroup.set_defaults(feed_func=feedQueue)
-    parser_bygroup.set_defaults(work_func=processQueue)
-    parser_bygroup.set_defaults(collect_func=collectQueue)  
-    parser_bygroup.set_defaults(group_func=indexJunctions)  
-    parser_bygroup.set_defaults(clone_func=distanceClones)
-    
-    # Chen2010
-    parser_chen = subparsers.add_parser('chen2010', parents=[parser_parent],
-                                        formatter_class=CommonHelpFormatter,
-                                        help='''Defines clones by method specified in Chen, 2010.''',
-                                        description='''Defines clones by method specified in Chen, 2010.''')
-    parser_chen.set_defaults(feed_func=feedQueueClust)
-    parser_chen.set_defaults(work_func=processQueueClust)
-    parser_chen.set_defaults(collect_func=collectQueueClust)
-    parser_chen.set_defaults(cluster_func=hierClust)
-
-    # Ademokun2011
-    parser_ade = subparsers.add_parser('ademokun2011', parents=[parser_parent],
-                                        formatter_class=CommonHelpFormatter,
-                                        help='''Defines clones by method specified in Ademokun, 2011.''',
-                                        description='''Defines clones by method specified in Ademokun, 2011.''')
-    parser_ade.set_defaults(feed_func=feedQueueClust)
-    parser_ade.set_defaults(work_func=processQueueClust)
-    parser_ade.set_defaults(collect_func=collectQueueClust)
-    parser_ade.set_defaults(cluster_func=hierClust)
-        
-    return parser
-
-
-if __name__ == '__main__':
-    """
-    Parses command line arguments and calls main function
-    """
-    # Parse arguments
-    parser = getArgParser()
-    args = parser.parse_args()
-    args_dict = parseCommonArgs(args)
-    # Convert case of fields
-    if 'seq_field' in args_dict:
-        args_dict['seq_field'] = args_dict['seq_field'].upper()
-    if 'fields' in args_dict and args_dict['fields'] is not None:  
-        args_dict['fields'] = [f.upper() for f in args_dict['fields']]
-    
-    # Define clone_args
-    if args.command == 'bygroup':
-        args_dict['group_args'] = {'fields': args_dict['fields'],
-                                   'action': args_dict['action'], 
-                                   'mode':args_dict['mode']}
-        args_dict['clone_args'] = {'model':  args_dict['model'],
-                                   'distance':  args_dict['distance'],
-                                   'norm': args_dict['norm'],
-                                   'sym': args_dict['sym'],
-                                   'linkage': args_dict['linkage'],
-                                   'seq_field': args_dict['seq_field']}
-
-        # Get distance matrix
-        try:
-            args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']]
-        except KeyError:
-            sys.exit('Unrecognized distance model: %s' % args_dict['model'])
-
-        del args_dict['fields']
-        del args_dict['action']
-        del args_dict['mode']
-        del args_dict['model']
-        del args_dict['distance']
-        del args_dict['norm']
-        del args_dict['sym']
-        del args_dict['linkage']
-        del args_dict['seq_field']
-
-    # Define clone_args
-    if args.command == 'chen2010':
-        args_dict['clone_func'] = distChen2010
-        args_dict['cluster_args'] = {'method': args.command }
-
-    if args.command == 'ademokun2011':
-        args_dict['clone_func'] = distAdemokun2011
-        args_dict['cluster_args'] = {'method': args.command }
-    
-    # Call defineClones
-    del args_dict['command']
-    del args_dict['db_files']
-    for f in args.__dict__['db_files']:
-        args_dict['db_file'] = f
-        defineClones(**args_dict)
--- a/change_o/MakeDb.py	Tue Jun 18 04:47:44 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,555 +0,0 @@
-#!/usr/bin/env python3
-"""
-Create tab-delimited database file to store sequence alignment information
-"""
-# Info
-__author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
-from changeo import __version__, __date__
-
-# Imports
-import os
-import sys
-from argparse import ArgumentParser
-from collections import OrderedDict
-from textwrap import dedent
-from time import time
-from Bio import SeqIO
-
-# Presto and changeo imports
-from presto.Defaults import default_out_args
-from presto.Annotation import parseAnnotation
-from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile
-from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
-from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo
-from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT
-
-
-def getFilePrefix(aligner_output, out_args):
-    """
-    Get file name prefix and create output directory
-
-    Arguments:
-      aligner_output : aligner output file or directory.
-      out_args : dictionary of output arguments.
-
-    Returns:
-        str : file name prefix.
-    """
-    # Determine output directory
-    if not out_args['out_dir']:
-        out_dir = os.path.dirname(os.path.abspath(aligner_output))
-    else:
-        out_dir = os.path.abspath(out_args['out_dir'])
-        if not os.path.exists(out_dir):
-            os.mkdir(out_dir)
-
-    # Determine file prefix
-    if out_args['out_name']:
-        file_prefix = out_args['out_name']
-    else:
-        file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0]
-
-    return os.path.join(out_dir, file_prefix)
-
-
-def getSeqDict(seq_file):
-    """
-    Create a dictionary from a sequence file.
-
-    Arguments:
-      seq_file : sequence file.
-
-    Returns:
-        dict : sequence description as keys with Bio.SeqRecords as values.
-    """
-    seq_dict = SeqIO.to_dict(readSeqFile(seq_file),
-                             key_function=lambda x: x.description)
-
-    return seq_dict
-
-
-def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
-            out_args=default_out_args):
-    """
-    Writes tab-delimited database file in output directory.
-    
-    Arguments:
-      db : a iterator of IgRecord objects containing alignment data.
-      fields : a list of ordered field names to write.
-      file_prefix : directory and prefix for CLIP tab-delim file.
-      total_count : number of records (for progress bar).
-      id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
-      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
-      partial : if True put incomplete alignments in the pass file.
-      out_args : common output argument dictionary from parseCommonArgs.
-
-    Returns:
-      None
-    """
-    # Function to check for valid records strictly
-    def _pass_strict(rec):
-        valid = [rec.v_call and rec.v_call != 'None',
-                 rec.j_call and rec.j_call != 'None',
-                 rec.functional is not None,
-                 rec.seq_vdj,
-                 rec.junction]
-        return all(valid)
-
-    # Function to check for valid records loosely
-    def _pass_gentle(rec):
-        valid = [rec.v_call and rec.v_call != 'None',
-                 rec.d_call and rec.d_call != 'None',
-                 rec.j_call and rec.j_call != 'None']
-        return any(valid)
-
-    # Set pass criteria
-    _pass = _pass_gentle if partial else _pass_strict
-
-    # Define output file names
-    pass_file = '%s_db-pass.tab' % file_prefix
-    fail_file = '%s_db-fail.tab' % file_prefix
-
-    # Initiate handles, writers and counters
-    pass_handle = None
-    fail_handle = None
-    pass_writer = None
-    fail_writer = None
-    start_time = time()
-    rec_count = pass_count = fail_count = 0
-
-    # Validate and write output
-    printProgress(0, total_count, 0.05, start_time)
-    for i, record in enumerate(db, start=1):
-
-        # Replace sequence description with full string, if required
-        if id_dict is not None and record.id in id_dict:
-            record.id = id_dict[record.id]
-
-        # Parse sequence description into new columns
-        if not no_parse:
-            try:
-                record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
-                record.id = record.annotations['ID']
-                del record.annotations['ID']
-
-                # TODO:  This is not the best approach. should pass in output fields.
-                # If first record, use parsed description to define extra columns
-                if i == 1:  fields.extend(list(record.annotations.keys()))
-            except IndexError:
-                # Could not parse pRESTO-style annotations so fall back to no parse
-                no_parse = True
-                sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')
-
-        # Count pass or fail and write to appropriate file
-        if _pass(record):
-            # Open pass file
-            if pass_writer is None:
-                pass_handle = open(pass_file, 'wt')
-                pass_writer = getDbWriter(pass_handle, add_fields=fields)
-
-            # Write row to pass file
-            pass_count += 1
-            pass_writer.writerow(record.toDict())
-        else:
-            # Open failed file
-            if out_args['failed'] and fail_writer is None:
-                fail_handle = open(fail_file, 'wt')
-                fail_writer = getDbWriter(fail_handle, add_fields=fields)
-
-            # Write row to fail file if specified
-            fail_count += 1
-            if fail_writer is not None:
-                fail_writer.writerow(record.toDict())
-
-        # Print progress
-        printProgress(i, total_count, 0.05, start_time)
-
-    # Print consol log
-    log = OrderedDict()
-    log['OUTPUT'] = pass_file
-    log['PASS'] = pass_count
-    log['FAIL'] = fail_count
-    log['END'] = 'MakeDb'
-    printLog(log)
-    
-    if pass_handle is not None: pass_handle.close()
-    if fail_handle is not None: fail_handle.close()
-
-
-# TODO:  may be able to merge with other mains
-def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False,
-              parse_scores=False, parse_regions=False, parse_junction=False,
-              out_args=default_out_args):
-    """
-    Main for IMGT aligned sample sequences.
-
-    Arguments:
-      aligner_output : zipped file or unzipped folder output by IMGT.
-      seq_file : FASTA file input to IMGT (from which to get seqID).
-      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
-      partial : If True put incomplete alignments in the pass file.
-      parse_scores : if True add alignment score fields to output file.
-      parse_regions : if True add FWR and CDR region fields to output file.
-      out_args : common output argument dictionary from parseCommonArgs.
-
-    Returns:
-      None
-    """
-    # Print parameter info
-    log = OrderedDict()
-    log['START'] = 'MakeDb'
-    log['ALIGNER'] = 'IMGT'
-    log['ALIGNER_OUTPUT'] = aligner_output
-    log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
-    log['NO_PARSE'] = no_parse
-    log['PARTIAL'] = partial
-    log['SCORES'] = parse_scores
-    log['REGIONS'] = parse_regions
-    log['JUNCTION'] = parse_junction
-    printLog(log)
-
-    start_time = time()
-    printMessage('Loading sequence files', start_time=start_time, width=25)
-    # Extract IMGT files
-    temp_dir, imgt_files = extractIMGT(aligner_output)
-    # Count records in IMGT files
-    total_count = countDbFile(imgt_files['summary'])
-    # Get (parsed) IDs from fasta file submitted to IMGT
-    id_dict = getIDforIMGT(seq_file) if seq_file else {}
-    printMessage('Done', start_time=start_time, end=True, width=25)
-
-    # Parse IMGT output and write db
-    with open(imgt_files['summary'], 'r') as summary_handle, \
-            open(imgt_files['gapped'], 'r') as gapped_handle, \
-            open(imgt_files['ntseq'], 'r') as ntseq_handle, \
-            open(imgt_files['junction'], 'r') as junction_handle:
-        parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle,
-                                parse_scores=parse_scores, parse_regions=parse_regions,
-                                parse_junction=parse_junction)
-        file_prefix = getFilePrefix(aligner_output, out_args)
-        writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict,
-                no_parse=no_parse, partial=partial, out_args=out_args)
-
-    # Cleanup temp directory
-    temp_dir.cleanup()
-
-    return None
-
-
-# TODO:  may be able to merge with other mains
-def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False,
-                 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False,
-                 out_args=default_out_args):
-    """
-    Main for IgBLAST aligned sample sequences.
-
-    Arguments:
-      aligner_output : IgBLAST output file to process.
-      seq_file : fasta file input to IgBlast (from which to get sequence).
-      repo : folder with germline repertoire files.
-      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
-      partial : If True put incomplete alignments in the pass file.
-      parse_regions : if True add FWR and CDR fields to output file.
-      parse_scores : if True add alignment score fields to output file.
-      parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST
-      out_args : common output argument dictionary from parseCommonArgs.
-
-    Returns:
-      None
-    """
-    # Print parameter info
-    log = OrderedDict()
-    log['START'] = 'MakeDB'
-    log['ALIGNER'] = 'IgBlast'
-    log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
-    log['SEQ_FILE'] = os.path.basename(seq_file)
-    log['NO_PARSE'] = no_parse
-    log['PARTIAL'] = partial
-    log['SCORES'] = parse_scores
-    log['REGIONS'] = parse_regions
-    printLog(log)
-
-    start_time = time()
-    printMessage('Loading sequence files', start_time=start_time, width=25)
-    # Count records in sequence file
-    total_count = countSeqFile(seq_file)
-    # Get input sequence dictionary
-    seq_dict = getSeqDict(seq_file)
-    # Create germline repo dictionary
-    repo_dict = readRepo(repo)
-    printMessage('Done', start_time=start_time, end=True, width=25)
-
-    # Parse and write output
-    with open(aligner_output, 'r') as f:
-        parse_iter = IgBLASTReader(f, seq_dict, repo_dict,
-                                   parse_scores=parse_scores, parse_regions=parse_regions,
-                                   parse_igblast_cdr3=parse_igblast_cdr3)
-        file_prefix = getFilePrefix(aligner_output, out_args)
-        writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
-                no_parse=no_parse, partial=partial, out_args=out_args)
-
-    return None
-
-
-# TODO:  may be able to merge with other mains
-def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False,
-              parse_scores=False, parse_regions=False, out_args=default_out_args):
-    """
-    Main for iHMMuneAlign aligned sample sequences.
-
-    Arguments:
-      aligner_output : iHMMune-Align output file to process.
-      seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
-      repo : folder with germline repertoire files.
-      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
-      partial : If True put incomplete alignments in the pass file.
-      parse_scores : if True parse alignment scores.
-      parse_regions : if True add FWR and CDR region fields.
-      out_args : common output argument dictionary from parseCommonArgs.
-
-    Returns:
-      None
-    """
-    # Print parameter info
-    log = OrderedDict()
-    log['START'] = 'MakeDB'
-    log['ALIGNER'] = 'iHMMune-Align'
-    log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
-    log['SEQ_FILE'] = os.path.basename(seq_file)
-    log['NO_PARSE'] = no_parse
-    log['PARTIAL'] = partial
-    log['SCORES'] = parse_scores
-    log['REGIONS'] = parse_regions
-    printLog(log)
-
-    start_time = time()
-    printMessage('Loading sequence files', start_time=start_time, width=25)
-    # Count records in sequence file
-    total_count = countSeqFile(seq_file)
-    # Get input sequence dictionary
-    seq_dict = getSeqDict(seq_file)
-    # Create germline repo dictionary
-    repo_dict = readRepo(repo)
-    printMessage('Done', start_time=start_time, end=True, width=25)
-
-    # Parse and write output
-    with open(aligner_output, 'r') as f:
-        parse_iter = IHMMuneReader(f, seq_dict, repo_dict,
-                                   parse_scores=parse_scores, parse_regions=parse_regions)
-        file_prefix = getFilePrefix(aligner_output, out_args)
-        writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
-                no_parse=no_parse, partial=partial, out_args=out_args)
-
-    return None
-
-
-def getArgParser():
-    """
-    Defines the ArgumentParser.
-
-    Returns: 
-      argparse.ArgumentParser
-    """
-    fields = dedent(
-             '''
-              output files:
-                  db-pass
-                      database of alignment records with functionality information,
-                      V and J calls, and a junction region.
-                  db-fail
-                      database with records that fail due to no functionality information
-                      (did not pass IMGT), no V call, no J call, or no junction region.
-
-              universal output fields:
-                  SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT,
-                  FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS,
-                  V_CALL, D_CALL, J_CALL,
-                  V_SEQ_START, V_SEQ_LENGTH,
-                  D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
-                  J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
-                  JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH,
-                  FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
-                  CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
-
-              imgt specific output fields:
-                  V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
-                  N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
-                  D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY,
-
-              igblast specific output fields:
-                  V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
-                  V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP,
-                  J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP.
-                  CDR3_IGBLAST_NT, CDR3_IGBLAST_AA
-
-              ihmm specific output fields:
-                  V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
-                  HMM_SCORE
-              ''')
-                
-    # Define ArgumentParser
-    parser = ArgumentParser(description=__doc__, epilog=fields,
-                            formatter_class=CommonHelpFormatter)
-    parser.add_argument('--version', action='version',
-                        version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
-    subparsers = parser.add_subparsers(title='subcommands', dest='command',
-                                       help='Aligner used', metavar='')
-    # TODO:  This is a temporary fix for Python issue 9253
-    subparsers.required = True
-
-    # Parent parser    
-    parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
-
-    # IgBlast Aligner
-    parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
-                                           formatter_class=CommonHelpFormatter,
-                                           help='Process IgBLAST output.',
-                                           description='Process IgBLAST output.')
-    parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
-                                required=True,
-                                help='''IgBLAST output files in format 7 with query sequence
-                                     (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
-    parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
-                                help='''List of folders and/or fasta files containing
-                                     IMGT-gapped germline sequences corresponding to the
-                                     set of germlines used in the IgBLAST alignment.''')
-    parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
-                                required=True,
-                                help='''List of input FASTA files (with .fasta, .fna or .fa
-                                     extension), containing sequences.''')
-    parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
-                                help='''Specify to prevent input sequence headers from being parsed
-                                    to add new columns to database. Parsing of sequence headers requires
-                                    headers to be in the pRESTO annotation format, so this should be specified
-                                    when sequence headers are incompatible with the pRESTO annotation scheme.
-                                    Note, unrecognized header formats will default to this behavior.''')
-    parser_igblast.add_argument('--partial', action='store_true', dest='partial',
-                                help='''If specified, include incomplete V(D)J alignments in
-                                     the pass file instead of the fail file.''')
-    parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores',
-                                help='''Specify if alignment score metrics should be
-                                     included in the output. Adds the V_SCORE, V_IDENTITY,
-                                     V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
-                                     J_BTOP, and J_EVALUE columns.''')
-    parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions',
-                                help='''Specify if IMGT FWR and CDRs should be
-                                     included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
-                                     FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
-                                     CDR3_IMGT columns.''')
-    parser_igblast.add_argument('--cdr3', action='store_true',
-                                dest='parse_igblast_cdr3', 
-                                help='''Specify if the CDR3 sequences generated by IgBLAST 
-                                     should be included in the output. Adds the columns
-                                     CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST
-                                     version 1.5 or greater.''')
-    parser_igblast.set_defaults(func=parseIgBLAST)
-
-    # IMGT aligner
-    parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
-                                        formatter_class=CommonHelpFormatter,
-                                        help='''Process IMGT/HighV-Quest output
-                                             (does not work with V-QUEST).''',
-                                        description='''Process IMGT/HighV-Quest output
-                                             (does not work with V-QUEST).''')
-    parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
-                             help='''Either zipped IMGT output files (.zip or .txz) or a
-                                  folder containing unzipped IMGT output files (which must
-                                  include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
-                                  and 6_Junction).''')
-    parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
-                             required=False,
-                             help='''List of input FASTA files (with .fasta, .fna or .fa
-                                  extension) containing sequences.''')
-    parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', 
-                             help='''Specify to prevent input sequence headers from being parsed
-                                  to add new columns to database. Parsing of sequence headers requires
-                                  headers to be in the pRESTO annotation format, so this should be specified
-                                  when sequence headers are incompatible with the pRESTO annotation scheme.
-                                  Note, unrecognized header formats will default to this behavior.''')
-    parser_imgt.add_argument('--partial', action='store_true', dest='partial',
-                             help='''If specified, include incomplete V(D)J alignments in
-                                  the pass file instead of the fail file.''')
-    parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores',
-                             help='''Specify if alignment score metrics should be
-                                  included in the output. Adds the V_SCORE, V_IDENTITY,
-                                  J_SCORE and J_IDENTITY.''')
-    parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions',
-                             help='''Specify if IMGT FWRs and CDRs should be
-                                  included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
-                                  FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
-                                  CDR3_IMGT columns.''')
-    parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction',
-                             help='''Specify if detailed junction fields should be
-                                  included in the output. Adds the columns 
-                                  N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH,
-                                  P5J_LENGTH, D_FRAME.''')
-    parser_imgt.set_defaults(func=parseIMGT)
-
-    # iHMMuneAlign Aligner
-    parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
-                                        formatter_class=CommonHelpFormatter,
-                                        help='Process iHMMune-Align output.',
-                                        description='Process iHMMune-Align output.')
-    parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
-                             required=True,
-                             help='''iHMMune-Align output file.''')
-    parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
-                             help='''List of folders and/or FASTA files containing
-                                  IMGT-gapped germline sequences corresponding to the
-                                  set of germlines used in the IgBLAST alignment.''')
-    parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
-                             required=True,
-                             help='''List of input FASTA files (with .fasta, .fna or .fa
-                                  extension) containing sequences.''')
-    parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse',
-                             help='''Specify to prevent input sequence headers from being parsed
-                                  to add new columns to database. Parsing of sequence headers requires
-                                  headers to be in the pRESTO annotation format, so this should be specified
-                                  when sequence headers are incompatible with the pRESTO annotation scheme.
-                                  Note, unrecognized header formats will default to this behavior.''')
-    parser_ihmm.add_argument('--partial', action='store_true', dest='partial',
-                             help='''If specified, include incomplete V(D)J alignments in
-                                  the pass file instead of the fail file.''')
-    parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores',
-                             help='''Specify if alignment score metrics should be
-                                  included in the output. Adds the path score of the
-                                  iHMMune-Align hidden Markov model to HMM_SCORE.''')
-    parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions',
-                             help='''Specify if IMGT FWRs and CDRs should be
-                                  included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
-                                  FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
-                                  CDR3_IMGT columns.''')
-    parser_ihmm.set_defaults(func=parseIHMM)
-
-    return parser
-    
-    
-if __name__ == "__main__":
-    """
-    Parses command line arguments and calls main
-    """
-    parser = getArgParser()    
-    args = parser.parse_args()
-    args_dict = parseCommonArgs(args, in_arg='aligner_outputs')
-
-    # Set no ID parsing if sequence files are not provided
-    if 'seq_files' in args_dict and not args_dict['seq_files']:
-        args_dict['no_parse'] = True
-
-    # Delete
-    if 'seq_files' in args_dict: del args_dict['seq_files']
-    if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs']
-    if 'command' in args_dict: del args_dict['command']
-    if 'func' in args_dict: del args_dict['func']           
-    
-    if args.command == 'imgt':
-        for i in range(len(args.__dict__['aligner_outputs'])):
-            args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
-            args_dict['seq_file'] = args.__dict__['seq_files'][i] \
-                                    if args.__dict__['seq_files'] else None
-            args.func(**args_dict)
-    elif args.command == 'igblast' or args.command == 'ihmm':
-        for i in range(len(args.__dict__['aligner_outputs'])):
-            args_dict['aligner_output'] =  args.__dict__['aligner_outputs'][i]
-            args_dict['seq_file'] = args.__dict__['seq_files'][i]
-            args.func(**args_dict)
--- a/merge_and_filter.r	Tue Jun 18 04:47:44 2019 -0400
+++ b/merge_and_filter.r	Wed Jun 19 04:31:44 2019 -0400
@@ -53,6 +53,10 @@
 hotspots = fix_column_names(hotspots)
 AAs = fix_column_names(AAs)
 
+if(!("Sequence.number" %in% names(summ))){ 
+	summ["Sequence.number"] = 1:nrow(summ)
+}
+
 if(method == "blastn"){
 	#"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
 	gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
@@ -140,9 +144,6 @@
 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
 result = merge(result, sequences, by="Sequence.ID", all.x=T)
 
-print("sequences files columns")
-print("CDR3.IMGT")
-
 AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")]
 names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA")
 result = merge(result, AAs, by="Sequence.ID", all.x=T)
@@ -172,10 +173,17 @@
 
 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
 
-print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T)))
-print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T)))
-print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T)))
-print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T)))
+missing.FR1 = result$FR1.IMGT.seq == "" | is.na(result$FR1.IMGT.seq)
+missing.CDR1 = result$CDR1.IMGT.seq == "" | is.na(result$CDR1.IMGT.seq)
+missing.FR2 = result$FR2.IMGT.seq == "" | is.na(result$FR2.IMGT.seq)
+missing.CDR2 = result$CDR2.IMGT.seq == "" | is.na(result$CDR2.IMGT.seq)
+missing.FR3 = result$FR3.IMGT.seq == "" | is.na(result$FR3.IMGT.seq)
+
+print(paste("Number of empty CDR1 sequences:", sum(missing.FR1)))
+print(paste("Number of empty FR2 sequences:", sum(missing.CDR1)))
+print(paste("Number of empty CDR2 sequences:", sum(missing.FR2)))
+print(paste("Number of empty FR3 sequences:", sum(missing.CDR2)))
+print(paste("Number of empty FR3 sequences:", sum(missing.FR3)))
 
 if(empty.region.filter == "leader"){
 	result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
@@ -217,6 +225,7 @@
 
 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
 
+
 if(filter.unique != "no"){
 	clmns = names(result)
 	if(filter.unique == "remove_vjaa"){
@@ -291,6 +300,9 @@
 
 matched.sequences.count = nrow(matched.sequences)
 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))
+if(matched.sequences.count <= unmatched.sequences.count){
+	print("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!")
+}
 
 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))
 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))
--- a/shm_csr.xml	Tue Jun 18 04:47:44 2019 -0400
+++ b/shm_csr.xml	Wed Jun 19 04:31:44 2019 -0400
@@ -8,7 +8,7 @@
 		<requirement type="package" version="0.5.0">r-scales</requirement>
 		<requirement type="package" version="3.4_5">r-seqinr</requirement>
 		<requirement type="package" version="1.11.4">r-data.table</requirement>
-		<requirement type="package" version="0.4.5">changeo</requirement>
+		<!--<requirement type="package" version="0.4.5">changeo</requirement>-->
 	</requirements>
 	<command interpreter="bash">
 		#if str ( $filter_unique.filter_unique_select ) == "remove":