Mercurial > repos > davidvanzessen > shm_csr

--- a/change_o/DefineClones.py	Tue May 16 08:52:48 2017 -0400
+++ b/change_o/DefineClones.py	Tue May 23 08:32:58 2017 -0400
@@ -10,6 +10,7 @@
 import os
 import re
 import sys
+import csv
 import numpy as np
 from argparse import ArgumentParser
 from collections import OrderedDict
@@ -25,56 +26,108 @@
 from presto.Multiprocessing import manageProcesses
 from presto.Sequence import getDNAScoreDict
 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
-from changeo.Distance import getDNADistMatrix, getAADistMatrix, \
-                             hs1f_model, m1n_model, hs5f_model, \
-                             calcDistances, formClusters
+from changeo.Distance import distance_models, calcDistances, formClusters
 from changeo.IO import getDbWriter, readDbFile, countDbFile
 from changeo.Multiprocessing import DbData, DbResult

+## Set maximum field size for csv.reader
+csv.field_size_limit(sys.maxsize)
+
 # Defaults
 default_translate = False
 default_distance = 0.0
-default_bygroup_model = 'hs1f'
+default_index_mode = 'gene'
+default_index_action = 'set'
+default_bygroup_model = 'ham'
 default_hclust_model = 'chen2010'
 default_seq_field = 'JUNCTION'
 default_norm = 'len'
 default_sym = 'avg'
 default_linkage = 'single'
-
-# TODO:  should be in Distance, but need to be after function definitions
-# Amino acid Hamming distance
-aa_model = getAADistMatrix(mask_dist=1, gap_dist=0)
-
-# DNA Hamming distance
-ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0)
+choices_bygroup_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f', 'mk_rs1nf', 'mk_rs5nf', 'hs1f_compat', 'm1n_compat')


-# TODO:  this function is an abstraction to facilitate later cleanup
-def getModelMatrix(model):
+def indexByIdentity(index, key, rec, fields=None):
     """
-    Simple wrapper to get distance matrix from model name
+    Updates a preclone index with a simple key
+
+    Arguments:
+    index = preclone index from indexJunctions
+    key = index key
+    rec = IgRecord to add to the index
+    fields = additional annotation fields to use to group preclones;
+             if None use only V, J and junction length
+
+    Returns:
+    None. Updates index with new key and records.
+    """
+    index.setdefault(tuple(key), []).append(rec)
+
+
+def indexByUnion(index, key, rec, fields=None):
+    """
+    Updates a preclone index with the union of nested keys

     Arguments:
-    model = model name
+    index = preclone index from indexJunctions
+    key = index key
+    rec = IgRecord to add to the index
+    fields = additional annotation fields to use to group preclones;
+             if None use only V, J and junction length

-    Return:
-    a pandas.DataFrame containing the character distance matrix
+    Returns:
+    None. Updates index with new key and records.
     """
-    if model == 'aa':
-        return(aa_model)
-    elif model == 'ham':
-        return(ham_model)
-    elif model == 'm1n':
-        return(m1n_model)
-    elif model == 'hs1f':
-        return(hs1f_model)
-    elif model == 'hs5f':
-        return(hs5f_model)
+    # List of values for this/new key
+    val = [rec]
+    f_range = list(range(2, 3 + (len(fields) if fields else 0)))
+
+    # See if field/junction length combination exists in index
+    outer_dict = index
+    for field in f_range:
+        try:
+            outer_dict = outer_dict[key[field]]
+        except (KeyError):
+            outer_dict = None
+            break
+    # If field combination exists, look through Js
+    j_matches = []
+    if outer_dict is not None:
+        for j in outer_dict.keys():
+            if not set(key[1]).isdisjoint(set(j)):
+                key[1] = tuple(set(key[1]).union(set(j)))
+                j_matches += [j]
+    # If J overlap exists, look through Vs for each J
+    for j in j_matches:
+        v_matches = []
+        # Collect V matches for this J
+        for v in outer_dict[j].keys():
+            if not set(key[0]).isdisjoint(set(v)):
+                key[0] = tuple(set(key[0]).union(set(v)))
+                v_matches += [v]
+        # If there are V overlaps for this J, pop them out
+        if v_matches:
+            val += list(chain(*(outer_dict[j].pop(v) for v in v_matches)))
+            # If the J dict is now empty, remove it
+            if not outer_dict[j]:
+                outer_dict.pop(j, None)
+
+    # Add value(s) into index nested dictionary
+    # OMG Python pointers are the best!
+    # Add field dictionaries into index
+    outer_dict = index
+    for field in f_range:
+        outer_dict.setdefault(key[field], {})
+        outer_dict = outer_dict[key[field]]
+    # Add J, then V into index
+    if key[1] in outer_dict:
+        outer_dict[key[1]].update({key[0]: val})
     else:
-        sys.stderr.write('Unrecognized distance model: %s.\n' % model)
+        outer_dict[key[1]] = {key[0]: val}


-def indexJunctions(db_iter, fields=None, mode='gene', action='first'):
+def indexJunctions(db_iter, fields=None, mode=default_index_mode,
+                   action=default_index_action):
     """
     Identifies preclonal groups by V, J and junction length

@@ -90,27 +143,47 @@
     Returns:
     a dictionary of {(V, J, junction length):[IgRecords]}
     """
+    # print(fields)
     # Define functions for grouping keys
     if mode == 'allele' and fields is None:
         def _get_key(rec, act):
-            return (rec.getVAllele(act), rec.getJAllele(act),
-                    None if rec.junction is None else len(rec.junction))
+            return [rec.getVAllele(act), rec.getJAllele(act),
+                    None if rec.junction is None else len(rec.junction)]
     elif mode == 'gene' and fields is None:
         def _get_key(rec, act):
-            return (rec.getVGene(act), rec.getJGene(act),
-                    None if rec.junction is None else len(rec.junction))
+            return [rec.getVGene(act), rec.getJGene(act),
+                    None if rec.junction is None else len(rec.junction)]
     elif mode == 'allele' and fields is not None:
         def _get_key(rec, act):
             vdj = [rec.getVAllele(act), rec.getJAllele(act),
                     None if rec.junction is None else len(rec.junction)]
             ann = [rec.toDict().get(k, None) for k in fields]
-            return tuple(chain(vdj, ann))
+            return list(chain(vdj, ann))
     elif mode == 'gene' and fields is not None:
         def _get_key(rec, act):
             vdj = [rec.getVGene(act), rec.getJGene(act),
                     None if rec.junction is None else len(rec.junction)]
             ann = [rec.toDict().get(k, None) for k in fields]
-            return tuple(chain(vdj, ann))
+            return list(chain(vdj, ann))
+
+    # Function to flatten nested dictionary
+    def _flatten_dict(d, parent_key=''):
+        items = []
+        for k, v in d.items():
+            new_key = parent_key + [k] if parent_key else [k]
+            if isinstance(v, dict):
+                items.extend(_flatten_dict(v, new_key).items())
+            else:
+                items.append((new_key, v))
+        flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items}
+        return flat_dict
+
+    if action == 'first':
+        index_func = indexByIdentity
+    elif action == 'set':
+        index_func = indexByUnion
+    else:
+        sys.stderr.write('Unrecognized action: %s.\n' % action)

     start_time = time()
     clone_index = {}
@@ -127,35 +200,16 @@

         # Assigned passed preclone records to key and failed to index None
         if all([k is not None and k != '' for k in key]):
-            #print key
-            # TODO:  Has much slow. Should have less slow.
-            if action == 'set':
-
-                f_range = list(range(2, 3 + (len(fields) if fields else 0)))
-                vdj_range = list(range(2))
-
-                # Check for any keys that have matching columns and junction length and overlapping genes/alleles
-                to_remove = []
-                if len(clone_index) > (1 if None in clone_index else 0) and key not in clone_index:
-                    key = list(key)
-                    for k in clone_index:
-                        if k is not None and all([key[i] == k[i] for i in f_range]):
-                            if all([not set(key[i]).isdisjoint(set(k[i])) for i in vdj_range]):
-                                for i in vdj_range:  key[i] = tuple(set(key[i]).union(set(k[i])))
-                                to_remove.append(k)
-
-                # Remove original keys, replace with union of all genes/alleles and append values to new key
-                val = [rec]
-                val += list(chain(*(clone_index.pop(k) for k in to_remove)))
-                clone_index[tuple(key)] = clone_index.get(tuple(key),[]) + val
-
-            elif action == 'first':
-                clone_index.setdefault(key, []).append(rec)
+            # Update index dictionary
+            index_func(clone_index, key, rec, fields)
         else:
             clone_index.setdefault(None, []).append(rec)

     printProgress(rec_count, step=1000, start_time=start_time, end=True)

+    if action == 'set':
+        clone_index = _flatten_dict(clone_index)
+
     return clone_index


@@ -179,15 +233,20 @@
     a dictionary of lists defining {clone number: [IgRecords clonal group]}
     """
     # Get distance matrix if not provided
-    if dist_mat is None:  dist_mat = getModelMatrix(model)
+    if dist_mat is None:
+        try:
+            dist_mat = distance_models[model]
+        except KeyError:
+            sys.exit('Unrecognized distance model: %s' % args_dict['model'])

+    # TODO:  can be cleaned up with abstract model class
     # Determine length of n-mers
-    if model in ['hs1f', 'm1n', 'aa', 'ham']:
+    if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']:
         nmer_len = 1
-    elif model in ['hs5f']:
+    elif model in ['hh_s5f', 'mk_rs5nf']:
         nmer_len = 5
     else:
-        sys.stderr.write('Unrecognized distance model: %s.\n' % model)
+        sys.exit('Unrecognized distance model: %s.\n' % model)

     # Define unique junction mapping
     seq_map = {}
@@ -197,7 +256,7 @@
         if len(seq) == 0:
             return None

-        seq = re.sub('[\.-]','N', str(seq))
+        seq = re.sub('[\.-]', 'N', str(seq))
         if model == 'aa':  seq = translate(seq)

         seq_map.setdefault(seq, []).append(ig)
@@ -210,7 +269,7 @@
     seqs = list(seq_map.keys())

     # Calculate pairwise distance matrix
-    dists = calcDistances(seqs, nmer_len, dist_mat, norm, sym)
+    dists = calcDistances(seqs, nmer_len, dist_mat, sym=sym, norm=norm)

     # Perform hierarchical clustering
     clusters = formClusters(dists, linkage, distance)
@@ -449,6 +508,7 @@

             # Define result object for iteration and get data records
             records = data.data
+            # print(data.id)
             result = DbResult(data.id, records)

             # Check for invalid data (due to failed indexing) and add failed result
@@ -901,7 +961,7 @@
                      database with records failing clonal grouping.

              required fields:
-                 SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION_LENGTH
+                 SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION

                  <field>
                      sequence field specified by the --sf parameter
@@ -926,32 +986,36 @@

     # Distance cloning method
     parser_bygroup = subparsers.add_parser('bygroup', parents=[parser_parent],
-                                        formatter_class=CommonHelpFormatter,
-                                        help='''Defines clones as having same V assignment,
-                                              J assignment, and junction length with
-                                              specified substitution distance model.''')
+                                           formatter_class=CommonHelpFormatter,
+                                           help='''Defines clones as having same V assignment,
+                                                J assignment, and junction length with
+                                                specified substitution distance model.''',
+                                           description='''Defines clones as having same V assignment,
+                                                       J assignment, and junction length with
+                                                       specified substitution distance model.''')
     parser_bygroup.add_argument('-f', nargs='+', action='store', dest='fields', default=None,
                              help='Additional fields to use for grouping clones (non VDJ)')
     parser_bygroup.add_argument('--mode', action='store', dest='mode',
-                             choices=('allele', 'gene'), default='gene',
+                             choices=('allele', 'gene'), default=default_index_mode,
                              help='''Specifies whether to use the V(D)J allele or gene for
                                   initial grouping.''')
-    parser_bygroup.add_argument('--act', action='store', dest='action', default='set',
-                             choices=('first', 'set'),
+    parser_bygroup.add_argument('--act', action='store', dest='action',
+                             choices=('first', 'set'), default=default_index_action,
                              help='''Specifies how to handle multiple V(D)J assignments
                                   for initial grouping.''')
     parser_bygroup.add_argument('--model', action='store', dest='model',
-                             choices=('aa', 'ham', 'm1n', 'hs1f', 'hs5f'),
+                             choices=choices_bygroup_model,
                              default=default_bygroup_model,
-                             help='''Specifies which substitution model to use for
-                                  calculating distance between sequences. Where m1n is the
-                                  mouse single nucleotide transition/trasversion model
-                                  of Smith et al, 1996; hs1f is the human single
-                                  nucleotide model derived from Yaari et al, 2013; hs5f
-                                  is the human S5F model of Yaari et al, 2013; ham is
-                                  nucleotide Hamming distance; and aa is amino acid
-                                  Hamming distance. The hs5f data should be
-                                  considered experimental.''')
+                             help='''Specifies which substitution model to use for calculating distance
+                                  between sequences. The "ham" model is nucleotide Hamming distance and
+                                  "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are
+                                  human specific single nucleotide and 5-mer content models, respectively,
+                                  from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are
+                                  mouse specific single nucleotide and 5-mer content models, respectively,
+                                  from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are
+                                  deprecated models provided backwards compatibility with the "m1n" and
+                                  "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both
+                                  5-mer models should be considered experimental.''')
     parser_bygroup.add_argument('--dist', action='store', dest='distance', type=float,
                              default=default_distance,
                              help='The distance threshold for clonal grouping')
@@ -977,22 +1041,25 @@
     parser_bygroup.set_defaults(group_func=indexJunctions)
     parser_bygroup.set_defaults(clone_func=distanceClones)

-
-    # Hierarchical clustering cloning method
-    parser_hclust = subparsers.add_parser('hclust', parents=[parser_parent],
+    # Chen2010
+    parser_chen = subparsers.add_parser('chen2010', parents=[parser_parent],
                                         formatter_class=CommonHelpFormatter,
-                                        help='Defines clones by specified distance metric on CDR3s and \
-                                              cutting of hierarchical clustering tree')
-#     parser_hclust.add_argument('-f', nargs='+', action='store', dest='fields', default=None,
-#                              help='Fields to use for grouping clones (non VDJ)')
-    parser_hclust.add_argument('--method', action='store', dest='method',
-                             choices=('chen2010', 'ademokun2011'), default=default_hclust_model,
-                             help='Specifies which cloning method to use for calculating distance \
-                                   between CDR3s, computing linkage, and cutting clusters')
-    parser_hclust.set_defaults(feed_func=feedQueueClust)
-    parser_hclust.set_defaults(work_func=processQueueClust)
-    parser_hclust.set_defaults(collect_func=collectQueueClust)
-    parser_hclust.set_defaults(cluster_func=hierClust)
+                                        help='''Defines clones by method specified in Chen, 2010.''',
+                                        description='''Defines clones by method specified in Chen, 2010.''')
+    parser_chen.set_defaults(feed_func=feedQueueClust)
+    parser_chen.set_defaults(work_func=processQueueClust)
+    parser_chen.set_defaults(collect_func=collectQueueClust)
+    parser_chen.set_defaults(cluster_func=hierClust)
+
+    # Ademokun2011
+    parser_ade = subparsers.add_parser('ademokun2011', parents=[parser_parent],
+                                        formatter_class=CommonHelpFormatter,
+                                        help='''Defines clones by method specified in Ademokun, 2011.''',
+                                        description='''Defines clones by method specified in Ademokun, 2011.''')
+    parser_ade.set_defaults(feed_func=feedQueueClust)
+    parser_ade.set_defaults(work_func=processQueueClust)
+    parser_ade.set_defaults(collect_func=collectQueueClust)
+    parser_ade.set_defaults(cluster_func=hierClust)

     return parser

@@ -1023,8 +1090,11 @@
                                    'linkage': args_dict['linkage'],
                                    'seq_field': args_dict['seq_field']}

-        # TODO:  can be cleaned up with abstract model class
-        args_dict['clone_args']['dist_mat'] = getModelMatrix(args_dict['model'])
+        # Get distance matrix
+        try:
+            args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']]
+        except KeyError:
+            sys.exit('Unrecognized distance model: %s' % args_dict['model'])

         del args_dict['fields']
         del args_dict['action']
@@ -1037,16 +1107,17 @@
         del args_dict['seq_field']

     # Define clone_args
-    if args.command == 'hclust':
-        dist_funcs = {'chen2010':distChen2010, 'ademokun2011':distAdemokun2011}
-        args_dict['clone_func'] = dist_funcs[args_dict['method']]
-        args_dict['cluster_args'] = {'method':  args_dict['method']}
-        #del args_dict['fields']
-        del args_dict['method']
+    if args.command == 'chen2010':
+        args_dict['clone_func'] = distChen2010
+        args_dict['cluster_args'] = {'method': args.command }
+
+    if args.command == 'ademokun2011':
+        args_dict['clone_func'] = distAdemokun2011
+        args_dict['cluster_args'] = {'method': args.command }

     # Call defineClones
     del args_dict['command']
     del args_dict['db_files']
     for f in args.__dict__['db_files']:
         args_dict['db_file'] = f
-        defineClones(**args_dict)
\ No newline at end of file
+        defineClones(**args_dict)
--- a/change_o/MakeDb.py	Tue May 16 08:52:48 2017 -0400
+++ b/change_o/MakeDb.py	Tue May 23 08:32:58 2017 -0400
@@ -7,765 +7,164 @@
 from changeo import __version__, __date__

 # Imports
-import csv
 import os
-import re
 import sys
-import pandas as pd
-import tarfile
-import zipfile
 from argparse import ArgumentParser
 from collections import OrderedDict
-from itertools import groupby
-from shutil import rmtree
-from tempfile import mkdtemp
 from textwrap import dedent
 from time import time
 from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.Alphabet import IUPAC

 # Presto and changeo imports
 from presto.Defaults import default_out_args
 from presto.Annotation import parseAnnotation
-from presto.IO import countSeqFile, printLog, printProgress
+from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile
 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
-from changeo.IO import getDbWriter, countDbFile, getRepo
-from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \
-                             j_allele_regex
-
-# Default parameters
-default_delimiter = ('\t', ',', '-')
-
-
-def gapV(ig_dict, repo_dict):
-    """
-    Insert gaps into V region and update alignment information
-
-    Arguments:
-      ig_dict : Dictionary of parsed IgBlast output
-      repo_dict : Dictionary of IMGT gapped germline sequences
-
-    Returns:
-      dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields
-    """
-
-    seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ']
-
-    # Find gapped germline V segment
-    vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first')
-    vkey = (vgene, )
-    #TODO: Figure out else case
-    if vkey in repo_dict:
-        vgap = repo_dict[vkey]
-        # Iterate over gaps in the germline segment
-        gaps = re.finditer(r'\.', vgap)
-        gapcount = int(ig_dict['V_GERM_START_VDJ'])-1
-        for gap in gaps:
-            i = gap.start()
-            # Break if gap begins after V region
-            if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount:
-                break
-            # Insert gap into IMGT sequence
-            seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:]
-            # Update gap counter
-            gapcount += 1
-        ig_dict['SEQUENCE_IMGT'] = seq_imgt
-        # Update IMGT positioning information for V
-        ig_dict['V_GERM_START_IMGT'] = 1
-        ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount
-
-    return ig_dict
+from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo
+from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT


-def getIMGTJunc(ig_dict, repo_dict):
-    """
-    Identify junction region by IMGT definition
-
-    Arguments:
-      ig_dict : Dictionary of parsed IgBlast output
-      repo_dict : Dictionary of IMGT gapped germline sequences
-
-    Returns:
-      dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields
-    """
-    # Find germline J segment
-    jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first')
-    jkey = (jgene, )
-    #TODO: Figure out else case
-    if jkey in repo_dict:
-        # Get germline J sequence
-        jgerm = repo_dict[jkey]
-        jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1]
-        # Look for (F|W)GXG aa motif in nt sequence
-        motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm)
-        aa_end = len(ig_dict['SEQUENCE_IMGT'])
-        #TODO: Figure out else case
-        if motif:
-            # print('\n', motif.group())
-            aa_end = motif.start() - len(jgerm) + 3
-        # Add fields to dict
-        ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end]
-        ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION'])
-
-    return ig_dict
-
-
-def getRegions(ig_dict):
+def getFilePrefix(aligner_output, out_args):
     """
-    Identify FWR and CDR regions by IMGT definition
-
-    Arguments:
-      ig_dict : Dictionary of parsed alignment output
-
-    Returns:
-      dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
-             CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields
-    """
-    try:
-        seq_len = len(ig_dict['SEQUENCE_IMGT'])
-        ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)]
-    except (KeyError, IndexError):
-        return ig_dict
-
-    try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)]
-    except (IndexError): return ig_dict
-
-    try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)]
-    except (IndexError): return ig_dict
-
-    try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)]
-    except (IndexError): return ig_dict
-
-    try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)]
-    except (IndexError): return ig_dict
-
-    try:
-        cdr3_end = 306 + ig_dict['JUNCTION_LENGTH']
-        ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end]
-        ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:]
-    except (KeyError, IndexError):
-        return ig_dict
-
-    return ig_dict
-
-
-def getSeqforIgBlast(seq_file):
-    """
-    Fetch input sequences for IgBlast queries
+    Get file name prefix and create output directory

     Arguments:
-    seq_file = a fasta file of sequences input to IgBlast
+      aligner_output : aligner output file or directory.
+      out_args : dictionary of output arguments.

     Returns:
-    a dictionary of {ID:Seq}
+        str : file name prefix.
     """
-
-    seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna)
-
-    # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
-    seqs = {}
-    for seq in seq_dict.values():
-        seqs.update({seq.description:str(seq.seq)})
-
-    return seqs
-
+    # Determine output directory
+    if not out_args['out_dir']:
+        out_dir = os.path.dirname(os.path.abspath(aligner_output))
+    else:
+        out_dir = os.path.abspath(out_args['out_dir'])
+        if not os.path.exists(out_dir):
+            os.mkdir(out_dir)

-def findLine(handle, query):
-    """
-    Finds line with query string in file
+    # Determine file prefix
+    if out_args['out_name']:
+        file_prefix = out_args['out_name']
+    else:
+        file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0]

-    Arguments:
-    handle = file handle in which to search for line
-    query = query string for which to search in file
-
-    Returns:
-    line from handle in which query string was found
-    """
-    for line in handle:
-        if(re.match(query, line)):
-            return line
+    return os.path.join(out_dir, file_prefix)


-def extractIMGT(imgt_output):
-    """
-    Extract necessary files from IMGT results, zipped or unzipped
-
-    Arguments:
-    imgt_output = zipped file or unzipped folder output by IMGT
-
-    Returns:
-    sorted list of filenames from which information will be read
+def getSeqDict(seq_file):
     """
-    #file_ext = os.path.splitext(imgt_output)[1].lower()
-    imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction')
-    temp_dir = mkdtemp()
-    if zipfile.is_zipfile(imgt_output):
-        # Open zip file
-        imgt_zip = zipfile.ZipFile(imgt_output, 'r')
-        # Extract required files
-        imgt_files = sorted([n for n in imgt_zip.namelist() \
-                             if os.path.basename(n).startswith(imgt_flags)])
-        imgt_zip.extractall(temp_dir, imgt_files)
-        # Define file list
-        imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
-    elif os.path.isdir(imgt_output):
-        # Find required files in folder
-        folder_files = []
-        for root, dirs, files in os.walk(imgt_output):
-            folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files])
-        # Define file list
-        imgt_files = sorted([n for n in folder_files \
-                             if os.path.basename(n).startswith(imgt_flags)])
-    elif tarfile.is_tarfile(imgt_output):
-        # Open zip file
-        imgt_tar = tarfile.open(imgt_output, 'r')
-        # Extract required files
-        imgt_files = sorted([n for n in imgt_tar.getnames() \
-                             if os.path.basename(n).startswith(imgt_flags)])
-        imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files])
-        # Define file list
-        imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
-    else:
-        sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.')
-
-    if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files
-        sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output)
-    elif len(imgt_files) < len(imgt_flags):
-        sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output)
-
-    return temp_dir, imgt_files
-
-
-# TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks
-def readOneIgBlastResult(block):
-    """
-    Parse a single IgBLAST query result
+    Create a dictionary from a sequence file.

     Arguments:
-    block =  itertools groupby object of single result
-
-    Returns:
-    None if no results, otherwise list of DataFrames for each result block
-    """
-    results = list()
-    i = 0
-    for match, subblock in groupby(block, lambda l: l=='\n'):
-        if not match:
-            # Strip whitespace and comments
-            sub = [s.strip() for s in subblock if not s.startswith('#')]
-
-            # Continue on empty block
-            if not sub:  continue
-            else:  i += 1
-
-            # Split by tabs
-            sub = [s.split('\t') for s in sub]
-
-            # Append list for "V-(D)-J rearrangement summary" (i == 1)
-            # And "V-(D)-J junction details" (i == 2)
-            # Otherwise append DataFrame of subblock
-            if i == 1 or i == 2:
-                results.append(sub[0])
-            else:
-                df = pd.DataFrame(sub)
-                if not df.empty: results.append(df)
-
-    return results if results else None
-
-
-# TODO:  needs more speeds. pandas is probably to blame.
-def readIgBlast(igblast_output, seq_dict, repo_dict,
-                score_fields=False, region_fields=False):
-    """
-    Reads IgBlast output
-
-    Arguments:
-    igblast_output = IgBlast output file (format 7)
-    seq_dict = a dictionary of {ID:Seq} from input fasta file
-    repo_dict = dictionary of IMGT gapped germline sequences
-    score_fields = if True parse alignment scores
-    region_fields = if True add FWR and CDR region fields
+      seq_file : sequence file.

     Returns:
-    a generator of dictionaries containing alignment data
+        dict : sequence description as keys with Bio.SeqRecords as values.
     """
-
-    # Open IgBlast output file
-    with open(igblast_output) as f:
-        # Iterate over individual results (separated by # IGBLASTN)
-        for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)):
-            block = list(block)
-            if not k1:
-                # TODO: move query name extraction into block parser readOneIgBlastResult().
-                # Extract sequence ID
-                query_name = ' '.join(block[0].strip().split(' ')[2:])
-                # Initialize db_gen to have ID and input sequence
-                db_gen = {'SEQUENCE_ID':     query_name,
-                          'SEQUENCE_INPUT':  seq_dict[query_name]}
-
-                # Parse further sub-blocks
-                block_list = readOneIgBlastResult(block)
-
-                # TODO: this is indented pretty far.  should be a separate function. or several functions.
-                # If results exist, parse further to obtain full db_gen
-                if block_list is not None:
-                    # Parse quality information
-                    db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F'
-                    db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F'
-                    db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F'
-                    if block_list[0][-1] == '-':
-                        db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'],
-                                                           IUPAC.ambiguous_dna).reverse_complement())
-
-                    # Parse V, D, and J calls
-                    call_str = ' '.join(block_list[0])
-                    v_call = parseAllele(call_str, v_allele_regex, action='list')
-                    d_call = parseAllele(call_str, d_allele_regex, action='list')
-                    j_call = parseAllele(call_str, j_allele_regex, action='list')
-                    db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None'
-                    db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None'
-                    db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None'
-
-                    # Parse junction sequence
-                    # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1]))
-                    # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ'])
-
-                    # TODO:  IgBLAST does a stupid and doesn't output block #3 sometimes. why?
-                    # TODO:  maybe we should fail these. they look craptastic.
-                    #pd.set_option('display.width', 500)
-                    #print query_name, len(block_list), hit_idx
-                    #for i, x in enumerate(block_list):
-                    #    print '[%i]' % i
-                    #    print x
-
-                    # Parse segment start and stop positions
-                    hit_df = block_list[-1]
-
-                    # Alignment info block
-                    #  0:  segment
-                    #  1:  query id
-                    #  2:  subject id
-                    #  3:  % identity
-                    #  4:  alignment length
-                    #  5:  mismatches
-                    #  6:  gap opens
-                    #  7:  gaps
-                    #  8:  q. start
-                    #  9:  q. end
-                    # 10:  s. start
-                    # 11:  s. end
-                    # 12:  evalue
-                    # 13:  bit score
-                    # 14:  query seq
-                    # 15:  subject seq
-                    # 16:  btop
-
-                    # If V call exists, parse V alignment information
-                    seq_vdj = ''
-                    if v_call is not None:
-                        v_align = hit_df[hit_df[0] == 'V'].iloc[0]
-                        # Germline positions
-                        db_gen['V_GERM_START_VDJ'] = int(v_align[10])
-                        db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1
-                        # Query sequence positions
-                        db_gen['V_SEQ_START'] = int(v_align[8])
-                        db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1
-
-                        if int(v_align[6]) == 0:
-                            db_gen['INDELS'] = 'F'
-                        else:
-                            db_gen['INDELS'] = 'T'
-                            # Set functional to none so record gets tossed (junction will be wrong)
-                            # db_gen['FUNCTIONAL'] = None
-
-                        # V alignment scores
-                        if score_fields:
-                            try: db_gen['V_SCORE'] = float(v_align[13])
-                            except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
-
-                            try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0
-                            except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
-
-                            try: db_gen['V_EVALUE'] = float(v_align[12])
-                            except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None'
-
-                            try: db_gen['V_BTOP'] = v_align[16]
-                            except (TypeError, ValueError): db_gen['V_BTOP'] = 'None'
+    seq_dict = SeqIO.to_dict(readSeqFile(seq_file),
+                             key_function=lambda x: x.description)

-                        # Update VDJ sequence, removing insertions
-                        start = 0
-                        for m in re.finditer(r'-', v_align[15]):
-                            ins = m.start()
-                            seq_vdj += v_align[14][start:ins]
-                            start = ins + 1
-                        seq_vdj += v_align[14][start:]
-
-                    # TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
-                    # If D call exists, parse D alignment information
-                    if d_call is not None:
-                        d_align = hit_df[hit_df[0] == 'D'].iloc[0]
-
-                        # TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
-                        # Determine N-region length and amount of J overlap with V or D alignment
-                        overlap = 0
-                        if v_call is not None:
-                            n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
-                            if n1_len < 0:
-                                db_gen['N1_LENGTH'] = 0
-                                overlap = abs(n1_len)
-                            else:
-                                db_gen['N1_LENGTH'] = n1_len
-                                n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1)
-                                n1_end = int(d_align[8])-1
-                                seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
-
-                        # Query sequence positions
-                        db_gen['D_SEQ_START'] = int(d_align[8]) + overlap
-                        db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0)
-
-                        # Germline positions
-                        db_gen['D_GERM_START'] = int(d_align[10]) + overlap
-                        db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0)
-
-                        # Update VDJ sequence, removing insertions
-                        start = overlap
-                        for m in re.finditer(r'-', d_align[15]):
-                            ins = m.start()
-                            seq_vdj += d_align[14][start:ins]
-                            start = ins + 1
-                        seq_vdj += d_align[14][start:]
-
-                    # TODO:  needs to check that the V results are present before trying to determine N1_LENGTH from them.
-                    # If J call exists, parse J alignment information
-                    if j_call is not None:
-                        j_align = hit_df[hit_df[0] == 'J'].iloc[0]
-
-                        # TODO:  this is kinda gross.  not sure how else to fix the alignment overlap problem though.
-                        # Determine N-region length and amount of J overlap with V or D alignment
-                        overlap = 0
-                        if d_call is not None:
-                            n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH'])
-                            if n2_len < 0:
-                                db_gen['N2_LENGTH'] = 0
-                                overlap = abs(n2_len)
-                            else:
-                                db_gen['N2_LENGTH'] = n2_len
-                                n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1)
-                                n2_end = int(j_align[8])-1
-                                seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end]
-                        elif v_call is not None:
-                            n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
-                            if n1_len < 0:
-                                db_gen['N1_LENGTH'] = 0
-                                overlap = abs(n1_len)
-                            else:
-                                db_gen['N1_LENGTH'] = n1_len
-                                n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1)
-                                n1_end = int(j_align[8])-1
-                                seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
-                        else:
-                            db_gen['N1_LENGTH'] = 0
-
-                        # Query positions
-                        db_gen['J_SEQ_START'] = int(j_align[8]) + overlap
-                        db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0)
-
-                        # Germline positions
-                        db_gen['J_GERM_START'] = int(j_align[10]) + overlap
-                        db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0)
-
-                        # J alignment scores
-                        if score_fields:
-                            try: db_gen['J_SCORE'] = float(j_align[13])
-                            except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
-
-                            try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0
-                            except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
-
-                            try: db_gen['J_EVALUE'] = float(j_align[12])
-                            except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None'
-
-                            try: db_gen['J_BTOP'] = j_align[16]
-                            except (TypeError, ValueError): db_gen['J_BTOP'] = 'None'
-
-                        # Update VDJ sequence, removing insertions
-                        start = overlap
-                        for m in re.finditer(r'-', j_align[15]):
-                            ins = m.start()
-                            seq_vdj += j_align[14][start:ins]
-                            start = ins + 1
-                        seq_vdj += j_align[14][start:]
-
-                    db_gen['SEQUENCE_VDJ'] = seq_vdj
-
-                    # Create IMGT-gapped sequence and infer IMGT junction
-                    if v_call is not None:
-                        db_gen = gapV(db_gen, repo_dict)
-                        if j_call is not None:
-                            db_gen = getIMGTJunc(db_gen, repo_dict)
-
-                    # FWR and CDR regions
-                    if region_fields: getRegions(db_gen)
-
-                yield IgRecord(db_gen)
+    return seq_dict


-# TODO:  should be more readable
-def readIMGT(imgt_files, score_fields=False, region_fields=False):
-    """
-    Reads IMGT/HighV-Quest output
-
-    Arguments:
-    imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6
-    score_fields = if True parse alignment scores
-    region_fields = if True add FWR and CDR region fields
-
-    Returns:
-    a generator of dictionaries containing alignment data
+def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
+            out_args=default_out_args):
     """
-    imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files]
-    # Create a dictionary for each sequence alignment and yield its generator
-    for sm, gp, nt, jn in zip(*imgt_iters):
-        if len(set([sm['Sequence ID'],
-                    gp['Sequence ID'],
-                    nt['Sequence ID'],
-                    jn['Sequence ID']])) != 1:
-            sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \
-                     % sm['Sequence ID'])
-
-        db_gen = {'SEQUENCE_ID': sm['Sequence ID'],
-                  'SEQUENCE_INPUT': sm['Sequence']}
-
-        if 'No results' not in sm['Functionality']:
-            db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) +
-                                                 ('unprod' in sm['Functionality'])]
-            db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) +
-                                               ('out-of-frame' in sm['JUNCTION frame'])],
-            db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) +
-                                           ('unprod' in sm['Functionality'])]
-            db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'],
-                                                         'missing' in sm['V-REGION potential ins/del']))) +
-                                                         ('unprod' in sm['Functionality'])]
-            db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'],
-                                              sm['V-REGION insertions'],
-                                              sm['V-REGION deletions']))]
-
-            db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION']
-            db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION']
-
-            db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele']))
-            db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele']))
-            db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele']))
-
-            v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0
-            db_gen['V_SEQ_START'] = nt['V-REGION start']
-            db_gen['V_SEQ_LENGTH'] = v_seq_length
-            db_gen['V_GERM_START_IMGT'] = 1
-            db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0
-
-            db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"],
-                                                       jn['N-REGION-nt nb'],
-                                                       jn['N1-REGION-nt nb'],
-                                                       jn["P5'D-nt nb"]] if i)
-            db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length,
-                                                         jn["P3'V-nt nb"],
-                                                         jn['N-REGION-nt nb'],
-                                                         jn['N1-REGION-nt nb'],
-                                                         jn["P5'D-nt nb"]] if i)
-            db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
-            db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1
-            db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
-            db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"],
-                                                       jn['N2-REGION-nt nb'],
-                                                       jn["P5'J-nt nb"]] if i)
-
-            db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length,
-                                                         jn["P3'V-nt nb"],
-                                                         jn['N-REGION-nt nb'],
-                                                         jn['N1-REGION-nt nb'],
-                                                         jn["P5'D-nt nb"],
-                                                         jn["D-REGION-nt nb"],
-                                                         jn["P3'D-nt nb"],
-                                                         jn['N2-REGION-nt nb'],
-                                                         jn["P5'J-nt nb"]] if i)
-            db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0
-            db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1
-            db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0
-
-            db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0
-            db_gen['JUNCTION'] = jn['JUNCTION']
-
-            # Alignment scores
-            if score_fields:
-                try:  db_gen['V_SCORE'] = float(sm['V-REGION score'])
-                except (TypeError, ValueError):  db_gen['V_SCORE'] = 'None'
-
-                try:  db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0
-                except (TypeError, ValueError):  db_gen['V_IDENTITY'] = 'None'
-
-                try:  db_gen['J_SCORE'] = float(sm['J-REGION score'])
-                except (TypeError, ValueError):  db_gen['J_SCORE'] = 'None'
-
-                try:  db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0
-                except (TypeError, ValueError):  db_gen['J_IDENTITY'] = 'None'
-
-            # FWR and CDR regions
-            if region_fields: getRegions(db_gen)
-        else:
-            db_gen['V_CALL'] = 'None'
-            db_gen['D_CALL'] = 'None'
-            db_gen['J_CALL'] = 'None'
-
-        yield IgRecord(db_gen)
-
-
-def getIDforIMGT(seq_file):
-    """
-    Create a sequence ID translation using IMGT truncation
-
-    Arguments:
-    seq_file = a fasta file of sequences input to IMGT
-
-    Returns:
-    a dictionary of {truncated ID: full seq description}
-    """
-
-    # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
-    ids = {}
-    for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)):
-        if len(rec.description) <= 50:
-            id_key = rec.description
-        else:
-            id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50])
-        ids.update({id_key:rec.description})
-
-    return ids
-
-
-def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
-            score_fields=False, region_fields=False, out_args=default_out_args):
-    """
-    Writes tab-delimited database file in output directory
+    Writes tab-delimited database file in output directory.

     Arguments:
-    db_gen = a generator of IgRecord objects containing alignment data
-    file_prefix = directory and prefix for CLIP tab-delim file
-    total_count = number of records (for progress bar)
-    id_dict = a dictionary of {IMGT ID: full seq description}
-    no_parse = if ID is to be parsed for pRESTO output with default delimiters
-    score_fields = if True add alignment score fields to output file
-    region_fields = if True add FWR and CDR region fields to output file
-    out_args = common output argument dictionary from parseCommonArgs
+      db : a iterator of IgRecord objects containing alignment data.
+      fields : a list of ordered field names to write.
+      file_prefix : directory and prefix for CLIP tab-delim file.
+      total_count : number of records (for progress bar).
+      id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
+      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
+      partial : if True put incomplete alignments in the pass file.
+      out_args : common output argument dictionary from parseCommonArgs.

     Returns:
-    None
+      None
     """
-    pass_file = "%s_db-pass.tab" % file_prefix
-    fail_file = "%s_db-fail.tab" % file_prefix
-    ordered_fields = ['SEQUENCE_ID',
-                      'SEQUENCE_INPUT',
-                      'FUNCTIONAL',
-                      'IN_FRAME',
-                      'STOP',
-                      'MUTATED_INVARIANT',
-                      'INDELS',
-                      'V_CALL',
-                      'D_CALL',
-                      'J_CALL',
-                      'SEQUENCE_VDJ',
-                      'SEQUENCE_IMGT',
-                      'V_SEQ_START',
-                      'V_SEQ_LENGTH',
-                      'V_GERM_START_VDJ',
-                      'V_GERM_LENGTH_VDJ',
-                      'V_GERM_START_IMGT',
-                      'V_GERM_LENGTH_IMGT',
-                      'N1_LENGTH',
-                      'D_SEQ_START',
-                      'D_SEQ_LENGTH',
-                      'D_GERM_START',
-                      'D_GERM_LENGTH',
-                      'N2_LENGTH',
-                      'J_SEQ_START',
-                      'J_SEQ_LENGTH',
-                      'J_GERM_START',
-                      'J_GERM_LENGTH',
-                      'JUNCTION_LENGTH',
-                      'JUNCTION']
+    # Function to check for valid records strictly
+    def _pass_strict(rec):
+        valid = [rec.v_call and rec.v_call != 'None',
+                 rec.j_call and rec.j_call != 'None',
+                 rec.functional is not None,
+                 rec.seq_vdj,
+                 rec.junction]
+        return all(valid)

-    if score_fields:
-        ordered_fields.extend(['V_SCORE',
-                               'V_IDENTITY',
-                               'V_EVALUE',
-                               'V_BTOP',
-                               'J_SCORE',
-                               'J_IDENTITY',
-                               'J_EVALUE',
-                               'J_BTOP'])
+    # Function to check for valid records loosely
+    def _pass_gentle(rec):
+        valid = [rec.v_call and rec.v_call != 'None',
+                 rec.d_call and rec.d_call != 'None',
+                 rec.j_call and rec.j_call != 'None']
+        return any(valid)

-    if region_fields:
-        ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
-                               'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])
-
+    # Set pass criteria
+    _pass = _pass_gentle if partial else _pass_strict

-    # TODO:  This is not the best approach. should pass in output fields.
-    # Initiate passed handle
-    pass_handle = None
+    # Define output file names
+    pass_file = '%s_db-pass.tab' % file_prefix
+    fail_file = '%s_db-fail.tab' % file_prefix

-    # Open failed file
-    if out_args['failed']:
-        fail_handle = open(fail_file, 'wt')
-        fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
-    else:
-        fail_handle = None
-        fail_writer = None
-
-    # Initialize counters and file
+    # Initiate handles, writers and counters
+    pass_handle = None
+    fail_handle = None
     pass_writer = None
+    fail_writer = None
     start_time = time()
     rec_count = pass_count = fail_count = 0
-    for record in db_gen:
-        #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
-        printProgress(rec_count, total_count, 0.05, start_time)
-        rec_count += 1

-        # Count pass or fail
-        if (record.v_call == 'None' and record.j_call == 'None') or \
-                record.functional is None or \
-                not record.seq_vdj or \
-                not record.junction:
-            # print(record.v_call, record.j_call, record.functional, record.junction)
-            fail_count += 1
-            if fail_writer is not None: fail_writer.writerow(record.toDict())
-            continue
-        else:
-            pass_count += 1
-
-        # Build sample sequence description
-        if record.id in id_dict:
+    # Validate and write output
+    printProgress(0, total_count, 0.05, start_time)
+    for i, record in enumerate(db, start=1):
+
+        # Replace sequence description with full string, if required
+        if id_dict is not None and record.id in id_dict:
             record.id = id_dict[record.id]

         # Parse sequence description into new columns
         if not no_parse:
-            record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
-            record.id = record.annotations['ID']
-            del record.annotations['ID']
+            try:
+                record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
+                record.id = record.annotations['ID']
+                del record.annotations['ID']
+
+                # TODO:  This is not the best approach. should pass in output fields.
+                # If first record, use parsed description to define extra columns
+                if i == 1:  fields.extend(list(record.annotations.keys()))
+            except IndexError:
+                # Could not parse pRESTO-style annotations so fall back to no parse
+                no_parse = True
+                sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')

-        # TODO:  This is not the best approach. should pass in output fields.
-        # If first sequence, use parsed description to create new columns and initialize writer
-        if pass_writer is None:
-            if not no_parse:  ordered_fields.extend(list(record.annotations.keys()))
-            pass_handle = open(pass_file, 'wt')
-            pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)
+        # Count pass or fail and write to appropriate file
+        if _pass(record):
+            # Open pass file
+            if pass_writer is None:
+                pass_handle = open(pass_file, 'wt')
+                pass_writer = getDbWriter(pass_handle, add_fields=fields)

-        # Write row to tab-delim CLIP file
-        pass_writer.writerow(record.toDict())
-
-    # Print log
-    #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
-    printProgress(rec_count, total_count, 0.05, start_time)
+            # Write row to pass file
+            pass_count += 1
+            pass_writer.writerow(record.toDict())
+        else:
+            # Open failed file
+            if out_args['failed'] and fail_writer is None:
+                fail_handle = open(fail_file, 'wt')
+                fail_writer = getDbWriter(fail_handle, add_fields=fields)

+            # Write row to fail file if specified
+            fail_count += 1
+            if fail_writer is not None:
+                fail_writer.writerow(record.toDict())
+
+        # Print progress
+        printProgress(i, total_count, 0.05, start_time)
+
+    # Print consol log
     log = OrderedDict()
     log['OUTPUT'] = pass_file
     log['PASS'] = pass_count
@@ -777,146 +176,215 @@
     if fail_handle is not None: fail_handle.close()


-# TODO:  may be able to merge with parseIMGT
-def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False,
-                 region_fields=False, out_args=default_out_args):
+# TODO:  may be able to merge with other mains
+def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False,
+              parse_scores=False, parse_regions=False, parse_junction=False,
+              out_args=default_out_args):
     """
-    Main for IgBlast aligned sample sequences
+    Main for IMGT aligned sample sequences.

     Arguments:
-    igblast_output = IgBlast output file to process
-    seq_file = fasta file input to IgBlast (from which to get sequence)
-    repo = folder with germline repertoire files
-    no_parse = if ID is to be parsed for pRESTO output with default delimiters
-    score_fields = if True add alignment score fields to output file
-    region_fields = if True add FWR and CDR region fields to output file
-    out_args = common output argument dictionary from parseCommonArgs
+      aligner_output : zipped file or unzipped folder output by IMGT.
+      seq_file : FASTA file input to IMGT (from which to get seqID).
+      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
+      partial : If True put incomplete alignments in the pass file.
+      parse_scores : if True add alignment score fields to output file.
+      parse_regions : if True add FWR and CDR region fields to output file.
+      out_args : common output argument dictionary from parseCommonArgs.

     Returns:
-    None
+      None
+    """
+    # Print parameter info
+    log = OrderedDict()
+    log['START'] = 'MakeDb'
+    log['ALIGNER'] = 'IMGT'
+    log['ALIGNER_OUTPUT'] = aligner_output
+    log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
+    log['NO_PARSE'] = no_parse
+    log['PARTIAL'] = partial
+    log['SCORES'] = parse_scores
+    log['REGIONS'] = parse_regions
+    log['JUNCTION'] = parse_junction
+    printLog(log)
+
+    start_time = time()
+    printMessage('Loading sequence files', start_time=start_time, width=25)
+    # Extract IMGT files
+    temp_dir, imgt_files = extractIMGT(aligner_output)
+    # Count records in IMGT files
+    total_count = countDbFile(imgt_files['summary'])
+    # Get (parsed) IDs from fasta file submitted to IMGT
+    id_dict = getIDforIMGT(seq_file) if seq_file else {}
+    printMessage('Done', start_time=start_time, end=True, width=25)
+
+    # Parse IMGT output and write db
+    with open(imgt_files['summary'], 'r') as summary_handle, \
+            open(imgt_files['gapped'], 'r') as gapped_handle, \
+            open(imgt_files['ntseq'], 'r') as ntseq_handle, \
+            open(imgt_files['junction'], 'r') as junction_handle:
+        parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle,
+                                parse_scores=parse_scores, parse_regions=parse_regions,
+                                parse_junction=parse_junction)
+        file_prefix = getFilePrefix(aligner_output, out_args)
+        writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict,
+                no_parse=no_parse, partial=partial, out_args=out_args)
+
+    # Cleanup temp directory
+    temp_dir.cleanup()
+
+    return None
+
+
+# TODO:  may be able to merge with other mains
+def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False,
+                 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False,
+                 out_args=default_out_args):
+    """
+    Main for IgBLAST aligned sample sequences.
+
+    Arguments:
+      aligner_output : IgBLAST output file to process.
+      seq_file : fasta file input to IgBlast (from which to get sequence).
+      repo : folder with germline repertoire files.
+      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
+      partial : If True put incomplete alignments in the pass file.
+      parse_regions : if True add FWR and CDR fields to output file.
+      parse_scores : if True add alignment score fields to output file.
+      parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST
+      out_args : common output argument dictionary from parseCommonArgs.
+
+    Returns:
+      None
     """
     # Print parameter info
     log = OrderedDict()
     log['START'] = 'MakeDB'
     log['ALIGNER'] = 'IgBlast'
-    log['ALIGN_RESULTS'] = os.path.basename(igblast_output)
+    log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
     log['SEQ_FILE'] = os.path.basename(seq_file)
     log['NO_PARSE'] = no_parse
-    log['SCORE_FIELDS'] = score_fields
-    log['REGION_FIELDS'] = region_fields
+    log['PARTIAL'] = partial
+    log['SCORES'] = parse_scores
+    log['REGIONS'] = parse_regions
     printLog(log)

+    start_time = time()
+    printMessage('Loading sequence files', start_time=start_time, width=25)
+    # Count records in sequence file
+    total_count = countSeqFile(seq_file)
     # Get input sequence dictionary
-    seq_dict = getSeqforIgBlast(seq_file)
+    seq_dict = getSeqDict(seq_file)
+    # Create germline repo dictionary
+    repo_dict = readRepo(repo)
+    printMessage('Done', start_time=start_time, end=True, width=25)

-    # Formalize out_dir and file-prefix
-    if not out_args['out_dir']:
-        out_dir = os.path.split(igblast_output)[0]
-    else:
-        out_dir = os.path.abspath(out_args['out_dir'])
-        if not os.path.exists(out_dir):  os.mkdir(out_dir)
-    if out_args['out_name']:
-        file_prefix = out_args['out_name']
-    else:
-        file_prefix = os.path.basename(os.path.splitext(igblast_output)[0])
-    file_prefix = os.path.join(out_dir, file_prefix)
+    # Parse and write output
+    with open(aligner_output, 'r') as f:
+        parse_iter = IgBLASTReader(f, seq_dict, repo_dict,
+                                   parse_scores=parse_scores, parse_regions=parse_regions,
+                                   parse_igblast_cdr3=parse_igblast_cdr3)
+        file_prefix = getFilePrefix(aligner_output, out_args)
+        writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
+                no_parse=no_parse, partial=partial, out_args=out_args)

-    total_count = countSeqFile(seq_file)
-
-    # Create
-    repo_dict = getRepo(repo)
-    igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict,
-                               score_fields=score_fields, region_fields=region_fields)
-    writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse,
-            score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+    return None


-# TODO:  may be able to merge with parseIgBlast
-def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False,
-              region_fields=False, out_args=default_out_args):
+# TODO:  may be able to merge with other mains
+def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False,
+              parse_scores=False, parse_regions=False, out_args=default_out_args):
     """
-    Main for IMGT aligned sample sequences
+    Main for iHMMuneAlign aligned sample sequences.

     Arguments:
-    imgt_output = zipped file or unzipped folder output by IMGT
-    seq_file = FASTA file input to IMGT (from which to get seqID)
-    no_parse = if ID is to be parsed for pRESTO output with default delimiters
-    score_fields = if True add alignment score fields to output file
-    region_fields = if True add FWR and CDR region fields to output file
-    out_args = common output argument dictionary from parseCommonArgs
-
-    Returns:
-    None
+      aligner_output : iHMMune-Align output file to process.
+      seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
+      repo : folder with germline repertoire files.
+      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
+      partial : If True put incomplete alignments in the pass file.
+      parse_scores : if True parse alignment scores.
+      parse_regions : if True add FWR and CDR region fields.
+      out_args : common output argument dictionary from parseCommonArgs.
+
+    Returns:
+      None
     """
     # Print parameter info
     log = OrderedDict()
-    log['START'] = 'MakeDb'
-    log['ALIGNER'] = 'IMGT'
-    log['ALIGN_RESULTS'] = imgt_output
-    log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
+    log['START'] = 'MakeDB'
+    log['ALIGNER'] = 'iHMMune-Align'
+    log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
+    log['SEQ_FILE'] = os.path.basename(seq_file)
     log['NO_PARSE'] = no_parse
-    log['SCORE_FIELDS'] = score_fields
-    log['REGION_FIELDS'] = region_fields
+    log['PARTIAL'] = partial
+    log['SCORES'] = parse_scores
+    log['REGIONS'] = parse_regions
     printLog(log)
-
-    # Get individual IMGT result files
-    temp_dir, imgt_files = extractIMGT(imgt_output)
-
-    # Formalize out_dir and file-prefix
-    if not out_args['out_dir']:
-        out_dir = os.path.dirname(os.path.abspath(imgt_output))
-    else:
-        out_dir = os.path.abspath(out_args['out_dir'])
-        if not os.path.exists(out_dir):  os.mkdir(out_dir)
-    if out_args['out_name']:
-        file_prefix = out_args['out_name']
-    else:
-        file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0]
-    file_prefix = os.path.join(out_dir, file_prefix)

-    total_count = countDbFile(imgt_files[0])
-
-    # Get (parsed) IDs from fasta file submitted to IMGT
-    id_dict = getIDforIMGT(seq_file) if seq_file else {}
-
-    # Create
-    imgt_dict = readIMGT(imgt_files, score_fields=score_fields,
-                         region_fields=region_fields)
-    writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse,
-            score_fields=score_fields, region_fields=region_fields, out_args=out_args)
+    start_time = time()
+    printMessage('Loading sequence files', start_time=start_time, width=25)
+    # Count records in sequence file
+    total_count = countSeqFile(seq_file)
+    # Get input sequence dictionary
+    seq_dict = getSeqDict(seq_file)
+    # Create germline repo dictionary
+    repo_dict = readRepo(repo)
+    printMessage('Done', start_time=start_time, end=True, width=25)

-    # Delete temp directory
-    rmtree(temp_dir)
+    # Parse and write output
+    with open(aligner_output, 'r') as f:
+        parse_iter = IHMMuneReader(f, seq_dict, repo_dict,
+                                   parse_scores=parse_scores, parse_regions=parse_regions)
+        file_prefix = getFilePrefix(aligner_output, out_args)
+        writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
+                no_parse=no_parse, partial=partial, out_args=out_args)
+
+    return None


 def getArgParser():
     """
-    Defines the ArgumentParser
+    Defines the ArgumentParser.

-    Arguments:
-    None
-
     Returns:
-    an ArgumentParser object
+      argparse.ArgumentParser
     """
     fields = dedent(
              '''
               output files:
                   db-pass
-                      database of parsed alignment records.
+                      database of alignment records with functionality information,
+                      V and J calls, and a junction region.
                   db-fail
-                      database with records failing alignment.
+                      database with records that fail due to no functionality information
+                      (did not pass IMGT), no V call, no J call, or no junction region.

-              output fields:
-                  SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT,
-                  INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT,
-                  V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT,
-                  V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH,
-                  D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH,
+              universal output fields:
+                  SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT,
+                  FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS,
+                  V_CALL, D_CALL, J_CALL,
+                  V_SEQ_START, V_SEQ_LENGTH,
+                  D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
                   J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
-                  JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP,
-                  J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT,
-                  FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
+                  JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH,
+                  FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
+                  CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
+
+              imgt specific output fields:
+                  V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
+                  N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
+                  D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY,
+
+              igblast specific output fields:
+                  V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
+                  V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP,
+                  J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP.
+                  CDR3_IGBLAST_NT, CDR3_IGBLAST_AA
+
+              ihmm specific output fields:
+                  V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
+                  HMM_SCORE
               ''')

     # Define ArgumentParser
@@ -933,11 +401,11 @@
     parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)

     # IgBlast Aligner
-    parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output',
-                                           parents=[parser_parent],
-                                           formatter_class=CommonHelpFormatter)
-    parser_igblast.set_defaults(func=parseIgBlast)
-    parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+    parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
+                                           formatter_class=CommonHelpFormatter,
+                                           help='Process IgBLAST output.',
+                                           description='Process IgBLAST output.')
+    parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
                                 required=True,
                                 help='''IgBLAST output files in format 7 with query sequence
                                      (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
@@ -947,50 +415,112 @@
                                      set of germlines used in the IgBLAST alignment.''')
     parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
                                 required=True,
-                                help='List of input FASTA files containing sequences')
+                                help='''List of input FASTA files (with .fasta, .fna or .fa
+                                     extension), containing sequences.''')
     parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
-                                help='''Specify if input IDs should not be parsed to add
-                                     new columns to database.''')
-    parser_igblast.add_argument('--scores', action='store_true', dest='score_fields',
+                                help='''Specify to prevent input sequence headers from being parsed
+                                    to add new columns to database. Parsing of sequence headers requires
+                                    headers to be in the pRESTO annotation format, so this should be specified
+                                    when sequence headers are incompatible with the pRESTO annotation scheme.
+                                    Note, unrecognized header formats will default to this behavior.''')
+    parser_igblast.add_argument('--partial', action='store_true', dest='partial',
+                                help='''If specified, include incomplete V(D)J alignments in
+                                     the pass file instead of the fail file.''')
+    parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores',
                                 help='''Specify if alignment score metrics should be
                                      included in the output. Adds the V_SCORE, V_IDENTITY,
                                      V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
                                      J_BTOP, and J_EVALUE columns.''')
-    parser_igblast.add_argument('--regions', action='store_true', dest='region_fields',
-                                help='''Specify if IMGT framework and CDR regions should be
+    parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions',
+                                help='''Specify if IMGT FWR and CDRs should be
                                      included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
                                      FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
                                      CDR3_IMGT columns.''')
-
+    parser_igblast.add_argument('--cdr3', action='store_true',
+                                dest='parse_igblast_cdr3',
+                                help='''Specify if the CDR3 sequences generated by IgBLAST
+                                     should be included in the output. Adds the columns
+                                     CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST
+                                     version 1.5 or greater.''')
+    parser_igblast.set_defaults(func=parseIgBLAST)
+
     # IMGT aligner
-    parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output',
-                                        parents=[parser_parent],
-                                        formatter_class=CommonHelpFormatter)
-    imgt_arg_group =  parser_imgt.add_mutually_exclusive_group(required=True)
-    imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files',
-                                help='''Either zipped IMGT output files (.zip) or a folder
-                                     containing unzipped IMGT output files (which must
-                                     include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
-                                     and 6_Junction).''')
+    parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
+                                        formatter_class=CommonHelpFormatter,
+                                        help='''Process IMGT/HighV-Quest output
+                                             (does not work with V-QUEST).''',
+                                        description='''Process IMGT/HighV-Quest output
+                                             (does not work with V-QUEST).''')
+    parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
+                             help='''Either zipped IMGT output files (.zip or .txz) or a
+                                  folder containing unzipped IMGT output files (which must
+                                  include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
+                                  and 6_Junction).''')
     parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
                              required=False,
-                             help='List of input FASTA files containing sequences')
+                             help='''List of input FASTA files (with .fasta, .fna or .fa
+                                  extension) containing sequences.''')
     parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
-                             help='''Specify if input IDs should not be parsed to add new
-                                  columns to database.''')
-    parser_imgt.add_argument('--scores', action='store_true', dest='score_fields',
+                             help='''Specify to prevent input sequence headers from being parsed
+                                  to add new columns to database. Parsing of sequence headers requires
+                                  headers to be in the pRESTO annotation format, so this should be specified
+                                  when sequence headers are incompatible with the pRESTO annotation scheme.
+                                  Note, unrecognized header formats will default to this behavior.''')
+    parser_imgt.add_argument('--partial', action='store_true', dest='partial',
+                             help='''If specified, include incomplete V(D)J alignments in
+                                  the pass file instead of the fail file.''')
+    parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores',
                              help='''Specify if alignment score metrics should be
                                   included in the output. Adds the V_SCORE, V_IDENTITY,
-                                  J_SCORE and J_IDENTITY. Note, this will also add
-                                  the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,
-                                  but they will be empty for IMGT output.''')
-    parser_imgt.add_argument('--regions', action='store_true', dest='region_fields',
-                             help='''Specify if IMGT framework and CDR regions should be
+                                  J_SCORE and J_IDENTITY.''')
+    parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions',
+                             help='''Specify if IMGT FWRs and CDRs should be
                                   included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
                                   FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
                                   CDR3_IMGT columns.''')
+    parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction',
+                             help='''Specify if detailed junction fields should be
+                                  included in the output. Adds the columns
+                                  N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH,
+                                  P5J_LENGTH, D_FRAME.''')
     parser_imgt.set_defaults(func=parseIMGT)

+    # iHMMuneAlign Aligner
+    parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
+                                        formatter_class=CommonHelpFormatter,
+                                        help='Process iHMMune-Align output.',
+                                        description='Process iHMMune-Align output.')
+    parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
+                             required=True,
+                             help='''iHMMune-Align output file.''')
+    parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
+                             help='''List of folders and/or FASTA files containing
+                                  IMGT-gapped germline sequences corresponding to the
+                                  set of germlines used in the IgBLAST alignment.''')
+    parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
+                             required=True,
+                             help='''List of input FASTA files (with .fasta, .fna or .fa
+                                  extension) containing sequences.''')
+    parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse',
+                             help='''Specify to prevent input sequence headers from being parsed
+                                  to add new columns to database. Parsing of sequence headers requires
+                                  headers to be in the pRESTO annotation format, so this should be specified
+                                  when sequence headers are incompatible with the pRESTO annotation scheme.
+                                  Note, unrecognized header formats will default to this behavior.''')
+    parser_ihmm.add_argument('--partial', action='store_true', dest='partial',
+                             help='''If specified, include incomplete V(D)J alignments in
+                                  the pass file instead of the fail file.''')
+    parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores',
+                             help='''Specify if alignment score metrics should be
+                                  included in the output. Adds the path score of the
+                                  iHMMune-Align hidden Markov model to HMM_SCORE.''')
+    parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions',
+                             help='''Specify if IMGT FWRs and CDRs should be
+                                  included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
+                                  FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
+                                  CDR3_IMGT columns.''')
+    parser_ihmm.set_defaults(func=parseIHMM)
+
     return parser


@@ -1000,7 +530,7 @@
     """
     parser = getArgParser()
     args = parser.parse_args()
-    args_dict = parseCommonArgs(args, in_arg='aligner_files')
+    args_dict = parseCommonArgs(args, in_arg='aligner_outputs')

     # Set no ID parsing if sequence files are not provided
     if 'seq_files' in args_dict and not args_dict['seq_files']:
@@ -1008,18 +538,18 @@

     # Delete
     if 'seq_files' in args_dict: del args_dict['seq_files']
-    if 'aligner_files' in args_dict: del args_dict['aligner_files']
+    if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs']
     if 'command' in args_dict: del args_dict['command']
     if 'func' in args_dict: del args_dict['func']

     if args.command == 'imgt':
-        for i in range(len(args.__dict__['aligner_files'])):
-            args_dict['imgt_output'] = args.__dict__['aligner_files'][i]
+        for i in range(len(args.__dict__['aligner_outputs'])):
+            args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
             args_dict['seq_file'] = args.__dict__['seq_files'][i] \
                                     if args.__dict__['seq_files'] else None
             args.func(**args_dict)
-    elif args.command == 'igblast':
-        for i in range(len(args.__dict__['aligner_files'])):
-            args_dict['igblast_output'] =  args.__dict__['aligner_files'][i]
+    elif args.command == 'igblast' or args.command == 'ihmm':
+        for i in range(len(args.__dict__['aligner_outputs'])):
+            args_dict['aligner_output'] =  args.__dict__['aligner_outputs'][i]
             args_dict['seq_file'] = args.__dict__['seq_files'][i]
             args.func(**args_dict)