Mercurial > repos > davidvanzessen > shm_csr
changeset 52:22dddabe3637 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 23 May 2017 08:32:58 -0400 |
parents | 8fa8836bd605 |
children | 3be28ac82909 |
files | change_o/DefineClones.py change_o/MakeDb.py |
diffstat | 2 files changed, 547 insertions(+), 946 deletions(-) [+] |
line wrap: on
line diff
--- a/change_o/DefineClones.py Tue May 16 08:52:48 2017 -0400 +++ b/change_o/DefineClones.py Tue May 23 08:32:58 2017 -0400 @@ -10,6 +10,7 @@ import os import re import sys +import csv import numpy as np from argparse import ArgumentParser from collections import OrderedDict @@ -25,56 +26,108 @@ from presto.Multiprocessing import manageProcesses from presto.Sequence import getDNAScoreDict from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs -from changeo.Distance import getDNADistMatrix, getAADistMatrix, \ - hs1f_model, m1n_model, hs5f_model, \ - calcDistances, formClusters +from changeo.Distance import distance_models, calcDistances, formClusters from changeo.IO import getDbWriter, readDbFile, countDbFile from changeo.Multiprocessing import DbData, DbResult +## Set maximum field size for csv.reader +csv.field_size_limit(sys.maxsize) + # Defaults default_translate = False default_distance = 0.0 -default_bygroup_model = 'hs1f' +default_index_mode = 'gene' +default_index_action = 'set' +default_bygroup_model = 'ham' default_hclust_model = 'chen2010' default_seq_field = 'JUNCTION' default_norm = 'len' default_sym = 'avg' default_linkage = 'single' - -# TODO: should be in Distance, but need to be after function definitions -# Amino acid Hamming distance -aa_model = getAADistMatrix(mask_dist=1, gap_dist=0) - -# DNA Hamming distance -ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0) +choices_bygroup_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f', 'mk_rs1nf', 'mk_rs5nf', 'hs1f_compat', 'm1n_compat') -# TODO: this function is an abstraction to facilitate later cleanup -def getModelMatrix(model): +def indexByIdentity(index, key, rec, fields=None): """ - Simple wrapper to get distance matrix from model name + Updates a preclone index with a simple key + + Arguments: + index = preclone index from indexJunctions + key = index key + rec = IgRecord to add to the index + fields = additional annotation fields to use to group preclones; + if None use only V, J and junction length + + Returns: + None. Updates index with new key and records. + """ + index.setdefault(tuple(key), []).append(rec) + + +def indexByUnion(index, key, rec, fields=None): + """ + Updates a preclone index with the union of nested keys Arguments: - model = model name + index = preclone index from indexJunctions + key = index key + rec = IgRecord to add to the index + fields = additional annotation fields to use to group preclones; + if None use only V, J and junction length - Return: - a pandas.DataFrame containing the character distance matrix + Returns: + None. Updates index with new key and records. """ - if model == 'aa': - return(aa_model) - elif model == 'ham': - return(ham_model) - elif model == 'm1n': - return(m1n_model) - elif model == 'hs1f': - return(hs1f_model) - elif model == 'hs5f': - return(hs5f_model) + # List of values for this/new key + val = [rec] + f_range = list(range(2, 3 + (len(fields) if fields else 0))) + + # See if field/junction length combination exists in index + outer_dict = index + for field in f_range: + try: + outer_dict = outer_dict[key[field]] + except (KeyError): + outer_dict = None + break + # If field combination exists, look through Js + j_matches = [] + if outer_dict is not None: + for j in outer_dict.keys(): + if not set(key[1]).isdisjoint(set(j)): + key[1] = tuple(set(key[1]).union(set(j))) + j_matches += [j] + # If J overlap exists, look through Vs for each J + for j in j_matches: + v_matches = [] + # Collect V matches for this J + for v in outer_dict[j].keys(): + if not set(key[0]).isdisjoint(set(v)): + key[0] = tuple(set(key[0]).union(set(v))) + v_matches += [v] + # If there are V overlaps for this J, pop them out + if v_matches: + val += list(chain(*(outer_dict[j].pop(v) for v in v_matches))) + # If the J dict is now empty, remove it + if not outer_dict[j]: + outer_dict.pop(j, None) + + # Add value(s) into index nested dictionary + # OMG Python pointers are the best! + # Add field dictionaries into index + outer_dict = index + for field in f_range: + outer_dict.setdefault(key[field], {}) + outer_dict = outer_dict[key[field]] + # Add J, then V into index + if key[1] in outer_dict: + outer_dict[key[1]].update({key[0]: val}) else: - sys.stderr.write('Unrecognized distance model: %s.\n' % model) + outer_dict[key[1]] = {key[0]: val} -def indexJunctions(db_iter, fields=None, mode='gene', action='first'): +def indexJunctions(db_iter, fields=None, mode=default_index_mode, + action=default_index_action): """ Identifies preclonal groups by V, J and junction length @@ -90,27 +143,47 @@ Returns: a dictionary of {(V, J, junction length):[IgRecords]} """ + # print(fields) # Define functions for grouping keys if mode == 'allele' and fields is None: def _get_key(rec, act): - return (rec.getVAllele(act), rec.getJAllele(act), - None if rec.junction is None else len(rec.junction)) + return [rec.getVAllele(act), rec.getJAllele(act), + None if rec.junction is None else len(rec.junction)] elif mode == 'gene' and fields is None: def _get_key(rec, act): - return (rec.getVGene(act), rec.getJGene(act), - None if rec.junction is None else len(rec.junction)) + return [rec.getVGene(act), rec.getJGene(act), + None if rec.junction is None else len(rec.junction)] elif mode == 'allele' and fields is not None: def _get_key(rec, act): vdj = [rec.getVAllele(act), rec.getJAllele(act), None if rec.junction is None else len(rec.junction)] ann = [rec.toDict().get(k, None) for k in fields] - return tuple(chain(vdj, ann)) + return list(chain(vdj, ann)) elif mode == 'gene' and fields is not None: def _get_key(rec, act): vdj = [rec.getVGene(act), rec.getJGene(act), None if rec.junction is None else len(rec.junction)] ann = [rec.toDict().get(k, None) for k in fields] - return tuple(chain(vdj, ann)) + return list(chain(vdj, ann)) + + # Function to flatten nested dictionary + def _flatten_dict(d, parent_key=''): + items = [] + for k, v in d.items(): + new_key = parent_key + [k] if parent_key else [k] + if isinstance(v, dict): + items.extend(_flatten_dict(v, new_key).items()) + else: + items.append((new_key, v)) + flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items} + return flat_dict + + if action == 'first': + index_func = indexByIdentity + elif action == 'set': + index_func = indexByUnion + else: + sys.stderr.write('Unrecognized action: %s.\n' % action) start_time = time() clone_index = {} @@ -127,35 +200,16 @@ # Assigned passed preclone records to key and failed to index None if all([k is not None and k != '' for k in key]): - #print key - # TODO: Has much slow. Should have less slow. - if action == 'set': - - f_range = list(range(2, 3 + (len(fields) if fields else 0))) - vdj_range = list(range(2)) - - # Check for any keys that have matching columns and junction length and overlapping genes/alleles - to_remove = [] - if len(clone_index) > (1 if None in clone_index else 0) and key not in clone_index: - key = list(key) - for k in clone_index: - if k is not None and all([key[i] == k[i] for i in f_range]): - if all([not set(key[i]).isdisjoint(set(k[i])) for i in vdj_range]): - for i in vdj_range: key[i] = tuple(set(key[i]).union(set(k[i]))) - to_remove.append(k) - - # Remove original keys, replace with union of all genes/alleles and append values to new key - val = [rec] - val += list(chain(*(clone_index.pop(k) for k in to_remove))) - clone_index[tuple(key)] = clone_index.get(tuple(key),[]) + val - - elif action == 'first': - clone_index.setdefault(key, []).append(rec) + # Update index dictionary + index_func(clone_index, key, rec, fields) else: clone_index.setdefault(None, []).append(rec) printProgress(rec_count, step=1000, start_time=start_time, end=True) + if action == 'set': + clone_index = _flatten_dict(clone_index) + return clone_index @@ -179,15 +233,20 @@ a dictionary of lists defining {clone number: [IgRecords clonal group]} """ # Get distance matrix if not provided - if dist_mat is None: dist_mat = getModelMatrix(model) + if dist_mat is None: + try: + dist_mat = distance_models[model] + except KeyError: + sys.exit('Unrecognized distance model: %s' % args_dict['model']) + # TODO: can be cleaned up with abstract model class # Determine length of n-mers - if model in ['hs1f', 'm1n', 'aa', 'ham']: + if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']: nmer_len = 1 - elif model in ['hs5f']: + elif model in ['hh_s5f', 'mk_rs5nf']: nmer_len = 5 else: - sys.stderr.write('Unrecognized distance model: %s.\n' % model) + sys.exit('Unrecognized distance model: %s.\n' % model) # Define unique junction mapping seq_map = {} @@ -197,7 +256,7 @@ if len(seq) == 0: return None - seq = re.sub('[\.-]','N', str(seq)) + seq = re.sub('[\.-]', 'N', str(seq)) if model == 'aa': seq = translate(seq) seq_map.setdefault(seq, []).append(ig) @@ -210,7 +269,7 @@ seqs = list(seq_map.keys()) # Calculate pairwise distance matrix - dists = calcDistances(seqs, nmer_len, dist_mat, norm, sym) + dists = calcDistances(seqs, nmer_len, dist_mat, sym=sym, norm=norm) # Perform hierarchical clustering clusters = formClusters(dists, linkage, distance) @@ -449,6 +508,7 @@ # Define result object for iteration and get data records records = data.data + # print(data.id) result = DbResult(data.id, records) # Check for invalid data (due to failed indexing) and add failed result @@ -901,7 +961,7 @@ database with records failing clonal grouping. required fields: - SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION_LENGTH + SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION <field> sequence field specified by the --sf parameter @@ -926,32 +986,36 @@ # Distance cloning method parser_bygroup = subparsers.add_parser('bygroup', parents=[parser_parent], - formatter_class=CommonHelpFormatter, - help='''Defines clones as having same V assignment, - J assignment, and junction length with - specified substitution distance model.''') + formatter_class=CommonHelpFormatter, + help='''Defines clones as having same V assignment, + J assignment, and junction length with + specified substitution distance model.''', + description='''Defines clones as having same V assignment, + J assignment, and junction length with + specified substitution distance model.''') parser_bygroup.add_argument('-f', nargs='+', action='store', dest='fields', default=None, help='Additional fields to use for grouping clones (non VDJ)') parser_bygroup.add_argument('--mode', action='store', dest='mode', - choices=('allele', 'gene'), default='gene', + choices=('allele', 'gene'), default=default_index_mode, help='''Specifies whether to use the V(D)J allele or gene for initial grouping.''') - parser_bygroup.add_argument('--act', action='store', dest='action', default='set', - choices=('first', 'set'), + parser_bygroup.add_argument('--act', action='store', dest='action', + choices=('first', 'set'), default=default_index_action, help='''Specifies how to handle multiple V(D)J assignments for initial grouping.''') parser_bygroup.add_argument('--model', action='store', dest='model', - choices=('aa', 'ham', 'm1n', 'hs1f', 'hs5f'), + choices=choices_bygroup_model, default=default_bygroup_model, - help='''Specifies which substitution model to use for - calculating distance between sequences. Where m1n is the - mouse single nucleotide transition/trasversion model - of Smith et al, 1996; hs1f is the human single - nucleotide model derived from Yaari et al, 2013; hs5f - is the human S5F model of Yaari et al, 2013; ham is - nucleotide Hamming distance; and aa is amino acid - Hamming distance. The hs5f data should be - considered experimental.''') + help='''Specifies which substitution model to use for calculating distance + between sequences. The "ham" model is nucleotide Hamming distance and + "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are + human specific single nucleotide and 5-mer content models, respectively, + from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are + mouse specific single nucleotide and 5-mer content models, respectively, + from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are + deprecated models provided backwards compatibility with the "m1n" and + "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both + 5-mer models should be considered experimental.''') parser_bygroup.add_argument('--dist', action='store', dest='distance', type=float, default=default_distance, help='The distance threshold for clonal grouping') @@ -977,22 +1041,25 @@ parser_bygroup.set_defaults(group_func=indexJunctions) parser_bygroup.set_defaults(clone_func=distanceClones) - - # Hierarchical clustering cloning method - parser_hclust = subparsers.add_parser('hclust', parents=[parser_parent], + # Chen2010 + parser_chen = subparsers.add_parser('chen2010', parents=[parser_parent], formatter_class=CommonHelpFormatter, - help='Defines clones by specified distance metric on CDR3s and \ - cutting of hierarchical clustering tree') -# parser_hclust.add_argument('-f', nargs='+', action='store', dest='fields', default=None, -# help='Fields to use for grouping clones (non VDJ)') - parser_hclust.add_argument('--method', action='store', dest='method', - choices=('chen2010', 'ademokun2011'), default=default_hclust_model, - help='Specifies which cloning method to use for calculating distance \ - between CDR3s, computing linkage, and cutting clusters') - parser_hclust.set_defaults(feed_func=feedQueueClust) - parser_hclust.set_defaults(work_func=processQueueClust) - parser_hclust.set_defaults(collect_func=collectQueueClust) - parser_hclust.set_defaults(cluster_func=hierClust) + help='''Defines clones by method specified in Chen, 2010.''', + description='''Defines clones by method specified in Chen, 2010.''') + parser_chen.set_defaults(feed_func=feedQueueClust) + parser_chen.set_defaults(work_func=processQueueClust) + parser_chen.set_defaults(collect_func=collectQueueClust) + parser_chen.set_defaults(cluster_func=hierClust) + + # Ademokun2011 + parser_ade = subparsers.add_parser('ademokun2011', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='''Defines clones by method specified in Ademokun, 2011.''', + description='''Defines clones by method specified in Ademokun, 2011.''') + parser_ade.set_defaults(feed_func=feedQueueClust) + parser_ade.set_defaults(work_func=processQueueClust) + parser_ade.set_defaults(collect_func=collectQueueClust) + parser_ade.set_defaults(cluster_func=hierClust) return parser @@ -1023,8 +1090,11 @@ 'linkage': args_dict['linkage'], 'seq_field': args_dict['seq_field']} - # TODO: can be cleaned up with abstract model class - args_dict['clone_args']['dist_mat'] = getModelMatrix(args_dict['model']) + # Get distance matrix + try: + args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']] + except KeyError: + sys.exit('Unrecognized distance model: %s' % args_dict['model']) del args_dict['fields'] del args_dict['action'] @@ -1037,16 +1107,17 @@ del args_dict['seq_field'] # Define clone_args - if args.command == 'hclust': - dist_funcs = {'chen2010':distChen2010, 'ademokun2011':distAdemokun2011} - args_dict['clone_func'] = dist_funcs[args_dict['method']] - args_dict['cluster_args'] = {'method': args_dict['method']} - #del args_dict['fields'] - del args_dict['method'] + if args.command == 'chen2010': + args_dict['clone_func'] = distChen2010 + args_dict['cluster_args'] = {'method': args.command } + + if args.command == 'ademokun2011': + args_dict['clone_func'] = distAdemokun2011 + args_dict['cluster_args'] = {'method': args.command } # Call defineClones del args_dict['command'] del args_dict['db_files'] for f in args.__dict__['db_files']: args_dict['db_file'] = f - defineClones(**args_dict) \ No newline at end of file + defineClones(**args_dict)
--- a/change_o/MakeDb.py Tue May 16 08:52:48 2017 -0400 +++ b/change_o/MakeDb.py Tue May 23 08:32:58 2017 -0400 @@ -7,765 +7,164 @@ from changeo import __version__, __date__ # Imports -import csv import os -import re import sys -import pandas as pd -import tarfile -import zipfile from argparse import ArgumentParser from collections import OrderedDict -from itertools import groupby -from shutil import rmtree -from tempfile import mkdtemp from textwrap import dedent from time import time from Bio import SeqIO -from Bio.Seq import Seq -from Bio.Alphabet import IUPAC # Presto and changeo imports from presto.Defaults import default_out_args from presto.Annotation import parseAnnotation -from presto.IO import countSeqFile, printLog, printProgress +from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs -from changeo.IO import getDbWriter, countDbFile, getRepo -from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \ - j_allele_regex - -# Default parameters -default_delimiter = ('\t', ',', '-') - - -def gapV(ig_dict, repo_dict): - """ - Insert gaps into V region and update alignment information - - Arguments: - ig_dict : Dictionary of parsed IgBlast output - repo_dict : Dictionary of IMGT gapped germline sequences - - Returns: - dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields - """ - - seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ'] - - # Find gapped germline V segment - vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first') - vkey = (vgene, ) - #TODO: Figure out else case - if vkey in repo_dict: - vgap = repo_dict[vkey] - # Iterate over gaps in the germline segment - gaps = re.finditer(r'\.', vgap) - gapcount = int(ig_dict['V_GERM_START_VDJ'])-1 - for gap in gaps: - i = gap.start() - # Break if gap begins after V region - if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount: - break - # Insert gap into IMGT sequence - seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:] - # Update gap counter - gapcount += 1 - ig_dict['SEQUENCE_IMGT'] = seq_imgt - # Update IMGT positioning information for V - ig_dict['V_GERM_START_IMGT'] = 1 - ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount - - return ig_dict +from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo +from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT -def getIMGTJunc(ig_dict, repo_dict): - """ - Identify junction region by IMGT definition - - Arguments: - ig_dict : Dictionary of parsed IgBlast output - repo_dict : Dictionary of IMGT gapped germline sequences - - Returns: - dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields - """ - # Find germline J segment - jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first') - jkey = (jgene, ) - #TODO: Figure out else case - if jkey in repo_dict: - # Get germline J sequence - jgerm = repo_dict[jkey] - jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1] - # Look for (F|W)GXG aa motif in nt sequence - motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm) - aa_end = len(ig_dict['SEQUENCE_IMGT']) - #TODO: Figure out else case - if motif: - # print('\n', motif.group()) - aa_end = motif.start() - len(jgerm) + 3 - # Add fields to dict - ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end] - ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION']) - - return ig_dict - - -def getRegions(ig_dict): +def getFilePrefix(aligner_output, out_args): """ - Identify FWR and CDR regions by IMGT definition - - Arguments: - ig_dict : Dictionary of parsed alignment output - - Returns: - dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, - CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields - """ - try: - seq_len = len(ig_dict['SEQUENCE_IMGT']) - ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)] - except (KeyError, IndexError): - return ig_dict - - try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)] - except (IndexError): return ig_dict - - try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)] - except (IndexError): return ig_dict - - try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)] - except (IndexError): return ig_dict - - try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)] - except (IndexError): return ig_dict - - try: - cdr3_end = 306 + ig_dict['JUNCTION_LENGTH'] - ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end] - ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:] - except (KeyError, IndexError): - return ig_dict - - return ig_dict - - -def getSeqforIgBlast(seq_file): - """ - Fetch input sequences for IgBlast queries + Get file name prefix and create output directory Arguments: - seq_file = a fasta file of sequences input to IgBlast + aligner_output : aligner output file or directory. + out_args : dictionary of output arguments. Returns: - a dictionary of {ID:Seq} + str : file name prefix. """ - - seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna) - - # Create a seq_dict ID translation using IDs truncate up to space or 50 chars - seqs = {} - for seq in seq_dict.values(): - seqs.update({seq.description:str(seq.seq)}) - - return seqs - + # Determine output directory + if not out_args['out_dir']: + out_dir = os.path.dirname(os.path.abspath(aligner_output)) + else: + out_dir = os.path.abspath(out_args['out_dir']) + if not os.path.exists(out_dir): + os.mkdir(out_dir) -def findLine(handle, query): - """ - Finds line with query string in file + # Determine file prefix + if out_args['out_name']: + file_prefix = out_args['out_name'] + else: + file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0] - Arguments: - handle = file handle in which to search for line - query = query string for which to search in file - - Returns: - line from handle in which query string was found - """ - for line in handle: - if(re.match(query, line)): - return line + return os.path.join(out_dir, file_prefix) -def extractIMGT(imgt_output): - """ - Extract necessary files from IMGT results, zipped or unzipped - - Arguments: - imgt_output = zipped file or unzipped folder output by IMGT - - Returns: - sorted list of filenames from which information will be read +def getSeqDict(seq_file): """ - #file_ext = os.path.splitext(imgt_output)[1].lower() - imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction') - temp_dir = mkdtemp() - if zipfile.is_zipfile(imgt_output): - # Open zip file - imgt_zip = zipfile.ZipFile(imgt_output, 'r') - # Extract required files - imgt_files = sorted([n for n in imgt_zip.namelist() \ - if os.path.basename(n).startswith(imgt_flags)]) - imgt_zip.extractall(temp_dir, imgt_files) - # Define file list - imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] - elif os.path.isdir(imgt_output): - # Find required files in folder - folder_files = [] - for root, dirs, files in os.walk(imgt_output): - folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files]) - # Define file list - imgt_files = sorted([n for n in folder_files \ - if os.path.basename(n).startswith(imgt_flags)]) - elif tarfile.is_tarfile(imgt_output): - # Open zip file - imgt_tar = tarfile.open(imgt_output, 'r') - # Extract required files - imgt_files = sorted([n for n in imgt_tar.getnames() \ - if os.path.basename(n).startswith(imgt_flags)]) - imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files]) - # Define file list - imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] - else: - sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.') - - if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files - sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output) - elif len(imgt_files) < len(imgt_flags): - sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output) - - return temp_dir, imgt_files - - -# TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks -def readOneIgBlastResult(block): - """ - Parse a single IgBLAST query result + Create a dictionary from a sequence file. Arguments: - block = itertools groupby object of single result - - Returns: - None if no results, otherwise list of DataFrames for each result block - """ - results = list() - i = 0 - for match, subblock in groupby(block, lambda l: l=='\n'): - if not match: - # Strip whitespace and comments - sub = [s.strip() for s in subblock if not s.startswith('#')] - - # Continue on empty block - if not sub: continue - else: i += 1 - - # Split by tabs - sub = [s.split('\t') for s in sub] - - # Append list for "V-(D)-J rearrangement summary" (i == 1) - # And "V-(D)-J junction details" (i == 2) - # Otherwise append DataFrame of subblock - if i == 1 or i == 2: - results.append(sub[0]) - else: - df = pd.DataFrame(sub) - if not df.empty: results.append(df) - - return results if results else None - - -# TODO: needs more speeds. pandas is probably to blame. -def readIgBlast(igblast_output, seq_dict, repo_dict, - score_fields=False, region_fields=False): - """ - Reads IgBlast output - - Arguments: - igblast_output = IgBlast output file (format 7) - seq_dict = a dictionary of {ID:Seq} from input fasta file - repo_dict = dictionary of IMGT gapped germline sequences - score_fields = if True parse alignment scores - region_fields = if True add FWR and CDR region fields + seq_file : sequence file. Returns: - a generator of dictionaries containing alignment data + dict : sequence description as keys with Bio.SeqRecords as values. """ - - # Open IgBlast output file - with open(igblast_output) as f: - # Iterate over individual results (separated by # IGBLASTN) - for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)): - block = list(block) - if not k1: - # TODO: move query name extraction into block parser readOneIgBlastResult(). - # Extract sequence ID - query_name = ' '.join(block[0].strip().split(' ')[2:]) - # Initialize db_gen to have ID and input sequence - db_gen = {'SEQUENCE_ID': query_name, - 'SEQUENCE_INPUT': seq_dict[query_name]} - - # Parse further sub-blocks - block_list = readOneIgBlastResult(block) - - # TODO: this is indented pretty far. should be a separate function. or several functions. - # If results exist, parse further to obtain full db_gen - if block_list is not None: - # Parse quality information - db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F' - db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F' - db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F' - if block_list[0][-1] == '-': - db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'], - IUPAC.ambiguous_dna).reverse_complement()) - - # Parse V, D, and J calls - call_str = ' '.join(block_list[0]) - v_call = parseAllele(call_str, v_allele_regex, action='list') - d_call = parseAllele(call_str, d_allele_regex, action='list') - j_call = parseAllele(call_str, j_allele_regex, action='list') - db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None' - db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None' - db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None' - - # Parse junction sequence - # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1])) - # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ']) - - # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why? - # TODO: maybe we should fail these. they look craptastic. - #pd.set_option('display.width', 500) - #print query_name, len(block_list), hit_idx - #for i, x in enumerate(block_list): - # print '[%i]' % i - # print x - - # Parse segment start and stop positions - hit_df = block_list[-1] - - # Alignment info block - # 0: segment - # 1: query id - # 2: subject id - # 3: % identity - # 4: alignment length - # 5: mismatches - # 6: gap opens - # 7: gaps - # 8: q. start - # 9: q. end - # 10: s. start - # 11: s. end - # 12: evalue - # 13: bit score - # 14: query seq - # 15: subject seq - # 16: btop - - # If V call exists, parse V alignment information - seq_vdj = '' - if v_call is not None: - v_align = hit_df[hit_df[0] == 'V'].iloc[0] - # Germline positions - db_gen['V_GERM_START_VDJ'] = int(v_align[10]) - db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1 - # Query sequence positions - db_gen['V_SEQ_START'] = int(v_align[8]) - db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1 - - if int(v_align[6]) == 0: - db_gen['INDELS'] = 'F' - else: - db_gen['INDELS'] = 'T' - # Set functional to none so record gets tossed (junction will be wrong) - # db_gen['FUNCTIONAL'] = None - - # V alignment scores - if score_fields: - try: db_gen['V_SCORE'] = float(v_align[13]) - except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' - - try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0 - except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' - - try: db_gen['V_EVALUE'] = float(v_align[12]) - except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None' - - try: db_gen['V_BTOP'] = v_align[16] - except (TypeError, ValueError): db_gen['V_BTOP'] = 'None' + seq_dict = SeqIO.to_dict(readSeqFile(seq_file), + key_function=lambda x: x.description) - # Update VDJ sequence, removing insertions - start = 0 - for m in re.finditer(r'-', v_align[15]): - ins = m.start() - seq_vdj += v_align[14][start:ins] - start = ins + 1 - seq_vdj += v_align[14][start:] - - # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. - # If D call exists, parse D alignment information - if d_call is not None: - d_align = hit_df[hit_df[0] == 'D'].iloc[0] - - # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. - # Determine N-region length and amount of J overlap with V or D alignment - overlap = 0 - if v_call is not None: - n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) - if n1_len < 0: - db_gen['N1_LENGTH'] = 0 - overlap = abs(n1_len) - else: - db_gen['N1_LENGTH'] = n1_len - n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1) - n1_end = int(d_align[8])-1 - seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] - - # Query sequence positions - db_gen['D_SEQ_START'] = int(d_align[8]) + overlap - db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0) - - # Germline positions - db_gen['D_GERM_START'] = int(d_align[10]) + overlap - db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0) - - # Update VDJ sequence, removing insertions - start = overlap - for m in re.finditer(r'-', d_align[15]): - ins = m.start() - seq_vdj += d_align[14][start:ins] - start = ins + 1 - seq_vdj += d_align[14][start:] - - # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. - # If J call exists, parse J alignment information - if j_call is not None: - j_align = hit_df[hit_df[0] == 'J'].iloc[0] - - # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. - # Determine N-region length and amount of J overlap with V or D alignment - overlap = 0 - if d_call is not None: - n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH']) - if n2_len < 0: - db_gen['N2_LENGTH'] = 0 - overlap = abs(n2_len) - else: - db_gen['N2_LENGTH'] = n2_len - n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1) - n2_end = int(j_align[8])-1 - seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end] - elif v_call is not None: - n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) - if n1_len < 0: - db_gen['N1_LENGTH'] = 0 - overlap = abs(n1_len) - else: - db_gen['N1_LENGTH'] = n1_len - n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1) - n1_end = int(j_align[8])-1 - seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] - else: - db_gen['N1_LENGTH'] = 0 - - # Query positions - db_gen['J_SEQ_START'] = int(j_align[8]) + overlap - db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0) - - # Germline positions - db_gen['J_GERM_START'] = int(j_align[10]) + overlap - db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0) - - # J alignment scores - if score_fields: - try: db_gen['J_SCORE'] = float(j_align[13]) - except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' - - try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0 - except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' - - try: db_gen['J_EVALUE'] = float(j_align[12]) - except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None' - - try: db_gen['J_BTOP'] = j_align[16] - except (TypeError, ValueError): db_gen['J_BTOP'] = 'None' - - # Update VDJ sequence, removing insertions - start = overlap - for m in re.finditer(r'-', j_align[15]): - ins = m.start() - seq_vdj += j_align[14][start:ins] - start = ins + 1 - seq_vdj += j_align[14][start:] - - db_gen['SEQUENCE_VDJ'] = seq_vdj - - # Create IMGT-gapped sequence and infer IMGT junction - if v_call is not None: - db_gen = gapV(db_gen, repo_dict) - if j_call is not None: - db_gen = getIMGTJunc(db_gen, repo_dict) - - # FWR and CDR regions - if region_fields: getRegions(db_gen) - - yield IgRecord(db_gen) + return seq_dict -# TODO: should be more readable -def readIMGT(imgt_files, score_fields=False, region_fields=False): - """ - Reads IMGT/HighV-Quest output - - Arguments: - imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6 - score_fields = if True parse alignment scores - region_fields = if True add FWR and CDR region fields - - Returns: - a generator of dictionaries containing alignment data +def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False, + out_args=default_out_args): """ - imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files] - # Create a dictionary for each sequence alignment and yield its generator - for sm, gp, nt, jn in zip(*imgt_iters): - if len(set([sm['Sequence ID'], - gp['Sequence ID'], - nt['Sequence ID'], - jn['Sequence ID']])) != 1: - sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \ - % sm['Sequence ID']) - - db_gen = {'SEQUENCE_ID': sm['Sequence ID'], - 'SEQUENCE_INPUT': sm['Sequence']} - - if 'No results' not in sm['Functionality']: - db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) + - ('unprod' in sm['Functionality'])] - db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) + - ('out-of-frame' in sm['JUNCTION frame'])], - db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) + - ('unprod' in sm['Functionality'])] - db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'], - 'missing' in sm['V-REGION potential ins/del']))) + - ('unprod' in sm['Functionality'])] - db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'], - sm['V-REGION insertions'], - sm['V-REGION deletions']))] - - db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION'] - db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION'] - - db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele'])) - db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele'])) - db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele'])) - - v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0 - db_gen['V_SEQ_START'] = nt['V-REGION start'] - db_gen['V_SEQ_LENGTH'] = v_seq_length - db_gen['V_GERM_START_IMGT'] = 1 - db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0 - - db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"], - jn['N-REGION-nt nb'], - jn['N1-REGION-nt nb'], - jn["P5'D-nt nb"]] if i) - db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length, - jn["P3'V-nt nb"], - jn['N-REGION-nt nb'], - jn['N1-REGION-nt nb'], - jn["P5'D-nt nb"]] if i) - db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) - db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1 - db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) - db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"], - jn['N2-REGION-nt nb'], - jn["P5'J-nt nb"]] if i) - - db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length, - jn["P3'V-nt nb"], - jn['N-REGION-nt nb'], - jn['N1-REGION-nt nb'], - jn["P5'D-nt nb"], - jn["D-REGION-nt nb"], - jn["P3'D-nt nb"], - jn['N2-REGION-nt nb'], - jn["P5'J-nt nb"]] if i) - db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0 - db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1 - db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0 - - db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0 - db_gen['JUNCTION'] = jn['JUNCTION'] - - # Alignment scores - if score_fields: - try: db_gen['V_SCORE'] = float(sm['V-REGION score']) - except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' - - try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0 - except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' - - try: db_gen['J_SCORE'] = float(sm['J-REGION score']) - except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' - - try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0 - except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' - - # FWR and CDR regions - if region_fields: getRegions(db_gen) - else: - db_gen['V_CALL'] = 'None' - db_gen['D_CALL'] = 'None' - db_gen['J_CALL'] = 'None' - - yield IgRecord(db_gen) - - -def getIDforIMGT(seq_file): - """ - Create a sequence ID translation using IMGT truncation - - Arguments: - seq_file = a fasta file of sequences input to IMGT - - Returns: - a dictionary of {truncated ID: full seq description} - """ - - # Create a seq_dict ID translation using IDs truncate up to space or 50 chars - ids = {} - for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)): - if len(rec.description) <= 50: - id_key = rec.description - else: - id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50]) - ids.update({id_key:rec.description}) - - return ids - - -def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, - score_fields=False, region_fields=False, out_args=default_out_args): - """ - Writes tab-delimited database file in output directory + Writes tab-delimited database file in output directory. Arguments: - db_gen = a generator of IgRecord objects containing alignment data - file_prefix = directory and prefix for CLIP tab-delim file - total_count = number of records (for progress bar) - id_dict = a dictionary of {IMGT ID: full seq description} - no_parse = if ID is to be parsed for pRESTO output with default delimiters - score_fields = if True add alignment score fields to output file - region_fields = if True add FWR and CDR region fields to output file - out_args = common output argument dictionary from parseCommonArgs + db : a iterator of IgRecord objects containing alignment data. + fields : a list of ordered field names to write. + file_prefix : directory and prefix for CLIP tab-delim file. + total_count : number of records (for progress bar). + id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. + no_parse : if ID is to be parsed for pRESTO output with default delimiters. + partial : if True put incomplete alignments in the pass file. + out_args : common output argument dictionary from parseCommonArgs. Returns: - None + None """ - pass_file = "%s_db-pass.tab" % file_prefix - fail_file = "%s_db-fail.tab" % file_prefix - ordered_fields = ['SEQUENCE_ID', - 'SEQUENCE_INPUT', - 'FUNCTIONAL', - 'IN_FRAME', - 'STOP', - 'MUTATED_INVARIANT', - 'INDELS', - 'V_CALL', - 'D_CALL', - 'J_CALL', - 'SEQUENCE_VDJ', - 'SEQUENCE_IMGT', - 'V_SEQ_START', - 'V_SEQ_LENGTH', - 'V_GERM_START_VDJ', - 'V_GERM_LENGTH_VDJ', - 'V_GERM_START_IMGT', - 'V_GERM_LENGTH_IMGT', - 'N1_LENGTH', - 'D_SEQ_START', - 'D_SEQ_LENGTH', - 'D_GERM_START', - 'D_GERM_LENGTH', - 'N2_LENGTH', - 'J_SEQ_START', - 'J_SEQ_LENGTH', - 'J_GERM_START', - 'J_GERM_LENGTH', - 'JUNCTION_LENGTH', - 'JUNCTION'] + # Function to check for valid records strictly + def _pass_strict(rec): + valid = [rec.v_call and rec.v_call != 'None', + rec.j_call and rec.j_call != 'None', + rec.functional is not None, + rec.seq_vdj, + rec.junction] + return all(valid) - if score_fields: - ordered_fields.extend(['V_SCORE', - 'V_IDENTITY', - 'V_EVALUE', - 'V_BTOP', - 'J_SCORE', - 'J_IDENTITY', - 'J_EVALUE', - 'J_BTOP']) + # Function to check for valid records loosely + def _pass_gentle(rec): + valid = [rec.v_call and rec.v_call != 'None', + rec.d_call and rec.d_call != 'None', + rec.j_call and rec.j_call != 'None'] + return any(valid) - if region_fields: - ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', - 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) - + # Set pass criteria + _pass = _pass_gentle if partial else _pass_strict - # TODO: This is not the best approach. should pass in output fields. - # Initiate passed handle - pass_handle = None + # Define output file names + pass_file = '%s_db-pass.tab' % file_prefix + fail_file = '%s_db-fail.tab' % file_prefix - # Open failed file - if out_args['failed']: - fail_handle = open(fail_file, 'wt') - fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) - else: - fail_handle = None - fail_writer = None - - # Initialize counters and file + # Initiate handles, writers and counters + pass_handle = None + fail_handle = None pass_writer = None + fail_writer = None start_time = time() rec_count = pass_count = fail_count = 0 - for record in db_gen: - #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) - printProgress(rec_count, total_count, 0.05, start_time) - rec_count += 1 - # Count pass or fail - if (record.v_call == 'None' and record.j_call == 'None') or \ - record.functional is None or \ - not record.seq_vdj or \ - not record.junction: - # print(record.v_call, record.j_call, record.functional, record.junction) - fail_count += 1 - if fail_writer is not None: fail_writer.writerow(record.toDict()) - continue - else: - pass_count += 1 - - # Build sample sequence description - if record.id in id_dict: + # Validate and write output + printProgress(0, total_count, 0.05, start_time) + for i, record in enumerate(db, start=1): + + # Replace sequence description with full string, if required + if id_dict is not None and record.id in id_dict: record.id = id_dict[record.id] # Parse sequence description into new columns if not no_parse: - record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) - record.id = record.annotations['ID'] - del record.annotations['ID'] + try: + record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) + record.id = record.annotations['ID'] + del record.annotations['ID'] + + # TODO: This is not the best approach. should pass in output fields. + # If first record, use parsed description to define extra columns + if i == 1: fields.extend(list(record.annotations.keys())) + except IndexError: + # Could not parse pRESTO-style annotations so fall back to no parse + no_parse = True + sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n') - # TODO: This is not the best approach. should pass in output fields. - # If first sequence, use parsed description to create new columns and initialize writer - if pass_writer is None: - if not no_parse: ordered_fields.extend(list(record.annotations.keys())) - pass_handle = open(pass_file, 'wt') - pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) + # Count pass or fail and write to appropriate file + if _pass(record): + # Open pass file + if pass_writer is None: + pass_handle = open(pass_file, 'wt') + pass_writer = getDbWriter(pass_handle, add_fields=fields) - # Write row to tab-delim CLIP file - pass_writer.writerow(record.toDict()) - - # Print log - #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) - printProgress(rec_count, total_count, 0.05, start_time) + # Write row to pass file + pass_count += 1 + pass_writer.writerow(record.toDict()) + else: + # Open failed file + if out_args['failed'] and fail_writer is None: + fail_handle = open(fail_file, 'wt') + fail_writer = getDbWriter(fail_handle, add_fields=fields) + # Write row to fail file if specified + fail_count += 1 + if fail_writer is not None: + fail_writer.writerow(record.toDict()) + + # Print progress + printProgress(i, total_count, 0.05, start_time) + + # Print consol log log = OrderedDict() log['OUTPUT'] = pass_file log['PASS'] = pass_count @@ -777,146 +176,215 @@ if fail_handle is not None: fail_handle.close() -# TODO: may be able to merge with parseIMGT -def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, - region_fields=False, out_args=default_out_args): +# TODO: may be able to merge with other mains +def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False, + parse_scores=False, parse_regions=False, parse_junction=False, + out_args=default_out_args): """ - Main for IgBlast aligned sample sequences + Main for IMGT aligned sample sequences. Arguments: - igblast_output = IgBlast output file to process - seq_file = fasta file input to IgBlast (from which to get sequence) - repo = folder with germline repertoire files - no_parse = if ID is to be parsed for pRESTO output with default delimiters - score_fields = if True add alignment score fields to output file - region_fields = if True add FWR and CDR region fields to output file - out_args = common output argument dictionary from parseCommonArgs + aligner_output : zipped file or unzipped folder output by IMGT. + seq_file : FASTA file input to IMGT (from which to get seqID). + no_parse : if ID is to be parsed for pRESTO output with default delimiters. + partial : If True put incomplete alignments in the pass file. + parse_scores : if True add alignment score fields to output file. + parse_regions : if True add FWR and CDR region fields to output file. + out_args : common output argument dictionary from parseCommonArgs. Returns: - None + None + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'MakeDb' + log['ALIGNER'] = 'IMGT' + log['ALIGNER_OUTPUT'] = aligner_output + log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' + log['NO_PARSE'] = no_parse + log['PARTIAL'] = partial + log['SCORES'] = parse_scores + log['REGIONS'] = parse_regions + log['JUNCTION'] = parse_junction + printLog(log) + + start_time = time() + printMessage('Loading sequence files', start_time=start_time, width=25) + # Extract IMGT files + temp_dir, imgt_files = extractIMGT(aligner_output) + # Count records in IMGT files + total_count = countDbFile(imgt_files['summary']) + # Get (parsed) IDs from fasta file submitted to IMGT + id_dict = getIDforIMGT(seq_file) if seq_file else {} + printMessage('Done', start_time=start_time, end=True, width=25) + + # Parse IMGT output and write db + with open(imgt_files['summary'], 'r') as summary_handle, \ + open(imgt_files['gapped'], 'r') as gapped_handle, \ + open(imgt_files['ntseq'], 'r') as ntseq_handle, \ + open(imgt_files['junction'], 'r') as junction_handle: + parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle, + parse_scores=parse_scores, parse_regions=parse_regions, + parse_junction=parse_junction) + file_prefix = getFilePrefix(aligner_output, out_args) + writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict, + no_parse=no_parse, partial=partial, out_args=out_args) + + # Cleanup temp directory + temp_dir.cleanup() + + return None + + +# TODO: may be able to merge with other mains +def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False, + parse_regions=False, parse_scores=False, parse_igblast_cdr3=False, + out_args=default_out_args): + """ + Main for IgBLAST aligned sample sequences. + + Arguments: + aligner_output : IgBLAST output file to process. + seq_file : fasta file input to IgBlast (from which to get sequence). + repo : folder with germline repertoire files. + no_parse : if ID is to be parsed for pRESTO output with default delimiters. + partial : If True put incomplete alignments in the pass file. + parse_regions : if True add FWR and CDR fields to output file. + parse_scores : if True add alignment score fields to output file. + parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST + out_args : common output argument dictionary from parseCommonArgs. + + Returns: + None """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['ALIGNER'] = 'IgBlast' - log['ALIGN_RESULTS'] = os.path.basename(igblast_output) + log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) log['SEQ_FILE'] = os.path.basename(seq_file) log['NO_PARSE'] = no_parse - log['SCORE_FIELDS'] = score_fields - log['REGION_FIELDS'] = region_fields + log['PARTIAL'] = partial + log['SCORES'] = parse_scores + log['REGIONS'] = parse_regions printLog(log) + start_time = time() + printMessage('Loading sequence files', start_time=start_time, width=25) + # Count records in sequence file + total_count = countSeqFile(seq_file) # Get input sequence dictionary - seq_dict = getSeqforIgBlast(seq_file) + seq_dict = getSeqDict(seq_file) + # Create germline repo dictionary + repo_dict = readRepo(repo) + printMessage('Done', start_time=start_time, end=True, width=25) - # Formalize out_dir and file-prefix - if not out_args['out_dir']: - out_dir = os.path.split(igblast_output)[0] - else: - out_dir = os.path.abspath(out_args['out_dir']) - if not os.path.exists(out_dir): os.mkdir(out_dir) - if out_args['out_name']: - file_prefix = out_args['out_name'] - else: - file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) - file_prefix = os.path.join(out_dir, file_prefix) + # Parse and write output + with open(aligner_output, 'r') as f: + parse_iter = IgBLASTReader(f, seq_dict, repo_dict, + parse_scores=parse_scores, parse_regions=parse_regions, + parse_igblast_cdr3=parse_igblast_cdr3) + file_prefix = getFilePrefix(aligner_output, out_args) + writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, + no_parse=no_parse, partial=partial, out_args=out_args) - total_count = countSeqFile(seq_file) - - # Create - repo_dict = getRepo(repo) - igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, - score_fields=score_fields, region_fields=region_fields) - writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, - score_fields=score_fields, region_fields=region_fields, out_args=out_args) + return None -# TODO: may be able to merge with parseIgBlast -def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False, - region_fields=False, out_args=default_out_args): +# TODO: may be able to merge with other mains +def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False, + parse_scores=False, parse_regions=False, out_args=default_out_args): """ - Main for IMGT aligned sample sequences + Main for iHMMuneAlign aligned sample sequences. Arguments: - imgt_output = zipped file or unzipped folder output by IMGT - seq_file = FASTA file input to IMGT (from which to get seqID) - no_parse = if ID is to be parsed for pRESTO output with default delimiters - score_fields = if True add alignment score fields to output file - region_fields = if True add FWR and CDR region fields to output file - out_args = common output argument dictionary from parseCommonArgs - - Returns: - None + aligner_output : iHMMune-Align output file to process. + seq_file : fasta file input to iHMMuneAlign (from which to get sequence). + repo : folder with germline repertoire files. + no_parse : if ID is to be parsed for pRESTO output with default delimiters. + partial : If True put incomplete alignments in the pass file. + parse_scores : if True parse alignment scores. + parse_regions : if True add FWR and CDR region fields. + out_args : common output argument dictionary from parseCommonArgs. + + Returns: + None """ # Print parameter info log = OrderedDict() - log['START'] = 'MakeDb' - log['ALIGNER'] = 'IMGT' - log['ALIGN_RESULTS'] = imgt_output - log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' + log['START'] = 'MakeDB' + log['ALIGNER'] = 'iHMMune-Align' + log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output) + log['SEQ_FILE'] = os.path.basename(seq_file) log['NO_PARSE'] = no_parse - log['SCORE_FIELDS'] = score_fields - log['REGION_FIELDS'] = region_fields + log['PARTIAL'] = partial + log['SCORES'] = parse_scores + log['REGIONS'] = parse_regions printLog(log) - - # Get individual IMGT result files - temp_dir, imgt_files = extractIMGT(imgt_output) - - # Formalize out_dir and file-prefix - if not out_args['out_dir']: - out_dir = os.path.dirname(os.path.abspath(imgt_output)) - else: - out_dir = os.path.abspath(out_args['out_dir']) - if not os.path.exists(out_dir): os.mkdir(out_dir) - if out_args['out_name']: - file_prefix = out_args['out_name'] - else: - file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] - file_prefix = os.path.join(out_dir, file_prefix) - total_count = countDbFile(imgt_files[0]) - - # Get (parsed) IDs from fasta file submitted to IMGT - id_dict = getIDforIMGT(seq_file) if seq_file else {} - - # Create - imgt_dict = readIMGT(imgt_files, score_fields=score_fields, - region_fields=region_fields) - writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, - score_fields=score_fields, region_fields=region_fields, out_args=out_args) + start_time = time() + printMessage('Loading sequence files', start_time=start_time, width=25) + # Count records in sequence file + total_count = countSeqFile(seq_file) + # Get input sequence dictionary + seq_dict = getSeqDict(seq_file) + # Create germline repo dictionary + repo_dict = readRepo(repo) + printMessage('Done', start_time=start_time, end=True, width=25) - # Delete temp directory - rmtree(temp_dir) + # Parse and write output + with open(aligner_output, 'r') as f: + parse_iter = IHMMuneReader(f, seq_dict, repo_dict, + parse_scores=parse_scores, parse_regions=parse_regions) + file_prefix = getFilePrefix(aligner_output, out_args) + writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, + no_parse=no_parse, partial=partial, out_args=out_args) + + return None def getArgParser(): """ - Defines the ArgumentParser + Defines the ArgumentParser. - Arguments: - None - Returns: - an ArgumentParser object + argparse.ArgumentParser """ fields = dedent( ''' output files: db-pass - database of parsed alignment records. + database of alignment records with functionality information, + V and J calls, and a junction region. db-fail - database with records failing alignment. + database with records that fail due to no functionality information + (did not pass IMGT), no V call, no J call, or no junction region. - output fields: - SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, - INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT, - V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT, - V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH, - D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH, + universal output fields: + SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT, + FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS, + V_CALL, D_CALL, J_CALL, + V_SEQ_START, V_SEQ_LENGTH, + D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, - JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, - J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, - FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT + JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH, + FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, + CDR1_IMGT, CDR2_IMGT, CDR3_IMGT + + imgt specific output fields: + V_GERM_START_IMGT, V_GERM_LENGTH_IMGT, + N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH, + D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY, + + igblast specific output fields: + V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, + V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP, + J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP. + CDR3_IGBLAST_NT, CDR3_IGBLAST_AA + + ihmm specific output fields: + V_GERM_START_VDJ, V_GERM_LENGTH_VDJ, + HMM_SCORE ''') # Define ArgumentParser @@ -933,11 +401,11 @@ parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) # IgBlast Aligner - parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output', - parents=[parser_parent], - formatter_class=CommonHelpFormatter) - parser_igblast.set_defaults(func=parseIgBlast) - parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', + parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Process IgBLAST output.', + description='Process IgBLAST output.') + parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', required=True, help='''IgBLAST output files in format 7 with query sequence (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') @@ -947,50 +415,112 @@ set of germlines used in the IgBLAST alignment.''') parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', required=True, - help='List of input FASTA files containing sequences') + help='''List of input FASTA files (with .fasta, .fna or .fa + extension), containing sequences.''') parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', - help='''Specify if input IDs should not be parsed to add - new columns to database.''') - parser_igblast.add_argument('--scores', action='store_true', dest='score_fields', + help='''Specify to prevent input sequence headers from being parsed + to add new columns to database. Parsing of sequence headers requires + headers to be in the pRESTO annotation format, so this should be specified + when sequence headers are incompatible with the pRESTO annotation scheme. + Note, unrecognized header formats will default to this behavior.''') + parser_igblast.add_argument('--partial', action='store_true', dest='partial', + help='''If specified, include incomplete V(D)J alignments in + the pass file instead of the fail file.''') + parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores', help='''Specify if alignment score metrics should be included in the output. Adds the V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, J_BTOP, and J_EVALUE columns.''') - parser_igblast.add_argument('--regions', action='store_true', dest='region_fields', - help='''Specify if IMGT framework and CDR regions should be + parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions', + help='''Specify if IMGT FWR and CDRs should be included in the output. Adds the FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT columns.''') - + parser_igblast.add_argument('--cdr3', action='store_true', + dest='parse_igblast_cdr3', + help='''Specify if the CDR3 sequences generated by IgBLAST + should be included in the output. Adds the columns + CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST + version 1.5 or greater.''') + parser_igblast.set_defaults(func=parseIgBLAST) + # IMGT aligner - parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', - parents=[parser_parent], - formatter_class=CommonHelpFormatter) - imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True) - imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files', - help='''Either zipped IMGT output files (.zip) or a folder - containing unzipped IMGT output files (which must - include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, - and 6_Junction).''') + parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='''Process IMGT/HighV-Quest output + (does not work with V-QUEST).''', + description='''Process IMGT/HighV-Quest output + (does not work with V-QUEST).''') + parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', + help='''Either zipped IMGT output files (.zip or .txz) or a + folder containing unzipped IMGT output files (which must + include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, + and 6_Junction).''') parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False, - help='List of input FASTA files containing sequences') + help='''List of input FASTA files (with .fasta, .fna or .fa + extension) containing sequences.''') parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', - help='''Specify if input IDs should not be parsed to add new - columns to database.''') - parser_imgt.add_argument('--scores', action='store_true', dest='score_fields', + help='''Specify to prevent input sequence headers from being parsed + to add new columns to database. Parsing of sequence headers requires + headers to be in the pRESTO annotation format, so this should be specified + when sequence headers are incompatible with the pRESTO annotation scheme. + Note, unrecognized header formats will default to this behavior.''') + parser_imgt.add_argument('--partial', action='store_true', dest='partial', + help='''If specified, include incomplete V(D)J alignments in + the pass file instead of the fail file.''') + parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores', help='''Specify if alignment score metrics should be included in the output. Adds the V_SCORE, V_IDENTITY, - J_SCORE and J_IDENTITY. Note, this will also add - the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP, - but they will be empty for IMGT output.''') - parser_imgt.add_argument('--regions', action='store_true', dest='region_fields', - help='''Specify if IMGT framework and CDR regions should be + J_SCORE and J_IDENTITY.''') + parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions', + help='''Specify if IMGT FWRs and CDRs should be included in the output. Adds the FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT columns.''') + parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction', + help='''Specify if detailed junction fields should be + included in the output. Adds the columns + N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, + P5J_LENGTH, D_FRAME.''') parser_imgt.set_defaults(func=parseIMGT) + # iHMMuneAlign Aligner + parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Process iHMMune-Align output.', + description='Process iHMMune-Align output.') + parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs', + required=True, + help='''iHMMune-Align output file.''') + parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True, + help='''List of folders and/or FASTA files containing + IMGT-gapped germline sequences corresponding to the + set of germlines used in the IgBLAST alignment.''') + parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files', + required=True, + help='''List of input FASTA files (with .fasta, .fna or .fa + extension) containing sequences.''') + parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse', + help='''Specify to prevent input sequence headers from being parsed + to add new columns to database. Parsing of sequence headers requires + headers to be in the pRESTO annotation format, so this should be specified + when sequence headers are incompatible with the pRESTO annotation scheme. + Note, unrecognized header formats will default to this behavior.''') + parser_ihmm.add_argument('--partial', action='store_true', dest='partial', + help='''If specified, include incomplete V(D)J alignments in + the pass file instead of the fail file.''') + parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores', + help='''Specify if alignment score metrics should be + included in the output. Adds the path score of the + iHMMune-Align hidden Markov model to HMM_SCORE.''') + parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions', + help='''Specify if IMGT FWRs and CDRs should be + included in the output. Adds the FWR1_IMGT, FWR2_IMGT, + FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and + CDR3_IMGT columns.''') + parser_ihmm.set_defaults(func=parseIHMM) + return parser @@ -1000,7 +530,7 @@ """ parser = getArgParser() args = parser.parse_args() - args_dict = parseCommonArgs(args, in_arg='aligner_files') + args_dict = parseCommonArgs(args, in_arg='aligner_outputs') # Set no ID parsing if sequence files are not provided if 'seq_files' in args_dict and not args_dict['seq_files']: @@ -1008,18 +538,18 @@ # Delete if 'seq_files' in args_dict: del args_dict['seq_files'] - if 'aligner_files' in args_dict: del args_dict['aligner_files'] + if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs'] if 'command' in args_dict: del args_dict['command'] if 'func' in args_dict: del args_dict['func'] if args.command == 'imgt': - for i in range(len(args.__dict__['aligner_files'])): - args_dict['imgt_output'] = args.__dict__['aligner_files'][i] + for i in range(len(args.__dict__['aligner_outputs'])): + args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] args_dict['seq_file'] = args.__dict__['seq_files'][i] \ if args.__dict__['seq_files'] else None args.func(**args_dict) - elif args.command == 'igblast': - for i in range(len(args.__dict__['aligner_files'])): - args_dict['igblast_output'] = args.__dict__['aligner_files'][i] + elif args.command == 'igblast' or args.command == 'ihmm': + for i in range(len(args.__dict__['aligner_outputs'])): + args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i] args_dict['seq_file'] = args.__dict__['seq_files'][i] args.func(**args_dict)