0
|
1 #!/usr/bin/env python3
|
|
2 """
|
|
3 Reconstructs germline sequences from alignment data
|
|
4 """
|
|
5 # Info
|
|
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
|
|
7 from changeo import __version__, __date__
|
|
8
|
|
9 # Imports
|
|
10 import os
|
|
11 import sys
|
|
12 from argparse import ArgumentParser
|
|
13 from collections import OrderedDict
|
|
14 from textwrap import dedent
|
|
15 from time import time
|
|
16
|
|
17 # Presto and change imports
|
|
18 from presto.Defaults import default_out_args
|
|
19 from presto.IO import getOutputHandle, printLog, printProgress
|
|
20 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
|
|
21 from changeo.IO import getDbWriter, readDbFile, countDbFile, readRepo
|
|
22 from changeo.Receptor import allele_regex, parseAllele
|
|
23
|
|
24 # Defaults
|
|
25 default_germ_types = 'dmask'
|
|
26 default_v_field = 'V_CALL'
|
|
27 default_seq_field = 'SEQUENCE_IMGT'
|
|
28
|
|
29
|
|
30 def joinGermline(align, repo_dict, germ_types, v_field, seq_field):
|
|
31 """
|
|
32 Join gapped germline sequences aligned with sample sequences
|
|
33
|
|
34 Arguments:
|
|
35 align = iterable yielding dictionaries of sample sequence data
|
|
36 repo_dict = dictionary of IMGT gapped germline sequences
|
|
37 germ_types = types of germline sequences to be output
|
|
38 (full germline, D-region masked, only V-region germline)
|
|
39 v_field = field in which to look for V call
|
|
40 seq_field = field in which to look for sequence
|
|
41
|
|
42 Returns:
|
|
43 dictionary of germline_type: germline_sequence
|
|
44 """
|
|
45 j_field = 'J_CALL'
|
|
46 germlines = {'full': '', 'dmask': '', 'vonly': '', 'regions': ''}
|
|
47 result_log = OrderedDict()
|
|
48 result_log['ID'] = align['SEQUENCE_ID']
|
|
49
|
|
50 # Find germline V-region gene
|
|
51 if v_field == 'V_CALL_GENOTYPED':
|
|
52 vgene = parseAllele(align[v_field], allele_regex, 'list')
|
|
53 vkey = vgene
|
|
54 else:
|
|
55 vgene = parseAllele(align[v_field], allele_regex, 'first')
|
|
56 vkey = (vgene, )
|
|
57
|
|
58 try:
|
|
59 int(align['P3V_LENGTH'])
|
|
60 int(align['N1_LENGTH'])
|
|
61 int(align['P5D_LENGTH'])
|
|
62 int(align['P3D_LENGTH'])
|
|
63 int(align['N2_LENGTH'])
|
|
64 int(align['P5J_LENGTH'])
|
|
65 except:
|
|
66 regions_style = 'IgBLAST'
|
|
67 else:
|
|
68 regions_style = 'IMGT'
|
|
69
|
|
70 # Build V-region germline
|
|
71 if vgene is not None:
|
|
72 result_log['V_CALL'] = ','.join(vkey)
|
|
73 if vkey in repo_dict:
|
|
74 vseq = repo_dict[vkey]
|
|
75 # Germline start
|
|
76 try: vstart = int(align['V_GERM_START_IMGT']) - 1
|
|
77 except (TypeError, ValueError): vstart = 0
|
|
78 # Germline length
|
|
79 try: vlen = int(align['V_GERM_LENGTH_IMGT'])
|
|
80 except (TypeError, ValueError): vlen = 0
|
|
81 # TODO: not sure what this line is doing here. it no make no sense.
|
|
82 vpad = vlen - len(vseq[vstart:])
|
|
83 if vpad < 0: vpad = 0
|
|
84 germ_vseq = vseq[vstart:(vstart + vlen)] + ('N' * vpad)
|
|
85 else:
|
|
86 result_log['ERROR'] = 'Germline %s not in repertoire' % ','.join(vkey)
|
|
87 return result_log, germlines
|
|
88 else:
|
|
89 result_log['V_CALL'] = None
|
|
90 try: vlen = int(align['V_GERM_LENGTH_IMGT'])
|
|
91 except (TypeError, ValueError): vlen = 0
|
|
92 germ_vseq = 'N' * vlen
|
|
93
|
|
94 # Find germline D-region gene
|
|
95 dgene = parseAllele(align['D_CALL'], allele_regex, 'first')
|
|
96
|
|
97 # Build D-region germline
|
|
98 if dgene is not None:
|
|
99 result_log['D_CALL'] = dgene
|
|
100 dkey = (dgene, )
|
|
101 if dkey in repo_dict:
|
|
102 dseq = repo_dict[dkey]
|
|
103 # Germline start
|
|
104 try: dstart = int(align['D_GERM_START']) - 1
|
|
105 except (TypeError, ValueError): dstart = 0
|
|
106 # Germline length
|
|
107 try: dlen = int(align['D_GERM_LENGTH'])
|
|
108 except (TypeError, ValueError): dlen = 0
|
|
109 germ_dseq = repo_dict[dkey][dstart:(dstart + dlen)]
|
|
110 else:
|
|
111 result_log['ERROR'] = 'Germline %s not in repertoire' % dgene
|
|
112 return result_log, germlines
|
|
113 else:
|
|
114 result_log['D_CALL'] = None
|
|
115 germ_dseq = ''
|
|
116
|
|
117 # Find germline J-region gene
|
|
118 jgene = parseAllele(align[j_field], allele_regex, 'first')
|
|
119
|
|
120 # Build D-region germline
|
|
121 if jgene is not None:
|
|
122 result_log['J_CALL'] = jgene
|
|
123 jkey = (jgene, )
|
|
124 if jkey in repo_dict:
|
|
125 jseq = repo_dict[jkey]
|
|
126 # Germline start
|
|
127 try: jstart = int(align['J_GERM_START']) - 1
|
|
128 except (TypeError, ValueError): jstart = 0
|
|
129 # Germline length
|
|
130 try: jlen = int(align['J_GERM_LENGTH'])
|
|
131 except (TypeError, ValueError): jlen = 0
|
|
132 # TODO: not sure what this line is doing either
|
|
133 jpad = jlen - len(jseq[jstart:])
|
|
134 if jpad < 0: jpad = 0
|
|
135 germ_jseq = jseq[jstart:(jstart + jlen)] + ('N' * jpad)
|
|
136 else:
|
|
137 result_log['ERROR'] = 'Germline %s not in repertoire' % jgene
|
|
138 return result_log, germlines
|
|
139 else:
|
|
140 result_log['J_CALL'] = None
|
|
141 try: jlen = int(align['J_GERM_LENGTH'])
|
|
142 except (TypeError, ValueError): jlen = 0
|
|
143 germ_jseq = 'N' * jlen
|
|
144
|
|
145 # Assemble pieces starting with V-region
|
|
146 germ_seq = germ_vseq
|
|
147 regions = 'V' * len(germ_vseq)
|
|
148
|
|
149 try:
|
|
150 np1_len = int(align['NP1_LENGTH'])
|
|
151 except (TypeError, ValueError):
|
|
152 np1_len = 0
|
|
153
|
|
154 # NP nucleotide additions after V
|
|
155 if regions_style == 'IMGT':
|
|
156 # P nucleotide additions
|
|
157 try:
|
|
158 p3v_len = int(align['P3V_LENGTH'])
|
|
159 except (TypeError, ValueError):
|
|
160 p3v_len = 0
|
|
161 if p3v_len < 0:
|
|
162 result_log['ERROR'] = 'P3V_LENGTH is negative'
|
|
163 return result_log, germlines
|
|
164
|
|
165 regions += 'P' * p3v_len
|
|
166
|
|
167 # N1 nucleotide additions
|
|
168 try:
|
|
169 n1_len = int(align['N1_LENGTH'])
|
|
170 except (TypeError, ValueError):
|
|
171 n1_len = 0
|
|
172 if n1_len < 0:
|
|
173 result_log['ERROR'] = 'N1_LENGTH is negative'
|
|
174 return result_log, germlines
|
|
175
|
|
176 regions += 'N' * n1_len
|
|
177
|
|
178 # P nucleotide additions before D
|
|
179 try: p5d_len = int(align['P5D_LENGTH'])
|
|
180 except (TypeError, ValueError): p5d_len = 0
|
|
181 if p5d_len < 0:
|
|
182 result_log['ERROR'] = 'P5D_LENGTH is negative'
|
|
183 return result_log, germlines
|
|
184
|
|
185 regions += 'P' * p5d_len
|
|
186 else:
|
|
187 # IgBLAST style
|
|
188 # PNP nucleotide additions after V
|
|
189 if np1_len < 0:
|
|
190 result_log['ERROR'] = 'NP1_LENGTH is negative'
|
|
191 return result_log, germlines
|
|
192
|
|
193 regions += 'N' * np1_len
|
|
194
|
|
195 germ_seq += 'N' * np1_len
|
|
196
|
|
197 # Add D-region
|
|
198 germ_seq += germ_dseq
|
|
199 regions += 'D' * len(germ_dseq)
|
|
200
|
|
201 #print 'VD>', germ_seq, '\nVD>', regions
|
|
202
|
|
203 try:
|
|
204 np2_len = int(align['NP2_LENGTH'])
|
|
205 except (TypeError, ValueError):
|
|
206 np2_len = 0
|
|
207
|
|
208 # NP nucleotide additions before J
|
|
209 if regions_style == 'IMGT':
|
|
210 # P nucleotide additions
|
|
211 try:
|
|
212 p3d_len = int(align['P3D_LENGTH'])
|
|
213 except (TypeError, ValueError):
|
|
214 p3d_len = 0
|
|
215 if p3d_len < 0:
|
|
216 result_log['ERROR'] = 'P3D_LENGTH is negative'
|
|
217 return result_log, germlines
|
|
218
|
|
219 regions += 'P' * p3d_len
|
|
220
|
|
221 # N2 nucleotide additions
|
|
222 try:
|
|
223 n2_len = int(align['N2_LENGTH'])
|
|
224 except (TypeError, ValueError):
|
|
225 n2_len = 0
|
|
226 if n2_len < 0:
|
|
227 result_log['ERROR'] = 'N2_LENGTH is negative'
|
|
228 return result_log, germlines
|
|
229
|
|
230 regions += 'N' * n2_len
|
|
231
|
|
232 # P nucleotide additions
|
|
233 try:
|
|
234 p5j_len = int(align['P5J_LENGTH'])
|
|
235 except (TypeError, ValueError):
|
|
236 p5j_len = 0
|
|
237 if p5j_len < 0:
|
|
238 result_log['ERROR'] = 'P5J_LENGTH is negative'
|
|
239 return result_log, germlines
|
|
240
|
|
241 regions += 'P' * p5j_len
|
|
242 else:
|
|
243 # IgBLAST style
|
|
244 # NP nucleotide additions
|
|
245 if np2_len < 0:
|
|
246 result_log['ERROR'] = 'NP2_LENGTH is negative'
|
|
247 return result_log, germlines
|
|
248
|
|
249 regions += 'N' * np2_len
|
|
250
|
|
251 germ_seq += 'N' * np2_len
|
|
252
|
|
253 # Add J-region
|
|
254 germ_seq += germ_jseq
|
|
255 regions += 'J' * len(germ_jseq)
|
|
256
|
|
257 #print('\nREGIONS>',regions,'\n')
|
|
258
|
|
259 # Define return germlines
|
|
260 germlines['full'] = germ_seq
|
|
261 germlines['regions'] = regions
|
|
262
|
|
263 if 'dmask' in germ_types:
|
|
264 germlines['dmask'] = germ_seq[:len(germ_vseq)] + \
|
|
265 'N' * (len(germ_seq) - len(germ_vseq) - len(germ_jseq)) + \
|
|
266 germ_seq[-len(germ_jseq):]
|
|
267 if 'vonly' in germ_types:
|
|
268 germlines['vonly'] = germ_vseq
|
|
269
|
|
270 # Check that input and germline sequence match
|
|
271 if len(align[seq_field]) == 0:
|
|
272 result_log['ERROR'] = 'Sequence is missing from %s column' % seq_field
|
|
273 elif len(germlines['full']) != len(align[seq_field]):
|
|
274 result_log['ERROR'] = 'Germline sequence is %d nucleotides longer than input sequence' % \
|
|
275 (len(germlines['full']) - len(align[seq_field]))
|
|
276
|
|
277 # Convert to uppercase
|
|
278 for k, v in germlines.items(): germlines[k] = v.upper()
|
|
279
|
|
280 return result_log, germlines
|
|
281
|
|
282
|
|
283 def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args):
|
|
284 """
|
|
285 Write germline sequences to tab-delimited database file
|
|
286
|
|
287 Arguments:
|
|
288 db_file = input tab-delimited database file
|
|
289 repo = folder with germline repertoire files
|
|
290 germ_types = types of germline sequences to be output
|
|
291 (full germline, D-region masked, only V-region germline)
|
|
292 v_field = field in which to look for V call
|
|
293 seq_field = field in which to look for sequence
|
|
294 out_args = arguments for output preferences
|
|
295
|
|
296 Returns:
|
|
297 None
|
|
298 """
|
|
299 # Print parameter info
|
|
300 log = OrderedDict()
|
|
301 log['START'] = 'CreateGermlines'
|
|
302 log['DB_FILE'] = os.path.basename(db_file)
|
|
303 log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types)
|
|
304 log['CLONED'] = 'False'
|
|
305 log['V_FIELD'] = v_field
|
|
306 log['SEQ_FIELD'] = seq_field
|
|
307 printLog(log)
|
|
308
|
|
309 # Get repertoire and open Db reader
|
|
310 repo_dict = readRepo(repo)
|
|
311 reader = readDbFile(db_file, ig=False)
|
|
312
|
|
313 # Exit if V call field does not exist in reader
|
|
314 if v_field not in reader.fieldnames:
|
|
315 sys.exit('Error: V field does not exist in input database file.')
|
|
316
|
|
317 # Define log handle
|
|
318 if out_args['log_file'] is None:
|
|
319 log_handle = None
|
|
320 else:
|
|
321 log_handle = open(out_args['log_file'], 'w')
|
|
322
|
|
323 add_fields = []
|
|
324 seq_type = seq_field.split('_')[-1]
|
|
325 if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type]
|
|
326 if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
|
|
327 if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION']
|
|
328 if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS']
|
|
329
|
|
330 # Create output file handle and Db writer
|
|
331 pass_handle = getOutputHandle(db_file, 'germ-pass',
|
|
332 out_dir=out_args['out_dir'],
|
|
333 out_name=out_args['out_name'],
|
|
334 out_type=out_args['out_type'])
|
|
335 pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields)
|
|
336
|
|
337 if out_args['failed']:
|
|
338 fail_handle = getOutputHandle(db_file, 'germ-fail',
|
|
339 out_dir=out_args['out_dir'],
|
|
340 out_name=out_args['out_name'],
|
|
341 out_type=out_args['out_type'])
|
|
342 fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields)
|
|
343 else:
|
|
344 fail_handle = None
|
|
345 fail_writer = None
|
|
346
|
|
347 # Initialize time and total count for progress bar
|
|
348 start_time = time()
|
|
349 rec_count = countDbFile(db_file)
|
|
350 pass_count = fail_count = 0
|
|
351 # Iterate over rows
|
|
352 for i, row in enumerate(reader):
|
|
353 # Print progress
|
|
354 printProgress(i, rec_count, 0.05, start_time)
|
|
355
|
|
356 result_log, germlines = joinGermline(row, repo_dict, germ_types, v_field, seq_field)
|
|
357
|
|
358 # Add germline field(s) to dictionary
|
|
359 if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full']
|
|
360 if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask']
|
|
361 if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly']
|
|
362 if 'regions' in germ_types: row['GERMLINE_REGIONS'] = germlines['regions']
|
|
363
|
|
364 # Write row to pass or fail file
|
|
365 if 'ERROR' in result_log:
|
|
366 fail_count += 1
|
|
367 if fail_writer is not None: fail_writer.writerow(row)
|
|
368 else:
|
|
369 result_log['SEQUENCE'] = row[seq_field]
|
|
370 result_log['GERMLINE'] = germlines['full']
|
|
371 result_log['REGIONS'] = germlines['regions']
|
|
372
|
|
373 pass_count += 1
|
|
374 pass_writer.writerow(row)
|
|
375 printLog(result_log, handle=log_handle)
|
|
376
|
|
377 # Print log
|
|
378 printProgress(i + 1, rec_count, 0.05, start_time)
|
|
379 log = OrderedDict()
|
|
380 log['OUTPUT'] = os.path.basename(pass_handle.name)
|
|
381 log['RECORDS'] = rec_count
|
|
382 log['PASS'] = pass_count
|
|
383 log['FAIL'] = fail_count
|
|
384 log['END'] = 'CreateGermlines'
|
|
385 printLog(log)
|
|
386
|
|
387 # Close file handles
|
|
388 pass_handle.close()
|
|
389 if fail_handle is not None: fail_handle.close()
|
|
390 if log_handle is not None: log_handle.close()
|
|
391
|
|
392
|
|
393 def makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field,
|
|
394 seq_field, counts, writers, out_args):
|
|
395 """
|
|
396 Determine consensus clone sequence and create germline for clone
|
|
397
|
|
398 Arguments:
|
|
399 clone = clone ID
|
|
400 clone_dict = iterable yielding dictionaries of sequence data from clone
|
|
401 repo_dict = dictionary of IMGT gapped germline sequences
|
|
402 germ_types = types of germline sequences to be output
|
|
403 (full germline, D-region masked, only V-region germline)
|
|
404 v_field = field in which to look for V call
|
|
405 seq_field = field in which to look for sequence
|
|
406 counts = dictionary of pass counter and fail counter
|
|
407 writers = dictionary with pass and fail DB writers
|
|
408 out_args = arguments for output preferences
|
|
409
|
|
410 Returns:
|
|
411 None
|
|
412 """
|
|
413 seq_type = seq_field.split('_')[-1]
|
|
414 j_field = 'J_CALL'
|
|
415
|
|
416 # Create dictionaries to count observed V/J calls
|
|
417 v_dict = OrderedDict()
|
|
418 j_dict = OrderedDict()
|
|
419
|
|
420 # Find longest sequence in clone
|
|
421 max_length = 0
|
|
422 for val in clone_dict.values():
|
|
423 v = val[v_field]
|
|
424 v_dict[v] = v_dict.get(v, 0) + 1
|
|
425 j = val[j_field]
|
|
426 j_dict[j] = j_dict.get(j, 0) + 1
|
|
427 if len(val[seq_field]) > max_length:
|
|
428 max_length = len(val[seq_field])
|
|
429
|
|
430 # Consensus V and J having most observations
|
|
431 v_cons = [k for k in list(v_dict.keys()) if v_dict[k] == max(v_dict.values())]
|
|
432 j_cons = [k for k in list(j_dict.keys()) if j_dict[k] == max(j_dict.values())]
|
|
433
|
|
434 # Consensus sequence(s) with consensus V/J calls and longest sequence
|
|
435 cons = [val for val in list(clone_dict.values()) \
|
|
436 if val.get(v_field, '') in v_cons and \
|
|
437 val.get(j_field, '') in j_cons and \
|
|
438 len(val[seq_field]) == max_length]
|
|
439
|
|
440 # Sequence(s) with consensus V/J are not longest
|
|
441 if not cons:
|
|
442 # Sequence(s) with consensus V/J (not longest)
|
|
443 cons = [val for val in list(clone_dict.values()) \
|
|
444 if val.get(v_field, '') in v_cons and val.get(j_field, '') in j_cons]
|
|
445
|
|
446 # No sequence has both consensus V and J call
|
|
447 if not cons:
|
|
448 result_log = OrderedDict()
|
|
449 result_log['ID'] = clone
|
|
450 result_log['V_CALL'] = ','.join(v_cons)
|
|
451 result_log['J_CALL'] = ','.join(j_cons)
|
|
452 result_log['ERROR'] = 'No consensus sequence for clone found'
|
|
453 else:
|
|
454 # Pad end of consensus sequence with gaps to make it the max length
|
|
455 cons = cons[0]
|
|
456 cons['J_GERM_LENGTH'] = str(int(cons['J_GERM_LENGTH'] or 0) + max_length - len(cons[seq_field]))
|
|
457 cons[seq_field] += '.' * (max_length - len(cons[seq_field]))
|
|
458 result_log, germlines = joinGermline(cons, repo_dict, germ_types, v_field, seq_field)
|
|
459 result_log['ID'] = clone
|
|
460 result_log['CONSENSUS'] = cons['SEQUENCE_ID']
|
|
461 else:
|
|
462 cons = cons[0]
|
|
463 result_log, germlines = joinGermline(cons, repo_dict, germ_types, v_field, seq_field)
|
|
464 result_log['ID'] = clone
|
|
465 result_log['CONSENSUS'] = cons['SEQUENCE_ID']
|
|
466
|
|
467 # Write sequences of clone
|
|
468 for val in clone_dict.values():
|
|
469 if 'ERROR' not in result_log:
|
|
470 # Update lengths padded to longest sequence in clone
|
|
471 val['J_GERM_LENGTH'] = str(int(val['J_GERM_LENGTH'] or 0) + max_length - len(val[seq_field]))
|
|
472 val[seq_field] += '.' * (max_length - len(val[seq_field]))
|
|
473
|
|
474 # Add column(s) to tab-delimited database file
|
|
475 if 'full' in germ_types: val['GERMLINE_' + seq_type] = germlines['full']
|
|
476 if 'dmask' in germ_types: val['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask']
|
|
477 if 'vonly' in germ_types: val['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly']
|
|
478 if 'regions' in germ_types: val['GERMLINE_REGIONS'] = germlines['regions']
|
|
479
|
|
480 # Add field
|
|
481 val['GERMLINE_V_CALL'] = result_log['V_CALL']
|
|
482 val['GERMLINE_D_CALL'] = result_log['D_CALL']
|
|
483 val['GERMLINE_J_CALL'] = result_log['J_CALL']
|
|
484
|
|
485 result_log['SEQUENCE'] = cons[seq_field]
|
|
486 result_log['GERMLINE'] = germlines['full']
|
|
487 result_log['REGIONS'] = germlines['regions']
|
|
488
|
|
489 # Write to pass file
|
|
490 counts['pass'] += 1
|
|
491 writers['pass'].writerow(val)
|
|
492 else:
|
|
493 # Write to fail file
|
|
494 counts['fail'] += 1
|
|
495 if writers['fail'] is not None:
|
|
496 writers['fail'].writerow(val)
|
|
497 # Return log
|
|
498 return result_log
|
|
499
|
|
500
|
|
501 def assembleCloneGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args):
|
|
502 """
|
|
503 Assemble one germline sequence for each clone in a tab-delimited database file
|
|
504
|
|
505 Arguments:
|
|
506 db_file = input tab-delimited database file
|
|
507 repo = folder with germline repertoire files
|
|
508 germ_types = types of germline sequences to be output
|
|
509 (full germline, D-region masked, only V-region germline)
|
|
510 v_field = field in which to look for V call
|
|
511 seq_field = field in which to look for sequence
|
|
512 out_args = arguments for output preferences
|
|
513
|
|
514 Returns:
|
|
515 None
|
|
516 """
|
|
517 # Print parameter info
|
|
518 log = OrderedDict()
|
|
519 log['START'] = 'CreateGermlines'
|
|
520 log['DB_FILE'] = os.path.basename(db_file)
|
|
521 log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types)
|
|
522 log['CLONED'] = 'True'
|
|
523 log['V_FIELD'] = v_field
|
|
524 log['SEQ_FIELD'] = seq_field
|
|
525 printLog(log)
|
|
526
|
|
527 # Get repertoire and open Db reader
|
|
528 repo_dict = readRepo(repo)
|
|
529 reader = readDbFile(db_file, ig=False)
|
|
530
|
|
531 # Exit if V call field does not exist in reader
|
|
532 if v_field not in reader.fieldnames:
|
|
533 sys.exit('Error: V field does not exist in input database file.')
|
|
534
|
|
535 # Define log handle
|
|
536 if out_args['log_file'] is None:
|
|
537 log_handle = None
|
|
538 else:
|
|
539 log_handle = open(out_args['log_file'], 'w')
|
|
540
|
|
541 add_fields = []
|
|
542 seq_type = seq_field.split('_')[-1]
|
|
543 if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type]
|
|
544 if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
|
|
545 if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION']
|
|
546 if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS']
|
|
547
|
|
548 add_fields += ['GERMLINE_V_CALL']
|
|
549 add_fields += ['GERMLINE_D_CALL']
|
|
550 add_fields += ['GERMLINE_J_CALL']
|
|
551
|
|
552 # Create output file handle and Db writer
|
|
553 writers = {}
|
|
554 pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'],
|
|
555 out_name=out_args['out_name'], out_type=out_args['out_type'])
|
|
556 writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields)
|
|
557
|
|
558 if out_args['failed']:
|
|
559 fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'],
|
|
560 out_name=out_args['out_name'], out_type=out_args['out_type'])
|
|
561 writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields)
|
|
562 else:
|
|
563 fail_handle = None
|
|
564 writers['fail'] = None
|
|
565
|
|
566 # Initialize time and total count for progress bar
|
|
567 start_time = time()
|
|
568 rec_count = countDbFile(db_file)
|
|
569 counts = {}
|
|
570 clone_count = counts['pass'] = counts['fail'] = 0
|
|
571 # Iterate over rows
|
|
572 clone = 'initial'
|
|
573 clone_dict = OrderedDict()
|
|
574 for i, row in enumerate(reader):
|
|
575 # Print progress
|
|
576 printProgress(i, rec_count, 0.05, start_time)
|
|
577
|
|
578 # Clone isn't over yet
|
|
579 if row.get('CLONE', '') == clone:
|
|
580 clone_dict[i] = row
|
|
581 # Clone just finished
|
|
582 elif clone_dict:
|
|
583 clone_count += 1
|
|
584 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types,
|
|
585 v_field, seq_field, counts, writers, out_args)
|
|
586 printLog(result_log, handle=log_handle)
|
|
587 # Now deal with current row (first of next clone)
|
|
588 clone = row['CLONE']
|
|
589 clone_dict = OrderedDict([(i, row)])
|
|
590 # Last case is only for first row of file
|
|
591 else:
|
|
592 clone = row['CLONE']
|
|
593 clone_dict = OrderedDict([(i, row)])
|
|
594
|
|
595 clone_count += 1
|
|
596 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field,
|
|
597 seq_field, counts, writers, out_args)
|
|
598 printLog(result_log, handle=log_handle)
|
|
599
|
|
600 # Print log
|
|
601 printProgress(i + 1, rec_count, 0.05, start_time)
|
|
602 log = OrderedDict()
|
|
603 log['OUTPUT'] = os.path.basename(pass_handle.name)
|
|
604 log['CLONES'] = clone_count
|
|
605 log['RECORDS'] = rec_count
|
|
606 log['PASS'] = counts['pass']
|
|
607 log['FAIL'] = counts['fail']
|
|
608 log['END'] = 'CreateGermlines'
|
|
609 printLog(log)
|
|
610
|
|
611 # Close file handles
|
|
612 pass_handle.close()
|
|
613 if fail_handle is not None: fail_handle.close()
|
|
614 if log_handle is not None: log_handle.close()
|
|
615
|
|
616
|
|
617 def getArgParser():
|
|
618 """
|
|
619 Defines the ArgumentParser
|
|
620
|
|
621 Arguments:
|
|
622 None
|
|
623
|
|
624 Returns:
|
|
625 an ArgumentParser object
|
|
626 """
|
|
627 # Define input and output field help message
|
|
628 fields = dedent(
|
|
629 '''
|
|
630 output files:
|
|
631 germ-pass
|
|
632 database with assigned germline sequences.
|
|
633 germ-fail
|
|
634 database with records failing germline assignment.
|
|
635
|
|
636 required fields:
|
|
637 SEQUENCE_ID, SEQUENCE_VDJ or SEQUENCE_IMGT,
|
|
638 V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL,
|
|
639 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
|
|
640 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
|
|
641 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
|
|
642 NP1_LENGTH, NP2_LENGTH
|
|
643
|
|
644 optional fields:
|
|
645 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
|
|
646 CLONE
|
|
647
|
|
648
|
|
649 output fields:
|
|
650 GERMLINE_VDJ, GERMLINE_VDJ_D_MASK, GERMLINE_VDJ_V_REGION,
|
|
651 GERMLINE_IMGT, GERMLINE_IMGT_D_MASK, GERMLINE_IMGT_V_REGION,
|
|
652 GERMLINE_V_CALL, GERMLINE_D_CALL, GERMLINE_J_CALL,
|
|
653 GERMLINE_REGIONS
|
|
654 ''')
|
|
655
|
|
656 # Parent parser
|
|
657 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True,
|
|
658 annotation=False)
|
|
659 # Define argument parser
|
|
660 parser = ArgumentParser(description=__doc__, epilog=fields,
|
|
661 parents=[parser_parent],
|
|
662 formatter_class=CommonHelpFormatter)
|
|
663 parser.add_argument('--version', action='version',
|
|
664 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
|
|
665
|
|
666 parser.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
|
|
667 help='''List of folders and/or fasta files (with .fasta, .fna or .fa
|
|
668 extension) with germline sequences.''')
|
|
669 parser.add_argument('-g', action='store', dest='germ_types', default=default_germ_types,
|
|
670 nargs='+', choices=('full', 'dmask', 'vonly', 'regions'),
|
|
671 help='''Specify type(s) of germlines to include full germline,
|
|
672 germline with D-region masked, or germline for V region only.''')
|
|
673 parser.add_argument('--cloned', action='store_true', dest='cloned',
|
|
674 help='''Specify to create only one germline per clone. Assumes input file is
|
|
675 sorted by clone column, and will not yield correct results if the data
|
|
676 is unsorted. Note, if allele calls are ambiguous within a clonal group,
|
|
677 this will place the germline call used for the entire clone within the
|
|
678 GERMLINE_V_CALL, GERMLINE_D_CALL and GERMLINE_J_CALL fields.''')
|
|
679 parser.add_argument('--vf', action='store', dest='v_field', default=default_v_field,
|
|
680 help='Specify field to use for germline V call')
|
|
681 parser.add_argument('--sf', action='store', dest='seq_field', default=default_seq_field,
|
|
682 help='Specify field to use for sequence')
|
|
683
|
|
684 return parser
|
|
685
|
|
686
|
|
687 if __name__ == "__main__":
|
|
688 """
|
|
689 Parses command line arguments and calls main
|
|
690 """
|
|
691
|
|
692 # Parse command line arguments
|
|
693 parser = getArgParser()
|
|
694 checkArgs(parser)
|
|
695 args = parser.parse_args()
|
|
696 args_dict = parseCommonArgs(args)
|
|
697 del args_dict['db_files']
|
|
698 del args_dict['cloned']
|
|
699 args_dict['v_field'] = args_dict['v_field'].upper()
|
|
700 args_dict['seq_field'] = args_dict['seq_field'].upper()
|
|
701
|
|
702 for f in args.__dict__['db_files']:
|
|
703 args_dict['db_file'] = f
|
|
704 if args.__dict__['cloned']:
|
|
705 assembleCloneGermline(**args_dict)
|
|
706 else:
|
|
707 assembleEachGermline(**args_dict)
|