comparison CreateGermlines.py @ 0:183edf446dcf draft default tip

Uploaded
author davidvanzessen
date Mon, 17 Jul 2017 07:44:27 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:183edf446dcf
1 #!/usr/bin/env python3
2 """
3 Reconstructs germline sequences from alignment data
4 """
5 # Info
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
7 from changeo import __version__, __date__
8
9 # Imports
10 import os
11 import sys
12 from argparse import ArgumentParser
13 from collections import OrderedDict
14 from textwrap import dedent
15 from time import time
16
17 # Presto and change imports
18 from presto.Defaults import default_out_args
19 from presto.IO import getOutputHandle, printLog, printProgress
20 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
21 from changeo.IO import getDbWriter, readDbFile, countDbFile, readRepo
22 from changeo.Receptor import allele_regex, parseAllele
23
24 # Defaults
25 default_germ_types = 'dmask'
26 default_v_field = 'V_CALL'
27 default_seq_field = 'SEQUENCE_IMGT'
28
29
30 def joinGermline(align, repo_dict, germ_types, v_field, seq_field):
31 """
32 Join gapped germline sequences aligned with sample sequences
33
34 Arguments:
35 align = iterable yielding dictionaries of sample sequence data
36 repo_dict = dictionary of IMGT gapped germline sequences
37 germ_types = types of germline sequences to be output
38 (full germline, D-region masked, only V-region germline)
39 v_field = field in which to look for V call
40 seq_field = field in which to look for sequence
41
42 Returns:
43 dictionary of germline_type: germline_sequence
44 """
45 j_field = 'J_CALL'
46 germlines = {'full': '', 'dmask': '', 'vonly': '', 'regions': ''}
47 result_log = OrderedDict()
48 result_log['ID'] = align['SEQUENCE_ID']
49
50 # Find germline V-region gene
51 if v_field == 'V_CALL_GENOTYPED':
52 vgene = parseAllele(align[v_field], allele_regex, 'list')
53 vkey = vgene
54 else:
55 vgene = parseAllele(align[v_field], allele_regex, 'first')
56 vkey = (vgene, )
57
58 try:
59 int(align['P3V_LENGTH'])
60 int(align['N1_LENGTH'])
61 int(align['P5D_LENGTH'])
62 int(align['P3D_LENGTH'])
63 int(align['N2_LENGTH'])
64 int(align['P5J_LENGTH'])
65 except:
66 regions_style = 'IgBLAST'
67 else:
68 regions_style = 'IMGT'
69
70 # Build V-region germline
71 if vgene is not None:
72 result_log['V_CALL'] = ','.join(vkey)
73 if vkey in repo_dict:
74 vseq = repo_dict[vkey]
75 # Germline start
76 try: vstart = int(align['V_GERM_START_IMGT']) - 1
77 except (TypeError, ValueError): vstart = 0
78 # Germline length
79 try: vlen = int(align['V_GERM_LENGTH_IMGT'])
80 except (TypeError, ValueError): vlen = 0
81 # TODO: not sure what this line is doing here. it no make no sense.
82 vpad = vlen - len(vseq[vstart:])
83 if vpad < 0: vpad = 0
84 germ_vseq = vseq[vstart:(vstart + vlen)] + ('N' * vpad)
85 else:
86 result_log['ERROR'] = 'Germline %s not in repertoire' % ','.join(vkey)
87 return result_log, germlines
88 else:
89 result_log['V_CALL'] = None
90 try: vlen = int(align['V_GERM_LENGTH_IMGT'])
91 except (TypeError, ValueError): vlen = 0
92 germ_vseq = 'N' * vlen
93
94 # Find germline D-region gene
95 dgene = parseAllele(align['D_CALL'], allele_regex, 'first')
96
97 # Build D-region germline
98 if dgene is not None:
99 result_log['D_CALL'] = dgene
100 dkey = (dgene, )
101 if dkey in repo_dict:
102 dseq = repo_dict[dkey]
103 # Germline start
104 try: dstart = int(align['D_GERM_START']) - 1
105 except (TypeError, ValueError): dstart = 0
106 # Germline length
107 try: dlen = int(align['D_GERM_LENGTH'])
108 except (TypeError, ValueError): dlen = 0
109 germ_dseq = repo_dict[dkey][dstart:(dstart + dlen)]
110 else:
111 result_log['ERROR'] = 'Germline %s not in repertoire' % dgene
112 return result_log, germlines
113 else:
114 result_log['D_CALL'] = None
115 germ_dseq = ''
116
117 # Find germline J-region gene
118 jgene = parseAllele(align[j_field], allele_regex, 'first')
119
120 # Build D-region germline
121 if jgene is not None:
122 result_log['J_CALL'] = jgene
123 jkey = (jgene, )
124 if jkey in repo_dict:
125 jseq = repo_dict[jkey]
126 # Germline start
127 try: jstart = int(align['J_GERM_START']) - 1
128 except (TypeError, ValueError): jstart = 0
129 # Germline length
130 try: jlen = int(align['J_GERM_LENGTH'])
131 except (TypeError, ValueError): jlen = 0
132 # TODO: not sure what this line is doing either
133 jpad = jlen - len(jseq[jstart:])
134 if jpad < 0: jpad = 0
135 germ_jseq = jseq[jstart:(jstart + jlen)] + ('N' * jpad)
136 else:
137 result_log['ERROR'] = 'Germline %s not in repertoire' % jgene
138 return result_log, germlines
139 else:
140 result_log['J_CALL'] = None
141 try: jlen = int(align['J_GERM_LENGTH'])
142 except (TypeError, ValueError): jlen = 0
143 germ_jseq = 'N' * jlen
144
145 # Assemble pieces starting with V-region
146 germ_seq = germ_vseq
147 regions = 'V' * len(germ_vseq)
148
149 try:
150 np1_len = int(align['NP1_LENGTH'])
151 except (TypeError, ValueError):
152 np1_len = 0
153
154 # NP nucleotide additions after V
155 if regions_style == 'IMGT':
156 # P nucleotide additions
157 try:
158 p3v_len = int(align['P3V_LENGTH'])
159 except (TypeError, ValueError):
160 p3v_len = 0
161 if p3v_len < 0:
162 result_log['ERROR'] = 'P3V_LENGTH is negative'
163 return result_log, germlines
164
165 regions += 'P' * p3v_len
166
167 # N1 nucleotide additions
168 try:
169 n1_len = int(align['N1_LENGTH'])
170 except (TypeError, ValueError):
171 n1_len = 0
172 if n1_len < 0:
173 result_log['ERROR'] = 'N1_LENGTH is negative'
174 return result_log, germlines
175
176 regions += 'N' * n1_len
177
178 # P nucleotide additions before D
179 try: p5d_len = int(align['P5D_LENGTH'])
180 except (TypeError, ValueError): p5d_len = 0
181 if p5d_len < 0:
182 result_log['ERROR'] = 'P5D_LENGTH is negative'
183 return result_log, germlines
184
185 regions += 'P' * p5d_len
186 else:
187 # IgBLAST style
188 # PNP nucleotide additions after V
189 if np1_len < 0:
190 result_log['ERROR'] = 'NP1_LENGTH is negative'
191 return result_log, germlines
192
193 regions += 'N' * np1_len
194
195 germ_seq += 'N' * np1_len
196
197 # Add D-region
198 germ_seq += germ_dseq
199 regions += 'D' * len(germ_dseq)
200
201 #print 'VD>', germ_seq, '\nVD>', regions
202
203 try:
204 np2_len = int(align['NP2_LENGTH'])
205 except (TypeError, ValueError):
206 np2_len = 0
207
208 # NP nucleotide additions before J
209 if regions_style == 'IMGT':
210 # P nucleotide additions
211 try:
212 p3d_len = int(align['P3D_LENGTH'])
213 except (TypeError, ValueError):
214 p3d_len = 0
215 if p3d_len < 0:
216 result_log['ERROR'] = 'P3D_LENGTH is negative'
217 return result_log, germlines
218
219 regions += 'P' * p3d_len
220
221 # N2 nucleotide additions
222 try:
223 n2_len = int(align['N2_LENGTH'])
224 except (TypeError, ValueError):
225 n2_len = 0
226 if n2_len < 0:
227 result_log['ERROR'] = 'N2_LENGTH is negative'
228 return result_log, germlines
229
230 regions += 'N' * n2_len
231
232 # P nucleotide additions
233 try:
234 p5j_len = int(align['P5J_LENGTH'])
235 except (TypeError, ValueError):
236 p5j_len = 0
237 if p5j_len < 0:
238 result_log['ERROR'] = 'P5J_LENGTH is negative'
239 return result_log, germlines
240
241 regions += 'P' * p5j_len
242 else:
243 # IgBLAST style
244 # NP nucleotide additions
245 if np2_len < 0:
246 result_log['ERROR'] = 'NP2_LENGTH is negative'
247 return result_log, germlines
248
249 regions += 'N' * np2_len
250
251 germ_seq += 'N' * np2_len
252
253 # Add J-region
254 germ_seq += germ_jseq
255 regions += 'J' * len(germ_jseq)
256
257 #print('\nREGIONS>',regions,'\n')
258
259 # Define return germlines
260 germlines['full'] = germ_seq
261 germlines['regions'] = regions
262
263 if 'dmask' in germ_types:
264 germlines['dmask'] = germ_seq[:len(germ_vseq)] + \
265 'N' * (len(germ_seq) - len(germ_vseq) - len(germ_jseq)) + \
266 germ_seq[-len(germ_jseq):]
267 if 'vonly' in germ_types:
268 germlines['vonly'] = germ_vseq
269
270 # Check that input and germline sequence match
271 if len(align[seq_field]) == 0:
272 result_log['ERROR'] = 'Sequence is missing from %s column' % seq_field
273 elif len(germlines['full']) != len(align[seq_field]):
274 result_log['ERROR'] = 'Germline sequence is %d nucleotides longer than input sequence' % \
275 (len(germlines['full']) - len(align[seq_field]))
276
277 # Convert to uppercase
278 for k, v in germlines.items(): germlines[k] = v.upper()
279
280 return result_log, germlines
281
282
283 def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args):
284 """
285 Write germline sequences to tab-delimited database file
286
287 Arguments:
288 db_file = input tab-delimited database file
289 repo = folder with germline repertoire files
290 germ_types = types of germline sequences to be output
291 (full germline, D-region masked, only V-region germline)
292 v_field = field in which to look for V call
293 seq_field = field in which to look for sequence
294 out_args = arguments for output preferences
295
296 Returns:
297 None
298 """
299 # Print parameter info
300 log = OrderedDict()
301 log['START'] = 'CreateGermlines'
302 log['DB_FILE'] = os.path.basename(db_file)
303 log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types)
304 log['CLONED'] = 'False'
305 log['V_FIELD'] = v_field
306 log['SEQ_FIELD'] = seq_field
307 printLog(log)
308
309 # Get repertoire and open Db reader
310 repo_dict = readRepo(repo)
311 reader = readDbFile(db_file, ig=False)
312
313 # Exit if V call field does not exist in reader
314 if v_field not in reader.fieldnames:
315 sys.exit('Error: V field does not exist in input database file.')
316
317 # Define log handle
318 if out_args['log_file'] is None:
319 log_handle = None
320 else:
321 log_handle = open(out_args['log_file'], 'w')
322
323 add_fields = []
324 seq_type = seq_field.split('_')[-1]
325 if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type]
326 if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
327 if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION']
328 if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS']
329
330 # Create output file handle and Db writer
331 pass_handle = getOutputHandle(db_file, 'germ-pass',
332 out_dir=out_args['out_dir'],
333 out_name=out_args['out_name'],
334 out_type=out_args['out_type'])
335 pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields)
336
337 if out_args['failed']:
338 fail_handle = getOutputHandle(db_file, 'germ-fail',
339 out_dir=out_args['out_dir'],
340 out_name=out_args['out_name'],
341 out_type=out_args['out_type'])
342 fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields)
343 else:
344 fail_handle = None
345 fail_writer = None
346
347 # Initialize time and total count for progress bar
348 start_time = time()
349 rec_count = countDbFile(db_file)
350 pass_count = fail_count = 0
351 # Iterate over rows
352 for i, row in enumerate(reader):
353 # Print progress
354 printProgress(i, rec_count, 0.05, start_time)
355
356 result_log, germlines = joinGermline(row, repo_dict, germ_types, v_field, seq_field)
357
358 # Add germline field(s) to dictionary
359 if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full']
360 if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask']
361 if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly']
362 if 'regions' in germ_types: row['GERMLINE_REGIONS'] = germlines['regions']
363
364 # Write row to pass or fail file
365 if 'ERROR' in result_log:
366 fail_count += 1
367 if fail_writer is not None: fail_writer.writerow(row)
368 else:
369 result_log['SEQUENCE'] = row[seq_field]
370 result_log['GERMLINE'] = germlines['full']
371 result_log['REGIONS'] = germlines['regions']
372
373 pass_count += 1
374 pass_writer.writerow(row)
375 printLog(result_log, handle=log_handle)
376
377 # Print log
378 printProgress(i + 1, rec_count, 0.05, start_time)
379 log = OrderedDict()
380 log['OUTPUT'] = os.path.basename(pass_handle.name)
381 log['RECORDS'] = rec_count
382 log['PASS'] = pass_count
383 log['FAIL'] = fail_count
384 log['END'] = 'CreateGermlines'
385 printLog(log)
386
387 # Close file handles
388 pass_handle.close()
389 if fail_handle is not None: fail_handle.close()
390 if log_handle is not None: log_handle.close()
391
392
393 def makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field,
394 seq_field, counts, writers, out_args):
395 """
396 Determine consensus clone sequence and create germline for clone
397
398 Arguments:
399 clone = clone ID
400 clone_dict = iterable yielding dictionaries of sequence data from clone
401 repo_dict = dictionary of IMGT gapped germline sequences
402 germ_types = types of germline sequences to be output
403 (full germline, D-region masked, only V-region germline)
404 v_field = field in which to look for V call
405 seq_field = field in which to look for sequence
406 counts = dictionary of pass counter and fail counter
407 writers = dictionary with pass and fail DB writers
408 out_args = arguments for output preferences
409
410 Returns:
411 None
412 """
413 seq_type = seq_field.split('_')[-1]
414 j_field = 'J_CALL'
415
416 # Create dictionaries to count observed V/J calls
417 v_dict = OrderedDict()
418 j_dict = OrderedDict()
419
420 # Find longest sequence in clone
421 max_length = 0
422 for val in clone_dict.values():
423 v = val[v_field]
424 v_dict[v] = v_dict.get(v, 0) + 1
425 j = val[j_field]
426 j_dict[j] = j_dict.get(j, 0) + 1
427 if len(val[seq_field]) > max_length:
428 max_length = len(val[seq_field])
429
430 # Consensus V and J having most observations
431 v_cons = [k for k in list(v_dict.keys()) if v_dict[k] == max(v_dict.values())]
432 j_cons = [k for k in list(j_dict.keys()) if j_dict[k] == max(j_dict.values())]
433
434 # Consensus sequence(s) with consensus V/J calls and longest sequence
435 cons = [val for val in list(clone_dict.values()) \
436 if val.get(v_field, '') in v_cons and \
437 val.get(j_field, '') in j_cons and \
438 len(val[seq_field]) == max_length]
439
440 # Sequence(s) with consensus V/J are not longest
441 if not cons:
442 # Sequence(s) with consensus V/J (not longest)
443 cons = [val for val in list(clone_dict.values()) \
444 if val.get(v_field, '') in v_cons and val.get(j_field, '') in j_cons]
445
446 # No sequence has both consensus V and J call
447 if not cons:
448 result_log = OrderedDict()
449 result_log['ID'] = clone
450 result_log['V_CALL'] = ','.join(v_cons)
451 result_log['J_CALL'] = ','.join(j_cons)
452 result_log['ERROR'] = 'No consensus sequence for clone found'
453 else:
454 # Pad end of consensus sequence with gaps to make it the max length
455 cons = cons[0]
456 cons['J_GERM_LENGTH'] = str(int(cons['J_GERM_LENGTH'] or 0) + max_length - len(cons[seq_field]))
457 cons[seq_field] += '.' * (max_length - len(cons[seq_field]))
458 result_log, germlines = joinGermline(cons, repo_dict, germ_types, v_field, seq_field)
459 result_log['ID'] = clone
460 result_log['CONSENSUS'] = cons['SEQUENCE_ID']
461 else:
462 cons = cons[0]
463 result_log, germlines = joinGermline(cons, repo_dict, germ_types, v_field, seq_field)
464 result_log['ID'] = clone
465 result_log['CONSENSUS'] = cons['SEQUENCE_ID']
466
467 # Write sequences of clone
468 for val in clone_dict.values():
469 if 'ERROR' not in result_log:
470 # Update lengths padded to longest sequence in clone
471 val['J_GERM_LENGTH'] = str(int(val['J_GERM_LENGTH'] or 0) + max_length - len(val[seq_field]))
472 val[seq_field] += '.' * (max_length - len(val[seq_field]))
473
474 # Add column(s) to tab-delimited database file
475 if 'full' in germ_types: val['GERMLINE_' + seq_type] = germlines['full']
476 if 'dmask' in germ_types: val['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask']
477 if 'vonly' in germ_types: val['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly']
478 if 'regions' in germ_types: val['GERMLINE_REGIONS'] = germlines['regions']
479
480 # Add field
481 val['GERMLINE_V_CALL'] = result_log['V_CALL']
482 val['GERMLINE_D_CALL'] = result_log['D_CALL']
483 val['GERMLINE_J_CALL'] = result_log['J_CALL']
484
485 result_log['SEQUENCE'] = cons[seq_field]
486 result_log['GERMLINE'] = germlines['full']
487 result_log['REGIONS'] = germlines['regions']
488
489 # Write to pass file
490 counts['pass'] += 1
491 writers['pass'].writerow(val)
492 else:
493 # Write to fail file
494 counts['fail'] += 1
495 if writers['fail'] is not None:
496 writers['fail'].writerow(val)
497 # Return log
498 return result_log
499
500
501 def assembleCloneGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args):
502 """
503 Assemble one germline sequence for each clone in a tab-delimited database file
504
505 Arguments:
506 db_file = input tab-delimited database file
507 repo = folder with germline repertoire files
508 germ_types = types of germline sequences to be output
509 (full germline, D-region masked, only V-region germline)
510 v_field = field in which to look for V call
511 seq_field = field in which to look for sequence
512 out_args = arguments for output preferences
513
514 Returns:
515 None
516 """
517 # Print parameter info
518 log = OrderedDict()
519 log['START'] = 'CreateGermlines'
520 log['DB_FILE'] = os.path.basename(db_file)
521 log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types)
522 log['CLONED'] = 'True'
523 log['V_FIELD'] = v_field
524 log['SEQ_FIELD'] = seq_field
525 printLog(log)
526
527 # Get repertoire and open Db reader
528 repo_dict = readRepo(repo)
529 reader = readDbFile(db_file, ig=False)
530
531 # Exit if V call field does not exist in reader
532 if v_field not in reader.fieldnames:
533 sys.exit('Error: V field does not exist in input database file.')
534
535 # Define log handle
536 if out_args['log_file'] is None:
537 log_handle = None
538 else:
539 log_handle = open(out_args['log_file'], 'w')
540
541 add_fields = []
542 seq_type = seq_field.split('_')[-1]
543 if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type]
544 if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
545 if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION']
546 if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS']
547
548 add_fields += ['GERMLINE_V_CALL']
549 add_fields += ['GERMLINE_D_CALL']
550 add_fields += ['GERMLINE_J_CALL']
551
552 # Create output file handle and Db writer
553 writers = {}
554 pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'],
555 out_name=out_args['out_name'], out_type=out_args['out_type'])
556 writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields)
557
558 if out_args['failed']:
559 fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'],
560 out_name=out_args['out_name'], out_type=out_args['out_type'])
561 writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields)
562 else:
563 fail_handle = None
564 writers['fail'] = None
565
566 # Initialize time and total count for progress bar
567 start_time = time()
568 rec_count = countDbFile(db_file)
569 counts = {}
570 clone_count = counts['pass'] = counts['fail'] = 0
571 # Iterate over rows
572 clone = 'initial'
573 clone_dict = OrderedDict()
574 for i, row in enumerate(reader):
575 # Print progress
576 printProgress(i, rec_count, 0.05, start_time)
577
578 # Clone isn't over yet
579 if row.get('CLONE', '') == clone:
580 clone_dict[i] = row
581 # Clone just finished
582 elif clone_dict:
583 clone_count += 1
584 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types,
585 v_field, seq_field, counts, writers, out_args)
586 printLog(result_log, handle=log_handle)
587 # Now deal with current row (first of next clone)
588 clone = row['CLONE']
589 clone_dict = OrderedDict([(i, row)])
590 # Last case is only for first row of file
591 else:
592 clone = row['CLONE']
593 clone_dict = OrderedDict([(i, row)])
594
595 clone_count += 1
596 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field,
597 seq_field, counts, writers, out_args)
598 printLog(result_log, handle=log_handle)
599
600 # Print log
601 printProgress(i + 1, rec_count, 0.05, start_time)
602 log = OrderedDict()
603 log['OUTPUT'] = os.path.basename(pass_handle.name)
604 log['CLONES'] = clone_count
605 log['RECORDS'] = rec_count
606 log['PASS'] = counts['pass']
607 log['FAIL'] = counts['fail']
608 log['END'] = 'CreateGermlines'
609 printLog(log)
610
611 # Close file handles
612 pass_handle.close()
613 if fail_handle is not None: fail_handle.close()
614 if log_handle is not None: log_handle.close()
615
616
617 def getArgParser():
618 """
619 Defines the ArgumentParser
620
621 Arguments:
622 None
623
624 Returns:
625 an ArgumentParser object
626 """
627 # Define input and output field help message
628 fields = dedent(
629 '''
630 output files:
631 germ-pass
632 database with assigned germline sequences.
633 germ-fail
634 database with records failing germline assignment.
635
636 required fields:
637 SEQUENCE_ID, SEQUENCE_VDJ or SEQUENCE_IMGT,
638 V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL,
639 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
640 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
641 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
642 NP1_LENGTH, NP2_LENGTH
643
644 optional fields:
645 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
646 CLONE
647
648
649 output fields:
650 GERMLINE_VDJ, GERMLINE_VDJ_D_MASK, GERMLINE_VDJ_V_REGION,
651 GERMLINE_IMGT, GERMLINE_IMGT_D_MASK, GERMLINE_IMGT_V_REGION,
652 GERMLINE_V_CALL, GERMLINE_D_CALL, GERMLINE_J_CALL,
653 GERMLINE_REGIONS
654 ''')
655
656 # Parent parser
657 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True,
658 annotation=False)
659 # Define argument parser
660 parser = ArgumentParser(description=__doc__, epilog=fields,
661 parents=[parser_parent],
662 formatter_class=CommonHelpFormatter)
663 parser.add_argument('--version', action='version',
664 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
665
666 parser.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
667 help='''List of folders and/or fasta files (with .fasta, .fna or .fa
668 extension) with germline sequences.''')
669 parser.add_argument('-g', action='store', dest='germ_types', default=default_germ_types,
670 nargs='+', choices=('full', 'dmask', 'vonly', 'regions'),
671 help='''Specify type(s) of germlines to include full germline,
672 germline with D-region masked, or germline for V region only.''')
673 parser.add_argument('--cloned', action='store_true', dest='cloned',
674 help='''Specify to create only one germline per clone. Assumes input file is
675 sorted by clone column, and will not yield correct results if the data
676 is unsorted. Note, if allele calls are ambiguous within a clonal group,
677 this will place the germline call used for the entire clone within the
678 GERMLINE_V_CALL, GERMLINE_D_CALL and GERMLINE_J_CALL fields.''')
679 parser.add_argument('--vf', action='store', dest='v_field', default=default_v_field,
680 help='Specify field to use for germline V call')
681 parser.add_argument('--sf', action='store', dest='seq_field', default=default_seq_field,
682 help='Specify field to use for sequence')
683
684 return parser
685
686
687 if __name__ == "__main__":
688 """
689 Parses command line arguments and calls main
690 """
691
692 # Parse command line arguments
693 parser = getArgParser()
694 checkArgs(parser)
695 args = parser.parse_args()
696 args_dict = parseCommonArgs(args)
697 del args_dict['db_files']
698 del args_dict['cloned']
699 args_dict['v_field'] = args_dict['v_field'].upper()
700 args_dict['seq_field'] = args_dict['seq_field'].upper()
701
702 for f in args.__dict__['db_files']:
703 args_dict['db_file'] = f
704 if args.__dict__['cloned']:
705 assembleCloneGermline(**args_dict)
706 else:
707 assembleEachGermline(**args_dict)