annotate change_o/MakeDb.py @ 66:43a1aa648537 draft

Uploaded
author davidvanzessen
date Thu, 07 Dec 2017 03:44:38 -0500
parents 22dddabe3637
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
1 #!/usr/bin/env python3
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
2 """
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
3 Create tab-delimited database file to store sequence alignment information
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
4 """
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
5 # Info
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
7 from changeo import __version__, __date__
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
8
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
9 # Imports
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
10 import os
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
11 import sys
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
12 from argparse import ArgumentParser
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
13 from collections import OrderedDict
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
14 from textwrap import dedent
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
15 from time import time
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
16 from Bio import SeqIO
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
17
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
18 # Presto and changeo imports
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
19 from presto.Defaults import default_out_args
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
20 from presto.Annotation import parseAnnotation
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
21 from presto.IO import countSeqFile, printLog, printMessage, printProgress, readSeqFile
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
22 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
23 from changeo.IO import countDbFile, extractIMGT, getDbWriter, readRepo
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
24 from changeo.Parsers import IgBLASTReader, IMGTReader, IHMMuneReader, getIDforIMGT
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
25
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
26
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
27 def getFilePrefix(aligner_output, out_args):
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
28 """
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
29 Get file name prefix and create output directory
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
30
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
31 Arguments:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
32 aligner_output : aligner output file or directory.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
33 out_args : dictionary of output arguments.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
34
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
35 Returns:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
36 str : file name prefix.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
37 """
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
38 # Determine output directory
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
39 if not out_args['out_dir']:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
40 out_dir = os.path.dirname(os.path.abspath(aligner_output))
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
41 else:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
42 out_dir = os.path.abspath(out_args['out_dir'])
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
43 if not os.path.exists(out_dir):
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
44 os.mkdir(out_dir)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
45
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
46 # Determine file prefix
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
47 if out_args['out_name']:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
48 file_prefix = out_args['out_name']
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
49 else:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
50 file_prefix = os.path.splitext(os.path.split(os.path.abspath(aligner_output))[1])[0]
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
51
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
52 return os.path.join(out_dir, file_prefix)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
53
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
54
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
55 def getSeqDict(seq_file):
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
56 """
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
57 Create a dictionary from a sequence file.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
58
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
59 Arguments:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
60 seq_file : sequence file.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
61
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
62 Returns:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
63 dict : sequence description as keys with Bio.SeqRecords as values.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
64 """
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
65 seq_dict = SeqIO.to_dict(readSeqFile(seq_file),
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
66 key_function=lambda x: x.description)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
67
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
68 return seq_dict
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
69
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
70
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
71 def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
72 out_args=default_out_args):
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
73 """
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
74 Writes tab-delimited database file in output directory.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
75
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
76 Arguments:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
77 db : a iterator of IgRecord objects containing alignment data.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
78 fields : a list of ordered field names to write.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
79 file_prefix : directory and prefix for CLIP tab-delim file.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
80 total_count : number of records (for progress bar).
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
81 id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
82 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
83 partial : if True put incomplete alignments in the pass file.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
84 out_args : common output argument dictionary from parseCommonArgs.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
85
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
86 Returns:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
87 None
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
88 """
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
89 # Function to check for valid records strictly
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
90 def _pass_strict(rec):
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
91 valid = [rec.v_call and rec.v_call != 'None',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
92 rec.j_call and rec.j_call != 'None',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
93 rec.functional is not None,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
94 rec.seq_vdj,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
95 rec.junction]
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
96 return all(valid)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
97
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
98 # Function to check for valid records loosely
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
99 def _pass_gentle(rec):
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
100 valid = [rec.v_call and rec.v_call != 'None',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
101 rec.d_call and rec.d_call != 'None',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
102 rec.j_call and rec.j_call != 'None']
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
103 return any(valid)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
104
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
105 # Set pass criteria
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
106 _pass = _pass_gentle if partial else _pass_strict
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
107
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
108 # Define output file names
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
109 pass_file = '%s_db-pass.tab' % file_prefix
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
110 fail_file = '%s_db-fail.tab' % file_prefix
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
111
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
112 # Initiate handles, writers and counters
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
113 pass_handle = None
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
114 fail_handle = None
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
115 pass_writer = None
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
116 fail_writer = None
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
117 start_time = time()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
118 rec_count = pass_count = fail_count = 0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
119
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
120 # Validate and write output
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
121 printProgress(0, total_count, 0.05, start_time)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
122 for i, record in enumerate(db, start=1):
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
123
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
124 # Replace sequence description with full string, if required
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
125 if id_dict is not None and record.id in id_dict:
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
126 record.id = id_dict[record.id]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
127
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
128 # Parse sequence description into new columns
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
129 if not no_parse:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
130 try:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
131 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
132 record.id = record.annotations['ID']
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
133 del record.annotations['ID']
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
134
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
135 # TODO: This is not the best approach. should pass in output fields.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
136 # If first record, use parsed description to define extra columns
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
137 if i == 1: fields.extend(list(record.annotations.keys()))
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
138 except IndexError:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
139 # Could not parse pRESTO-style annotations so fall back to no parse
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
140 no_parse = True
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
141 sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
142
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
143 # Count pass or fail and write to appropriate file
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
144 if _pass(record):
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
145 # Open pass file
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
146 if pass_writer is None:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
147 pass_handle = open(pass_file, 'wt')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
148 pass_writer = getDbWriter(pass_handle, add_fields=fields)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
149
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
150 # Write row to pass file
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
151 pass_count += 1
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
152 pass_writer.writerow(record.toDict())
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
153 else:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
154 # Open failed file
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
155 if out_args['failed'] and fail_writer is None:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
156 fail_handle = open(fail_file, 'wt')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
157 fail_writer = getDbWriter(fail_handle, add_fields=fields)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
158
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
159 # Write row to fail file if specified
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
160 fail_count += 1
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
161 if fail_writer is not None:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
162 fail_writer.writerow(record.toDict())
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
163
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
164 # Print progress
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
165 printProgress(i, total_count, 0.05, start_time)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
166
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
167 # Print consol log
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
168 log = OrderedDict()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
169 log['OUTPUT'] = pass_file
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
170 log['PASS'] = pass_count
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
171 log['FAIL'] = fail_count
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
172 log['END'] = 'MakeDb'
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
173 printLog(log)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
174
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
175 if pass_handle is not None: pass_handle.close()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
176 if fail_handle is not None: fail_handle.close()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
177
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
178
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
179 # TODO: may be able to merge with other mains
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
180 def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
181 parse_scores=False, parse_regions=False, parse_junction=False,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
182 out_args=default_out_args):
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
183 """
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
184 Main for IMGT aligned sample sequences.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
185
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
186 Arguments:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
187 aligner_output : zipped file or unzipped folder output by IMGT.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
188 seq_file : FASTA file input to IMGT (from which to get seqID).
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
189 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
190 partial : If True put incomplete alignments in the pass file.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
191 parse_scores : if True add alignment score fields to output file.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
192 parse_regions : if True add FWR and CDR region fields to output file.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
193 out_args : common output argument dictionary from parseCommonArgs.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
194
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
195 Returns:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
196 None
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
197 """
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
198 # Print parameter info
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
199 log = OrderedDict()
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
200 log['START'] = 'MakeDb'
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
201 log['ALIGNER'] = 'IMGT'
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
202 log['ALIGNER_OUTPUT'] = aligner_output
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
203 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
204 log['NO_PARSE'] = no_parse
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
205 log['PARTIAL'] = partial
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
206 log['SCORES'] = parse_scores
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
207 log['REGIONS'] = parse_regions
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
208 log['JUNCTION'] = parse_junction
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
209 printLog(log)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
210
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
211 start_time = time()
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
212 printMessage('Loading sequence files', start_time=start_time, width=25)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
213 # Extract IMGT files
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
214 temp_dir, imgt_files = extractIMGT(aligner_output)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
215 # Count records in IMGT files
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
216 total_count = countDbFile(imgt_files['summary'])
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
217 # Get (parsed) IDs from fasta file submitted to IMGT
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
218 id_dict = getIDforIMGT(seq_file) if seq_file else {}
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
219 printMessage('Done', start_time=start_time, end=True, width=25)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
220
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
221 # Parse IMGT output and write db
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
222 with open(imgt_files['summary'], 'r') as summary_handle, \
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
223 open(imgt_files['gapped'], 'r') as gapped_handle, \
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
224 open(imgt_files['ntseq'], 'r') as ntseq_handle, \
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
225 open(imgt_files['junction'], 'r') as junction_handle:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
226 parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
227 parse_scores=parse_scores, parse_regions=parse_regions,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
228 parse_junction=parse_junction)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
229 file_prefix = getFilePrefix(aligner_output, out_args)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
230 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
231 no_parse=no_parse, partial=partial, out_args=out_args)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
232
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
233 # Cleanup temp directory
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
234 temp_dir.cleanup()
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
235
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
236 return None
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
237
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
238
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
239 # TODO: may be able to merge with other mains
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
240 def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
241 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
242 out_args=default_out_args):
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
243 """
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
244 Main for IgBLAST aligned sample sequences.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
245
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
246 Arguments:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
247 aligner_output : IgBLAST output file to process.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
248 seq_file : fasta file input to IgBlast (from which to get sequence).
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
249 repo : folder with germline repertoire files.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
250 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
251 partial : If True put incomplete alignments in the pass file.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
252 parse_regions : if True add FWR and CDR fields to output file.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
253 parse_scores : if True add alignment score fields to output file.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
254 parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
255 out_args : common output argument dictionary from parseCommonArgs.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
256
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
257 Returns:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
258 None
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
259 """
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
260 # Print parameter info
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
261 log = OrderedDict()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
262 log['START'] = 'MakeDB'
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
263 log['ALIGNER'] = 'IgBlast'
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
264 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
265 log['SEQ_FILE'] = os.path.basename(seq_file)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
266 log['NO_PARSE'] = no_parse
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
267 log['PARTIAL'] = partial
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
268 log['SCORES'] = parse_scores
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
269 log['REGIONS'] = parse_regions
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
270 printLog(log)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
271
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
272 start_time = time()
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
273 printMessage('Loading sequence files', start_time=start_time, width=25)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
274 # Count records in sequence file
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
275 total_count = countSeqFile(seq_file)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
276 # Get input sequence dictionary
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
277 seq_dict = getSeqDict(seq_file)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
278 # Create germline repo dictionary
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
279 repo_dict = readRepo(repo)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
280 printMessage('Done', start_time=start_time, end=True, width=25)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
281
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
282 # Parse and write output
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
283 with open(aligner_output, 'r') as f:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
284 parse_iter = IgBLASTReader(f, seq_dict, repo_dict,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
285 parse_scores=parse_scores, parse_regions=parse_regions,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
286 parse_igblast_cdr3=parse_igblast_cdr3)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
287 file_prefix = getFilePrefix(aligner_output, out_args)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
288 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
289 no_parse=no_parse, partial=partial, out_args=out_args)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
290
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
291 return None
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
292
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
293
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
294 # TODO: may be able to merge with other mains
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
295 def parseIHMM(aligner_output, seq_file, repo, no_parse=True, partial=False,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
296 parse_scores=False, parse_regions=False, out_args=default_out_args):
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
297 """
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
298 Main for iHMMuneAlign aligned sample sequences.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
299
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
300 Arguments:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
301 aligner_output : iHMMune-Align output file to process.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
302 seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
303 repo : folder with germline repertoire files.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
304 no_parse : if ID is to be parsed for pRESTO output with default delimiters.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
305 partial : If True put incomplete alignments in the pass file.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
306 parse_scores : if True parse alignment scores.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
307 parse_regions : if True add FWR and CDR region fields.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
308 out_args : common output argument dictionary from parseCommonArgs.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
309
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
310 Returns:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
311 None
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
312 """
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
313 # Print parameter info
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
314 log = OrderedDict()
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
315 log['START'] = 'MakeDB'
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
316 log['ALIGNER'] = 'iHMMune-Align'
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
317 log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
318 log['SEQ_FILE'] = os.path.basename(seq_file)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
319 log['NO_PARSE'] = no_parse
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
320 log['PARTIAL'] = partial
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
321 log['SCORES'] = parse_scores
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
322 log['REGIONS'] = parse_regions
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
323 printLog(log)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
324
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
325 start_time = time()
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
326 printMessage('Loading sequence files', start_time=start_time, width=25)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
327 # Count records in sequence file
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
328 total_count = countSeqFile(seq_file)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
329 # Get input sequence dictionary
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
330 seq_dict = getSeqDict(seq_file)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
331 # Create germline repo dictionary
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
332 repo_dict = readRepo(repo)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
333 printMessage('Done', start_time=start_time, end=True, width=25)
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
334
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
335 # Parse and write output
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
336 with open(aligner_output, 'r') as f:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
337 parse_iter = IHMMuneReader(f, seq_dict, repo_dict,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
338 parse_scores=parse_scores, parse_regions=parse_regions)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
339 file_prefix = getFilePrefix(aligner_output, out_args)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
340 writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
341 no_parse=no_parse, partial=partial, out_args=out_args)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
342
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
343 return None
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
344
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
345
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
346 def getArgParser():
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
347 """
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
348 Defines the ArgumentParser.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
349
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
350 Returns:
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
351 argparse.ArgumentParser
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
352 """
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
353 fields = dedent(
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
354 '''
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
355 output files:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
356 db-pass
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
357 database of alignment records with functionality information,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
358 V and J calls, and a junction region.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
359 db-fail
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
360 database with records that fail due to no functionality information
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
361 (did not pass IMGT), no V call, no J call, or no junction region.
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
362
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
363 universal output fields:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
364 SEQUENCE_ID, SEQUENCE_INPUT, SEQUENCE_VDJ, SEQUENCE_IMGT,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
365 FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, INDELS,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
366 V_CALL, D_CALL, J_CALL,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
367 V_SEQ_START, V_SEQ_LENGTH,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
368 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH,
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
369 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
370 JUNCTION_LENGTH, JUNCTION, NP1_LENGTH, NP2_LENGTH,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
371 FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
372 CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
373
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
374 imgt specific output fields:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
375 V_GERM_START_IMGT, V_GERM_LENGTH_IMGT,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
376 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH, P5J_LENGTH,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
377 D_FRAME, V_SCORE, V_IDENTITY, J_SCORE, J_IDENTITY,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
378
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
379 igblast specific output fields:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
380 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
381 V_EVALUE, V_SCORE, V_IDENTITY, V_BTOP,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
382 J_EVALUE, J_SCORE, J_IDENTITY, J_BTOP.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
383 CDR3_IGBLAST_NT, CDR3_IGBLAST_AA
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
384
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
385 ihmm specific output fields:
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
386 V_GERM_START_VDJ, V_GERM_LENGTH_VDJ,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
387 HMM_SCORE
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
388 ''')
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
389
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
390 # Define ArgumentParser
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
391 parser = ArgumentParser(description=__doc__, epilog=fields,
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
392 formatter_class=CommonHelpFormatter)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
393 parser.add_argument('--version', action='version',
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
394 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
395 subparsers = parser.add_subparsers(title='subcommands', dest='command',
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
396 help='Aligner used', metavar='')
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
397 # TODO: This is a temporary fix for Python issue 9253
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
398 subparsers.required = True
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
399
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
400 # Parent parser
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
401 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
402
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
403 # IgBlast Aligner
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
404 parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
405 formatter_class=CommonHelpFormatter,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
406 help='Process IgBLAST output.',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
407 description='Process IgBLAST output.')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
408 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
409 required=True,
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
410 help='''IgBLAST output files in format 7 with query sequence
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
411 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
412 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
413 help='''List of folders and/or fasta files containing
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
414 IMGT-gapped germline sequences corresponding to the
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
415 set of germlines used in the IgBLAST alignment.''')
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
416 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
417 required=True,
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
418 help='''List of input FASTA files (with .fasta, .fna or .fa
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
419 extension), containing sequences.''')
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
420 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
421 help='''Specify to prevent input sequence headers from being parsed
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
422 to add new columns to database. Parsing of sequence headers requires
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
423 headers to be in the pRESTO annotation format, so this should be specified
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
424 when sequence headers are incompatible with the pRESTO annotation scheme.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
425 Note, unrecognized header formats will default to this behavior.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
426 parser_igblast.add_argument('--partial', action='store_true', dest='partial',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
427 help='''If specified, include incomplete V(D)J alignments in
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
428 the pass file instead of the fail file.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
429 parser_igblast.add_argument('--scores', action='store_true', dest='parse_scores',
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
430 help='''Specify if alignment score metrics should be
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
431 included in the output. Adds the V_SCORE, V_IDENTITY,
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
432 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
433 J_BTOP, and J_EVALUE columns.''')
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
434 parser_igblast.add_argument('--regions', action='store_true', dest='parse_regions',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
435 help='''Specify if IMGT FWR and CDRs should be
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
436 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
437 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
438 CDR3_IMGT columns.''')
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
439 parser_igblast.add_argument('--cdr3', action='store_true',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
440 dest='parse_igblast_cdr3',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
441 help='''Specify if the CDR3 sequences generated by IgBLAST
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
442 should be included in the output. Adds the columns
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
443 CDR3_IGBLAST_NT and CDR3_IGBLAST_AA. Requires IgBLAST
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
444 version 1.5 or greater.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
445 parser_igblast.set_defaults(func=parseIgBLAST)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
446
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
447 # IMGT aligner
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
448 parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
449 formatter_class=CommonHelpFormatter,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
450 help='''Process IMGT/HighV-Quest output
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
451 (does not work with V-QUEST).''',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
452 description='''Process IMGT/HighV-Quest output
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
453 (does not work with V-QUEST).''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
454 parser_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
455 help='''Either zipped IMGT output files (.zip or .txz) or a
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
456 folder containing unzipped IMGT output files (which must
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
457 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
458 and 6_Junction).''')
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
459 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
460 required=False,
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
461 help='''List of input FASTA files (with .fasta, .fna or .fa
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
462 extension) containing sequences.''')
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
463 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
464 help='''Specify to prevent input sequence headers from being parsed
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
465 to add new columns to database. Parsing of sequence headers requires
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
466 headers to be in the pRESTO annotation format, so this should be specified
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
467 when sequence headers are incompatible with the pRESTO annotation scheme.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
468 Note, unrecognized header formats will default to this behavior.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
469 parser_imgt.add_argument('--partial', action='store_true', dest='partial',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
470 help='''If specified, include incomplete V(D)J alignments in
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
471 the pass file instead of the fail file.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
472 parser_imgt.add_argument('--scores', action='store_true', dest='parse_scores',
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
473 help='''Specify if alignment score metrics should be
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
474 included in the output. Adds the V_SCORE, V_IDENTITY,
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
475 J_SCORE and J_IDENTITY.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
476 parser_imgt.add_argument('--regions', action='store_true', dest='parse_regions',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
477 help='''Specify if IMGT FWRs and CDRs should be
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
478 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
479 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
480 CDR3_IMGT columns.''')
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
481 parser_imgt.add_argument('--junction', action='store_true', dest='parse_junction',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
482 help='''Specify if detailed junction fields should be
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
483 included in the output. Adds the columns
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
484 N1_LENGTH, N2_LENGTH, P3V_LENGTH, P5D_LENGTH, P3D_LENGTH,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
485 P5J_LENGTH, D_FRAME.''')
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
486 parser_imgt.set_defaults(func=parseIMGT)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
487
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
488 # iHMMuneAlign Aligner
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
489 parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
490 formatter_class=CommonHelpFormatter,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
491 help='Process iHMMune-Align output.',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
492 description='Process iHMMune-Align output.')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
493 parser_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_outputs',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
494 required=True,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
495 help='''iHMMune-Align output file.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
496 parser_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
497 help='''List of folders and/or FASTA files containing
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
498 IMGT-gapped germline sequences corresponding to the
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
499 set of germlines used in the IgBLAST alignment.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
500 parser_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
501 required=True,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
502 help='''List of input FASTA files (with .fasta, .fna or .fa
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
503 extension) containing sequences.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
504 parser_ihmm.add_argument('--noparse', action='store_true', dest='no_parse',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
505 help='''Specify to prevent input sequence headers from being parsed
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
506 to add new columns to database. Parsing of sequence headers requires
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
507 headers to be in the pRESTO annotation format, so this should be specified
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
508 when sequence headers are incompatible with the pRESTO annotation scheme.
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
509 Note, unrecognized header formats will default to this behavior.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
510 parser_ihmm.add_argument('--partial', action='store_true', dest='partial',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
511 help='''If specified, include incomplete V(D)J alignments in
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
512 the pass file instead of the fail file.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
513 parser_ihmm.add_argument('--scores', action='store_true', dest='parse_scores',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
514 help='''Specify if alignment score metrics should be
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
515 included in the output. Adds the path score of the
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
516 iHMMune-Align hidden Markov model to HMM_SCORE.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
517 parser_ihmm.add_argument('--regions', action='store_true', dest='parse_regions',
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
518 help='''Specify if IMGT FWRs and CDRs should be
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
519 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
520 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
521 CDR3_IMGT columns.''')
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
522 parser_ihmm.set_defaults(func=parseIHMM)
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
523
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
524 return parser
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
525
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
526
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
527 if __name__ == "__main__":
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
528 """
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
529 Parses command line arguments and calls main
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
530 """
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
531 parser = getArgParser()
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
532 args = parser.parse_args()
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
533 args_dict = parseCommonArgs(args, in_arg='aligner_outputs')
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
534
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
535 # Set no ID parsing if sequence files are not provided
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
536 if 'seq_files' in args_dict and not args_dict['seq_files']:
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
537 args_dict['no_parse'] = True
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
538
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
539 # Delete
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
540 if 'seq_files' in args_dict: del args_dict['seq_files']
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
541 if 'aligner_outputs' in args_dict: del args_dict['aligner_outputs']
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
542 if 'command' in args_dict: del args_dict['command']
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
543 if 'func' in args_dict: del args_dict['func']
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
544
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
545 if args.command == 'imgt':
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
546 for i in range(len(args.__dict__['aligner_outputs'])):
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
547 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
548 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
549 if args.__dict__['seq_files'] else None
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
550 args.func(**args_dict)
52
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
551 elif args.command == 'igblast' or args.command == 'ihmm':
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
552 for i in range(len(args.__dict__['aligner_outputs'])):
22dddabe3637 Uploaded
davidvanzessen
parents: 0
diff changeset
553 args_dict['aligner_output'] = args.__dict__['aligner_outputs'][i]
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
554 args_dict['seq_file'] = args.__dict__['seq_files'][i]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
555 args.func(**args_dict)