comparison dante.py @ 0:a5f1638b73be draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 08:01:42 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a5f1638b73be
1 #!/usr/bin/env python3
2
3 import numpy as np
4 import subprocess
5 import math
6 import time
7 from operator import itemgetter
8 from collections import Counter
9 from itertools import groupby
10 import os
11 import configuration
12 from tempfile import NamedTemporaryFile
13 import sys
14 import warnings
15 import shutil
16 from collections import defaultdict
17
18 np.set_printoptions(threshold=sys.maxsize)
19
20
21 def alignment_scoring():
22 ''' Create hash table for alignment similarity counting: for every
23 combination of aminoacids in alignment assign score from protein
24 scoring matrix defined in configuration file '''
25 score_dict = {}
26 with open(configuration.SC_MATRIX) as smatrix:
27 count = 1
28 for line in smatrix:
29 if not line.startswith("#"):
30 if count == 1:
31 aa_all = line.rstrip().replace(" ", "")
32 else:
33 count_aa = 1
34 line = list(filter(None, line.rstrip().split(" ")))
35 for aa in aa_all:
36 score_dict["{}{}".format(line[0], aa)] = line[count_aa]
37 count_aa += 1
38 count += 1
39 return score_dict
40
41
42 def characterize_fasta(QUERY, WIN_DOM):
43 ''' Find the sequences, their lengths, starts, ends and if
44 they exceed the window '''
45 with open(QUERY) as query:
46 headers = []
47 fasta_lengths = []
48 seq_starts = []
49 seq_ends = []
50 fasta_chunk_len = 0
51 count_line = 1
52 for line in query:
53 line = line.rstrip()
54 if line.startswith(">"):
55 headers.append(line.rstrip())
56 fasta_lengths.append(fasta_chunk_len)
57 fasta_chunk_len = 0
58 seq_starts.append(count_line + 1)
59 seq_ends.append(count_line - 1)
60 else:
61 fasta_chunk_len += len(line)
62 count_line += 1
63 seq_ends.append(count_line)
64 seq_ends = seq_ends[1:]
65 fasta_lengths.append(fasta_chunk_len)
66 fasta_lengths = fasta_lengths[1:]
67 # control if there are correct (unique) names for individual seqs:
68 # LASTAL takes seqs IDs till the first space which can then create problems with ambiguous records
69 if len(headers) > len(set([header.split(" ")[0] for header in headers
70 ])):
71 raise NameError(
72 '''Sequences in multifasta format are not named correctly:
73 seq IDs (before the first space) are the same''')
74
75 above_win = [idx
76 for idx, value in enumerate(fasta_lengths) if value > WIN_DOM]
77 below_win = [idx
78 for idx, value in enumerate(fasta_lengths)
79 if value <= WIN_DOM]
80 lens_above_win = np.array(fasta_lengths)[above_win]
81 return headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
82
83
84 def split_fasta(QUERY, WIN_DOM, step, headers, above_win, below_win,
85 lens_above_win, seq_starts, seq_ends):
86 ''' Create temporary file containing all sequences - the ones that exceed
87 the window are cut with a set overlap (greater than domain size with a reserve) '''
88 with open(QUERY, "r") as query:
89 count_fasta_divided = 0
90 count_fasta_not_divided = 0
91 ntf = NamedTemporaryFile(delete=False)
92 divided = np.array(headers)[above_win]
93 row_length = configuration.FASTA_LINE
94 for line in query:
95 line = line.rstrip()
96 if line.startswith(">") and line in divided:
97 stop_line = seq_ends[above_win[
98 count_fasta_divided]] - seq_starts[above_win[
99 count_fasta_divided]] + 1
100 count_line = 0
101 whole_seq = []
102 for line2 in query:
103 whole_seq.append(line2.rstrip())
104 count_line += 1
105 if count_line == stop_line:
106 break
107 whole_seq = "".join(whole_seq)
108 ## create list of starting positions for individual parts of a seq with a step given by a window and overlap
109 windows_starts = list(range(0, lens_above_win[
110 count_fasta_divided], step))
111 ## create list of ending positions (starting pos + window), the last element is the whole seq length
112 windows_ends = [
113 x + WIN_DOM
114 if x + WIN_DOM < lens_above_win[count_fasta_divided] else
115 lens_above_win[count_fasta_divided] for x in windows_starts
116 ]
117 count_part = 1
118 for start_part, end_part in zip(windows_starts, windows_ends):
119 seq_part = whole_seq[start_part:end_part]
120 if count_part == len(windows_starts):
121 ntf.write("{}_DANTE_PART{}_LAST:{}-{}\n{}\n".format(
122 line.split(" ")[0], count_part, start_part + 1,
123 end_part, "\n".join([seq_part[i:i + row_length]
124 for i in range(0, len(
125 seq_part), row_length)
126 ])).encode("utf-8"))
127 else:
128 ntf.write("{}_DANTE_PART{}:{}-{}\n{}\n".format(
129 line.split(" ")[0], count_part, start_part + 1,
130 end_part, "\n".join([seq_part[i:i + row_length]
131 for i in range(0, len(
132 seq_part), row_length)
133 ])).encode("utf-8"))
134 count_part += 1
135 count_fasta_divided += 1
136 elif line.startswith(">") and line not in divided:
137 length_seq = seq_ends[below_win[
138 count_fasta_not_divided]] - seq_starts[below_win[
139 count_fasta_not_divided]] + 1
140 ntf.write("{}\n{}".format(line, "".join([query.readline(
141 ) for x in range(length_seq)])).encode("utf-8"))
142 count_fasta_not_divided += 1
143 query_temp = ntf.name
144 ntf.close()
145 return query_temp
146
147
148 def domain_annotation(elements, CLASSIFICATION):
149 ''' Assign protein domain to each hit from protein database '''
150 domains = []
151 annotations = []
152 with open(CLASSIFICATION, "r") as cl_tbl:
153 annotation = {}
154 for line in cl_tbl:
155 record = line.rstrip().split("\t")
156 annotation[record[0]] = record[1:]
157 for i in range(len(elements)):
158 domains.append(elements[i].split("__")[0].split("-")[1])
159 element_name = "__".join(elements[i].split("__")[1:])
160 if element_name in annotation.keys():
161 annotations.append("|".join([elements[i].split("__")[0].split("-")[
162 1], ("|".join(annotation[element_name]))]))
163 else:
164 annotations.append("unknown|unknown")
165 return annotations
166
167
168 def hits_processing(seq_len, start, end, strand):
169 ''' Gain hits intervals separately for forward and reverse strand '''
170 reverse_strand_idx = np.where(strand == "-")[0]
171 if not reverse_strand_idx.any():
172 start_pos_plus = start + 1
173 end_pos_plus = end
174 regions_plus = list(zip(start_pos_plus, end_pos_plus))
175 regions_minus = []
176 else:
177 reverse_strand_idx = reverse_strand_idx[0]
178 start_pos_plus = start[0:reverse_strand_idx] + 1
179 end_pos_plus = end[0:reverse_strand_idx]
180 start_pos_minus = seq_len[0] - end[reverse_strand_idx:] + 1
181 end_pos_minus = seq_len[0] - start[reverse_strand_idx:]
182 regions_plus = list(zip(start_pos_plus, end_pos_plus))
183 regions_minus = list(zip(start_pos_minus, end_pos_minus))
184 return reverse_strand_idx, regions_plus, regions_minus
185
186
187 def overlapping_regions(input_data):
188 ''' Join all overlapping intervals(hits) to clusters (potential domains),
189 get list of start-end positions of individual hits within the interval,
190 list of minimus and maximums as well as the indices in the original
191 sequence_hits structure for the hits belonging to the same clusters '''
192 if input_data:
193 sorted_idx, sorted_data = zip(*sorted(
194 [(index, data) for index, data in enumerate(input_data)],
195 key=itemgetter(1)))
196 merged_ends = input_data[sorted_idx[0]][1]
197 intervals = []
198 data = []
199 output_intervals = []
200 output_data = []
201 for i, j in zip(sorted_idx, sorted_data):
202 if input_data[i][0] < merged_ends:
203 merged_ends = max(input_data[i][1], merged_ends)
204 intervals.append(i)
205 data.append(j)
206 else:
207 output_intervals.append(intervals)
208 output_data.append(data)
209 intervals = []
210 data = []
211 intervals.append(i)
212 data.append(j)
213 merged_ends = input_data[i][1]
214 output_intervals.append(intervals)
215 output_data.append(data)
216 mins = [x[0][0] for x in output_data]
217 maxs = [max(x, key=itemgetter(1))[1] for x in output_data]
218 else:
219 mins = []
220 maxs = []
221 output_intervals = []
222 output_data = []
223 return mins, maxs, output_data, output_intervals
224
225
226 def annotations_dict(annotations):
227 ''' Hash table where annotations of the hits within a clusters are the keys.
228 Each annotation has serial number assigned which indexes the row in the score_table '''
229 classes_dict = {classes: idx
230 for idx, classes in enumerate(set(annotations))}
231 return classes_dict
232
233
234 def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION):
235 ''' Score table is created based on the annotations occurance in the cluster.
236 Matrix axis y corresponds to individual annotations (indexed according to classes_dict),
237 axis x represents positions of analyzed seq in a given cluster.
238 For every hit within cluster, array of scores on the corresponding position
239 is recorded to the table in case if the score on certain position is so far the highest
240 for the certain position and certain annotation '''
241 classes_dict = annotations_dict(annotations)
242 score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int)
243 count = 0
244 for item in annotations:
245 saved_scores = score_matrix[classes_dict[item], data[count][0] - mins:
246 data[count][1] - mins + 1]
247 new_scores = [scores[count]] * len(saved_scores)
248 score_matrix[classes_dict[item], data[count][0] - mins:data[count][
249 1] - mins + 1] = [max(*pos_score)
250 for pos_score in zip(saved_scores, new_scores)]
251 count += 1
252 return score_matrix, classes_dict
253
254
255 def score_matrix_evaluation(score_matrix, classes_dict, THRESHOLD_SCORE):
256 ''' Score matrix is evaluated based on each position.
257 For every position the list of annotations with a score which reaches
258 certain percentage of the overal best score of the cluster are stored '''
259 ann_per_reg = []
260 overal_best_score_reg = max((score_matrix.max(axis=1)))
261 for position in score_matrix.T:
262 ## score threshold calculated as a percentage of the OVERALL best score in the cluster
263 threshold = overal_best_score_reg * THRESHOLD_SCORE / 100
264 above_th = [idx
265 for idx, score in enumerate(position)
266 if position[idx] >= threshold]
267 ## select unique annotations in one position that are above threshold
268 ann_per_pos = list(set(
269 [key for key, value in classes_dict.items() if value in above_th]))
270 ann_per_reg.append(ann_per_pos)
271 return ann_per_reg
272
273
274 def group_annot_regs(ann_per_reg):
275 ''' Get list of domains, annotations, longest common annotations and
276 counts of positions with certain annotation per regions '''
277 ## tranform list of lists (potential multiple annotations for every position ) to flat list of all annotations
278 all_annotations = [item for sublist in ann_per_reg for item in sublist]
279 unique_annotations = list(set(all_annotations))
280 ann_pos_counts = [all_annotations.count(x) for x in unique_annotations]
281 unique_annotations = list(set(
282 [item for sublist in ann_per_reg for item in sublist]))
283 domain_type = list(set([annotation.split("|")[0]
284 for annotation in unique_annotations]))
285 classification_list = [annotation.split("|")
286 for annotation in unique_annotations]
287 ann_substring = "|".join(os.path.commonprefix(classification_list))
288 domain_type = "/".join(domain_type)
289 return domain_type, ann_substring, unique_annotations, ann_pos_counts
290
291
292 def best_score(scores, region):
293 ''' From overlapping intervals take the one with the highest score '''
294 ## if more hits have the same best score take only the first one
295 best_idx = region[np.where(scores == max(scores))[0][0]]
296 best_idx_reg = np.where(scores == max(scores))[0][0]
297 return best_idx, best_idx_reg
298
299
300 def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts,
301 dom_start, dom_end, step, best_idx, annotation_best,
302 db_name_best, db_starts_best, db_ends_best, strand, score,
303 seq_id, db_seq, query_seq, domain_size, positions, gff):
304 ''' Record obtained information about domain corresponding to individual cluster to common gff file '''
305 best_start = positions[best_idx][0]
306 best_end = positions[best_idx][1]
307 best_score = score[best_idx]
308 ## proportion of length of the best hit to the whole region length found by base
309 length_proportion = int((best_end - best_start + 1) /
310 (dom_end - dom_start + 1) * 100)
311 db_seq_best = db_seq[best_idx]
312 query_seq_best = query_seq[best_idx]
313 domain_size_best = domain_size[best_idx]
314 [percent_ident, align_similarity, relat_align_len, relat_interrupt,
315 db_len_proportion
316 ] = filter_params(db_seq_best, query_seq_best, domain_size_best)
317 ann_substring = "|".join(ann_substring.split("|")[1:])
318 annotation_best = "|".join([db_name_best] + annotation_best.split("|")[1:])
319 if "DANTE_PART" in seq_id:
320 part = int(seq_id.split("DANTE_PART")[1].split(":")[0].split("_")[0])
321 dom_start = dom_start + (part - 1) * step
322 dom_end = dom_end + (part - 1) * step
323 best_start = best_start + (part - 1) * step
324 best_end = best_end + (part - 1) * step
325 if ann_substring is '':
326 ann_substring = "NONE(Annotations from different classes)"
327 if len(unique_annotations) > 1:
328 unique_annotations = ",".join(["{}[{}bp]".format(
329 ann, pos) for ann, pos in zip(unique_annotations, ann_pos_counts)])
330 else:
331 unique_annotations = unique_annotations[0]
332 if __name__ == '__main__':
333 SOURCE = configuration.SOURCE_DANTE
334 else:
335 SOURCE = configuration.SOURCE_PROFREP
336 if "/" in domain_type:
337 gff.write(
338 "{}\t{}\t{}\t{}\t{}\t.\t{}\t{}\tName={};Final_Classification=Ambiguous_domain;Region_Hits_Classifications_={}\n".format(
339 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
340 dom_end, strand, configuration.PHASE, domain_type,
341 unique_annotations))
342 else:
343 gff.write(
344 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format(
345 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
346 dom_end, best_score, strand, configuration.PHASE, domain_type,
347 ann_substring, unique_annotations, annotation_best, best_start,
348 best_end, length_proportion, db_starts_best, db_ends_best,
349 domain_size_best, db_seq_best, query_seq_best, percent_ident,
350 align_similarity, relat_align_len, relat_interrupt,
351 db_len_proportion))
352
353
354 def filter_params(db, query, protein_len):
355 ''' Calculate basic statistics of the quality of the alignment '''
356 score_dict = alignment_scoring()
357 num_ident = 0
358 count_interrupt = 0
359 count_similarity = 0
360 alignment_len = 0
361 for i, j in zip(db.upper(), query.upper()):
362 if i == j and i != "X":
363 num_ident += 1
364 if j == "/" or j == "\\" or j == "*":
365 count_interrupt += 1
366 if (i.isalpha() or i == "*") and (j.isalpha() or j == "*"):
367 if int(score_dict["{}{}".format(i, j)]) > 0:
368 count_similarity += 1
369 ## gapless alignment length proportional to the domain protein length
370 relat_align_len = round((len(db) - db.count("-")) / protein_len, 3)
371 ## proportional identical bases (except of X) to al.length
372 align_identity = round(num_ident / len(db), 2)
373 ## proportional count of positive scores from scoring matrix to al. length
374 align_similarity = round(count_similarity / len(db), 2)
375 ## number of interruptions per 100 bp
376 relat_interrupt = round(count_interrupt / math.ceil((len(query) / 100)), 2)
377 ## Proportion of alignment to the original length of protein domain from database (indels included)
378 db_len_proportion = round(len(db) / protein_len, 2)
379 return align_identity, align_similarity, relat_align_len, relat_interrupt, db_len_proportion
380
381
382 def line_generator(tab_pipe, maf_pipe, start):
383 ''' Yield individual lines of LASTAL stdout for single sequence '''
384 if hasattr(line_generator, "dom"):
385 seq_id = line_generator.dom.split("\t")[6]
386 yield line_generator.dom.encode("utf-8")
387 del line_generator.dom
388 line_tab = ""
389 for line_tab in tab_pipe:
390 line_tab = line_tab.decode("utf-8")
391 if not line_tab.startswith('#'):
392 if start:
393 if not ('seq_id' in locals() and
394 seq_id != line_tab.split("\t")[6]):
395 seq_id = line_tab.split("\t")[6]
396 start = False
397 line_maf = [maf_pipe.readline() for line_count in range(4)]
398 db_seq = line_maf[1].decode("utf-8").rstrip().split(" ")[-1]
399 alignment_seq = line_maf[2].decode("utf-8").rstrip().split(" ")[-1]
400 line = "{}\t{}\t{}".format(line_tab, db_seq, alignment_seq)
401 line_id = line.split("\t")[6]
402 if seq_id != line_id:
403 line_generator.dom = line
404 return
405 else:
406 yield line.encode("utf-8")
407 else:
408 maf_pipe.readline()
409 if line_tab == "":
410 raise RuntimeError
411 else:
412 return
413
414
415 def get_version(path, LAST_DB):
416 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
417 shell=True,
418 cwd=path).decode('ascii').strip()
419 shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1 ",
420 shell=True,
421 cwd=path).decode('ascii').strip()
422 revcount = len(subprocess.check_output("git log --oneline",
423 shell=True,
424 cwd=path).decode('ascii').split())
425 version_string = (
426 "##-----------------------------------------------\n"
427 "##PIPELINE VERSION : "
428 "{branch}-rv-{revcount}({shorthash})\n"
429 "##PROTEIN DATABASE VERSION : {PD}\n"
430 "##-----------------------------------------------\n").format(
431 branch=branch,
432 shorthash=shorthash,
433 revcount=revcount,
434 PD=os.path.basename(LAST_DB))
435 return version_string
436
437
438 def write_info(dom_gff_tmp, version_string):
439 dom_gff_tmp.write("{}\n".format(configuration.HEADER_GFF))
440 dom_gff_tmp.write(version_string)
441
442
443 def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
444 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM):
445 ''' Search for protein domains using our protein database and external tool LAST,
446 stdout is parsed in real time and hits for a single sequence undergo further processing
447 - tabular format(TAB) to get info about position, score, orientation
448 - MAF format to gain alignment and original sequence
449 '''
450
451 step = WIN_DOM - OVERLAP_DOM
452 [headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
453 ] = characterize_fasta(QUERY, WIN_DOM)
454 query_temp = split_fasta(QUERY, WIN_DOM, step, headers, above_win,
455 below_win, lens_above_win, seq_starts, seq_ends)
456
457 ## TAB output contains all the alignment scores, positions, strands...
458 tab = subprocess.Popen(
459 "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f TAB".format(LAST_DB,
460 query_temp),
461 stdout=subprocess.PIPE,
462 shell=True)
463 ## MAF output contains alignment sequences
464 maf = subprocess.Popen(
465 "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f MAF".format(LAST_DB,
466 query_temp),
467 stdout=subprocess.PIPE,
468 shell=True)
469
470 tab_pipe = tab.stdout
471 maf_pipe = maf.stdout
472 maf_pipe.readline()
473
474 seq_ids = []
475 dom_tmp = NamedTemporaryFile(delete=False)
476 with open(dom_tmp.name, "w") as dom_gff_tmp:
477 path = os.path.dirname(os.path.realpath(__file__))
478 version_string = get_version(path, LAST_DB)
479 write_info(dom_gff_tmp, version_string)
480 gff = open(dom_tmp.name, "a")
481 start = True
482 while True:
483 try:
484 with warnings.catch_warnings():
485 warnings.simplefilter("ignore")
486 sequence_hits = np.genfromtxt(
487 line_generator(tab_pipe, maf_pipe, start),
488 names=
489 "score, name_db, start_db, al_size_db, strand_db, seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q, block1, block2, block3, db_seq, q_seq",
490 usecols=
491 "score, name_q, start_q, al_size_q, strand_q, seq_size_q, name_db, db_seq, q_seq, seq_size_db, start_db, al_size_db",
492 dtype=None,
493 comments=None)
494 except RuntimeError:
495 break
496 ## if there are no domains found
497 if sequence_hits.size is 0:
498 gff.write("##NO DOMAINS")
499 return [], [], [], []
500
501 ############# PARSING LASTAL OUTPUT ############################
502 sequence_hits = np.atleast_1d(sequence_hits)
503 score = sequence_hits['score'].astype("int")
504 seq_id = sequence_hits['name_q'][0].astype("str")
505 start_hit = sequence_hits['start_q'].astype("int")
506 end_hit = start_hit + sequence_hits['al_size_q'].astype("int")
507 strand = sequence_hits['strand_q'].astype("str")
508 seq_len = sequence_hits['seq_size_q'].astype("int")
509 domain_db = sequence_hits['name_db'].astype("str")
510 db_seq = sequence_hits['db_seq'].astype("str")
511 query_seq = sequence_hits['q_seq'].astype("str")
512 domain_size = sequence_hits['seq_size_db'].astype("int")
513 db_start = sequence_hits['start_db'].astype("int") + 1
514 db_end = sequence_hits['start_db'].astype("int") + sequence_hits[
515 'al_size_db'].astype("int")
516
517 [reverse_strand_idx, positions_plus, positions_minus
518 ] = hits_processing(seq_len, start_hit, end_hit, strand)
519 strand_gff = "+"
520 [mins_plus, maxs_plus, data_plus, indices_plus
521 ] = overlapping_regions(positions_plus)
522 [mins_minus, maxs_minus, data_minus, indices_minus
523 ] = overlapping_regions(positions_minus)
524 positions = positions_plus + positions_minus
525 indices_overal = indices_plus + [x + reverse_strand_idx
526 for x in indices_minus]
527 mins = mins_plus + mins_minus
528 maxs = maxs_plus + maxs_minus
529 data = data_plus + data_minus
530 ## process every region (cluster) of overlapping hits sequentially
531 count_region = 0
532 for region in indices_overal:
533 db_names = domain_db[np.array(region)]
534 db_starts = db_start[np.array(region)]
535 db_ends = db_end[np.array(region)]
536 scores = score[np.array(region)]
537 annotations = domain_annotation(db_names, CLASSIFICATION)
538 [score_matrix, classes_dict] = score_table(
539 mins[count_region], maxs[count_region], data[count_region],
540 annotations, scores, CLASSIFICATION)
541 ann_per_reg = score_matrix_evaluation(score_matrix, classes_dict,
542 THRESHOLD_SCORE)
543 [domain_type, ann_substring, unique_annotations, ann_pos_counts
544 ] = group_annot_regs(ann_per_reg)
545 [best_idx, best_idx_reg] = best_score(scores, region)
546 annotation_best = annotations[best_idx_reg]
547 db_name_best = db_names[best_idx_reg]
548 db_starts_best = db_starts[best_idx_reg]
549 db_ends_best = db_ends[best_idx_reg]
550 if count_region == len(indices_plus):
551 strand_gff = "-"
552 create_gff3(domain_type, ann_substring, unique_annotations,
553 ann_pos_counts, mins[count_region], maxs[count_region],
554 step, best_idx, annotation_best, db_name_best,
555 db_starts_best, db_ends_best, strand_gff, score,
556 seq_id, db_seq, query_seq, domain_size, positions, gff)
557 count_region += 1
558 seq_ids.append(seq_id)
559 os.unlink(query_temp)
560 gff.close()
561 dom_tmp.close()
562 ## if any sequence from input data was split into windows, merge and adjust the data from individual windows
563 if any("DANTE_PART" in x for x in seq_ids):
564 adjust_gff(OUTPUT_DOMAIN, dom_tmp.name, WIN_DOM, OVERLAP_DOM, step)
565 ## otherwise use the temporary output as the final domains gff
566 else:
567 shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN)
568 os.unlink(dom_tmp.name)
569
570
571 def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step):
572 ''' Original gff file is adjusted in case of containing cut parts
573 - for consecutive sequences overlap is divided to half with first half
574 of records(domains) belonging to the first sequence and second to the following one.
575 Duplicate domains going through the middle of the overlap are removed.
576 First and the last part (marked as LAST) of a certain sequence are
577 handled separately as the are overlapped from one side only '''
578
579 seq_id_all = []
580 class_dict = defaultdict(int)
581 seen = set()
582 with open(OUTPUT_DOMAIN, "w") as adjusted_gff:
583 with open(gff, "r") as primary_gff:
584 start = True
585 for line in primary_gff:
586 if line.startswith("#"):
587 adjusted_gff.write(line)
588 else:
589 split_line = line.split("\t")
590 classification = split_line[-1].split(";")[1].split("=")[1]
591 if start:
592 seq_id_all.append(split_line[0].split("_DANTE_PART")[
593 0])
594 start = False
595 seq_id = split_line[0].split("_DANTE_PART")[0]
596 if "DANTE_PART" in line:
597 line_without_id = "\t".join(split_line[1:])
598 part = int(split_line[0].split("_DANTE_PART")[1].split(
599 ":")[0].split("_")[0])
600 if seq_id != seq_id_all[-1]:
601 seq_id_all.append(seq_id)
602
603 ## first part of the sequence
604 if part == 1:
605 cut_end = WIN_DOM - OVERLAP_DOM / 2
606 if int(split_line[3]) <= cut_end <= int(split_line[
607 4]):
608 if line_without_id not in seen:
609 adjusted_gff.write("{}\t{}".format(
610 seq_id, line_without_id))
611 class_dict[classification] += 1
612 seen.add(line_without_id)
613 elif int(split_line[4]) < cut_end:
614 adjusted_gff.write("{}\t{}".format(
615 seq_id, line_without_id))
616 class_dict[classification] += 1
617
618 ## last part of the sequence
619 elif "LAST" in split_line[0]:
620 cut_start = OVERLAP_DOM / 2 + (part - 1) * step
621 if int(split_line[3]) <= cut_start <= int(
622 split_line[4]):
623 if line_without_id not in seen:
624 adjusted_gff.write("{}\t{}".format(
625 seq_id, line_without_id))
626 class_dict[classification] += 1
627 seen.add(line_without_id)
628 elif int(split_line[3]) > cut_start:
629 adjusted_gff.write("{}\t{}".format(
630 seq_id, line_without_id))
631 class_dict[classification] += 1
632
633 ## middle part of the sequence
634 else:
635 cut_start = OVERLAP_DOM / 2 + (part - 1) * step
636 cut_end = WIN_DOM - OVERLAP_DOM / 2 + (part -
637 1) * step
638 if int(split_line[3]) <= cut_start <= int(
639 split_line[4]) or int(split_line[
640 3]) <= cut_end <= int(split_line[4]):
641 if line_without_id not in seen:
642 adjusted_gff.write("{}\t{}".format(
643 seq_id, line_without_id))
644 class_dict[classification] += 1
645 seen.add(line_without_id)
646 elif int(split_line[3]) > cut_start and int(
647 split_line[4]) < cut_end:
648 adjusted_gff.write("{}\t{}".format(
649 seq_id, line_without_id))
650 class_dict[classification] += 1
651 ## not divived
652 else:
653 if seq_id != seq_id_all[-1]:
654 seq_id_all.append(seq_id)
655 adjusted_gff.write(line)
656 class_dict[classification] += 1
657
658
659 def main(args):
660
661 t = time.time()
662
663 QUERY = args.query
664 LAST_DB = args.protein_database
665 CLASSIFICATION = args.classification
666 OUTPUT_DOMAIN = args.domain_gff
667 NEW_LDB = args.new_ldb
668 OUTPUT_DIR = args.output_dir
669 THRESHOLD_SCORE = args.threshold_score
670 WIN_DOM = args.win_dom
671 OVERLAP_DOM = args.overlap_dom
672
673 if OUTPUT_DOMAIN is None:
674 OUTPUT_DOMAIN = configuration.DOMAINS_GFF
675 if os.path.isdir(LAST_DB):
676 LAST_DB = os.path.join(LAST_DB, configuration.LAST_DB_FILE)
677 if os.path.isdir(CLASSIFICATION):
678 CLASSIFICATION = os.path.join(CLASSIFICATION, configuration.CLASS_FILE)
679
680 if NEW_LDB:
681 subprocess.call("lastdb -p -cR01 {} {}".format(LAST_DB, LAST_DB),
682 shell=True)
683
684 if OUTPUT_DIR and not os.path.exists(OUTPUT_DIR):
685 os.makedirs(OUTPUT_DIR)
686
687 if not os.path.isabs(OUTPUT_DOMAIN):
688 if OUTPUT_DIR is None:
689 OUTPUT_DIR = configuration.TMP
690 if not os.path.exists(OUTPUT_DIR):
691 os.makedirs(OUTPUT_DIR)
692 OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR,
693 os.path.basename(OUTPUT_DOMAIN))
694 domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
695 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM)
696
697 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))
698
699
700 if __name__ == "__main__":
701 import argparse
702 from argparse import RawDescriptionHelpFormatter
703
704 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
705 argparse.RawDescriptionHelpFormatter):
706 pass
707
708 parser = argparse.ArgumentParser(
709 description=
710 '''Script performs similarity search on given DNA sequence(s) in (multi)fasta against our protein domains database of all Transposable element for certain group of organisms (Viridiplantae or Metazoans). Domains are subsequently annotated and classified - in case certain domain has multiple annotations assigned, classifation is derived from the common classification level of all of them. Domains search is accomplished engaging LASTAL alignment tool.
711
712 DEPENDENCIES:
713 - python 3.4 or higher with packages:
714 -numpy
715 - lastal 744 or higher [http://last.cbrc.jp/]
716 - configuration.py module
717
718 EXAMPLE OF USAGE:
719
720 ./protein_domains_pd.py -q PATH_TO_INPUT_SEQ -pdb PATH_TO_PROTEIN_DB -cs PATH_TO_CLASSIFICATION_FILE
721
722 When running for the first time with a new database use -nld option allowing lastal to create indexed database files:
723
724 -nld True
725
726 ''',
727 epilog="""""",
728 formatter_class=CustomFormatter)
729
730 requiredNamed = parser.add_argument_group('required named arguments')
731 requiredNamed.add_argument(
732 "-q",
733 "--query",
734 type=str,
735 required=True,
736 help=
737 'input DNA sequence to search for protein domains in a fasta format. Multifasta format allowed.')
738 requiredNamed.add_argument('-pdb',
739 "--protein_database",
740 type=str,
741 required=True,
742 help='protein domains database file')
743 requiredNamed.add_argument('-cs',
744 '--classification',
745 type=str,
746 required=True,
747 help='protein domains classification file')
748 parser.add_argument("-oug",
749 "--domain_gff",
750 type=str,
751 help="output domains gff format")
752 parser.add_argument(
753 "-nld",
754 "--new_ldb",
755 type=str,
756 default=False,
757 help=
758 "create indexed database files for lastal in case of working with new protein db")
759 parser.add_argument(
760 "-dir",
761 "--output_dir",
762 type=str,
763 help="specify if you want to change the output directory")
764 parser.add_argument(
765 "-thsc",
766 "--threshold_score",
767 type=int,
768 default=80,
769 help=
770 "percentage of the best score in the cluster to be tolerated when assigning annotations per base")
771 parser.add_argument(
772 "-wd",
773 "--win_dom",
774 type=int,
775 default=10000000,
776 help="window to process large input sequences sequentially")
777 parser.add_argument("-od",
778 "--overlap_dom",
779 type=int,
780 default=10000,
781 help="overlap of sequences in two consecutive windows")
782
783 args = parser.parse_args()
784 main(args)