0
|
1 #!/usr/bin/env python3
|
|
2
|
|
3 import subprocess
|
|
4 import csv
|
|
5 import time
|
|
6 import sys
|
|
7 import matplotlib
|
|
8 # matplotlib.use("PDF")
|
|
9 matplotlib.use("pdf")
|
|
10 import matplotlib.pyplot as plt
|
|
11 import matplotlib.colors as colors
|
|
12 import matplotlib.cm as cmx
|
|
13 import multiprocessing
|
|
14 import argparse
|
|
15 import os
|
|
16 from functools import partial
|
|
17 from multiprocessing import Pool
|
|
18 from tempfile import NamedTemporaryFile
|
|
19 from operator import itemgetter
|
|
20 from itertools import groupby
|
|
21 import gff
|
|
22 import configuration
|
|
23 import visualization
|
|
24 import distutils
|
|
25 from distutils import dir_util
|
|
26 import tempfile
|
|
27 import re
|
|
28 from Bio import SeqIO
|
|
29 import sys
|
|
30 import pickle
|
|
31 import shutil
|
|
32 import warnings
|
|
33 import random
|
|
34 import numpy as np
|
|
35 import dante_gff_output_filtering as domains_filtering
|
|
36 import dante as protein_domains
|
|
37
|
|
38 t_profrep = time.time()
|
|
39 np.set_printoptions(threshold=sys.maxsize)
|
|
40 warnings.filterwarnings("ignore", module="matplotlib")
|
|
41
|
|
42
|
|
43 class Range():
|
|
44 '''
|
|
45 This class is used to check float range in argparse
|
|
46 '''
|
|
47
|
|
48 def __init__(self, start, end):
|
|
49 self.start = start
|
|
50 self.end = end
|
|
51
|
|
52 def __eq__(self, other):
|
|
53 return self.start <= other <= self.end
|
|
54
|
|
55 def __str__(self):
|
|
56 return "float range {}..{}".format(self.start, self.end)
|
|
57
|
|
58 def __repr__(self):
|
|
59 return "float range {}..{}".format(self.start, self.end)
|
|
60
|
|
61
|
|
62 def get_version(path):
|
|
63 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
|
|
64 shell=True,
|
|
65 cwd=path).decode('ascii').strip()
|
|
66 shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1 ",
|
|
67 shell=True,
|
|
68 cwd=path).decode('ascii').strip()
|
|
69 revcount = len(subprocess.check_output("git log --oneline",
|
|
70 shell=True,
|
|
71 cwd=path).decode('ascii').split())
|
|
72 version_string = ("-------------------------------------"
|
|
73 "-------------------------------------\n"
|
|
74 "PIPELINE VERSION : "
|
|
75 "{branch}-rv-{revcount}({shorthash})\n"
|
|
76 "-------------------------------------"
|
|
77 "-------------------------------------\n").format(
|
|
78 branch=branch,
|
|
79 shorthash=shorthash,
|
|
80 revcount=revcount, )
|
|
81 return (version_string)
|
|
82
|
|
83
|
|
84 def str2bool(v):
|
|
85 if v.lower() in ('yes', 'true', 't', 'y', '1'):
|
|
86 return True
|
|
87 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
|
|
88 return False
|
|
89 else:
|
|
90 raise argparse.ArgumentTypeError('Boolean value expected')
|
|
91
|
|
92
|
|
93 def check_fasta_id(QUERY):
|
|
94 forbidden_ids = []
|
|
95 headers = []
|
|
96 for record in SeqIO.parse(QUERY, "fasta"):
|
|
97 if any(x in record.id for x in configuration.FORBIDDEN_CHARS):
|
|
98 forbidden_ids.append(record.id)
|
|
99 headers.append(record.id)
|
|
100 if len(headers) > len(set([header.split(" ")[0] for header in headers])):
|
|
101 raise NameError(
|
|
102 '''Sequences in multifasta format are not named correctly:
|
|
103 seq IDs(before the first space) are the same''')
|
|
104 return forbidden_ids, headers
|
|
105
|
|
106
|
|
107 def multifasta(QUERY):
|
|
108 ''' Create single fasta temporary files to be processed sequentially '''
|
|
109 PATTERN = ">"
|
|
110 fasta_list = []
|
|
111 with open(QUERY, "r") as fasta:
|
|
112 reader = fasta.read()
|
|
113 splitter = reader.split(PATTERN)[1:]
|
|
114 for fasta_num, part in enumerate(splitter):
|
|
115 ntf = NamedTemporaryFile(delete=False)
|
|
116 ntf.write("{}{}".format(PATTERN, part).encode("utf-8"))
|
|
117 fasta_list.append(ntf.name)
|
|
118 ntf.close()
|
|
119 return fasta_list
|
|
120
|
|
121
|
|
122 def fasta_read(subfasta):
|
|
123 ''' Read fasta, gain header and sequence without gaps '''
|
|
124 sequence_lines = []
|
|
125 with open(subfasta, "r") as fasta:
|
|
126 header = fasta.readline().strip().split(" ")[0][1:]
|
|
127 for line in fasta:
|
|
128 clean_line = line.strip()
|
|
129 if clean_line:
|
|
130 sequence_lines.append(clean_line)
|
|
131 sequence = "".join(sequence_lines)
|
|
132 return header, sequence
|
|
133
|
|
134
|
|
135 def cluster_annotation(CL_ANNOTATION_TBL):
|
|
136 ''' Create dictionary of known annotations classes and related clusters '''
|
|
137 cl_annotations = {}
|
|
138 annot_table = np.genfromtxt(CL_ANNOTATION_TBL, dtype=str)
|
|
139 for line in annot_table:
|
|
140 if line[1] in cl_annotations:
|
|
141 cl_annotations[line[1]].append(line[0])
|
|
142 else:
|
|
143 cl_annotations[line[1]] = [line[0]]
|
|
144 return list(cl_annotations.items()), list(cl_annotations.keys())
|
|
145
|
|
146
|
|
147 def read_annotation(CLS, cl_annotations_items):
|
|
148 ''' Dictionary of known repeat classes and related reads '''
|
|
149 reads_annotations = {}
|
|
150 with open(CLS, "r") as cls_file:
|
|
151 count = 0
|
|
152 for line in cls_file:
|
|
153 line = line.rstrip()
|
|
154 count += 1
|
|
155 if count % 2 == 0:
|
|
156 reads = re.split("\s+", line)
|
|
157 for element in reads:
|
|
158 for key, value in cl_annotations_items:
|
|
159 if clust in value:
|
|
160 reads_annotations[element] = key
|
|
161 else:
|
|
162 clust = re.split("\s+", line)[0].split(">CL")[1]
|
|
163 return reads_annotations
|
|
164
|
|
165
|
|
166 def annot_profile(annotation_keys, part):
|
|
167 ''' Predefine dictionary of known annotations and partial sequence
|
|
168 repetitive profiles defined by parallel process '''
|
|
169 subprofile = {}
|
|
170 for key in annotation_keys:
|
|
171 subprofile[key] = [np.zeros(part, dtype=int), np.zeros(part,
|
|
172 dtype=int)]
|
|
173 subprofile["ALL"] = [np.zeros(part, dtype=int), np.zeros(part, dtype=int)]
|
|
174 return subprofile
|
|
175
|
|
176
|
|
177 def parallel_process(WINDOW, OVERLAP, seq_length, annotation_keys,
|
|
178 reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE,
|
|
179 BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER,
|
|
180 last_index, subsets_num, subset_index):
|
|
181 ''' Run parallel function to process the input sequence in windows
|
|
182 Run blast for subsequence defined by the input index and window size
|
|
183 Create and increment subprofile vector based on reads aligned within window '''
|
|
184 loc_start = subset_index + 1
|
|
185 loc_end = subset_index + WINDOW
|
|
186 if loc_end >= seq_length:
|
|
187 loc_end = seq_length
|
|
188 subprofile = annot_profile(annotation_keys, seq_length - loc_start + 1)
|
|
189 else:
|
|
190 subprofile = annot_profile(annotation_keys, WINDOW + 1)
|
|
191
|
|
192 # Find HSP records for every window defined by query location and parse the tabular stdout:
|
|
193 # 1. query, 2. database read, 3. alignment start, 4. alignment end, 5. bitscore
|
|
194 p = subprocess.Popen(
|
|
195 "blastn -query {} -query_loc {}-{} -db {} -evalue {} -word_size {} -dust {} -task {} -num_alignments {} -outfmt '6 qseqid sseqid qstart qend bitscore pident'".format(
|
|
196 subfasta, loc_start, loc_end, BLAST_DB, E_VALUE, WORD_SIZE,
|
|
197 DUST_FILTER, BLAST_TASK, MAX_ALIGNMENTS),
|
|
198 stdout=subprocess.PIPE,
|
|
199 shell=True)
|
|
200 count_hits = 0
|
|
201 for line in p.stdout:
|
|
202 column = line.decode("utf-8").rstrip().split("\t")
|
|
203 if float(column[4]) >= BITSCORE:
|
|
204 count_hits += 1
|
|
205 read = column[1] # ID of individual aligned read
|
|
206 if "reduce" in read:
|
|
207 reads_representation = int(read.split("reduce")[-1])
|
|
208 else:
|
|
209 reads_representation = 1
|
|
210 qstart = int(column[2]) # starting position of alignment
|
|
211 qend = int(column[3]) # ending position of alignemnt
|
|
212 if read in reads_annotations:
|
|
213 annotation = reads_annotations[read]
|
|
214 else:
|
|
215 annotation = "ALL"
|
|
216 subprofile[annotation][0][
|
|
217 qstart - subset_index - 1:qend - subset_index] = subprofile[
|
|
218 annotation][0][qstart - subset_index - 1:qend -
|
|
219 subset_index] + reads_representation
|
|
220 subprofile[annotation][1][qstart - subset_index - 1:qend -
|
|
221 subset_index] = subprofile[annotation][
|
|
222 1][qstart - subset_index - 1:qend -
|
|
223 subset_index] + float(column[
|
|
224 5]) * reads_representation
|
|
225 subprofile["ALL"][0] = sum([item[0] for item in subprofile.values()])
|
|
226 subprofile["ALL"][1] = sum([item[1] for item in subprofile.values()])
|
|
227 for repeat in subprofile.keys():
|
|
228 subprofile[repeat][1] = [int(round(quality / hits_num))
|
|
229 if hits_num != 0 else quality
|
|
230 for hits_num, quality in zip(subprofile[
|
|
231 repeat][0], subprofile[repeat][1])]
|
|
232 if subset_index == 0:
|
|
233 if subsets_num == 1:
|
|
234 subprf_name = subprofile_single(subprofile, subset_index)
|
|
235 else:
|
|
236 subprf_name = subprofile_first(subprofile, subset_index, WINDOW,
|
|
237 OVERLAP)
|
|
238 elif subset_index == last_index:
|
|
239 subprf_name = subprofile_last(subprofile, subset_index, OVERLAP)
|
|
240 else:
|
|
241 subprf_name = subprofiles_middle(subprofile, subset_index, WINDOW,
|
|
242 OVERLAP)
|
|
243 return subprf_name
|
|
244
|
|
245
|
|
246 def subprofile_single(subprofile, subset_index):
|
|
247 subprofile['idx'] = list(range(1, len(subprofile["ALL"][0]) + 1))
|
|
248 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
|
|
249 delete=False)
|
|
250 with open(subprf_dict.name, 'wb') as handle:
|
|
251 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
252 subprf_dict.close()
|
|
253 return subprf_dict.name
|
|
254
|
|
255
|
|
256 def subprofile_first(subprofile, subset_index, WINDOW, OVERLAP):
|
|
257 for key in subprofile.keys():
|
|
258 subprofile[key][0] = subprofile[key][0][0:-OVERLAP // 2 - 1]
|
|
259 subprofile[key][1] = subprofile[key][1][0:-OVERLAP // 2 - 1]
|
|
260 subprofile['idx'] = list(range(subset_index + 1, subset_index + WINDOW -
|
|
261 OVERLAP // 2 + 1))
|
|
262 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
|
|
263 delete=False)
|
|
264 with open(subprf_dict.name, 'wb') as handle:
|
|
265 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
266 subprf_dict.close()
|
|
267 return subprf_dict.name
|
|
268
|
|
269
|
|
270 def subprofiles_middle(subprofile, subset_index, WINDOW, OVERLAP):
|
|
271 for key in subprofile.keys():
|
|
272 subprofile[key][0] = subprofile[key][0][OVERLAP // 2:-OVERLAP // 2 - 1]
|
|
273 subprofile[key][1] = subprofile[key][1][OVERLAP // 2:-OVERLAP // 2 - 1]
|
|
274 subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1,
|
|
275 subset_index + WINDOW - OVERLAP // 2 + 1))
|
|
276 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
|
|
277 delete=False)
|
|
278 with open(subprf_dict.name, 'wb') as handle:
|
|
279 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
280 subprf_dict.close()
|
|
281 return subprf_dict.name
|
|
282
|
|
283
|
|
284 def subprofile_last(subprofile, subset_index, OVERLAP):
|
|
285 len_subprofile = len(subprofile['ALL'][0])
|
|
286 for key in subprofile.keys():
|
|
287 subprofile[key][0] = subprofile[key][0][OVERLAP // 2:]
|
|
288 subprofile[key][1] = subprofile[key][1][OVERLAP // 2:]
|
|
289 subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1,
|
|
290 subset_index + len_subprofile + 1))
|
|
291 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
|
|
292 delete=False)
|
|
293 with open(subprf_dict.name, 'wb') as handle:
|
|
294 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
295 subprf_dict.close()
|
|
296 return subprf_dict.name
|
|
297
|
|
298
|
|
299 def concatenate_prof(subprofiles_all, files_dict, seq_id, HTML_DATA,
|
|
300 wig_files):
|
|
301 for subprofile in subprofiles_all:
|
|
302 with open(subprofile, 'rb') as handle:
|
|
303 individual_dict = pickle.load(handle)
|
|
304 exclude = set(["idx"])
|
|
305 for key in set(individual_dict.keys()).difference(exclude):
|
|
306 if any(individual_dict[key][0]):
|
|
307 indices = handle_zero_lines(individual_dict[key][0])
|
|
308 if key not in files_dict.keys():
|
|
309 prf_name = "{}/{}.wig".format(HTML_DATA, re.sub(
|
|
310 '[\/\|]', '_', key))
|
|
311 prf_qual_name = "{}/{}_qual.wig".format(
|
|
312 HTML_DATA, re.sub('[\/\|]', '_', key))
|
|
313 prf_file = open(prf_name, "w")
|
|
314 prf_q_file = open(prf_qual_name, "w")
|
|
315 prf_file.write("{}{}\n".format(
|
|
316 configuration.HEADER_WIG, seq_id))
|
|
317 prf_q_file.write("{}{}\n".format(
|
|
318 configuration.HEADER_WIG, seq_id))
|
|
319 for i in indices:
|
|
320 prf_file.write("{}\t{}\n".format(individual_dict[
|
|
321 'idx'][i], individual_dict[key][0][i]))
|
|
322 prf_q_file.write("{}\t{}\n".format(individual_dict[
|
|
323 'idx'][i], int(individual_dict[key][1][i])))
|
|
324 files_dict[key] = [prf_name, [seq_id], prf_qual_name]
|
|
325 wig_files.append(prf_file)
|
|
326 wig_files.append(prf_q_file)
|
|
327 prf_file.close()
|
|
328 prf_q_file.close()
|
|
329 else:
|
|
330 prf_name = files_dict[key][0]
|
|
331 prf_qual_name = files_dict[key][2]
|
|
332 with open(prf_name, "a") as prf_file, open(
|
|
333 prf_qual_name, "a") as prf_q_file:
|
|
334 if seq_id not in files_dict[key][1]:
|
|
335 prf_file.write("{}{}\n".format(
|
|
336 configuration.HEADER_WIG, seq_id))
|
|
337 prf_q_file.write("{}{}\n".format(
|
|
338 configuration.HEADER_WIG, seq_id))
|
|
339 files_dict[key][1].append(seq_id)
|
|
340 for i in indices:
|
|
341 prf_file.write("{}\t{}\n".format(
|
|
342 individual_dict['idx'][i], individual_dict[
|
|
343 key][0][i]))
|
|
344 prf_q_file.write("{}\t{}\n".format(
|
|
345 individual_dict['idx'][i], int(
|
|
346 individual_dict[key][1][i])))
|
|
347 return files_dict, wig_files
|
|
348
|
|
349
|
|
350 def concatenate_prof_CN(CV, subprofiles_all, files_dict, seq_id, HTML_DATA,
|
|
351 wig_files):
|
|
352 for subprofile in subprofiles_all:
|
|
353 with open(subprofile, 'rb') as handle:
|
|
354 individual_dict = pickle.load(handle)
|
|
355 exclude = set(["idx"])
|
|
356 for key in set(individual_dict.keys()).difference(exclude):
|
|
357 if any(individual_dict[key][0]):
|
|
358 indices = handle_zero_lines(individual_dict[key][0])
|
|
359 if key not in files_dict.keys():
|
|
360 prf_name = "{}/{}.wig".format(HTML_DATA, re.sub(
|
|
361 '[\/\|]', '_', key))
|
|
362 prf_qual_name = "{}/{}_qual.wig".format(
|
|
363 HTML_DATA, re.sub('[\/\|]', '_', key))
|
|
364 prf_file = open(prf_name, "w")
|
|
365 prf_q_file = open(prf_qual_name, "w")
|
|
366 prf_file.write("{}{}\n".format(
|
|
367 configuration.HEADER_WIG, seq_id))
|
|
368 prf_q_file.write("{}{}\n".format(
|
|
369 configuration.HEADER_WIG, seq_id))
|
|
370 for i in indices:
|
|
371 prf_file.write("{}\t{}\n".format(individual_dict[
|
|
372 'idx'][i], int(individual_dict[key][0][i] /
|
|
373 CV)))
|
|
374 prf_q_file.write("{}\t{}\n".format(individual_dict[
|
|
375 'idx'][i], int(individual_dict[key][1][i])))
|
|
376 files_dict[key] = [prf_name, [seq_id], prf_qual_name]
|
|
377 wig_files.append(prf_file)
|
|
378 wig_files.append(prf_q_file)
|
|
379 prf_file.close()
|
|
380 prf_q_file.close()
|
|
381 else:
|
|
382 prf_name = files_dict[key][0]
|
|
383 prf_qual_name = files_dict[key][2]
|
|
384 with open(prf_name, "a") as prf_file, open(
|
|
385 prf_qual_name, "a") as prf_q_file:
|
|
386 if seq_id not in files_dict[key][1]:
|
|
387 prf_file.write("{}{}\n".format(
|
|
388 configuration.HEADER_WIG, seq_id))
|
|
389 prf_q_file.write("{}{}\n".format(
|
|
390 configuration.HEADER_WIG, seq_id))
|
|
391 files_dict[key][1].append(seq_id)
|
|
392 for i in indices:
|
|
393 prf_file.write("{}\t{}\n".format(
|
|
394 individual_dict['idx'][i], int(
|
|
395 individual_dict[key][0][i] / CV)))
|
|
396 prf_q_file.write("{}\t{}\n".format(
|
|
397 individual_dict['idx'][i], int(
|
|
398 individual_dict[key][1][i])))
|
|
399 return files_dict, wig_files
|
|
400
|
|
401
|
|
402 def handle_zero_lines(repeat_subhits):
|
|
403 ''' Clean lines which contains only zeros, i.e. positons which do not contain any hit. However border zero positions need to be preserved due to correct graphs plotting '''
|
|
404 zero_idx = [idx for idx, val in enumerate(repeat_subhits) if val == 0]
|
|
405 indices = [idx for idx, val in enumerate(repeat_subhits) if val != 0]
|
|
406 zero_breakpoints = []
|
|
407 for key, group in groupby(
|
|
408 enumerate(zero_idx),
|
|
409 lambda index_item: index_item[0] - index_item[1]):
|
|
410 group = list(map(itemgetter(1), group))
|
|
411 zero_breakpoints.append(group[0])
|
|
412 zero_breakpoints.append(group[-1])
|
|
413 if indices:
|
|
414 indices.extend(zero_breakpoints)
|
|
415 indices = sorted(set(indices), key=int)
|
|
416 else:
|
|
417 indices = []
|
|
418 return indices
|
|
419
|
|
420
|
|
421 def repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA,
|
|
422 xminimal, xmaximal, domains, seq_ids_dom, CN,
|
|
423 seq_ids_all, seq_lengths_all, files_dict):
|
|
424 ''' Process the hits table separately for each fasta, create gff file and profile picture '''
|
|
425 if files_dict:
|
|
426 gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict,
|
|
427 seq_ids_all)
|
|
428 else:
|
|
429 with open(OUTPUT_GFF, "w") as gff_file:
|
|
430 gff_file.write("{}\n".format(configuration.HEADER_GFF))
|
|
431
|
|
432 # TODO remove plotting, keep only basic report
|
|
433 return None
|
|
434 seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM]
|
|
435 graphs_dict = {}
|
|
436 seqs_long = []
|
|
437 if files_dict:
|
|
438 [graphs_dict, seqs_long] = visualization.vis_profrep(
|
|
439 seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA,
|
|
440 seqs_all_part)
|
|
441 count_seq = 0
|
|
442 for seq in seqs_all_part:
|
|
443 if seq in graphs_dict.keys():
|
|
444 fig = graphs_dict[seq][0]
|
|
445 ax = graphs_dict[seq][1]
|
|
446 art = []
|
|
447 lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3)
|
|
448 art.append(lgd)
|
|
449 if seq in seq_ids_dom:
|
|
450 dom_idx = seq_ids_dom.index(seq)
|
|
451 [fig, ax] = visualization.vis_domains(
|
|
452 fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx],
|
|
453 domains[dom_idx])
|
|
454 elif seq in seqs_long:
|
|
455 [fig, ax] = visualization.plot_figure(
|
|
456 seq, seq_lengths_all[count_seq], CN)
|
|
457 ax.text(
|
|
458 0.3,
|
|
459 0.5,
|
|
460 "Graphs are only displayed if sequence is not longer than {} bp".format(
|
|
461 configuration.SEQ_LEN_VIZ),
|
|
462 transform=ax.transAxes,
|
|
463 fontsize=14,
|
|
464 verticalalignment='center',
|
|
465 color='blue')
|
|
466 else:
|
|
467 [fig, ax] = visualization.plot_figure(
|
|
468 seq, seq_lengths_all[count_seq], CN)
|
|
469 ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4)
|
|
470 if seq in seq_ids_dom:
|
|
471 dom_idx = seq_ids_dom.index(seq)
|
|
472 [fig, ax] = visualization.vis_domains(
|
|
473 fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx],
|
|
474 domains[dom_idx])
|
|
475 output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq)
|
|
476 fig.savefig(output_pic_png,
|
|
477 bbox_inches="tight",
|
|
478 format="png",
|
|
479 dpi=configuration.IMAGE_RES)
|
|
480 count_seq += 1
|
|
481 return None
|
|
482
|
|
483
|
|
484 def repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA, CN,
|
|
485 seq_ids_all, seq_lengths_all, files_dict):
|
|
486 ''' Process the hits table separately for each fasta, create gff file and profile picture '''
|
|
487 if files_dict:
|
|
488 gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict,
|
|
489 seq_ids_all)
|
|
490 else:
|
|
491 with open(OUTPUT_GFF, "w") as gff_file:
|
|
492 gff_file.write("{}\n".format(configuration.HEADER_GFF))
|
|
493
|
|
494 # TODO remove plotting, keep only basic report
|
|
495 return None
|
|
496 seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM]
|
|
497 graphs_dict = {}
|
|
498 seqs_long = []
|
|
499 if files_dict:
|
|
500 [graphs_dict, seqs_long] = visualization.vis_profrep(
|
|
501 seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA,
|
|
502 seqs_all_part)
|
|
503 count_seq = 0
|
|
504 for seq in seqs_all_part:
|
|
505 if seq in graphs_dict.keys():
|
|
506 fig = graphs_dict[seq][0]
|
|
507 ax = graphs_dict[seq][1]
|
|
508 art = []
|
|
509 lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3)
|
|
510 art.append(lgd)
|
|
511 elif seq in seqs_long:
|
|
512 [fig, ax] = visualization.plot_figure(
|
|
513 seq, seq_lengths_all[count_seq], CN)
|
|
514 ax.text(
|
|
515 0.3,
|
|
516 0.5,
|
|
517 "Graphs are only displayed if sequence is not longer than {} bp".format(
|
|
518 configuration.SEQ_LEN_VIZ),
|
|
519 transform=ax.transAxes,
|
|
520 fontsize=14,
|
|
521 verticalalignment='center',
|
|
522 color='blue')
|
|
523 else:
|
|
524 [fig, ax] = visualization.plot_figure(
|
|
525 seq, seq_lengths_all[count_seq], CN)
|
|
526 ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4)
|
|
527 output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq)
|
|
528 fig.savefig(output_pic_png,
|
|
529 bbox_inches="tight",
|
|
530 format="png",
|
|
531 dpi=configuration.IMAGE_RES)
|
|
532 plt.close()
|
|
533 count_seq += 1
|
|
534 return None
|
|
535
|
|
536
|
|
537 def html_output(total_length, seq_lengths_all, seq_names, HTML, DB_NAME, REF,
|
|
538 REF_LINK):
|
|
539 ''' Define html output with limited number of output pictures and link to JBrowse '''
|
|
540 info = "\t\t".join(['<pre> {} [{} bp]</pre>'.format(seq_name, seq_length)
|
|
541 for seq_name, seq_length in zip(seq_names,
|
|
542 seq_lengths_all)])
|
|
543 if REF:
|
|
544 ref_part_1 = REF.split("-")[0]
|
|
545 ref_part_2 = "-".join(REF.split("-")[1:]).split(". ")[0]
|
|
546 ref_part_3 = ". ".join("-".join(REF.split("-")[1:]).split(". ")[1:])
|
|
547 ref_string = '''<h6> {} - <a href="{}" target="_blank" >{}</a>. {}'''.format(
|
|
548 ref_part_1, REF_LINK, ref_part_2, ref_part_3)
|
|
549 else:
|
|
550 ref_string = "Custom Data"
|
|
551 pictures = "\n\t\t".join(['<img src="{}.png" width=1800>'.format(
|
|
552 pic) for pic in range(len(seq_names))[:configuration.MAX_PIC_NUM]])
|
|
553 html_str = configuration.HTML_STR.format(info, total_length, DB_NAME,
|
|
554 pictures, ref_string)
|
|
555 with open(HTML, "w") as html_file:
|
|
556 html_file.write(html_str)
|
|
557
|
|
558
|
|
559 def adjust_tracklist(jbrowse_data_path):
|
|
560 starting_lines = []
|
|
561 ending_lines = []
|
|
562 end = False
|
|
563 with open(
|
|
564 os.path.join(jbrowse_data_path,
|
|
565 "trackList.json"), "r") as track_list:
|
|
566 for line in track_list:
|
|
567 if "]" not in line and not end:
|
|
568 starting_lines.append(line)
|
|
569 else:
|
|
570 end = True
|
|
571 ending_lines.append(line)
|
|
572 with open(
|
|
573 os.path.join(jbrowse_data_path,
|
|
574 "trackList.json"), "w") as track_list:
|
|
575 for line in starting_lines:
|
|
576 track_list.write(line)
|
|
577 return ending_lines
|
|
578
|
|
579
|
|
580 def jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF,
|
|
581 total_length, JBROWSE_BIN, files_dict):
|
|
582 ''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse '''
|
|
583 jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir)
|
|
584 with tempfile.TemporaryDirectory() as dirpath:
|
|
585 subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN),
|
|
586 "--fasta", QUERY, "--out", jbrowse_data_path])
|
|
587 subprocess.call(["{}/flatfile-to-json.pl".format(
|
|
588 JBROWSE_BIN), "--gff", OUT_DOMAIN_GFF, "--trackLabel",
|
|
589 "GFF_domains", "--out", jbrowse_data_path])
|
|
590 subprocess.call(["{}/flatfile-to-json.pl".format(
|
|
591 JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats",
|
|
592 "--config", configuration.JSON_CONF_R, "--out",
|
|
593 jbrowse_data_path])
|
|
594 subprocess.call(["{}/flatfile-to-json.pl".format(
|
|
595 JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions",
|
|
596 "--config", configuration.JSON_CONF_N, "--out",
|
|
597 jbrowse_data_path])
|
|
598 count = 0
|
|
599 # Control the total length processed, if above threshold, dont create wig image tracks
|
|
600 if files_dict:
|
|
601 exclude = set(['ALL'])
|
|
602 sorted_keys = sorted(set(files_dict.keys()).difference(exclude))
|
|
603 sorted_keys.insert(0, "ALL")
|
|
604 ending_lines = adjust_tracklist(jbrowse_data_path)
|
|
605 track_list = open(
|
|
606 os.path.join(jbrowse_data_path, "trackList.json"), "a")
|
|
607 color_avail = len(configuration.COLORS_HEX)
|
|
608 for repeat_id in sorted_keys:
|
|
609 if count <= color_avail - 1:
|
|
610 color = configuration.COLORS_HEX[count]
|
|
611 else:
|
|
612 r = lambda: random.randint(0, 255)
|
|
613 color = '#%02X%02X%02X' % (r(), r(), r())
|
|
614 count += 1
|
|
615 bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id))
|
|
616 subprocess.call(["wigToBigWig", files_dict[repeat_id][
|
|
617 0], os.path.join(HTML_DATA,
|
|
618 configuration.CHROM_SIZES_FILE),
|
|
619 os.path.join(jbrowse_data_path, bw_name)])
|
|
620 track_list.write(configuration.TRACK_LIST.format(
|
|
621 "{", bw_name, repeat_id, repeat_id, "{", color, "}", "}"))
|
|
622 for line in ending_lines:
|
|
623 track_list.write(line)
|
|
624 distutils.dir_util.copy_tree(dirpath, jbrowse_data_path)
|
|
625 return None
|
|
626
|
|
627
|
|
628 def jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length,
|
|
629 JBROWSE_BIN, files_dict):
|
|
630 ''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse '''
|
|
631 jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir)
|
|
632 with tempfile.TemporaryDirectory() as dirpath:
|
|
633 subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN),
|
|
634 "--fasta", QUERY, "--out", jbrowse_data_path])
|
|
635 subprocess.call(["{}/flatfile-to-json.pl".format(
|
|
636 JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats",
|
|
637 "--config", configuration.JSON_CONF_R, "--out",
|
|
638 jbrowse_data_path])
|
|
639 subprocess.call(["{}/flatfile-to-json.pl".format(
|
|
640 JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions",
|
|
641 "--config", configuration.JSON_CONF_N, "--out",
|
|
642 jbrowse_data_path])
|
|
643 count = 0
|
|
644 ## Control the total length processed, if above threshold, dont create wig image tracks
|
|
645 if files_dict:
|
|
646 exclude = set(['ALL'])
|
|
647 sorted_keys = sorted(set(files_dict.keys()).difference(exclude))
|
|
648 sorted_keys.insert(0, "ALL")
|
|
649 ending_lines = adjust_tracklist(jbrowse_data_path)
|
|
650 track_list = open(
|
|
651 os.path.join(jbrowse_data_path, "trackList.json"), "a")
|
|
652 color_avail = len(configuration.COLORS_HEX)
|
|
653 for repeat_id in sorted_keys:
|
|
654 if count <= color_avail - 1:
|
|
655 color = configuration.COLORS_HEX[count]
|
|
656 else:
|
|
657 r = lambda: random.randint(0, 255)
|
|
658 color = '#%02X%02X%02X' % (r(), r(), r())
|
|
659 count += 1
|
|
660 bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id))
|
|
661 subprocess.call(["wigToBigWig", files_dict[repeat_id][
|
|
662 0], os.path.join(HTML_DATA,
|
|
663 configuration.CHROM_SIZES_FILE),
|
|
664 os.path.join(jbrowse_data_path, bw_name)])
|
|
665 track_list.write(configuration.TRACK_LIST.format(
|
|
666 "{", bw_name, repeat_id, repeat_id, "{", color, "}", "}"))
|
|
667 for line in ending_lines:
|
|
668 track_list.write(line)
|
|
669 track_list.close()
|
|
670 distutils.dir_util.copy_tree(dirpath, jbrowse_data_path)
|
|
671 return None
|
|
672
|
|
673
|
|
674 def genome2coverage(GS, BLAST_DB):
|
|
675 ''' Convert genome size to coverage '''
|
|
676 num_of_reads = 0
|
|
677 with open(BLAST_DB, "r") as reads_all:
|
|
678 first_line = reads_all.readline()
|
|
679 if first_line.startswith(">"):
|
|
680 num_of_reads += 1
|
|
681 first_seq = reads_all.readline().rstrip()
|
|
682 for line in reads_all:
|
|
683 if line.startswith(">"):
|
|
684 num_of_reads += 1
|
|
685 len_of_read = len(first_seq)
|
|
686 CV = (num_of_reads * len_of_read) / (GS * 1000000) # GS in Mb
|
|
687 return CV
|
|
688
|
|
689
|
|
690 def prepared_data(DB_ID):
|
|
691 ''' Get prepared rep. annotation data from the table based on the selected species ID '''
|
|
692 tbl = os.path.join(
|
|
693 os.path.dirname(os.path.realpath(__file__)),
|
|
694 configuration.PROFREP_DATA, configuration.PROFREP_TBL)
|
|
695 with open(tbl, "r") as datasets:
|
|
696 for line in datasets:
|
|
697 if line.split("\t")[0] == DB_ID:
|
|
698 DB_NAME = line.split("\t")[1]
|
|
699 CV = float(line.split("\t")[5])
|
|
700 REF = line.split("\t")[6]
|
|
701 REF_LINK = line.split("\t")[7]
|
|
702 return DB_NAME, CV, REF, REF_LINK
|
|
703
|
|
704
|
|
705 def seq_sizes_file(seq_ids, seq_lengths_all, HTML_DATA):
|
|
706 chrom_sizes = os.path.join(HTML_DATA, configuration.CHROM_SIZES_FILE)
|
|
707 with open(chrom_sizes, "w") as chroms:
|
|
708 for seq_id, seq_length in zip(seq_ids, seq_lengths_all):
|
|
709 chroms.write("{}\t{}\n".format(seq_id, seq_length))
|
|
710
|
|
711
|
|
712 def main(args):
|
|
713 ## Command line arguments
|
|
714 QUERY = args.query
|
|
715 BLAST_DB = args.reads
|
|
716 CL_ANNOTATION_TBL = args.ann_tbl
|
|
717 CLS = args.cls
|
|
718 BITSCORE = args.bit_score
|
|
719 E_VALUE = args.e_value
|
|
720 WORD_SIZE = args.word_size
|
|
721 WINDOW = args.window
|
|
722 OVERLAP = args.overlap
|
|
723 BLAST_TASK = args.task
|
|
724 MAX_ALIGNMENTS = args.max_alignments
|
|
725 NEW_DB = args.new_db
|
|
726 THRESHOLD = args.threshold_repeat
|
|
727 THRESHOLD_SEGMENT = args.threshold_segment
|
|
728 TH_IDENTITY = args.th_identity
|
|
729 TH_LENGTH = args.th_length
|
|
730 TH_INTERRUPT = args.interruptions
|
|
731 TH_SIMILARITY = args.th_similarity
|
|
732 TH_LEN_RATIO = args.max_len_proportion
|
|
733 OUTPUT_GFF = args.output_gff
|
|
734 DOMAINS = args.protein_domains
|
|
735 LAST_DB = args.protein_database
|
|
736 CLASSIFICATION = args.classification
|
|
737 OUT_DOMAIN_GFF = args.domain_gff
|
|
738 HTML = args.html_file
|
|
739 HTML_DATA = args.html_path
|
|
740 N_GFF = args.n_gff
|
|
741 CN = args.copy_numbers
|
|
742 GS = args.genome_size
|
|
743 DB_ID = args.db_id
|
|
744 THRESHOLD_SCORE = args.threshold_score
|
|
745 WIN_DOM = args.win_dom
|
|
746 OVERLAP_DOM = args.overlap_dom
|
|
747 JBROWSE_BIN = args.jbrowse_bin
|
|
748 DUST_FILTER = args.dust_filter
|
|
749 LOG_FILE = args.log_file
|
|
750 #JBROWSE_BIN = os.environ['JBROWSE_SOURCE_DIR']+"/bin"
|
|
751 #if not JBROWSE_BIN:
|
|
752 # try:
|
|
753 # JBROWSE_BIN = os.environ['JBROWSE_BIN']
|
|
754 # except KeyError:
|
|
755 # raise ValueError('There was no path to JBrowse bin found - set the enviroment variable JBROWSE_BIN or pass the argument explicitly')
|
|
756 if CN and not DB_ID and not GS:
|
|
757 raise ValueError("Genome size missing - if you want to convert hits to copy numbers please enter --genome_size parameter")
|
|
758
|
|
759
|
|
760 ## Check if there are forbidden characters in fasta IDs
|
|
761 [forbidden_ids, headers] = check_fasta_id(QUERY)
|
|
762 if forbidden_ids:
|
|
763 ##################### USER ERROR ###############################
|
|
764 raise UserWarning(
|
|
765 "The following IDs contain forbidden characters ('/' or '\\') - PLEASE REPLACE OR DELETE THEM:\n{}".format(
|
|
766 "\n".join(forbidden_ids)))
|
|
767
|
|
768 if len(headers) > len(set([header.split(" ")[0] for header in headers])):
|
|
769 raise NameError(
|
|
770 '''Sequences in multifasta format are not named correctly:
|
|
771 seq IDs(before the first space) are the same''')
|
|
772
|
|
773 ## Create new blast database of reads
|
|
774 if NEW_DB:
|
|
775 subprocess.call("makeblastdb -in {} -dbtype nucl".format(BLAST_DB),
|
|
776 shell=True)
|
|
777
|
|
778 ## Parse prepared annotation data table
|
|
779 if DB_ID:
|
|
780 [DB_NAME, CV, REF, REF_LINK] = prepared_data(DB_ID)
|
|
781 else:
|
|
782 REF = None
|
|
783 REF_LINK = None
|
|
784 DB_NAME = "CUSTOM"
|
|
785
|
|
786 ## Create dir to store outputs for html and JBROWSE
|
|
787 if not os.path.exists(HTML_DATA):
|
|
788 os.makedirs(HTML_DATA)
|
|
789
|
|
790 if not os.path.isabs(HTML):
|
|
791 HTML = os.path.join(HTML_DATA, HTML)
|
|
792
|
|
793 if not os.path.isabs(OUT_DOMAIN_GFF):
|
|
794 OUT_DOMAIN_GFF = os.path.join(HTML_DATA, OUT_DOMAIN_GFF)
|
|
795
|
|
796 if not os.path.isabs(LOG_FILE):
|
|
797 LOG_FILE = os.path.join(HTML_DATA, LOG_FILE)
|
|
798
|
|
799 if not os.path.isabs(N_GFF):
|
|
800 N_GFF = os.path.join(HTML_DATA, N_GFF)
|
|
801
|
|
802 if not os.path.isabs(OUTPUT_GFF):
|
|
803 OUTPUT_GFF = os.path.join(HTML_DATA, OUTPUT_GFF)
|
|
804
|
|
805 path = os.path.dirname(os.path.realpath(__file__))
|
|
806 version_string = get_version(path)
|
|
807
|
|
808 log = os.open(LOG_FILE, os.O_RDWR | os.O_CREAT)
|
|
809
|
|
810 os.write(log, version_string.encode("utf-8"))
|
|
811
|
|
812 ## Define parameters for parallel process
|
|
813 STEP = WINDOW - OVERLAP
|
|
814 NUM_CORES = multiprocessing.cpu_count()
|
|
815 os.write(log, "NUM_OF_CORES = {}\n".format(NUM_CORES).encode("utf-8"))
|
|
816
|
|
817 ## Convert genome size to coverage
|
|
818 if CN and GS:
|
|
819 CV = genome2coverage(GS, BLAST_DB)
|
|
820 os.write(log, "COVERAGE = {}\n".format(CV).encode("utf-8"))
|
|
821
|
|
822 parallel_pool = Pool(NUM_CORES)
|
|
823
|
|
824 ## Assign clusters to repetitive classes
|
|
825 [cl_annotations_items, annotation_keys
|
|
826 ] = cluster_annotation(CL_ANNOTATION_TBL)
|
|
827
|
|
828 ## Assign reads to repetitive classes
|
|
829 reads_annotations = read_annotation(CLS, cl_annotations_items)
|
|
830
|
|
831 ## Detect all fasta sequences from input
|
|
832 fasta_list = multifasta(QUERY)
|
|
833 headers = []
|
|
834 files_dict = {}
|
|
835 wig_files = []
|
|
836 seq_count = 1
|
|
837 start = 1
|
|
838 total_length = 0
|
|
839 seq_lengths_all = []
|
|
840 Ngff = open(N_GFF, "w")
|
|
841 Ngff.write("{}\n".format(configuration.HEADER_GFF))
|
|
842
|
|
843 ## Find hits for each fasta sequence separetely
|
|
844 t_blast = time.time()
|
|
845 for subfasta in fasta_list:
|
|
846 [header, sequence] = fasta_read(subfasta)
|
|
847 os.write(log, "Sequence {} is being processed...\n".format(
|
|
848 header).encode("utf-8"))
|
|
849 os.fsync(log)
|
|
850 indices_N = [indices + 1
|
|
851 for indices, n in enumerate(sequence)
|
|
852 if n == "n" or n == "N"]
|
|
853 if indices_N:
|
|
854 gff.idx_ranges_N(indices_N, configuration.N_segment, header, Ngff,
|
|
855 configuration.N_NAME, configuration.N_FEATURE)
|
|
856 seq_length = len(sequence)
|
|
857 headers.append(header)
|
|
858 ## Create parallel process
|
|
859 subset_index = list(range(0, seq_length, STEP))
|
|
860 ## Situation when penultimal window is not complete but it is following by another one
|
|
861 if len(subset_index) > 1 and subset_index[-2] + WINDOW >= seq_length:
|
|
862 subset_index = subset_index[:-1]
|
|
863 last_index = subset_index[-1]
|
|
864 index_range = range(len(subset_index))
|
|
865 for chunk_index in index_range[0::configuration.MAX_FILES_SUBPROFILES]:
|
|
866 multiple_param = partial(
|
|
867 parallel_process, WINDOW, OVERLAP, seq_length, annotation_keys,
|
|
868 reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE,
|
|
869 BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER, last_index,
|
|
870 len(subset_index))
|
|
871 subprofiles_all = parallel_pool.map(multiple_param, subset_index[
|
|
872 chunk_index:chunk_index + configuration.MAX_FILES_SUBPROFILES])
|
|
873 ## Join partial profiles to the final profile of the sequence
|
|
874 if CN:
|
|
875 [files_dict, wig_files
|
|
876 ] = concatenate_prof_CN(CV, subprofiles_all, files_dict,
|
|
877 header, HTML_DATA, wig_files)
|
|
878 else:
|
|
879 [files_dict, wig_files] = concatenate_prof(
|
|
880 subprofiles_all, files_dict, header, HTML_DATA, wig_files)
|
|
881 for subprofile in subprofiles_all:
|
|
882 os.unlink(subprofile)
|
|
883 total_length += seq_length
|
|
884 seq_lengths_all.append(seq_length)
|
|
885
|
|
886 os.write(log, "ELAPSED_TIME_BLAST = {} s\n".format(time.time(
|
|
887 ) - t_blast).encode("utf-8"))
|
|
888 os.write(
|
|
889 log,
|
|
890 "TOTAL_LENGHT_ANALYZED = {} bp\n".format(total_length).encode("utf-8"))
|
|
891
|
|
892 ## Close opened files
|
|
893 for opened_file in wig_files:
|
|
894 opened_file.close()
|
|
895 Ngff.close()
|
|
896
|
|
897 ## Create file containing size of sequences to convert wig to bigwig
|
|
898 seq_sizes_file(headers, seq_lengths_all, HTML_DATA)
|
|
899
|
|
900 ## Protein domains module
|
|
901 t_domains = time.time()
|
|
902 if DOMAINS:
|
|
903 os.write(log, "Domains module has started...\n".encode("utf-8"))
|
|
904 os.fsync(log)
|
|
905 domains_primary = NamedTemporaryFile(delete=False)
|
|
906 protein_domains.domain_search(QUERY, LAST_DB, CLASSIFICATION,
|
|
907 domains_primary.name, THRESHOLD_SCORE,
|
|
908 WIN_DOM, OVERLAP_DOM)
|
|
909 domains_primary.close()
|
|
910 [xminimal, xmaximal, domains, seq_ids_dom
|
|
911 ] = domains_filtering.filter_qual_dom(
|
|
912 domains_primary.name, OUT_DOMAIN_GFF, TH_IDENTITY, TH_SIMILARITY,
|
|
913 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, 'All', "")
|
|
914 os.unlink(domains_primary.name)
|
|
915 os.write(log, "ELAPSED_TIME_DOMAINS = {} s\n".format(time.time(
|
|
916 ) - t_domains).encode("utf-8"))
|
|
917
|
|
918 # Process individual sequences from the input file sequentially
|
|
919 t_gff_vis = time.time()
|
|
920 repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT,
|
|
921 HTML_DATA, xminimal, xmaximal, domains,
|
|
922 seq_ids_dom, CN, headers, seq_lengths_all,
|
|
923 files_dict)
|
|
924 os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time(
|
|
925 ) - t_gff_vis).encode("utf-8"))
|
|
926 os.fsync(log)
|
|
927 # Prepare data for html output
|
|
928 t_jbrowse = time.time()
|
|
929 os.write(log, "JBrowse tracks are being prepared...\n".encode("utf-8"))
|
|
930 os.fsync(log)
|
|
931 jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF,
|
|
932 total_length, JBROWSE_BIN, files_dict)
|
|
933 os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time(
|
|
934 ) - t_jbrowse).encode("utf-8"))
|
|
935 else:
|
|
936 # Process individual sequences from the input file sequentially
|
|
937 t_gff_vis = time.time()
|
|
938 repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA,
|
|
939 CN, headers, seq_lengths_all, files_dict)
|
|
940 os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time(
|
|
941 ) - t_gff_vis).encode("utf-8"))
|
|
942
|
|
943 # Prepare data for html output
|
|
944 t_jbrowse = time.time()
|
|
945 jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length,
|
|
946 JBROWSE_BIN, files_dict)
|
|
947 os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time(
|
|
948 ) - t_jbrowse).encode("utf-8"))
|
|
949
|
|
950 # Create HTML output
|
|
951 t_html = time.time()
|
|
952 os.write(
|
|
953 log,
|
|
954 "HTML output and JBrowse data structure are being prepared...\n".encode(
|
|
955 "utf-8"))
|
|
956 os.fsync(log)
|
|
957 html_output(total_length, seq_lengths_all, headers, HTML, DB_NAME, REF,
|
|
958 REF_LINK)
|
|
959 os.write(log, "ELAPSED_TIME_HTML = {} s\n".format(time.time() -
|
|
960 t_html).encode("utf-8"))
|
|
961 os.write(log, "ELAPSED_TIME_PROFREP = {} s\n".format(time.time(
|
|
962 ) - t_profrep).encode("utf-8"))
|
|
963 os.close(log)
|
|
964
|
|
965 ## Clean up the temporary fasta files
|
|
966 for subfasta in fasta_list:
|
|
967 os.unlink(subfasta)
|
|
968
|
|
969
|
|
970 if __name__ == "__main__":
|
|
971 import argparse
|
|
972 from argparse import RawDescriptionHelpFormatter
|
|
973
|
|
974 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
|
|
975 argparse.RawDescriptionHelpFormatter):
|
|
976 pass
|
|
977
|
|
978 # Default paths (command line usage)
|
|
979 HTML = configuration.HTML
|
|
980 DOMAINS_GFF = configuration.DOMAINS_GFF
|
|
981 REPEATS_GFF = configuration.REPEATS_GFF
|
|
982 N_GFF = configuration.N_GFF
|
|
983 LOG_FILE = configuration.LOG_FILE
|
|
984 PROFREP_OUTPUT_DIR = configuration.PROFREP_OUTPUT_DIR
|
|
985
|
|
986 # Command line arguments
|
|
987 parser = argparse.ArgumentParser(
|
|
988 description='''
|
|
989
|
|
990 DEPENDENCIES:
|
|
991 - python 3.4 or higher with packages:
|
|
992 - numpy
|
|
993 - matplotlib
|
|
994 - biopython
|
|
995 - BLAST 2.2.28+ or higher
|
|
996 - LAST 744 or higher
|
|
997 - wigToBigWig
|
|
998 - cd-hit
|
|
999 - JBrowse - ! Only bin needed, does not have to be installed under a web server
|
|
1000 * ProfRep Modules:
|
|
1001 - gff.py
|
|
1002 - visualization.py
|
|
1003 - configuration.py
|
|
1004 - protein_domains.py
|
|
1005 - domains_filtering.py
|
|
1006
|
|
1007 EXAMPLE OF USAGE:
|
|
1008
|
|
1009 ./protein.py --query PATH_TO_DNA_SEQ --reads PATH_TO_READS --ann_tbl PATH_TO_CLUSTERS_CLASSIFICATION --cls PATH_TO_hitsort.cls [--new_db True]
|
|
1010 ''',
|
|
1011 epilog="""take a look at README for more detailed information""",
|
|
1012 formatter_class=CustomFormatter)
|
|
1013
|
|
1014 Required = parser.add_argument_group('required arguments')
|
|
1015 altRequired = parser.add_argument_group(
|
|
1016 'alternative required arguments - prepared datasets')
|
|
1017 blastOpt = parser.add_argument_group('optional arguments - BLAST Search')
|
|
1018 parallelOpt = parser.add_argument_group(
|
|
1019 'optional arguments - Parallel Processing')
|
|
1020 protOpt = parser.add_argument_group('optional arguments - Protein Domains')
|
|
1021 outOpt = parser.add_argument_group('optional arguments - Output Paths')
|
|
1022 cnOpt = parser.add_argument_group(
|
|
1023 'optional arguments - Copy Numbers/Hits ')
|
|
1024 galaxyOpt = parser.add_argument_group(
|
|
1025 'optional arguments - Enviroment Variables')
|
|
1026
|
|
1027 ################ INPUTS ############################################
|
|
1028 Required.add_argument('-q',
|
|
1029 '--query',
|
|
1030 type=str,
|
|
1031 required=True,
|
|
1032 help='input DNA sequence in (multi)fasta format')
|
|
1033 Required.add_argument('-rdb',
|
|
1034 '--reads',
|
|
1035 type=str,
|
|
1036 required=True,
|
|
1037 help='blast database of all sequencing reads')
|
|
1038 Required.add_argument(
|
|
1039 '-a',
|
|
1040 '--ann_tbl',
|
|
1041 type=str,
|
|
1042 required=True,
|
|
1043 help=
|
|
1044 'clusters annotation table, tab-separated number of cluster and its classification')
|
|
1045 Required.add_argument(
|
|
1046 '-c',
|
|
1047 '--cls',
|
|
1048 type=str,
|
|
1049 required=True,
|
|
1050 help='cls file containing reads assigned to clusters (hitsort.cls)')
|
|
1051 altRequired.add_argument(
|
|
1052 '-id',
|
|
1053 '--db_id',
|
|
1054 type=str,
|
|
1055 help='annotation dataset ID (first column of datasets table)')
|
|
1056
|
|
1057 ################ BLAST parameters ##################################
|
|
1058 blastOpt.add_argument('-bs',
|
|
1059 '--bit_score',
|
|
1060 type=float,
|
|
1061 default=50,
|
|
1062 help='bitscore threshold')
|
|
1063 blastOpt.add_argument(
|
|
1064 '-m',
|
|
1065 '--max_alignments',
|
|
1066 type=int,
|
|
1067 default=10000000,
|
|
1068 help=
|
|
1069 'blast filtering option: maximal number of alignments in the output')
|
|
1070 blastOpt.add_argument('-e',
|
|
1071 '--e_value',
|
|
1072 type=str,
|
|
1073 default=0.1,
|
|
1074 help='blast setting option: e-value')
|
|
1075 blastOpt.add_argument(
|
|
1076 '-df',
|
|
1077 '--dust_filter',
|
|
1078 type=str,
|
|
1079 default="'20 64 1'",
|
|
1080 help='dust filters low-complexity regions during BLAST search')
|
|
1081 blastOpt.add_argument(
|
|
1082 '-ws',
|
|
1083 '--word_size',
|
|
1084 type=int,
|
|
1085 default=11,
|
|
1086 help='blast search option: initial word size for alignment')
|
|
1087 blastOpt.add_argument('-t',
|
|
1088 '--task',
|
|
1089 type=str,
|
|
1090 default="blastn",
|
|
1091 help='type of blast to be triggered')
|
|
1092 blastOpt.add_argument(
|
|
1093 '-n',
|
|
1094 '--new_db',
|
|
1095 type=str2bool,
|
|
1096 default=True,
|
|
1097 help=
|
|
1098 'create a new blast database, USE THIS OPTION IF YOU RUN PROFREP WITH NEW DATABASE FOR THE FIRST TIME')
|
|
1099
|
|
1100 ############### PARALLEL PROCESSING ARGUMENTS ######################
|
|
1101 parallelOpt.add_argument(
|
|
1102 '-w',
|
|
1103 '--window',
|
|
1104 type=int,
|
|
1105 default=5000,
|
|
1106 help='sliding window size for parallel processing')
|
|
1107 parallelOpt.add_argument(
|
|
1108 '-o',
|
|
1109 '--overlap',
|
|
1110 type=int,
|
|
1111 default=150,
|
|
1112 help=
|
|
1113 'overlap for parallely processed regions, set greater than a read size')
|
|
1114
|
|
1115 ################ PROTEIN DOMAINS PARAMETERS ########################
|
|
1116 protOpt.add_argument('-pd',
|
|
1117 '--protein_domains',
|
|
1118 type=str2bool,
|
|
1119 default=False,
|
|
1120 help='use module for protein domains')
|
|
1121 protOpt.add_argument('-pdb',
|
|
1122 '--protein_database',
|
|
1123 type=str,
|
|
1124 help='protein domains database')
|
|
1125 protOpt.add_argument('-cs',
|
|
1126 '--classification',
|
|
1127 type=str,
|
|
1128 help='protein domains classification file')
|
|
1129 protOpt.add_argument(
|
|
1130 '-wd',
|
|
1131 '--win_dom',
|
|
1132 type=int,
|
|
1133 default=10000000,
|
|
1134 help=
|
|
1135 'protein domains module: sliding window to process large input sequences sequentially')
|
|
1136 protOpt.add_argument(
|
|
1137 '-od',
|
|
1138 '--overlap_dom',
|
|
1139 type=int,
|
|
1140 default=10000,
|
|
1141 help=
|
|
1142 'protein domains module: overlap of sequences in two consecutive windows')
|
|
1143 protOpt.add_argument(
|
|
1144 '-thsc',
|
|
1145 '--threshold_score',
|
|
1146 type=int,
|
|
1147 default=80,
|
|
1148 help=
|
|
1149 'protein domains module: percentage of the best score within the cluster to significant domains')
|
|
1150 protOpt.add_argument("-thl",
|
|
1151 "--th_length",
|
|
1152 type=float,
|
|
1153 choices=[Range(0.0, 1.0)],
|
|
1154 default=0.8,
|
|
1155 help="proportion of alignment length threshold")
|
|
1156 protOpt.add_argument("-thi",
|
|
1157 "--th_identity",
|
|
1158 type=float,
|
|
1159 choices=[Range(0.0, 1.0)],
|
|
1160 default=0.35,
|
|
1161 help="proportion of alignment identity threshold")
|
|
1162 protOpt.add_argument(
|
|
1163 "-ths",
|
|
1164 "--th_similarity",
|
|
1165 type=float,
|
|
1166 choices=[Range(0.0, 1.0)],
|
|
1167 default=0.45,
|
|
1168 help="threshold for alignment proportional similarity")
|
|
1169 protOpt.add_argument(
|
|
1170 "-ir",
|
|
1171 "--interruptions",
|
|
1172 type=int,
|
|
1173 default=3,
|
|
1174 help=
|
|
1175 "interruptions (frameshifts + stop codons) tolerance threshold per 100 AA")
|
|
1176 protOpt.add_argument(
|
|
1177 "-mlen",
|
|
1178 "--max_len_proportion",
|
|
1179 type=float,
|
|
1180 default=1.2,
|
|
1181 help=
|
|
1182 "maximal proportion of alignment length to the original length of protein domain from database")
|
|
1183
|
|
1184 ################ OUTPUTS ###########################################
|
|
1185 outOpt.add_argument('-lg',
|
|
1186 '--log_file',
|
|
1187 type=str,
|
|
1188 default=LOG_FILE,
|
|
1189 help='path to log file')
|
|
1190 outOpt.add_argument('-ouf',
|
|
1191 '--output_gff',
|
|
1192 type=str,
|
|
1193 default=REPEATS_GFF,
|
|
1194 help='path to output gff of repetitive regions')
|
|
1195 outOpt.add_argument('-oug',
|
|
1196 '--domain_gff',
|
|
1197 type=str,
|
|
1198 default=DOMAINS_GFF,
|
|
1199 help='path to output gff of protein domains')
|
|
1200 outOpt.add_argument('-oun',
|
|
1201 '--n_gff',
|
|
1202 type=str,
|
|
1203 default=N_GFF,
|
|
1204 help='path to output gff of N regions')
|
|
1205 outOpt.add_argument('-hf',
|
|
1206 '--html_file',
|
|
1207 type=str,
|
|
1208 default=HTML,
|
|
1209 help='path to output html file')
|
|
1210 outOpt.add_argument('-hp',
|
|
1211 '--html_path',
|
|
1212 type=str,
|
|
1213 default=PROFREP_OUTPUT_DIR,
|
|
1214 help='path to html extra files')
|
|
1215
|
|
1216 ################ HITS/COPY NUMBERS ####################################
|
|
1217 cnOpt.add_argument('-cn',
|
|
1218 '--copy_numbers',
|
|
1219 type=str2bool,
|
|
1220 default=False,
|
|
1221 help='convert hits to copy numbers')
|
|
1222 cnOpt.add_argument(
|
|
1223 '-gs',
|
|
1224 '--genome_size',
|
|
1225 type=float,
|
|
1226 help=
|
|
1227 'genome size is required when converting hits to copy numbers and you use custom data')
|
|
1228 cnOpt.add_argument(
|
|
1229 '-thr',
|
|
1230 '--threshold_repeat',
|
|
1231 type=int,
|
|
1232 default=3,
|
|
1233 help=
|
|
1234 'threshold for hits/copy numbers per position to be considered repetitive')
|
|
1235 cnOpt.add_argument(
|
|
1236 '-thsg',
|
|
1237 '--threshold_segment',
|
|
1238 type=int,
|
|
1239 default=80,
|
|
1240 help='threshold for the length of repetitive segment to be reported')
|
|
1241
|
|
1242 ################ JBrowse ##########################
|
|
1243 galaxyOpt.add_argument('-jb',
|
|
1244 '--jbrowse_bin',
|
|
1245 type=str,
|
|
1246 help='path to JBrowse bin directory')
|
|
1247
|
|
1248 args = parser.parse_args()
|
|
1249 main(args)
|