comparison profrep.py @ 0:a5f1638b73be draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 08:01:42 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a5f1638b73be
1 #!/usr/bin/env python3
2
3 import subprocess
4 import csv
5 import time
6 import sys
7 import matplotlib
8 # matplotlib.use("PDF")
9 matplotlib.use("pdf")
10 import matplotlib.pyplot as plt
11 import matplotlib.colors as colors
12 import matplotlib.cm as cmx
13 import multiprocessing
14 import argparse
15 import os
16 from functools import partial
17 from multiprocessing import Pool
18 from tempfile import NamedTemporaryFile
19 from operator import itemgetter
20 from itertools import groupby
21 import gff
22 import configuration
23 import visualization
24 import distutils
25 from distutils import dir_util
26 import tempfile
27 import re
28 from Bio import SeqIO
29 import sys
30 import pickle
31 import shutil
32 import warnings
33 import random
34 import numpy as np
35 import dante_gff_output_filtering as domains_filtering
36 import dante as protein_domains
37
38 t_profrep = time.time()
39 np.set_printoptions(threshold=sys.maxsize)
40 warnings.filterwarnings("ignore", module="matplotlib")
41
42
43 class Range():
44 '''
45 This class is used to check float range in argparse
46 '''
47
48 def __init__(self, start, end):
49 self.start = start
50 self.end = end
51
52 def __eq__(self, other):
53 return self.start <= other <= self.end
54
55 def __str__(self):
56 return "float range {}..{}".format(self.start, self.end)
57
58 def __repr__(self):
59 return "float range {}..{}".format(self.start, self.end)
60
61
62 def get_version(path):
63 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
64 shell=True,
65 cwd=path).decode('ascii').strip()
66 shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1 ",
67 shell=True,
68 cwd=path).decode('ascii').strip()
69 revcount = len(subprocess.check_output("git log --oneline",
70 shell=True,
71 cwd=path).decode('ascii').split())
72 version_string = ("-------------------------------------"
73 "-------------------------------------\n"
74 "PIPELINE VERSION : "
75 "{branch}-rv-{revcount}({shorthash})\n"
76 "-------------------------------------"
77 "-------------------------------------\n").format(
78 branch=branch,
79 shorthash=shorthash,
80 revcount=revcount, )
81 return (version_string)
82
83
84 def str2bool(v):
85 if v.lower() in ('yes', 'true', 't', 'y', '1'):
86 return True
87 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
88 return False
89 else:
90 raise argparse.ArgumentTypeError('Boolean value expected')
91
92
93 def check_fasta_id(QUERY):
94 forbidden_ids = []
95 headers = []
96 for record in SeqIO.parse(QUERY, "fasta"):
97 if any(x in record.id for x in configuration.FORBIDDEN_CHARS):
98 forbidden_ids.append(record.id)
99 headers.append(record.id)
100 if len(headers) > len(set([header.split(" ")[0] for header in headers])):
101 raise NameError(
102 '''Sequences in multifasta format are not named correctly:
103 seq IDs(before the first space) are the same''')
104 return forbidden_ids, headers
105
106
107 def multifasta(QUERY):
108 ''' Create single fasta temporary files to be processed sequentially '''
109 PATTERN = ">"
110 fasta_list = []
111 with open(QUERY, "r") as fasta:
112 reader = fasta.read()
113 splitter = reader.split(PATTERN)[1:]
114 for fasta_num, part in enumerate(splitter):
115 ntf = NamedTemporaryFile(delete=False)
116 ntf.write("{}{}".format(PATTERN, part).encode("utf-8"))
117 fasta_list.append(ntf.name)
118 ntf.close()
119 return fasta_list
120
121
122 def fasta_read(subfasta):
123 ''' Read fasta, gain header and sequence without gaps '''
124 sequence_lines = []
125 with open(subfasta, "r") as fasta:
126 header = fasta.readline().strip().split(" ")[0][1:]
127 for line in fasta:
128 clean_line = line.strip()
129 if clean_line:
130 sequence_lines.append(clean_line)
131 sequence = "".join(sequence_lines)
132 return header, sequence
133
134
135 def cluster_annotation(CL_ANNOTATION_TBL):
136 ''' Create dictionary of known annotations classes and related clusters '''
137 cl_annotations = {}
138 annot_table = np.genfromtxt(CL_ANNOTATION_TBL, dtype=str)
139 for line in annot_table:
140 if line[1] in cl_annotations:
141 cl_annotations[line[1]].append(line[0])
142 else:
143 cl_annotations[line[1]] = [line[0]]
144 return list(cl_annotations.items()), list(cl_annotations.keys())
145
146
147 def read_annotation(CLS, cl_annotations_items):
148 ''' Dictionary of known repeat classes and related reads '''
149 reads_annotations = {}
150 with open(CLS, "r") as cls_file:
151 count = 0
152 for line in cls_file:
153 line = line.rstrip()
154 count += 1
155 if count % 2 == 0:
156 reads = re.split("\s+", line)
157 for element in reads:
158 for key, value in cl_annotations_items:
159 if clust in value:
160 reads_annotations[element] = key
161 else:
162 clust = re.split("\s+", line)[0].split(">CL")[1]
163 return reads_annotations
164
165
166 def annot_profile(annotation_keys, part):
167 ''' Predefine dictionary of known annotations and partial sequence
168 repetitive profiles defined by parallel process '''
169 subprofile = {}
170 for key in annotation_keys:
171 subprofile[key] = [np.zeros(part, dtype=int), np.zeros(part,
172 dtype=int)]
173 subprofile["ALL"] = [np.zeros(part, dtype=int), np.zeros(part, dtype=int)]
174 return subprofile
175
176
177 def parallel_process(WINDOW, OVERLAP, seq_length, annotation_keys,
178 reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE,
179 BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER,
180 last_index, subsets_num, subset_index):
181 ''' Run parallel function to process the input sequence in windows
182 Run blast for subsequence defined by the input index and window size
183 Create and increment subprofile vector based on reads aligned within window '''
184 loc_start = subset_index + 1
185 loc_end = subset_index + WINDOW
186 if loc_end >= seq_length:
187 loc_end = seq_length
188 subprofile = annot_profile(annotation_keys, seq_length - loc_start + 1)
189 else:
190 subprofile = annot_profile(annotation_keys, WINDOW + 1)
191
192 # Find HSP records for every window defined by query location and parse the tabular stdout:
193 # 1. query, 2. database read, 3. alignment start, 4. alignment end, 5. bitscore
194 p = subprocess.Popen(
195 "blastn -query {} -query_loc {}-{} -db {} -evalue {} -word_size {} -dust {} -task {} -num_alignments {} -outfmt '6 qseqid sseqid qstart qend bitscore pident'".format(
196 subfasta, loc_start, loc_end, BLAST_DB, E_VALUE, WORD_SIZE,
197 DUST_FILTER, BLAST_TASK, MAX_ALIGNMENTS),
198 stdout=subprocess.PIPE,
199 shell=True)
200 count_hits = 0
201 for line in p.stdout:
202 column = line.decode("utf-8").rstrip().split("\t")
203 if float(column[4]) >= BITSCORE:
204 count_hits += 1
205 read = column[1] # ID of individual aligned read
206 if "reduce" in read:
207 reads_representation = int(read.split("reduce")[-1])
208 else:
209 reads_representation = 1
210 qstart = int(column[2]) # starting position of alignment
211 qend = int(column[3]) # ending position of alignemnt
212 if read in reads_annotations:
213 annotation = reads_annotations[read]
214 else:
215 annotation = "ALL"
216 subprofile[annotation][0][
217 qstart - subset_index - 1:qend - subset_index] = subprofile[
218 annotation][0][qstart - subset_index - 1:qend -
219 subset_index] + reads_representation
220 subprofile[annotation][1][qstart - subset_index - 1:qend -
221 subset_index] = subprofile[annotation][
222 1][qstart - subset_index - 1:qend -
223 subset_index] + float(column[
224 5]) * reads_representation
225 subprofile["ALL"][0] = sum([item[0] for item in subprofile.values()])
226 subprofile["ALL"][1] = sum([item[1] for item in subprofile.values()])
227 for repeat in subprofile.keys():
228 subprofile[repeat][1] = [int(round(quality / hits_num))
229 if hits_num != 0 else quality
230 for hits_num, quality in zip(subprofile[
231 repeat][0], subprofile[repeat][1])]
232 if subset_index == 0:
233 if subsets_num == 1:
234 subprf_name = subprofile_single(subprofile, subset_index)
235 else:
236 subprf_name = subprofile_first(subprofile, subset_index, WINDOW,
237 OVERLAP)
238 elif subset_index == last_index:
239 subprf_name = subprofile_last(subprofile, subset_index, OVERLAP)
240 else:
241 subprf_name = subprofiles_middle(subprofile, subset_index, WINDOW,
242 OVERLAP)
243 return subprf_name
244
245
246 def subprofile_single(subprofile, subset_index):
247 subprofile['idx'] = list(range(1, len(subprofile["ALL"][0]) + 1))
248 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
249 delete=False)
250 with open(subprf_dict.name, 'wb') as handle:
251 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
252 subprf_dict.close()
253 return subprf_dict.name
254
255
256 def subprofile_first(subprofile, subset_index, WINDOW, OVERLAP):
257 for key in subprofile.keys():
258 subprofile[key][0] = subprofile[key][0][0:-OVERLAP // 2 - 1]
259 subprofile[key][1] = subprofile[key][1][0:-OVERLAP // 2 - 1]
260 subprofile['idx'] = list(range(subset_index + 1, subset_index + WINDOW -
261 OVERLAP // 2 + 1))
262 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
263 delete=False)
264 with open(subprf_dict.name, 'wb') as handle:
265 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
266 subprf_dict.close()
267 return subprf_dict.name
268
269
270 def subprofiles_middle(subprofile, subset_index, WINDOW, OVERLAP):
271 for key in subprofile.keys():
272 subprofile[key][0] = subprofile[key][0][OVERLAP // 2:-OVERLAP // 2 - 1]
273 subprofile[key][1] = subprofile[key][1][OVERLAP // 2:-OVERLAP // 2 - 1]
274 subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1,
275 subset_index + WINDOW - OVERLAP // 2 + 1))
276 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
277 delete=False)
278 with open(subprf_dict.name, 'wb') as handle:
279 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
280 subprf_dict.close()
281 return subprf_dict.name
282
283
284 def subprofile_last(subprofile, subset_index, OVERLAP):
285 len_subprofile = len(subprofile['ALL'][0])
286 for key in subprofile.keys():
287 subprofile[key][0] = subprofile[key][0][OVERLAP // 2:]
288 subprofile[key][1] = subprofile[key][1][OVERLAP // 2:]
289 subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1,
290 subset_index + len_subprofile + 1))
291 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
292 delete=False)
293 with open(subprf_dict.name, 'wb') as handle:
294 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
295 subprf_dict.close()
296 return subprf_dict.name
297
298
299 def concatenate_prof(subprofiles_all, files_dict, seq_id, HTML_DATA,
300 wig_files):
301 for subprofile in subprofiles_all:
302 with open(subprofile, 'rb') as handle:
303 individual_dict = pickle.load(handle)
304 exclude = set(["idx"])
305 for key in set(individual_dict.keys()).difference(exclude):
306 if any(individual_dict[key][0]):
307 indices = handle_zero_lines(individual_dict[key][0])
308 if key not in files_dict.keys():
309 prf_name = "{}/{}.wig".format(HTML_DATA, re.sub(
310 '[\/\|]', '_', key))
311 prf_qual_name = "{}/{}_qual.wig".format(
312 HTML_DATA, re.sub('[\/\|]', '_', key))
313 prf_file = open(prf_name, "w")
314 prf_q_file = open(prf_qual_name, "w")
315 prf_file.write("{}{}\n".format(
316 configuration.HEADER_WIG, seq_id))
317 prf_q_file.write("{}{}\n".format(
318 configuration.HEADER_WIG, seq_id))
319 for i in indices:
320 prf_file.write("{}\t{}\n".format(individual_dict[
321 'idx'][i], individual_dict[key][0][i]))
322 prf_q_file.write("{}\t{}\n".format(individual_dict[
323 'idx'][i], int(individual_dict[key][1][i])))
324 files_dict[key] = [prf_name, [seq_id], prf_qual_name]
325 wig_files.append(prf_file)
326 wig_files.append(prf_q_file)
327 prf_file.close()
328 prf_q_file.close()
329 else:
330 prf_name = files_dict[key][0]
331 prf_qual_name = files_dict[key][2]
332 with open(prf_name, "a") as prf_file, open(
333 prf_qual_name, "a") as prf_q_file:
334 if seq_id not in files_dict[key][1]:
335 prf_file.write("{}{}\n".format(
336 configuration.HEADER_WIG, seq_id))
337 prf_q_file.write("{}{}\n".format(
338 configuration.HEADER_WIG, seq_id))
339 files_dict[key][1].append(seq_id)
340 for i in indices:
341 prf_file.write("{}\t{}\n".format(
342 individual_dict['idx'][i], individual_dict[
343 key][0][i]))
344 prf_q_file.write("{}\t{}\n".format(
345 individual_dict['idx'][i], int(
346 individual_dict[key][1][i])))
347 return files_dict, wig_files
348
349
350 def concatenate_prof_CN(CV, subprofiles_all, files_dict, seq_id, HTML_DATA,
351 wig_files):
352 for subprofile in subprofiles_all:
353 with open(subprofile, 'rb') as handle:
354 individual_dict = pickle.load(handle)
355 exclude = set(["idx"])
356 for key in set(individual_dict.keys()).difference(exclude):
357 if any(individual_dict[key][0]):
358 indices = handle_zero_lines(individual_dict[key][0])
359 if key not in files_dict.keys():
360 prf_name = "{}/{}.wig".format(HTML_DATA, re.sub(
361 '[\/\|]', '_', key))
362 prf_qual_name = "{}/{}_qual.wig".format(
363 HTML_DATA, re.sub('[\/\|]', '_', key))
364 prf_file = open(prf_name, "w")
365 prf_q_file = open(prf_qual_name, "w")
366 prf_file.write("{}{}\n".format(
367 configuration.HEADER_WIG, seq_id))
368 prf_q_file.write("{}{}\n".format(
369 configuration.HEADER_WIG, seq_id))
370 for i in indices:
371 prf_file.write("{}\t{}\n".format(individual_dict[
372 'idx'][i], int(individual_dict[key][0][i] /
373 CV)))
374 prf_q_file.write("{}\t{}\n".format(individual_dict[
375 'idx'][i], int(individual_dict[key][1][i])))
376 files_dict[key] = [prf_name, [seq_id], prf_qual_name]
377 wig_files.append(prf_file)
378 wig_files.append(prf_q_file)
379 prf_file.close()
380 prf_q_file.close()
381 else:
382 prf_name = files_dict[key][0]
383 prf_qual_name = files_dict[key][2]
384 with open(prf_name, "a") as prf_file, open(
385 prf_qual_name, "a") as prf_q_file:
386 if seq_id not in files_dict[key][1]:
387 prf_file.write("{}{}\n".format(
388 configuration.HEADER_WIG, seq_id))
389 prf_q_file.write("{}{}\n".format(
390 configuration.HEADER_WIG, seq_id))
391 files_dict[key][1].append(seq_id)
392 for i in indices:
393 prf_file.write("{}\t{}\n".format(
394 individual_dict['idx'][i], int(
395 individual_dict[key][0][i] / CV)))
396 prf_q_file.write("{}\t{}\n".format(
397 individual_dict['idx'][i], int(
398 individual_dict[key][1][i])))
399 return files_dict, wig_files
400
401
402 def handle_zero_lines(repeat_subhits):
403 ''' Clean lines which contains only zeros, i.e. positons which do not contain any hit. However border zero positions need to be preserved due to correct graphs plotting '''
404 zero_idx = [idx for idx, val in enumerate(repeat_subhits) if val == 0]
405 indices = [idx for idx, val in enumerate(repeat_subhits) if val != 0]
406 zero_breakpoints = []
407 for key, group in groupby(
408 enumerate(zero_idx),
409 lambda index_item: index_item[0] - index_item[1]):
410 group = list(map(itemgetter(1), group))
411 zero_breakpoints.append(group[0])
412 zero_breakpoints.append(group[-1])
413 if indices:
414 indices.extend(zero_breakpoints)
415 indices = sorted(set(indices), key=int)
416 else:
417 indices = []
418 return indices
419
420
421 def repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA,
422 xminimal, xmaximal, domains, seq_ids_dom, CN,
423 seq_ids_all, seq_lengths_all, files_dict):
424 ''' Process the hits table separately for each fasta, create gff file and profile picture '''
425 if files_dict:
426 gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict,
427 seq_ids_all)
428 else:
429 with open(OUTPUT_GFF, "w") as gff_file:
430 gff_file.write("{}\n".format(configuration.HEADER_GFF))
431
432 # TODO remove plotting, keep only basic report
433 return None
434 seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM]
435 graphs_dict = {}
436 seqs_long = []
437 if files_dict:
438 [graphs_dict, seqs_long] = visualization.vis_profrep(
439 seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA,
440 seqs_all_part)
441 count_seq = 0
442 for seq in seqs_all_part:
443 if seq in graphs_dict.keys():
444 fig = graphs_dict[seq][0]
445 ax = graphs_dict[seq][1]
446 art = []
447 lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3)
448 art.append(lgd)
449 if seq in seq_ids_dom:
450 dom_idx = seq_ids_dom.index(seq)
451 [fig, ax] = visualization.vis_domains(
452 fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx],
453 domains[dom_idx])
454 elif seq in seqs_long:
455 [fig, ax] = visualization.plot_figure(
456 seq, seq_lengths_all[count_seq], CN)
457 ax.text(
458 0.3,
459 0.5,
460 "Graphs are only displayed if sequence is not longer than {} bp".format(
461 configuration.SEQ_LEN_VIZ),
462 transform=ax.transAxes,
463 fontsize=14,
464 verticalalignment='center',
465 color='blue')
466 else:
467 [fig, ax] = visualization.plot_figure(
468 seq, seq_lengths_all[count_seq], CN)
469 ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4)
470 if seq in seq_ids_dom:
471 dom_idx = seq_ids_dom.index(seq)
472 [fig, ax] = visualization.vis_domains(
473 fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx],
474 domains[dom_idx])
475 output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq)
476 fig.savefig(output_pic_png,
477 bbox_inches="tight",
478 format="png",
479 dpi=configuration.IMAGE_RES)
480 count_seq += 1
481 return None
482
483
484 def repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA, CN,
485 seq_ids_all, seq_lengths_all, files_dict):
486 ''' Process the hits table separately for each fasta, create gff file and profile picture '''
487 if files_dict:
488 gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict,
489 seq_ids_all)
490 else:
491 with open(OUTPUT_GFF, "w") as gff_file:
492 gff_file.write("{}\n".format(configuration.HEADER_GFF))
493
494 # TODO remove plotting, keep only basic report
495 return None
496 seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM]
497 graphs_dict = {}
498 seqs_long = []
499 if files_dict:
500 [graphs_dict, seqs_long] = visualization.vis_profrep(
501 seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA,
502 seqs_all_part)
503 count_seq = 0
504 for seq in seqs_all_part:
505 if seq in graphs_dict.keys():
506 fig = graphs_dict[seq][0]
507 ax = graphs_dict[seq][1]
508 art = []
509 lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3)
510 art.append(lgd)
511 elif seq in seqs_long:
512 [fig, ax] = visualization.plot_figure(
513 seq, seq_lengths_all[count_seq], CN)
514 ax.text(
515 0.3,
516 0.5,
517 "Graphs are only displayed if sequence is not longer than {} bp".format(
518 configuration.SEQ_LEN_VIZ),
519 transform=ax.transAxes,
520 fontsize=14,
521 verticalalignment='center',
522 color='blue')
523 else:
524 [fig, ax] = visualization.plot_figure(
525 seq, seq_lengths_all[count_seq], CN)
526 ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4)
527 output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq)
528 fig.savefig(output_pic_png,
529 bbox_inches="tight",
530 format="png",
531 dpi=configuration.IMAGE_RES)
532 plt.close()
533 count_seq += 1
534 return None
535
536
537 def html_output(total_length, seq_lengths_all, seq_names, HTML, DB_NAME, REF,
538 REF_LINK):
539 ''' Define html output with limited number of output pictures and link to JBrowse '''
540 info = "\t\t".join(['<pre> {} [{} bp]</pre>'.format(seq_name, seq_length)
541 for seq_name, seq_length in zip(seq_names,
542 seq_lengths_all)])
543 if REF:
544 ref_part_1 = REF.split("-")[0]
545 ref_part_2 = "-".join(REF.split("-")[1:]).split(". ")[0]
546 ref_part_3 = ". ".join("-".join(REF.split("-")[1:]).split(". ")[1:])
547 ref_string = '''<h6> {} - <a href="{}" target="_blank" >{}</a>. {}'''.format(
548 ref_part_1, REF_LINK, ref_part_2, ref_part_3)
549 else:
550 ref_string = "Custom Data"
551 pictures = "\n\t\t".join(['<img src="{}.png" width=1800>'.format(
552 pic) for pic in range(len(seq_names))[:configuration.MAX_PIC_NUM]])
553 html_str = configuration.HTML_STR.format(info, total_length, DB_NAME,
554 pictures, ref_string)
555 with open(HTML, "w") as html_file:
556 html_file.write(html_str)
557
558
559 def adjust_tracklist(jbrowse_data_path):
560 starting_lines = []
561 ending_lines = []
562 end = False
563 with open(
564 os.path.join(jbrowse_data_path,
565 "trackList.json"), "r") as track_list:
566 for line in track_list:
567 if "]" not in line and not end:
568 starting_lines.append(line)
569 else:
570 end = True
571 ending_lines.append(line)
572 with open(
573 os.path.join(jbrowse_data_path,
574 "trackList.json"), "w") as track_list:
575 for line in starting_lines:
576 track_list.write(line)
577 return ending_lines
578
579
580 def jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF,
581 total_length, JBROWSE_BIN, files_dict):
582 ''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse '''
583 jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir)
584 with tempfile.TemporaryDirectory() as dirpath:
585 subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN),
586 "--fasta", QUERY, "--out", jbrowse_data_path])
587 subprocess.call(["{}/flatfile-to-json.pl".format(
588 JBROWSE_BIN), "--gff", OUT_DOMAIN_GFF, "--trackLabel",
589 "GFF_domains", "--out", jbrowse_data_path])
590 subprocess.call(["{}/flatfile-to-json.pl".format(
591 JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats",
592 "--config", configuration.JSON_CONF_R, "--out",
593 jbrowse_data_path])
594 subprocess.call(["{}/flatfile-to-json.pl".format(
595 JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions",
596 "--config", configuration.JSON_CONF_N, "--out",
597 jbrowse_data_path])
598 count = 0
599 # Control the total length processed, if above threshold, dont create wig image tracks
600 if files_dict:
601 exclude = set(['ALL'])
602 sorted_keys = sorted(set(files_dict.keys()).difference(exclude))
603 sorted_keys.insert(0, "ALL")
604 ending_lines = adjust_tracklist(jbrowse_data_path)
605 track_list = open(
606 os.path.join(jbrowse_data_path, "trackList.json"), "a")
607 color_avail = len(configuration.COLORS_HEX)
608 for repeat_id in sorted_keys:
609 if count <= color_avail - 1:
610 color = configuration.COLORS_HEX[count]
611 else:
612 r = lambda: random.randint(0, 255)
613 color = '#%02X%02X%02X' % (r(), r(), r())
614 count += 1
615 bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id))
616 subprocess.call(["wigToBigWig", files_dict[repeat_id][
617 0], os.path.join(HTML_DATA,
618 configuration.CHROM_SIZES_FILE),
619 os.path.join(jbrowse_data_path, bw_name)])
620 track_list.write(configuration.TRACK_LIST.format(
621 "{", bw_name, repeat_id, repeat_id, "{", color, "}", "}"))
622 for line in ending_lines:
623 track_list.write(line)
624 distutils.dir_util.copy_tree(dirpath, jbrowse_data_path)
625 return None
626
627
628 def jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length,
629 JBROWSE_BIN, files_dict):
630 ''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse '''
631 jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir)
632 with tempfile.TemporaryDirectory() as dirpath:
633 subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN),
634 "--fasta", QUERY, "--out", jbrowse_data_path])
635 subprocess.call(["{}/flatfile-to-json.pl".format(
636 JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats",
637 "--config", configuration.JSON_CONF_R, "--out",
638 jbrowse_data_path])
639 subprocess.call(["{}/flatfile-to-json.pl".format(
640 JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions",
641 "--config", configuration.JSON_CONF_N, "--out",
642 jbrowse_data_path])
643 count = 0
644 ## Control the total length processed, if above threshold, dont create wig image tracks
645 if files_dict:
646 exclude = set(['ALL'])
647 sorted_keys = sorted(set(files_dict.keys()).difference(exclude))
648 sorted_keys.insert(0, "ALL")
649 ending_lines = adjust_tracklist(jbrowse_data_path)
650 track_list = open(
651 os.path.join(jbrowse_data_path, "trackList.json"), "a")
652 color_avail = len(configuration.COLORS_HEX)
653 for repeat_id in sorted_keys:
654 if count <= color_avail - 1:
655 color = configuration.COLORS_HEX[count]
656 else:
657 r = lambda: random.randint(0, 255)
658 color = '#%02X%02X%02X' % (r(), r(), r())
659 count += 1
660 bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id))
661 subprocess.call(["wigToBigWig", files_dict[repeat_id][
662 0], os.path.join(HTML_DATA,
663 configuration.CHROM_SIZES_FILE),
664 os.path.join(jbrowse_data_path, bw_name)])
665 track_list.write(configuration.TRACK_LIST.format(
666 "{", bw_name, repeat_id, repeat_id, "{", color, "}", "}"))
667 for line in ending_lines:
668 track_list.write(line)
669 track_list.close()
670 distutils.dir_util.copy_tree(dirpath, jbrowse_data_path)
671 return None
672
673
674 def genome2coverage(GS, BLAST_DB):
675 ''' Convert genome size to coverage '''
676 num_of_reads = 0
677 with open(BLAST_DB, "r") as reads_all:
678 first_line = reads_all.readline()
679 if first_line.startswith(">"):
680 num_of_reads += 1
681 first_seq = reads_all.readline().rstrip()
682 for line in reads_all:
683 if line.startswith(">"):
684 num_of_reads += 1
685 len_of_read = len(first_seq)
686 CV = (num_of_reads * len_of_read) / (GS * 1000000) # GS in Mb
687 return CV
688
689
690 def prepared_data(DB_ID):
691 ''' Get prepared rep. annotation data from the table based on the selected species ID '''
692 tbl = os.path.join(
693 os.path.dirname(os.path.realpath(__file__)),
694 configuration.PROFREP_DATA, configuration.PROFREP_TBL)
695 with open(tbl, "r") as datasets:
696 for line in datasets:
697 if line.split("\t")[0] == DB_ID:
698 DB_NAME = line.split("\t")[1]
699 CV = float(line.split("\t")[5])
700 REF = line.split("\t")[6]
701 REF_LINK = line.split("\t")[7]
702 return DB_NAME, CV, REF, REF_LINK
703
704
705 def seq_sizes_file(seq_ids, seq_lengths_all, HTML_DATA):
706 chrom_sizes = os.path.join(HTML_DATA, configuration.CHROM_SIZES_FILE)
707 with open(chrom_sizes, "w") as chroms:
708 for seq_id, seq_length in zip(seq_ids, seq_lengths_all):
709 chroms.write("{}\t{}\n".format(seq_id, seq_length))
710
711
712 def main(args):
713 ## Command line arguments
714 QUERY = args.query
715 BLAST_DB = args.reads
716 CL_ANNOTATION_TBL = args.ann_tbl
717 CLS = args.cls
718 BITSCORE = args.bit_score
719 E_VALUE = args.e_value
720 WORD_SIZE = args.word_size
721 WINDOW = args.window
722 OVERLAP = args.overlap
723 BLAST_TASK = args.task
724 MAX_ALIGNMENTS = args.max_alignments
725 NEW_DB = args.new_db
726 THRESHOLD = args.threshold_repeat
727 THRESHOLD_SEGMENT = args.threshold_segment
728 TH_IDENTITY = args.th_identity
729 TH_LENGTH = args.th_length
730 TH_INTERRUPT = args.interruptions
731 TH_SIMILARITY = args.th_similarity
732 TH_LEN_RATIO = args.max_len_proportion
733 OUTPUT_GFF = args.output_gff
734 DOMAINS = args.protein_domains
735 LAST_DB = args.protein_database
736 CLASSIFICATION = args.classification
737 OUT_DOMAIN_GFF = args.domain_gff
738 HTML = args.html_file
739 HTML_DATA = args.html_path
740 N_GFF = args.n_gff
741 CN = args.copy_numbers
742 GS = args.genome_size
743 DB_ID = args.db_id
744 THRESHOLD_SCORE = args.threshold_score
745 WIN_DOM = args.win_dom
746 OVERLAP_DOM = args.overlap_dom
747 JBROWSE_BIN = args.jbrowse_bin
748 DUST_FILTER = args.dust_filter
749 LOG_FILE = args.log_file
750 #JBROWSE_BIN = os.environ['JBROWSE_SOURCE_DIR']+"/bin"
751 #if not JBROWSE_BIN:
752 # try:
753 # JBROWSE_BIN = os.environ['JBROWSE_BIN']
754 # except KeyError:
755 # raise ValueError('There was no path to JBrowse bin found - set the enviroment variable JBROWSE_BIN or pass the argument explicitly')
756 if CN and not DB_ID and not GS:
757 raise ValueError("Genome size missing - if you want to convert hits to copy numbers please enter --genome_size parameter")
758
759
760 ## Check if there are forbidden characters in fasta IDs
761 [forbidden_ids, headers] = check_fasta_id(QUERY)
762 if forbidden_ids:
763 ##################### USER ERROR ###############################
764 raise UserWarning(
765 "The following IDs contain forbidden characters ('/' or '\\') - PLEASE REPLACE OR DELETE THEM:\n{}".format(
766 "\n".join(forbidden_ids)))
767
768 if len(headers) > len(set([header.split(" ")[0] for header in headers])):
769 raise NameError(
770 '''Sequences in multifasta format are not named correctly:
771 seq IDs(before the first space) are the same''')
772
773 ## Create new blast database of reads
774 if NEW_DB:
775 subprocess.call("makeblastdb -in {} -dbtype nucl".format(BLAST_DB),
776 shell=True)
777
778 ## Parse prepared annotation data table
779 if DB_ID:
780 [DB_NAME, CV, REF, REF_LINK] = prepared_data(DB_ID)
781 else:
782 REF = None
783 REF_LINK = None
784 DB_NAME = "CUSTOM"
785
786 ## Create dir to store outputs for html and JBROWSE
787 if not os.path.exists(HTML_DATA):
788 os.makedirs(HTML_DATA)
789
790 if not os.path.isabs(HTML):
791 HTML = os.path.join(HTML_DATA, HTML)
792
793 if not os.path.isabs(OUT_DOMAIN_GFF):
794 OUT_DOMAIN_GFF = os.path.join(HTML_DATA, OUT_DOMAIN_GFF)
795
796 if not os.path.isabs(LOG_FILE):
797 LOG_FILE = os.path.join(HTML_DATA, LOG_FILE)
798
799 if not os.path.isabs(N_GFF):
800 N_GFF = os.path.join(HTML_DATA, N_GFF)
801
802 if not os.path.isabs(OUTPUT_GFF):
803 OUTPUT_GFF = os.path.join(HTML_DATA, OUTPUT_GFF)
804
805 path = os.path.dirname(os.path.realpath(__file__))
806 version_string = get_version(path)
807
808 log = os.open(LOG_FILE, os.O_RDWR | os.O_CREAT)
809
810 os.write(log, version_string.encode("utf-8"))
811
812 ## Define parameters for parallel process
813 STEP = WINDOW - OVERLAP
814 NUM_CORES = multiprocessing.cpu_count()
815 os.write(log, "NUM_OF_CORES = {}\n".format(NUM_CORES).encode("utf-8"))
816
817 ## Convert genome size to coverage
818 if CN and GS:
819 CV = genome2coverage(GS, BLAST_DB)
820 os.write(log, "COVERAGE = {}\n".format(CV).encode("utf-8"))
821
822 parallel_pool = Pool(NUM_CORES)
823
824 ## Assign clusters to repetitive classes
825 [cl_annotations_items, annotation_keys
826 ] = cluster_annotation(CL_ANNOTATION_TBL)
827
828 ## Assign reads to repetitive classes
829 reads_annotations = read_annotation(CLS, cl_annotations_items)
830
831 ## Detect all fasta sequences from input
832 fasta_list = multifasta(QUERY)
833 headers = []
834 files_dict = {}
835 wig_files = []
836 seq_count = 1
837 start = 1
838 total_length = 0
839 seq_lengths_all = []
840 Ngff = open(N_GFF, "w")
841 Ngff.write("{}\n".format(configuration.HEADER_GFF))
842
843 ## Find hits for each fasta sequence separetely
844 t_blast = time.time()
845 for subfasta in fasta_list:
846 [header, sequence] = fasta_read(subfasta)
847 os.write(log, "Sequence {} is being processed...\n".format(
848 header).encode("utf-8"))
849 os.fsync(log)
850 indices_N = [indices + 1
851 for indices, n in enumerate(sequence)
852 if n == "n" or n == "N"]
853 if indices_N:
854 gff.idx_ranges_N(indices_N, configuration.N_segment, header, Ngff,
855 configuration.N_NAME, configuration.N_FEATURE)
856 seq_length = len(sequence)
857 headers.append(header)
858 ## Create parallel process
859 subset_index = list(range(0, seq_length, STEP))
860 ## Situation when penultimal window is not complete but it is following by another one
861 if len(subset_index) > 1 and subset_index[-2] + WINDOW >= seq_length:
862 subset_index = subset_index[:-1]
863 last_index = subset_index[-1]
864 index_range = range(len(subset_index))
865 for chunk_index in index_range[0::configuration.MAX_FILES_SUBPROFILES]:
866 multiple_param = partial(
867 parallel_process, WINDOW, OVERLAP, seq_length, annotation_keys,
868 reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE,
869 BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER, last_index,
870 len(subset_index))
871 subprofiles_all = parallel_pool.map(multiple_param, subset_index[
872 chunk_index:chunk_index + configuration.MAX_FILES_SUBPROFILES])
873 ## Join partial profiles to the final profile of the sequence
874 if CN:
875 [files_dict, wig_files
876 ] = concatenate_prof_CN(CV, subprofiles_all, files_dict,
877 header, HTML_DATA, wig_files)
878 else:
879 [files_dict, wig_files] = concatenate_prof(
880 subprofiles_all, files_dict, header, HTML_DATA, wig_files)
881 for subprofile in subprofiles_all:
882 os.unlink(subprofile)
883 total_length += seq_length
884 seq_lengths_all.append(seq_length)
885
886 os.write(log, "ELAPSED_TIME_BLAST = {} s\n".format(time.time(
887 ) - t_blast).encode("utf-8"))
888 os.write(
889 log,
890 "TOTAL_LENGHT_ANALYZED = {} bp\n".format(total_length).encode("utf-8"))
891
892 ## Close opened files
893 for opened_file in wig_files:
894 opened_file.close()
895 Ngff.close()
896
897 ## Create file containing size of sequences to convert wig to bigwig
898 seq_sizes_file(headers, seq_lengths_all, HTML_DATA)
899
900 ## Protein domains module
901 t_domains = time.time()
902 if DOMAINS:
903 os.write(log, "Domains module has started...\n".encode("utf-8"))
904 os.fsync(log)
905 domains_primary = NamedTemporaryFile(delete=False)
906 protein_domains.domain_search(QUERY, LAST_DB, CLASSIFICATION,
907 domains_primary.name, THRESHOLD_SCORE,
908 WIN_DOM, OVERLAP_DOM)
909 domains_primary.close()
910 [xminimal, xmaximal, domains, seq_ids_dom
911 ] = domains_filtering.filter_qual_dom(
912 domains_primary.name, OUT_DOMAIN_GFF, TH_IDENTITY, TH_SIMILARITY,
913 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, 'All', "")
914 os.unlink(domains_primary.name)
915 os.write(log, "ELAPSED_TIME_DOMAINS = {} s\n".format(time.time(
916 ) - t_domains).encode("utf-8"))
917
918 # Process individual sequences from the input file sequentially
919 t_gff_vis = time.time()
920 repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT,
921 HTML_DATA, xminimal, xmaximal, domains,
922 seq_ids_dom, CN, headers, seq_lengths_all,
923 files_dict)
924 os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time(
925 ) - t_gff_vis).encode("utf-8"))
926 os.fsync(log)
927 # Prepare data for html output
928 t_jbrowse = time.time()
929 os.write(log, "JBrowse tracks are being prepared...\n".encode("utf-8"))
930 os.fsync(log)
931 jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF,
932 total_length, JBROWSE_BIN, files_dict)
933 os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time(
934 ) - t_jbrowse).encode("utf-8"))
935 else:
936 # Process individual sequences from the input file sequentially
937 t_gff_vis = time.time()
938 repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA,
939 CN, headers, seq_lengths_all, files_dict)
940 os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time(
941 ) - t_gff_vis).encode("utf-8"))
942
943 # Prepare data for html output
944 t_jbrowse = time.time()
945 jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length,
946 JBROWSE_BIN, files_dict)
947 os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time(
948 ) - t_jbrowse).encode("utf-8"))
949
950 # Create HTML output
951 t_html = time.time()
952 os.write(
953 log,
954 "HTML output and JBrowse data structure are being prepared...\n".encode(
955 "utf-8"))
956 os.fsync(log)
957 html_output(total_length, seq_lengths_all, headers, HTML, DB_NAME, REF,
958 REF_LINK)
959 os.write(log, "ELAPSED_TIME_HTML = {} s\n".format(time.time() -
960 t_html).encode("utf-8"))
961 os.write(log, "ELAPSED_TIME_PROFREP = {} s\n".format(time.time(
962 ) - t_profrep).encode("utf-8"))
963 os.close(log)
964
965 ## Clean up the temporary fasta files
966 for subfasta in fasta_list:
967 os.unlink(subfasta)
968
969
970 if __name__ == "__main__":
971 import argparse
972 from argparse import RawDescriptionHelpFormatter
973
974 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
975 argparse.RawDescriptionHelpFormatter):
976 pass
977
978 # Default paths (command line usage)
979 HTML = configuration.HTML
980 DOMAINS_GFF = configuration.DOMAINS_GFF
981 REPEATS_GFF = configuration.REPEATS_GFF
982 N_GFF = configuration.N_GFF
983 LOG_FILE = configuration.LOG_FILE
984 PROFREP_OUTPUT_DIR = configuration.PROFREP_OUTPUT_DIR
985
986 # Command line arguments
987 parser = argparse.ArgumentParser(
988 description='''
989
990 DEPENDENCIES:
991 - python 3.4 or higher with packages:
992 - numpy
993 - matplotlib
994 - biopython
995 - BLAST 2.2.28+ or higher
996 - LAST 744 or higher
997 - wigToBigWig
998 - cd-hit
999 - JBrowse - ! Only bin needed, does not have to be installed under a web server
1000 * ProfRep Modules:
1001 - gff.py
1002 - visualization.py
1003 - configuration.py
1004 - protein_domains.py
1005 - domains_filtering.py
1006
1007 EXAMPLE OF USAGE:
1008
1009 ./protein.py --query PATH_TO_DNA_SEQ --reads PATH_TO_READS --ann_tbl PATH_TO_CLUSTERS_CLASSIFICATION --cls PATH_TO_hitsort.cls [--new_db True]
1010 ''',
1011 epilog="""take a look at README for more detailed information""",
1012 formatter_class=CustomFormatter)
1013
1014 Required = parser.add_argument_group('required arguments')
1015 altRequired = parser.add_argument_group(
1016 'alternative required arguments - prepared datasets')
1017 blastOpt = parser.add_argument_group('optional arguments - BLAST Search')
1018 parallelOpt = parser.add_argument_group(
1019 'optional arguments - Parallel Processing')
1020 protOpt = parser.add_argument_group('optional arguments - Protein Domains')
1021 outOpt = parser.add_argument_group('optional arguments - Output Paths')
1022 cnOpt = parser.add_argument_group(
1023 'optional arguments - Copy Numbers/Hits ')
1024 galaxyOpt = parser.add_argument_group(
1025 'optional arguments - Enviroment Variables')
1026
1027 ################ INPUTS ############################################
1028 Required.add_argument('-q',
1029 '--query',
1030 type=str,
1031 required=True,
1032 help='input DNA sequence in (multi)fasta format')
1033 Required.add_argument('-rdb',
1034 '--reads',
1035 type=str,
1036 required=True,
1037 help='blast database of all sequencing reads')
1038 Required.add_argument(
1039 '-a',
1040 '--ann_tbl',
1041 type=str,
1042 required=True,
1043 help=
1044 'clusters annotation table, tab-separated number of cluster and its classification')
1045 Required.add_argument(
1046 '-c',
1047 '--cls',
1048 type=str,
1049 required=True,
1050 help='cls file containing reads assigned to clusters (hitsort.cls)')
1051 altRequired.add_argument(
1052 '-id',
1053 '--db_id',
1054 type=str,
1055 help='annotation dataset ID (first column of datasets table)')
1056
1057 ################ BLAST parameters ##################################
1058 blastOpt.add_argument('-bs',
1059 '--bit_score',
1060 type=float,
1061 default=50,
1062 help='bitscore threshold')
1063 blastOpt.add_argument(
1064 '-m',
1065 '--max_alignments',
1066 type=int,
1067 default=10000000,
1068 help=
1069 'blast filtering option: maximal number of alignments in the output')
1070 blastOpt.add_argument('-e',
1071 '--e_value',
1072 type=str,
1073 default=0.1,
1074 help='blast setting option: e-value')
1075 blastOpt.add_argument(
1076 '-df',
1077 '--dust_filter',
1078 type=str,
1079 default="'20 64 1'",
1080 help='dust filters low-complexity regions during BLAST search')
1081 blastOpt.add_argument(
1082 '-ws',
1083 '--word_size',
1084 type=int,
1085 default=11,
1086 help='blast search option: initial word size for alignment')
1087 blastOpt.add_argument('-t',
1088 '--task',
1089 type=str,
1090 default="blastn",
1091 help='type of blast to be triggered')
1092 blastOpt.add_argument(
1093 '-n',
1094 '--new_db',
1095 type=str2bool,
1096 default=True,
1097 help=
1098 'create a new blast database, USE THIS OPTION IF YOU RUN PROFREP WITH NEW DATABASE FOR THE FIRST TIME')
1099
1100 ############### PARALLEL PROCESSING ARGUMENTS ######################
1101 parallelOpt.add_argument(
1102 '-w',
1103 '--window',
1104 type=int,
1105 default=5000,
1106 help='sliding window size for parallel processing')
1107 parallelOpt.add_argument(
1108 '-o',
1109 '--overlap',
1110 type=int,
1111 default=150,
1112 help=
1113 'overlap for parallely processed regions, set greater than a read size')
1114
1115 ################ PROTEIN DOMAINS PARAMETERS ########################
1116 protOpt.add_argument('-pd',
1117 '--protein_domains',
1118 type=str2bool,
1119 default=False,
1120 help='use module for protein domains')
1121 protOpt.add_argument('-pdb',
1122 '--protein_database',
1123 type=str,
1124 help='protein domains database')
1125 protOpt.add_argument('-cs',
1126 '--classification',
1127 type=str,
1128 help='protein domains classification file')
1129 protOpt.add_argument(
1130 '-wd',
1131 '--win_dom',
1132 type=int,
1133 default=10000000,
1134 help=
1135 'protein domains module: sliding window to process large input sequences sequentially')
1136 protOpt.add_argument(
1137 '-od',
1138 '--overlap_dom',
1139 type=int,
1140 default=10000,
1141 help=
1142 'protein domains module: overlap of sequences in two consecutive windows')
1143 protOpt.add_argument(
1144 '-thsc',
1145 '--threshold_score',
1146 type=int,
1147 default=80,
1148 help=
1149 'protein domains module: percentage of the best score within the cluster to significant domains')
1150 protOpt.add_argument("-thl",
1151 "--th_length",
1152 type=float,
1153 choices=[Range(0.0, 1.0)],
1154 default=0.8,
1155 help="proportion of alignment length threshold")
1156 protOpt.add_argument("-thi",
1157 "--th_identity",
1158 type=float,
1159 choices=[Range(0.0, 1.0)],
1160 default=0.35,
1161 help="proportion of alignment identity threshold")
1162 protOpt.add_argument(
1163 "-ths",
1164 "--th_similarity",
1165 type=float,
1166 choices=[Range(0.0, 1.0)],
1167 default=0.45,
1168 help="threshold for alignment proportional similarity")
1169 protOpt.add_argument(
1170 "-ir",
1171 "--interruptions",
1172 type=int,
1173 default=3,
1174 help=
1175 "interruptions (frameshifts + stop codons) tolerance threshold per 100 AA")
1176 protOpt.add_argument(
1177 "-mlen",
1178 "--max_len_proportion",
1179 type=float,
1180 default=1.2,
1181 help=
1182 "maximal proportion of alignment length to the original length of protein domain from database")
1183
1184 ################ OUTPUTS ###########################################
1185 outOpt.add_argument('-lg',
1186 '--log_file',
1187 type=str,
1188 default=LOG_FILE,
1189 help='path to log file')
1190 outOpt.add_argument('-ouf',
1191 '--output_gff',
1192 type=str,
1193 default=REPEATS_GFF,
1194 help='path to output gff of repetitive regions')
1195 outOpt.add_argument('-oug',
1196 '--domain_gff',
1197 type=str,
1198 default=DOMAINS_GFF,
1199 help='path to output gff of protein domains')
1200 outOpt.add_argument('-oun',
1201 '--n_gff',
1202 type=str,
1203 default=N_GFF,
1204 help='path to output gff of N regions')
1205 outOpt.add_argument('-hf',
1206 '--html_file',
1207 type=str,
1208 default=HTML,
1209 help='path to output html file')
1210 outOpt.add_argument('-hp',
1211 '--html_path',
1212 type=str,
1213 default=PROFREP_OUTPUT_DIR,
1214 help='path to html extra files')
1215
1216 ################ HITS/COPY NUMBERS ####################################
1217 cnOpt.add_argument('-cn',
1218 '--copy_numbers',
1219 type=str2bool,
1220 default=False,
1221 help='convert hits to copy numbers')
1222 cnOpt.add_argument(
1223 '-gs',
1224 '--genome_size',
1225 type=float,
1226 help=
1227 'genome size is required when converting hits to copy numbers and you use custom data')
1228 cnOpt.add_argument(
1229 '-thr',
1230 '--threshold_repeat',
1231 type=int,
1232 default=3,
1233 help=
1234 'threshold for hits/copy numbers per position to be considered repetitive')
1235 cnOpt.add_argument(
1236 '-thsg',
1237 '--threshold_segment',
1238 type=int,
1239 default=80,
1240 help='threshold for the length of repetitive segment to be reported')
1241
1242 ################ JBrowse ##########################
1243 galaxyOpt.add_argument('-jb',
1244 '--jbrowse_bin',
1245 type=str,
1246 help='path to JBrowse bin directory')
1247
1248 args = parser.parse_args()
1249 main(args)