Mercurial > repos > petr-novak > profrep
comparison profrep.py @ 0:a5f1638b73be draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 26 Jun 2019 08:01:42 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a5f1638b73be |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import subprocess | |
4 import csv | |
5 import time | |
6 import sys | |
7 import matplotlib | |
8 # matplotlib.use("PDF") | |
9 matplotlib.use("pdf") | |
10 import matplotlib.pyplot as plt | |
11 import matplotlib.colors as colors | |
12 import matplotlib.cm as cmx | |
13 import multiprocessing | |
14 import argparse | |
15 import os | |
16 from functools import partial | |
17 from multiprocessing import Pool | |
18 from tempfile import NamedTemporaryFile | |
19 from operator import itemgetter | |
20 from itertools import groupby | |
21 import gff | |
22 import configuration | |
23 import visualization | |
24 import distutils | |
25 from distutils import dir_util | |
26 import tempfile | |
27 import re | |
28 from Bio import SeqIO | |
29 import sys | |
30 import pickle | |
31 import shutil | |
32 import warnings | |
33 import random | |
34 import numpy as np | |
35 import dante_gff_output_filtering as domains_filtering | |
36 import dante as protein_domains | |
37 | |
38 t_profrep = time.time() | |
39 np.set_printoptions(threshold=sys.maxsize) | |
40 warnings.filterwarnings("ignore", module="matplotlib") | |
41 | |
42 | |
43 class Range(): | |
44 ''' | |
45 This class is used to check float range in argparse | |
46 ''' | |
47 | |
48 def __init__(self, start, end): | |
49 self.start = start | |
50 self.end = end | |
51 | |
52 def __eq__(self, other): | |
53 return self.start <= other <= self.end | |
54 | |
55 def __str__(self): | |
56 return "float range {}..{}".format(self.start, self.end) | |
57 | |
58 def __repr__(self): | |
59 return "float range {}..{}".format(self.start, self.end) | |
60 | |
61 | |
62 def get_version(path): | |
63 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD", | |
64 shell=True, | |
65 cwd=path).decode('ascii').strip() | |
66 shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1 ", | |
67 shell=True, | |
68 cwd=path).decode('ascii').strip() | |
69 revcount = len(subprocess.check_output("git log --oneline", | |
70 shell=True, | |
71 cwd=path).decode('ascii').split()) | |
72 version_string = ("-------------------------------------" | |
73 "-------------------------------------\n" | |
74 "PIPELINE VERSION : " | |
75 "{branch}-rv-{revcount}({shorthash})\n" | |
76 "-------------------------------------" | |
77 "-------------------------------------\n").format( | |
78 branch=branch, | |
79 shorthash=shorthash, | |
80 revcount=revcount, ) | |
81 return (version_string) | |
82 | |
83 | |
84 def str2bool(v): | |
85 if v.lower() in ('yes', 'true', 't', 'y', '1'): | |
86 return True | |
87 elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |
88 return False | |
89 else: | |
90 raise argparse.ArgumentTypeError('Boolean value expected') | |
91 | |
92 | |
93 def check_fasta_id(QUERY): | |
94 forbidden_ids = [] | |
95 headers = [] | |
96 for record in SeqIO.parse(QUERY, "fasta"): | |
97 if any(x in record.id for x in configuration.FORBIDDEN_CHARS): | |
98 forbidden_ids.append(record.id) | |
99 headers.append(record.id) | |
100 if len(headers) > len(set([header.split(" ")[0] for header in headers])): | |
101 raise NameError( | |
102 '''Sequences in multifasta format are not named correctly: | |
103 seq IDs(before the first space) are the same''') | |
104 return forbidden_ids, headers | |
105 | |
106 | |
107 def multifasta(QUERY): | |
108 ''' Create single fasta temporary files to be processed sequentially ''' | |
109 PATTERN = ">" | |
110 fasta_list = [] | |
111 with open(QUERY, "r") as fasta: | |
112 reader = fasta.read() | |
113 splitter = reader.split(PATTERN)[1:] | |
114 for fasta_num, part in enumerate(splitter): | |
115 ntf = NamedTemporaryFile(delete=False) | |
116 ntf.write("{}{}".format(PATTERN, part).encode("utf-8")) | |
117 fasta_list.append(ntf.name) | |
118 ntf.close() | |
119 return fasta_list | |
120 | |
121 | |
122 def fasta_read(subfasta): | |
123 ''' Read fasta, gain header and sequence without gaps ''' | |
124 sequence_lines = [] | |
125 with open(subfasta, "r") as fasta: | |
126 header = fasta.readline().strip().split(" ")[0][1:] | |
127 for line in fasta: | |
128 clean_line = line.strip() | |
129 if clean_line: | |
130 sequence_lines.append(clean_line) | |
131 sequence = "".join(sequence_lines) | |
132 return header, sequence | |
133 | |
134 | |
135 def cluster_annotation(CL_ANNOTATION_TBL): | |
136 ''' Create dictionary of known annotations classes and related clusters ''' | |
137 cl_annotations = {} | |
138 annot_table = np.genfromtxt(CL_ANNOTATION_TBL, dtype=str) | |
139 for line in annot_table: | |
140 if line[1] in cl_annotations: | |
141 cl_annotations[line[1]].append(line[0]) | |
142 else: | |
143 cl_annotations[line[1]] = [line[0]] | |
144 return list(cl_annotations.items()), list(cl_annotations.keys()) | |
145 | |
146 | |
147 def read_annotation(CLS, cl_annotations_items): | |
148 ''' Dictionary of known repeat classes and related reads ''' | |
149 reads_annotations = {} | |
150 with open(CLS, "r") as cls_file: | |
151 count = 0 | |
152 for line in cls_file: | |
153 line = line.rstrip() | |
154 count += 1 | |
155 if count % 2 == 0: | |
156 reads = re.split("\s+", line) | |
157 for element in reads: | |
158 for key, value in cl_annotations_items: | |
159 if clust in value: | |
160 reads_annotations[element] = key | |
161 else: | |
162 clust = re.split("\s+", line)[0].split(">CL")[1] | |
163 return reads_annotations | |
164 | |
165 | |
166 def annot_profile(annotation_keys, part): | |
167 ''' Predefine dictionary of known annotations and partial sequence | |
168 repetitive profiles defined by parallel process ''' | |
169 subprofile = {} | |
170 for key in annotation_keys: | |
171 subprofile[key] = [np.zeros(part, dtype=int), np.zeros(part, | |
172 dtype=int)] | |
173 subprofile["ALL"] = [np.zeros(part, dtype=int), np.zeros(part, dtype=int)] | |
174 return subprofile | |
175 | |
176 | |
177 def parallel_process(WINDOW, OVERLAP, seq_length, annotation_keys, | |
178 reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE, | |
179 BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER, | |
180 last_index, subsets_num, subset_index): | |
181 ''' Run parallel function to process the input sequence in windows | |
182 Run blast for subsequence defined by the input index and window size | |
183 Create and increment subprofile vector based on reads aligned within window ''' | |
184 loc_start = subset_index + 1 | |
185 loc_end = subset_index + WINDOW | |
186 if loc_end >= seq_length: | |
187 loc_end = seq_length | |
188 subprofile = annot_profile(annotation_keys, seq_length - loc_start + 1) | |
189 else: | |
190 subprofile = annot_profile(annotation_keys, WINDOW + 1) | |
191 | |
192 # Find HSP records for every window defined by query location and parse the tabular stdout: | |
193 # 1. query, 2. database read, 3. alignment start, 4. alignment end, 5. bitscore | |
194 p = subprocess.Popen( | |
195 "blastn -query {} -query_loc {}-{} -db {} -evalue {} -word_size {} -dust {} -task {} -num_alignments {} -outfmt '6 qseqid sseqid qstart qend bitscore pident'".format( | |
196 subfasta, loc_start, loc_end, BLAST_DB, E_VALUE, WORD_SIZE, | |
197 DUST_FILTER, BLAST_TASK, MAX_ALIGNMENTS), | |
198 stdout=subprocess.PIPE, | |
199 shell=True) | |
200 count_hits = 0 | |
201 for line in p.stdout: | |
202 column = line.decode("utf-8").rstrip().split("\t") | |
203 if float(column[4]) >= BITSCORE: | |
204 count_hits += 1 | |
205 read = column[1] # ID of individual aligned read | |
206 if "reduce" in read: | |
207 reads_representation = int(read.split("reduce")[-1]) | |
208 else: | |
209 reads_representation = 1 | |
210 qstart = int(column[2]) # starting position of alignment | |
211 qend = int(column[3]) # ending position of alignemnt | |
212 if read in reads_annotations: | |
213 annotation = reads_annotations[read] | |
214 else: | |
215 annotation = "ALL" | |
216 subprofile[annotation][0][ | |
217 qstart - subset_index - 1:qend - subset_index] = subprofile[ | |
218 annotation][0][qstart - subset_index - 1:qend - | |
219 subset_index] + reads_representation | |
220 subprofile[annotation][1][qstart - subset_index - 1:qend - | |
221 subset_index] = subprofile[annotation][ | |
222 1][qstart - subset_index - 1:qend - | |
223 subset_index] + float(column[ | |
224 5]) * reads_representation | |
225 subprofile["ALL"][0] = sum([item[0] for item in subprofile.values()]) | |
226 subprofile["ALL"][1] = sum([item[1] for item in subprofile.values()]) | |
227 for repeat in subprofile.keys(): | |
228 subprofile[repeat][1] = [int(round(quality / hits_num)) | |
229 if hits_num != 0 else quality | |
230 for hits_num, quality in zip(subprofile[ | |
231 repeat][0], subprofile[repeat][1])] | |
232 if subset_index == 0: | |
233 if subsets_num == 1: | |
234 subprf_name = subprofile_single(subprofile, subset_index) | |
235 else: | |
236 subprf_name = subprofile_first(subprofile, subset_index, WINDOW, | |
237 OVERLAP) | |
238 elif subset_index == last_index: | |
239 subprf_name = subprofile_last(subprofile, subset_index, OVERLAP) | |
240 else: | |
241 subprf_name = subprofiles_middle(subprofile, subset_index, WINDOW, | |
242 OVERLAP) | |
243 return subprf_name | |
244 | |
245 | |
246 def subprofile_single(subprofile, subset_index): | |
247 subprofile['idx'] = list(range(1, len(subprofile["ALL"][0]) + 1)) | |
248 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index), | |
249 delete=False) | |
250 with open(subprf_dict.name, 'wb') as handle: | |
251 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
252 subprf_dict.close() | |
253 return subprf_dict.name | |
254 | |
255 | |
256 def subprofile_first(subprofile, subset_index, WINDOW, OVERLAP): | |
257 for key in subprofile.keys(): | |
258 subprofile[key][0] = subprofile[key][0][0:-OVERLAP // 2 - 1] | |
259 subprofile[key][1] = subprofile[key][1][0:-OVERLAP // 2 - 1] | |
260 subprofile['idx'] = list(range(subset_index + 1, subset_index + WINDOW - | |
261 OVERLAP // 2 + 1)) | |
262 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index), | |
263 delete=False) | |
264 with open(subprf_dict.name, 'wb') as handle: | |
265 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
266 subprf_dict.close() | |
267 return subprf_dict.name | |
268 | |
269 | |
270 def subprofiles_middle(subprofile, subset_index, WINDOW, OVERLAP): | |
271 for key in subprofile.keys(): | |
272 subprofile[key][0] = subprofile[key][0][OVERLAP // 2:-OVERLAP // 2 - 1] | |
273 subprofile[key][1] = subprofile[key][1][OVERLAP // 2:-OVERLAP // 2 - 1] | |
274 subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1, | |
275 subset_index + WINDOW - OVERLAP // 2 + 1)) | |
276 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index), | |
277 delete=False) | |
278 with open(subprf_dict.name, 'wb') as handle: | |
279 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
280 subprf_dict.close() | |
281 return subprf_dict.name | |
282 | |
283 | |
284 def subprofile_last(subprofile, subset_index, OVERLAP): | |
285 len_subprofile = len(subprofile['ALL'][0]) | |
286 for key in subprofile.keys(): | |
287 subprofile[key][0] = subprofile[key][0][OVERLAP // 2:] | |
288 subprofile[key][1] = subprofile[key][1][OVERLAP // 2:] | |
289 subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1, | |
290 subset_index + len_subprofile + 1)) | |
291 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index), | |
292 delete=False) | |
293 with open(subprf_dict.name, 'wb') as handle: | |
294 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
295 subprf_dict.close() | |
296 return subprf_dict.name | |
297 | |
298 | |
299 def concatenate_prof(subprofiles_all, files_dict, seq_id, HTML_DATA, | |
300 wig_files): | |
301 for subprofile in subprofiles_all: | |
302 with open(subprofile, 'rb') as handle: | |
303 individual_dict = pickle.load(handle) | |
304 exclude = set(["idx"]) | |
305 for key in set(individual_dict.keys()).difference(exclude): | |
306 if any(individual_dict[key][0]): | |
307 indices = handle_zero_lines(individual_dict[key][0]) | |
308 if key not in files_dict.keys(): | |
309 prf_name = "{}/{}.wig".format(HTML_DATA, re.sub( | |
310 '[\/\|]', '_', key)) | |
311 prf_qual_name = "{}/{}_qual.wig".format( | |
312 HTML_DATA, re.sub('[\/\|]', '_', key)) | |
313 prf_file = open(prf_name, "w") | |
314 prf_q_file = open(prf_qual_name, "w") | |
315 prf_file.write("{}{}\n".format( | |
316 configuration.HEADER_WIG, seq_id)) | |
317 prf_q_file.write("{}{}\n".format( | |
318 configuration.HEADER_WIG, seq_id)) | |
319 for i in indices: | |
320 prf_file.write("{}\t{}\n".format(individual_dict[ | |
321 'idx'][i], individual_dict[key][0][i])) | |
322 prf_q_file.write("{}\t{}\n".format(individual_dict[ | |
323 'idx'][i], int(individual_dict[key][1][i]))) | |
324 files_dict[key] = [prf_name, [seq_id], prf_qual_name] | |
325 wig_files.append(prf_file) | |
326 wig_files.append(prf_q_file) | |
327 prf_file.close() | |
328 prf_q_file.close() | |
329 else: | |
330 prf_name = files_dict[key][0] | |
331 prf_qual_name = files_dict[key][2] | |
332 with open(prf_name, "a") as prf_file, open( | |
333 prf_qual_name, "a") as prf_q_file: | |
334 if seq_id not in files_dict[key][1]: | |
335 prf_file.write("{}{}\n".format( | |
336 configuration.HEADER_WIG, seq_id)) | |
337 prf_q_file.write("{}{}\n".format( | |
338 configuration.HEADER_WIG, seq_id)) | |
339 files_dict[key][1].append(seq_id) | |
340 for i in indices: | |
341 prf_file.write("{}\t{}\n".format( | |
342 individual_dict['idx'][i], individual_dict[ | |
343 key][0][i])) | |
344 prf_q_file.write("{}\t{}\n".format( | |
345 individual_dict['idx'][i], int( | |
346 individual_dict[key][1][i]))) | |
347 return files_dict, wig_files | |
348 | |
349 | |
350 def concatenate_prof_CN(CV, subprofiles_all, files_dict, seq_id, HTML_DATA, | |
351 wig_files): | |
352 for subprofile in subprofiles_all: | |
353 with open(subprofile, 'rb') as handle: | |
354 individual_dict = pickle.load(handle) | |
355 exclude = set(["idx"]) | |
356 for key in set(individual_dict.keys()).difference(exclude): | |
357 if any(individual_dict[key][0]): | |
358 indices = handle_zero_lines(individual_dict[key][0]) | |
359 if key not in files_dict.keys(): | |
360 prf_name = "{}/{}.wig".format(HTML_DATA, re.sub( | |
361 '[\/\|]', '_', key)) | |
362 prf_qual_name = "{}/{}_qual.wig".format( | |
363 HTML_DATA, re.sub('[\/\|]', '_', key)) | |
364 prf_file = open(prf_name, "w") | |
365 prf_q_file = open(prf_qual_name, "w") | |
366 prf_file.write("{}{}\n".format( | |
367 configuration.HEADER_WIG, seq_id)) | |
368 prf_q_file.write("{}{}\n".format( | |
369 configuration.HEADER_WIG, seq_id)) | |
370 for i in indices: | |
371 prf_file.write("{}\t{}\n".format(individual_dict[ | |
372 'idx'][i], int(individual_dict[key][0][i] / | |
373 CV))) | |
374 prf_q_file.write("{}\t{}\n".format(individual_dict[ | |
375 'idx'][i], int(individual_dict[key][1][i]))) | |
376 files_dict[key] = [prf_name, [seq_id], prf_qual_name] | |
377 wig_files.append(prf_file) | |
378 wig_files.append(prf_q_file) | |
379 prf_file.close() | |
380 prf_q_file.close() | |
381 else: | |
382 prf_name = files_dict[key][0] | |
383 prf_qual_name = files_dict[key][2] | |
384 with open(prf_name, "a") as prf_file, open( | |
385 prf_qual_name, "a") as prf_q_file: | |
386 if seq_id not in files_dict[key][1]: | |
387 prf_file.write("{}{}\n".format( | |
388 configuration.HEADER_WIG, seq_id)) | |
389 prf_q_file.write("{}{}\n".format( | |
390 configuration.HEADER_WIG, seq_id)) | |
391 files_dict[key][1].append(seq_id) | |
392 for i in indices: | |
393 prf_file.write("{}\t{}\n".format( | |
394 individual_dict['idx'][i], int( | |
395 individual_dict[key][0][i] / CV))) | |
396 prf_q_file.write("{}\t{}\n".format( | |
397 individual_dict['idx'][i], int( | |
398 individual_dict[key][1][i]))) | |
399 return files_dict, wig_files | |
400 | |
401 | |
402 def handle_zero_lines(repeat_subhits): | |
403 ''' Clean lines which contains only zeros, i.e. positons which do not contain any hit. However border zero positions need to be preserved due to correct graphs plotting ''' | |
404 zero_idx = [idx for idx, val in enumerate(repeat_subhits) if val == 0] | |
405 indices = [idx for idx, val in enumerate(repeat_subhits) if val != 0] | |
406 zero_breakpoints = [] | |
407 for key, group in groupby( | |
408 enumerate(zero_idx), | |
409 lambda index_item: index_item[0] - index_item[1]): | |
410 group = list(map(itemgetter(1), group)) | |
411 zero_breakpoints.append(group[0]) | |
412 zero_breakpoints.append(group[-1]) | |
413 if indices: | |
414 indices.extend(zero_breakpoints) | |
415 indices = sorted(set(indices), key=int) | |
416 else: | |
417 indices = [] | |
418 return indices | |
419 | |
420 | |
421 def repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA, | |
422 xminimal, xmaximal, domains, seq_ids_dom, CN, | |
423 seq_ids_all, seq_lengths_all, files_dict): | |
424 ''' Process the hits table separately for each fasta, create gff file and profile picture ''' | |
425 if files_dict: | |
426 gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict, | |
427 seq_ids_all) | |
428 else: | |
429 with open(OUTPUT_GFF, "w") as gff_file: | |
430 gff_file.write("{}\n".format(configuration.HEADER_GFF)) | |
431 | |
432 # TODO remove plotting, keep only basic report | |
433 return None | |
434 seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM] | |
435 graphs_dict = {} | |
436 seqs_long = [] | |
437 if files_dict: | |
438 [graphs_dict, seqs_long] = visualization.vis_profrep( | |
439 seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA, | |
440 seqs_all_part) | |
441 count_seq = 0 | |
442 for seq in seqs_all_part: | |
443 if seq in graphs_dict.keys(): | |
444 fig = graphs_dict[seq][0] | |
445 ax = graphs_dict[seq][1] | |
446 art = [] | |
447 lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3) | |
448 art.append(lgd) | |
449 if seq in seq_ids_dom: | |
450 dom_idx = seq_ids_dom.index(seq) | |
451 [fig, ax] = visualization.vis_domains( | |
452 fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx], | |
453 domains[dom_idx]) | |
454 elif seq in seqs_long: | |
455 [fig, ax] = visualization.plot_figure( | |
456 seq, seq_lengths_all[count_seq], CN) | |
457 ax.text( | |
458 0.3, | |
459 0.5, | |
460 "Graphs are only displayed if sequence is not longer than {} bp".format( | |
461 configuration.SEQ_LEN_VIZ), | |
462 transform=ax.transAxes, | |
463 fontsize=14, | |
464 verticalalignment='center', | |
465 color='blue') | |
466 else: | |
467 [fig, ax] = visualization.plot_figure( | |
468 seq, seq_lengths_all[count_seq], CN) | |
469 ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4) | |
470 if seq in seq_ids_dom: | |
471 dom_idx = seq_ids_dom.index(seq) | |
472 [fig, ax] = visualization.vis_domains( | |
473 fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx], | |
474 domains[dom_idx]) | |
475 output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq) | |
476 fig.savefig(output_pic_png, | |
477 bbox_inches="tight", | |
478 format="png", | |
479 dpi=configuration.IMAGE_RES) | |
480 count_seq += 1 | |
481 return None | |
482 | |
483 | |
484 def repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA, CN, | |
485 seq_ids_all, seq_lengths_all, files_dict): | |
486 ''' Process the hits table separately for each fasta, create gff file and profile picture ''' | |
487 if files_dict: | |
488 gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict, | |
489 seq_ids_all) | |
490 else: | |
491 with open(OUTPUT_GFF, "w") as gff_file: | |
492 gff_file.write("{}\n".format(configuration.HEADER_GFF)) | |
493 | |
494 # TODO remove plotting, keep only basic report | |
495 return None | |
496 seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM] | |
497 graphs_dict = {} | |
498 seqs_long = [] | |
499 if files_dict: | |
500 [graphs_dict, seqs_long] = visualization.vis_profrep( | |
501 seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA, | |
502 seqs_all_part) | |
503 count_seq = 0 | |
504 for seq in seqs_all_part: | |
505 if seq in graphs_dict.keys(): | |
506 fig = graphs_dict[seq][0] | |
507 ax = graphs_dict[seq][1] | |
508 art = [] | |
509 lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3) | |
510 art.append(lgd) | |
511 elif seq in seqs_long: | |
512 [fig, ax] = visualization.plot_figure( | |
513 seq, seq_lengths_all[count_seq], CN) | |
514 ax.text( | |
515 0.3, | |
516 0.5, | |
517 "Graphs are only displayed if sequence is not longer than {} bp".format( | |
518 configuration.SEQ_LEN_VIZ), | |
519 transform=ax.transAxes, | |
520 fontsize=14, | |
521 verticalalignment='center', | |
522 color='blue') | |
523 else: | |
524 [fig, ax] = visualization.plot_figure( | |
525 seq, seq_lengths_all[count_seq], CN) | |
526 ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4) | |
527 output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq) | |
528 fig.savefig(output_pic_png, | |
529 bbox_inches="tight", | |
530 format="png", | |
531 dpi=configuration.IMAGE_RES) | |
532 plt.close() | |
533 count_seq += 1 | |
534 return None | |
535 | |
536 | |
537 def html_output(total_length, seq_lengths_all, seq_names, HTML, DB_NAME, REF, | |
538 REF_LINK): | |
539 ''' Define html output with limited number of output pictures and link to JBrowse ''' | |
540 info = "\t\t".join(['<pre> {} [{} bp]</pre>'.format(seq_name, seq_length) | |
541 for seq_name, seq_length in zip(seq_names, | |
542 seq_lengths_all)]) | |
543 if REF: | |
544 ref_part_1 = REF.split("-")[0] | |
545 ref_part_2 = "-".join(REF.split("-")[1:]).split(". ")[0] | |
546 ref_part_3 = ". ".join("-".join(REF.split("-")[1:]).split(". ")[1:]) | |
547 ref_string = '''<h6> {} - <a href="{}" target="_blank" >{}</a>. {}'''.format( | |
548 ref_part_1, REF_LINK, ref_part_2, ref_part_3) | |
549 else: | |
550 ref_string = "Custom Data" | |
551 pictures = "\n\t\t".join(['<img src="{}.png" width=1800>'.format( | |
552 pic) for pic in range(len(seq_names))[:configuration.MAX_PIC_NUM]]) | |
553 html_str = configuration.HTML_STR.format(info, total_length, DB_NAME, | |
554 pictures, ref_string) | |
555 with open(HTML, "w") as html_file: | |
556 html_file.write(html_str) | |
557 | |
558 | |
559 def adjust_tracklist(jbrowse_data_path): | |
560 starting_lines = [] | |
561 ending_lines = [] | |
562 end = False | |
563 with open( | |
564 os.path.join(jbrowse_data_path, | |
565 "trackList.json"), "r") as track_list: | |
566 for line in track_list: | |
567 if "]" not in line and not end: | |
568 starting_lines.append(line) | |
569 else: | |
570 end = True | |
571 ending_lines.append(line) | |
572 with open( | |
573 os.path.join(jbrowse_data_path, | |
574 "trackList.json"), "w") as track_list: | |
575 for line in starting_lines: | |
576 track_list.write(line) | |
577 return ending_lines | |
578 | |
579 | |
580 def jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF, | |
581 total_length, JBROWSE_BIN, files_dict): | |
582 ''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse ''' | |
583 jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir) | |
584 with tempfile.TemporaryDirectory() as dirpath: | |
585 subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN), | |
586 "--fasta", QUERY, "--out", jbrowse_data_path]) | |
587 subprocess.call(["{}/flatfile-to-json.pl".format( | |
588 JBROWSE_BIN), "--gff", OUT_DOMAIN_GFF, "--trackLabel", | |
589 "GFF_domains", "--out", jbrowse_data_path]) | |
590 subprocess.call(["{}/flatfile-to-json.pl".format( | |
591 JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats", | |
592 "--config", configuration.JSON_CONF_R, "--out", | |
593 jbrowse_data_path]) | |
594 subprocess.call(["{}/flatfile-to-json.pl".format( | |
595 JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions", | |
596 "--config", configuration.JSON_CONF_N, "--out", | |
597 jbrowse_data_path]) | |
598 count = 0 | |
599 # Control the total length processed, if above threshold, dont create wig image tracks | |
600 if files_dict: | |
601 exclude = set(['ALL']) | |
602 sorted_keys = sorted(set(files_dict.keys()).difference(exclude)) | |
603 sorted_keys.insert(0, "ALL") | |
604 ending_lines = adjust_tracklist(jbrowse_data_path) | |
605 track_list = open( | |
606 os.path.join(jbrowse_data_path, "trackList.json"), "a") | |
607 color_avail = len(configuration.COLORS_HEX) | |
608 for repeat_id in sorted_keys: | |
609 if count <= color_avail - 1: | |
610 color = configuration.COLORS_HEX[count] | |
611 else: | |
612 r = lambda: random.randint(0, 255) | |
613 color = '#%02X%02X%02X' % (r(), r(), r()) | |
614 count += 1 | |
615 bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id)) | |
616 subprocess.call(["wigToBigWig", files_dict[repeat_id][ | |
617 0], os.path.join(HTML_DATA, | |
618 configuration.CHROM_SIZES_FILE), | |
619 os.path.join(jbrowse_data_path, bw_name)]) | |
620 track_list.write(configuration.TRACK_LIST.format( | |
621 "{", bw_name, repeat_id, repeat_id, "{", color, "}", "}")) | |
622 for line in ending_lines: | |
623 track_list.write(line) | |
624 distutils.dir_util.copy_tree(dirpath, jbrowse_data_path) | |
625 return None | |
626 | |
627 | |
628 def jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length, | |
629 JBROWSE_BIN, files_dict): | |
630 ''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse ''' | |
631 jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir) | |
632 with tempfile.TemporaryDirectory() as dirpath: | |
633 subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN), | |
634 "--fasta", QUERY, "--out", jbrowse_data_path]) | |
635 subprocess.call(["{}/flatfile-to-json.pl".format( | |
636 JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats", | |
637 "--config", configuration.JSON_CONF_R, "--out", | |
638 jbrowse_data_path]) | |
639 subprocess.call(["{}/flatfile-to-json.pl".format( | |
640 JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions", | |
641 "--config", configuration.JSON_CONF_N, "--out", | |
642 jbrowse_data_path]) | |
643 count = 0 | |
644 ## Control the total length processed, if above threshold, dont create wig image tracks | |
645 if files_dict: | |
646 exclude = set(['ALL']) | |
647 sorted_keys = sorted(set(files_dict.keys()).difference(exclude)) | |
648 sorted_keys.insert(0, "ALL") | |
649 ending_lines = adjust_tracklist(jbrowse_data_path) | |
650 track_list = open( | |
651 os.path.join(jbrowse_data_path, "trackList.json"), "a") | |
652 color_avail = len(configuration.COLORS_HEX) | |
653 for repeat_id in sorted_keys: | |
654 if count <= color_avail - 1: | |
655 color = configuration.COLORS_HEX[count] | |
656 else: | |
657 r = lambda: random.randint(0, 255) | |
658 color = '#%02X%02X%02X' % (r(), r(), r()) | |
659 count += 1 | |
660 bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id)) | |
661 subprocess.call(["wigToBigWig", files_dict[repeat_id][ | |
662 0], os.path.join(HTML_DATA, | |
663 configuration.CHROM_SIZES_FILE), | |
664 os.path.join(jbrowse_data_path, bw_name)]) | |
665 track_list.write(configuration.TRACK_LIST.format( | |
666 "{", bw_name, repeat_id, repeat_id, "{", color, "}", "}")) | |
667 for line in ending_lines: | |
668 track_list.write(line) | |
669 track_list.close() | |
670 distutils.dir_util.copy_tree(dirpath, jbrowse_data_path) | |
671 return None | |
672 | |
673 | |
674 def genome2coverage(GS, BLAST_DB): | |
675 ''' Convert genome size to coverage ''' | |
676 num_of_reads = 0 | |
677 with open(BLAST_DB, "r") as reads_all: | |
678 first_line = reads_all.readline() | |
679 if first_line.startswith(">"): | |
680 num_of_reads += 1 | |
681 first_seq = reads_all.readline().rstrip() | |
682 for line in reads_all: | |
683 if line.startswith(">"): | |
684 num_of_reads += 1 | |
685 len_of_read = len(first_seq) | |
686 CV = (num_of_reads * len_of_read) / (GS * 1000000) # GS in Mb | |
687 return CV | |
688 | |
689 | |
690 def prepared_data(DB_ID): | |
691 ''' Get prepared rep. annotation data from the table based on the selected species ID ''' | |
692 tbl = os.path.join( | |
693 os.path.dirname(os.path.realpath(__file__)), | |
694 configuration.PROFREP_DATA, configuration.PROFREP_TBL) | |
695 with open(tbl, "r") as datasets: | |
696 for line in datasets: | |
697 if line.split("\t")[0] == DB_ID: | |
698 DB_NAME = line.split("\t")[1] | |
699 CV = float(line.split("\t")[5]) | |
700 REF = line.split("\t")[6] | |
701 REF_LINK = line.split("\t")[7] | |
702 return DB_NAME, CV, REF, REF_LINK | |
703 | |
704 | |
705 def seq_sizes_file(seq_ids, seq_lengths_all, HTML_DATA): | |
706 chrom_sizes = os.path.join(HTML_DATA, configuration.CHROM_SIZES_FILE) | |
707 with open(chrom_sizes, "w") as chroms: | |
708 for seq_id, seq_length in zip(seq_ids, seq_lengths_all): | |
709 chroms.write("{}\t{}\n".format(seq_id, seq_length)) | |
710 | |
711 | |
712 def main(args): | |
713 ## Command line arguments | |
714 QUERY = args.query | |
715 BLAST_DB = args.reads | |
716 CL_ANNOTATION_TBL = args.ann_tbl | |
717 CLS = args.cls | |
718 BITSCORE = args.bit_score | |
719 E_VALUE = args.e_value | |
720 WORD_SIZE = args.word_size | |
721 WINDOW = args.window | |
722 OVERLAP = args.overlap | |
723 BLAST_TASK = args.task | |
724 MAX_ALIGNMENTS = args.max_alignments | |
725 NEW_DB = args.new_db | |
726 THRESHOLD = args.threshold_repeat | |
727 THRESHOLD_SEGMENT = args.threshold_segment | |
728 TH_IDENTITY = args.th_identity | |
729 TH_LENGTH = args.th_length | |
730 TH_INTERRUPT = args.interruptions | |
731 TH_SIMILARITY = args.th_similarity | |
732 TH_LEN_RATIO = args.max_len_proportion | |
733 OUTPUT_GFF = args.output_gff | |
734 DOMAINS = args.protein_domains | |
735 LAST_DB = args.protein_database | |
736 CLASSIFICATION = args.classification | |
737 OUT_DOMAIN_GFF = args.domain_gff | |
738 HTML = args.html_file | |
739 HTML_DATA = args.html_path | |
740 N_GFF = args.n_gff | |
741 CN = args.copy_numbers | |
742 GS = args.genome_size | |
743 DB_ID = args.db_id | |
744 THRESHOLD_SCORE = args.threshold_score | |
745 WIN_DOM = args.win_dom | |
746 OVERLAP_DOM = args.overlap_dom | |
747 JBROWSE_BIN = args.jbrowse_bin | |
748 DUST_FILTER = args.dust_filter | |
749 LOG_FILE = args.log_file | |
750 #JBROWSE_BIN = os.environ['JBROWSE_SOURCE_DIR']+"/bin" | |
751 #if not JBROWSE_BIN: | |
752 # try: | |
753 # JBROWSE_BIN = os.environ['JBROWSE_BIN'] | |
754 # except KeyError: | |
755 # raise ValueError('There was no path to JBrowse bin found - set the enviroment variable JBROWSE_BIN or pass the argument explicitly') | |
756 if CN and not DB_ID and not GS: | |
757 raise ValueError("Genome size missing - if you want to convert hits to copy numbers please enter --genome_size parameter") | |
758 | |
759 | |
760 ## Check if there are forbidden characters in fasta IDs | |
761 [forbidden_ids, headers] = check_fasta_id(QUERY) | |
762 if forbidden_ids: | |
763 ##################### USER ERROR ############################### | |
764 raise UserWarning( | |
765 "The following IDs contain forbidden characters ('/' or '\\') - PLEASE REPLACE OR DELETE THEM:\n{}".format( | |
766 "\n".join(forbidden_ids))) | |
767 | |
768 if len(headers) > len(set([header.split(" ")[0] for header in headers])): | |
769 raise NameError( | |
770 '''Sequences in multifasta format are not named correctly: | |
771 seq IDs(before the first space) are the same''') | |
772 | |
773 ## Create new blast database of reads | |
774 if NEW_DB: | |
775 subprocess.call("makeblastdb -in {} -dbtype nucl".format(BLAST_DB), | |
776 shell=True) | |
777 | |
778 ## Parse prepared annotation data table | |
779 if DB_ID: | |
780 [DB_NAME, CV, REF, REF_LINK] = prepared_data(DB_ID) | |
781 else: | |
782 REF = None | |
783 REF_LINK = None | |
784 DB_NAME = "CUSTOM" | |
785 | |
786 ## Create dir to store outputs for html and JBROWSE | |
787 if not os.path.exists(HTML_DATA): | |
788 os.makedirs(HTML_DATA) | |
789 | |
790 if not os.path.isabs(HTML): | |
791 HTML = os.path.join(HTML_DATA, HTML) | |
792 | |
793 if not os.path.isabs(OUT_DOMAIN_GFF): | |
794 OUT_DOMAIN_GFF = os.path.join(HTML_DATA, OUT_DOMAIN_GFF) | |
795 | |
796 if not os.path.isabs(LOG_FILE): | |
797 LOG_FILE = os.path.join(HTML_DATA, LOG_FILE) | |
798 | |
799 if not os.path.isabs(N_GFF): | |
800 N_GFF = os.path.join(HTML_DATA, N_GFF) | |
801 | |
802 if not os.path.isabs(OUTPUT_GFF): | |
803 OUTPUT_GFF = os.path.join(HTML_DATA, OUTPUT_GFF) | |
804 | |
805 path = os.path.dirname(os.path.realpath(__file__)) | |
806 version_string = get_version(path) | |
807 | |
808 log = os.open(LOG_FILE, os.O_RDWR | os.O_CREAT) | |
809 | |
810 os.write(log, version_string.encode("utf-8")) | |
811 | |
812 ## Define parameters for parallel process | |
813 STEP = WINDOW - OVERLAP | |
814 NUM_CORES = multiprocessing.cpu_count() | |
815 os.write(log, "NUM_OF_CORES = {}\n".format(NUM_CORES).encode("utf-8")) | |
816 | |
817 ## Convert genome size to coverage | |
818 if CN and GS: | |
819 CV = genome2coverage(GS, BLAST_DB) | |
820 os.write(log, "COVERAGE = {}\n".format(CV).encode("utf-8")) | |
821 | |
822 parallel_pool = Pool(NUM_CORES) | |
823 | |
824 ## Assign clusters to repetitive classes | |
825 [cl_annotations_items, annotation_keys | |
826 ] = cluster_annotation(CL_ANNOTATION_TBL) | |
827 | |
828 ## Assign reads to repetitive classes | |
829 reads_annotations = read_annotation(CLS, cl_annotations_items) | |
830 | |
831 ## Detect all fasta sequences from input | |
832 fasta_list = multifasta(QUERY) | |
833 headers = [] | |
834 files_dict = {} | |
835 wig_files = [] | |
836 seq_count = 1 | |
837 start = 1 | |
838 total_length = 0 | |
839 seq_lengths_all = [] | |
840 Ngff = open(N_GFF, "w") | |
841 Ngff.write("{}\n".format(configuration.HEADER_GFF)) | |
842 | |
843 ## Find hits for each fasta sequence separetely | |
844 t_blast = time.time() | |
845 for subfasta in fasta_list: | |
846 [header, sequence] = fasta_read(subfasta) | |
847 os.write(log, "Sequence {} is being processed...\n".format( | |
848 header).encode("utf-8")) | |
849 os.fsync(log) | |
850 indices_N = [indices + 1 | |
851 for indices, n in enumerate(sequence) | |
852 if n == "n" or n == "N"] | |
853 if indices_N: | |
854 gff.idx_ranges_N(indices_N, configuration.N_segment, header, Ngff, | |
855 configuration.N_NAME, configuration.N_FEATURE) | |
856 seq_length = len(sequence) | |
857 headers.append(header) | |
858 ## Create parallel process | |
859 subset_index = list(range(0, seq_length, STEP)) | |
860 ## Situation when penultimal window is not complete but it is following by another one | |
861 if len(subset_index) > 1 and subset_index[-2] + WINDOW >= seq_length: | |
862 subset_index = subset_index[:-1] | |
863 last_index = subset_index[-1] | |
864 index_range = range(len(subset_index)) | |
865 for chunk_index in index_range[0::configuration.MAX_FILES_SUBPROFILES]: | |
866 multiple_param = partial( | |
867 parallel_process, WINDOW, OVERLAP, seq_length, annotation_keys, | |
868 reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE, | |
869 BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER, last_index, | |
870 len(subset_index)) | |
871 subprofiles_all = parallel_pool.map(multiple_param, subset_index[ | |
872 chunk_index:chunk_index + configuration.MAX_FILES_SUBPROFILES]) | |
873 ## Join partial profiles to the final profile of the sequence | |
874 if CN: | |
875 [files_dict, wig_files | |
876 ] = concatenate_prof_CN(CV, subprofiles_all, files_dict, | |
877 header, HTML_DATA, wig_files) | |
878 else: | |
879 [files_dict, wig_files] = concatenate_prof( | |
880 subprofiles_all, files_dict, header, HTML_DATA, wig_files) | |
881 for subprofile in subprofiles_all: | |
882 os.unlink(subprofile) | |
883 total_length += seq_length | |
884 seq_lengths_all.append(seq_length) | |
885 | |
886 os.write(log, "ELAPSED_TIME_BLAST = {} s\n".format(time.time( | |
887 ) - t_blast).encode("utf-8")) | |
888 os.write( | |
889 log, | |
890 "TOTAL_LENGHT_ANALYZED = {} bp\n".format(total_length).encode("utf-8")) | |
891 | |
892 ## Close opened files | |
893 for opened_file in wig_files: | |
894 opened_file.close() | |
895 Ngff.close() | |
896 | |
897 ## Create file containing size of sequences to convert wig to bigwig | |
898 seq_sizes_file(headers, seq_lengths_all, HTML_DATA) | |
899 | |
900 ## Protein domains module | |
901 t_domains = time.time() | |
902 if DOMAINS: | |
903 os.write(log, "Domains module has started...\n".encode("utf-8")) | |
904 os.fsync(log) | |
905 domains_primary = NamedTemporaryFile(delete=False) | |
906 protein_domains.domain_search(QUERY, LAST_DB, CLASSIFICATION, | |
907 domains_primary.name, THRESHOLD_SCORE, | |
908 WIN_DOM, OVERLAP_DOM) | |
909 domains_primary.close() | |
910 [xminimal, xmaximal, domains, seq_ids_dom | |
911 ] = domains_filtering.filter_qual_dom( | |
912 domains_primary.name, OUT_DOMAIN_GFF, TH_IDENTITY, TH_SIMILARITY, | |
913 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, 'All', "") | |
914 os.unlink(domains_primary.name) | |
915 os.write(log, "ELAPSED_TIME_DOMAINS = {} s\n".format(time.time( | |
916 ) - t_domains).encode("utf-8")) | |
917 | |
918 # Process individual sequences from the input file sequentially | |
919 t_gff_vis = time.time() | |
920 repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, | |
921 HTML_DATA, xminimal, xmaximal, domains, | |
922 seq_ids_dom, CN, headers, seq_lengths_all, | |
923 files_dict) | |
924 os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time( | |
925 ) - t_gff_vis).encode("utf-8")) | |
926 os.fsync(log) | |
927 # Prepare data for html output | |
928 t_jbrowse = time.time() | |
929 os.write(log, "JBrowse tracks are being prepared...\n".encode("utf-8")) | |
930 os.fsync(log) | |
931 jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF, | |
932 total_length, JBROWSE_BIN, files_dict) | |
933 os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time( | |
934 ) - t_jbrowse).encode("utf-8")) | |
935 else: | |
936 # Process individual sequences from the input file sequentially | |
937 t_gff_vis = time.time() | |
938 repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA, | |
939 CN, headers, seq_lengths_all, files_dict) | |
940 os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time( | |
941 ) - t_gff_vis).encode("utf-8")) | |
942 | |
943 # Prepare data for html output | |
944 t_jbrowse = time.time() | |
945 jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length, | |
946 JBROWSE_BIN, files_dict) | |
947 os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time( | |
948 ) - t_jbrowse).encode("utf-8")) | |
949 | |
950 # Create HTML output | |
951 t_html = time.time() | |
952 os.write( | |
953 log, | |
954 "HTML output and JBrowse data structure are being prepared...\n".encode( | |
955 "utf-8")) | |
956 os.fsync(log) | |
957 html_output(total_length, seq_lengths_all, headers, HTML, DB_NAME, REF, | |
958 REF_LINK) | |
959 os.write(log, "ELAPSED_TIME_HTML = {} s\n".format(time.time() - | |
960 t_html).encode("utf-8")) | |
961 os.write(log, "ELAPSED_TIME_PROFREP = {} s\n".format(time.time( | |
962 ) - t_profrep).encode("utf-8")) | |
963 os.close(log) | |
964 | |
965 ## Clean up the temporary fasta files | |
966 for subfasta in fasta_list: | |
967 os.unlink(subfasta) | |
968 | |
969 | |
970 if __name__ == "__main__": | |
971 import argparse | |
972 from argparse import RawDescriptionHelpFormatter | |
973 | |
974 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, | |
975 argparse.RawDescriptionHelpFormatter): | |
976 pass | |
977 | |
978 # Default paths (command line usage) | |
979 HTML = configuration.HTML | |
980 DOMAINS_GFF = configuration.DOMAINS_GFF | |
981 REPEATS_GFF = configuration.REPEATS_GFF | |
982 N_GFF = configuration.N_GFF | |
983 LOG_FILE = configuration.LOG_FILE | |
984 PROFREP_OUTPUT_DIR = configuration.PROFREP_OUTPUT_DIR | |
985 | |
986 # Command line arguments | |
987 parser = argparse.ArgumentParser( | |
988 description=''' | |
989 | |
990 DEPENDENCIES: | |
991 - python 3.4 or higher with packages: | |
992 - numpy | |
993 - matplotlib | |
994 - biopython | |
995 - BLAST 2.2.28+ or higher | |
996 - LAST 744 or higher | |
997 - wigToBigWig | |
998 - cd-hit | |
999 - JBrowse - ! Only bin needed, does not have to be installed under a web server | |
1000 * ProfRep Modules: | |
1001 - gff.py | |
1002 - visualization.py | |
1003 - configuration.py | |
1004 - protein_domains.py | |
1005 - domains_filtering.py | |
1006 | |
1007 EXAMPLE OF USAGE: | |
1008 | |
1009 ./protein.py --query PATH_TO_DNA_SEQ --reads PATH_TO_READS --ann_tbl PATH_TO_CLUSTERS_CLASSIFICATION --cls PATH_TO_hitsort.cls [--new_db True] | |
1010 ''', | |
1011 epilog="""take a look at README for more detailed information""", | |
1012 formatter_class=CustomFormatter) | |
1013 | |
1014 Required = parser.add_argument_group('required arguments') | |
1015 altRequired = parser.add_argument_group( | |
1016 'alternative required arguments - prepared datasets') | |
1017 blastOpt = parser.add_argument_group('optional arguments - BLAST Search') | |
1018 parallelOpt = parser.add_argument_group( | |
1019 'optional arguments - Parallel Processing') | |
1020 protOpt = parser.add_argument_group('optional arguments - Protein Domains') | |
1021 outOpt = parser.add_argument_group('optional arguments - Output Paths') | |
1022 cnOpt = parser.add_argument_group( | |
1023 'optional arguments - Copy Numbers/Hits ') | |
1024 galaxyOpt = parser.add_argument_group( | |
1025 'optional arguments - Enviroment Variables') | |
1026 | |
1027 ################ INPUTS ############################################ | |
1028 Required.add_argument('-q', | |
1029 '--query', | |
1030 type=str, | |
1031 required=True, | |
1032 help='input DNA sequence in (multi)fasta format') | |
1033 Required.add_argument('-rdb', | |
1034 '--reads', | |
1035 type=str, | |
1036 required=True, | |
1037 help='blast database of all sequencing reads') | |
1038 Required.add_argument( | |
1039 '-a', | |
1040 '--ann_tbl', | |
1041 type=str, | |
1042 required=True, | |
1043 help= | |
1044 'clusters annotation table, tab-separated number of cluster and its classification') | |
1045 Required.add_argument( | |
1046 '-c', | |
1047 '--cls', | |
1048 type=str, | |
1049 required=True, | |
1050 help='cls file containing reads assigned to clusters (hitsort.cls)') | |
1051 altRequired.add_argument( | |
1052 '-id', | |
1053 '--db_id', | |
1054 type=str, | |
1055 help='annotation dataset ID (first column of datasets table)') | |
1056 | |
1057 ################ BLAST parameters ################################## | |
1058 blastOpt.add_argument('-bs', | |
1059 '--bit_score', | |
1060 type=float, | |
1061 default=50, | |
1062 help='bitscore threshold') | |
1063 blastOpt.add_argument( | |
1064 '-m', | |
1065 '--max_alignments', | |
1066 type=int, | |
1067 default=10000000, | |
1068 help= | |
1069 'blast filtering option: maximal number of alignments in the output') | |
1070 blastOpt.add_argument('-e', | |
1071 '--e_value', | |
1072 type=str, | |
1073 default=0.1, | |
1074 help='blast setting option: e-value') | |
1075 blastOpt.add_argument( | |
1076 '-df', | |
1077 '--dust_filter', | |
1078 type=str, | |
1079 default="'20 64 1'", | |
1080 help='dust filters low-complexity regions during BLAST search') | |
1081 blastOpt.add_argument( | |
1082 '-ws', | |
1083 '--word_size', | |
1084 type=int, | |
1085 default=11, | |
1086 help='blast search option: initial word size for alignment') | |
1087 blastOpt.add_argument('-t', | |
1088 '--task', | |
1089 type=str, | |
1090 default="blastn", | |
1091 help='type of blast to be triggered') | |
1092 blastOpt.add_argument( | |
1093 '-n', | |
1094 '--new_db', | |
1095 type=str2bool, | |
1096 default=True, | |
1097 help= | |
1098 'create a new blast database, USE THIS OPTION IF YOU RUN PROFREP WITH NEW DATABASE FOR THE FIRST TIME') | |
1099 | |
1100 ############### PARALLEL PROCESSING ARGUMENTS ###################### | |
1101 parallelOpt.add_argument( | |
1102 '-w', | |
1103 '--window', | |
1104 type=int, | |
1105 default=5000, | |
1106 help='sliding window size for parallel processing') | |
1107 parallelOpt.add_argument( | |
1108 '-o', | |
1109 '--overlap', | |
1110 type=int, | |
1111 default=150, | |
1112 help= | |
1113 'overlap for parallely processed regions, set greater than a read size') | |
1114 | |
1115 ################ PROTEIN DOMAINS PARAMETERS ######################## | |
1116 protOpt.add_argument('-pd', | |
1117 '--protein_domains', | |
1118 type=str2bool, | |
1119 default=False, | |
1120 help='use module for protein domains') | |
1121 protOpt.add_argument('-pdb', | |
1122 '--protein_database', | |
1123 type=str, | |
1124 help='protein domains database') | |
1125 protOpt.add_argument('-cs', | |
1126 '--classification', | |
1127 type=str, | |
1128 help='protein domains classification file') | |
1129 protOpt.add_argument( | |
1130 '-wd', | |
1131 '--win_dom', | |
1132 type=int, | |
1133 default=10000000, | |
1134 help= | |
1135 'protein domains module: sliding window to process large input sequences sequentially') | |
1136 protOpt.add_argument( | |
1137 '-od', | |
1138 '--overlap_dom', | |
1139 type=int, | |
1140 default=10000, | |
1141 help= | |
1142 'protein domains module: overlap of sequences in two consecutive windows') | |
1143 protOpt.add_argument( | |
1144 '-thsc', | |
1145 '--threshold_score', | |
1146 type=int, | |
1147 default=80, | |
1148 help= | |
1149 'protein domains module: percentage of the best score within the cluster to significant domains') | |
1150 protOpt.add_argument("-thl", | |
1151 "--th_length", | |
1152 type=float, | |
1153 choices=[Range(0.0, 1.0)], | |
1154 default=0.8, | |
1155 help="proportion of alignment length threshold") | |
1156 protOpt.add_argument("-thi", | |
1157 "--th_identity", | |
1158 type=float, | |
1159 choices=[Range(0.0, 1.0)], | |
1160 default=0.35, | |
1161 help="proportion of alignment identity threshold") | |
1162 protOpt.add_argument( | |
1163 "-ths", | |
1164 "--th_similarity", | |
1165 type=float, | |
1166 choices=[Range(0.0, 1.0)], | |
1167 default=0.45, | |
1168 help="threshold for alignment proportional similarity") | |
1169 protOpt.add_argument( | |
1170 "-ir", | |
1171 "--interruptions", | |
1172 type=int, | |
1173 default=3, | |
1174 help= | |
1175 "interruptions (frameshifts + stop codons) tolerance threshold per 100 AA") | |
1176 protOpt.add_argument( | |
1177 "-mlen", | |
1178 "--max_len_proportion", | |
1179 type=float, | |
1180 default=1.2, | |
1181 help= | |
1182 "maximal proportion of alignment length to the original length of protein domain from database") | |
1183 | |
1184 ################ OUTPUTS ########################################### | |
1185 outOpt.add_argument('-lg', | |
1186 '--log_file', | |
1187 type=str, | |
1188 default=LOG_FILE, | |
1189 help='path to log file') | |
1190 outOpt.add_argument('-ouf', | |
1191 '--output_gff', | |
1192 type=str, | |
1193 default=REPEATS_GFF, | |
1194 help='path to output gff of repetitive regions') | |
1195 outOpt.add_argument('-oug', | |
1196 '--domain_gff', | |
1197 type=str, | |
1198 default=DOMAINS_GFF, | |
1199 help='path to output gff of protein domains') | |
1200 outOpt.add_argument('-oun', | |
1201 '--n_gff', | |
1202 type=str, | |
1203 default=N_GFF, | |
1204 help='path to output gff of N regions') | |
1205 outOpt.add_argument('-hf', | |
1206 '--html_file', | |
1207 type=str, | |
1208 default=HTML, | |
1209 help='path to output html file') | |
1210 outOpt.add_argument('-hp', | |
1211 '--html_path', | |
1212 type=str, | |
1213 default=PROFREP_OUTPUT_DIR, | |
1214 help='path to html extra files') | |
1215 | |
1216 ################ HITS/COPY NUMBERS #################################### | |
1217 cnOpt.add_argument('-cn', | |
1218 '--copy_numbers', | |
1219 type=str2bool, | |
1220 default=False, | |
1221 help='convert hits to copy numbers') | |
1222 cnOpt.add_argument( | |
1223 '-gs', | |
1224 '--genome_size', | |
1225 type=float, | |
1226 help= | |
1227 'genome size is required when converting hits to copy numbers and you use custom data') | |
1228 cnOpt.add_argument( | |
1229 '-thr', | |
1230 '--threshold_repeat', | |
1231 type=int, | |
1232 default=3, | |
1233 help= | |
1234 'threshold for hits/copy numbers per position to be considered repetitive') | |
1235 cnOpt.add_argument( | |
1236 '-thsg', | |
1237 '--threshold_segment', | |
1238 type=int, | |
1239 default=80, | |
1240 help='threshold for the length of repetitive segment to be reported') | |
1241 | |
1242 ################ JBrowse ########################## | |
1243 galaxyOpt.add_argument('-jb', | |
1244 '--jbrowse_bin', | |
1245 type=str, | |
1246 help='path to JBrowse bin directory') | |
1247 | |
1248 args = parser.parse_args() | |
1249 main(args) |