annotate profrep.py @ 2:22919ea3463c draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 09:06:57 -0400
parents a5f1638b73be
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
2
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
3 import subprocess
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
4 import csv
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
5 import time
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
6 import sys
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
7 import matplotlib
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
8 # matplotlib.use("PDF")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
9 matplotlib.use("pdf")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
10 import matplotlib.pyplot as plt
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
11 import matplotlib.colors as colors
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
12 import matplotlib.cm as cmx
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
13 import multiprocessing
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
14 import argparse
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
15 import os
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
16 from functools import partial
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
17 from multiprocessing import Pool
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
18 from tempfile import NamedTemporaryFile
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
19 from operator import itemgetter
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
20 from itertools import groupby
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
21 import gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
22 import configuration
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
23 import visualization
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
24 import distutils
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
25 from distutils import dir_util
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
26 import tempfile
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
27 import re
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
28 from Bio import SeqIO
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
29 import sys
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
30 import pickle
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
31 import shutil
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
32 import warnings
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
33 import random
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
34 import numpy as np
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
35 import dante_gff_output_filtering as domains_filtering
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
36 import dante as protein_domains
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
37
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
38 t_profrep = time.time()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
39 np.set_printoptions(threshold=sys.maxsize)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
40 warnings.filterwarnings("ignore", module="matplotlib")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
41
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
42
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
43 class Range():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
44 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
45 This class is used to check float range in argparse
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
46 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
47
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
48 def __init__(self, start, end):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
49 self.start = start
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
50 self.end = end
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
51
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
52 def __eq__(self, other):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
53 return self.start <= other <= self.end
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
54
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
55 def __str__(self):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
56 return "float range {}..{}".format(self.start, self.end)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
57
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
58 def __repr__(self):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
59 return "float range {}..{}".format(self.start, self.end)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
60
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
61
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
62 def get_version(path):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
63 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
64 shell=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
65 cwd=path).decode('ascii').strip()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
66 shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1 ",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
67 shell=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
68 cwd=path).decode('ascii').strip()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
69 revcount = len(subprocess.check_output("git log --oneline",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
70 shell=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
71 cwd=path).decode('ascii').split())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
72 version_string = ("-------------------------------------"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
73 "-------------------------------------\n"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
74 "PIPELINE VERSION : "
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
75 "{branch}-rv-{revcount}({shorthash})\n"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
76 "-------------------------------------"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
77 "-------------------------------------\n").format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
78 branch=branch,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
79 shorthash=shorthash,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
80 revcount=revcount, )
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
81 return (version_string)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
82
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
83
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
84 def str2bool(v):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
85 if v.lower() in ('yes', 'true', 't', 'y', '1'):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
86 return True
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
87 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
88 return False
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
89 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
90 raise argparse.ArgumentTypeError('Boolean value expected')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
91
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
92
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
93 def check_fasta_id(QUERY):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
94 forbidden_ids = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
95 headers = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
96 for record in SeqIO.parse(QUERY, "fasta"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
97 if any(x in record.id for x in configuration.FORBIDDEN_CHARS):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
98 forbidden_ids.append(record.id)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
99 headers.append(record.id)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
100 if len(headers) > len(set([header.split(" ")[0] for header in headers])):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
101 raise NameError(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
102 '''Sequences in multifasta format are not named correctly:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
103 seq IDs(before the first space) are the same''')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
104 return forbidden_ids, headers
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
105
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
106
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
107 def multifasta(QUERY):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
108 ''' Create single fasta temporary files to be processed sequentially '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
109 PATTERN = ">"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
110 fasta_list = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
111 with open(QUERY, "r") as fasta:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
112 reader = fasta.read()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
113 splitter = reader.split(PATTERN)[1:]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
114 for fasta_num, part in enumerate(splitter):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
115 ntf = NamedTemporaryFile(delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
116 ntf.write("{}{}".format(PATTERN, part).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
117 fasta_list.append(ntf.name)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
118 ntf.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
119 return fasta_list
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
120
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
121
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
122 def fasta_read(subfasta):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
123 ''' Read fasta, gain header and sequence without gaps '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
124 sequence_lines = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
125 with open(subfasta, "r") as fasta:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
126 header = fasta.readline().strip().split(" ")[0][1:]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
127 for line in fasta:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
128 clean_line = line.strip()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
129 if clean_line:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
130 sequence_lines.append(clean_line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
131 sequence = "".join(sequence_lines)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
132 return header, sequence
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
133
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
134
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
135 def cluster_annotation(CL_ANNOTATION_TBL):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
136 ''' Create dictionary of known annotations classes and related clusters '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
137 cl_annotations = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
138 annot_table = np.genfromtxt(CL_ANNOTATION_TBL, dtype=str)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
139 for line in annot_table:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
140 if line[1] in cl_annotations:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
141 cl_annotations[line[1]].append(line[0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
142 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
143 cl_annotations[line[1]] = [line[0]]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
144 return list(cl_annotations.items()), list(cl_annotations.keys())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
145
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
146
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
147 def read_annotation(CLS, cl_annotations_items):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
148 ''' Dictionary of known repeat classes and related reads '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
149 reads_annotations = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
150 with open(CLS, "r") as cls_file:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
151 count = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
152 for line in cls_file:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
153 line = line.rstrip()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
154 count += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
155 if count % 2 == 0:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
156 reads = re.split("\s+", line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
157 for element in reads:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
158 for key, value in cl_annotations_items:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
159 if clust in value:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
160 reads_annotations[element] = key
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
161 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
162 clust = re.split("\s+", line)[0].split(">CL")[1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
163 return reads_annotations
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
164
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
165
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
166 def annot_profile(annotation_keys, part):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
167 ''' Predefine dictionary of known annotations and partial sequence
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
168 repetitive profiles defined by parallel process '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
169 subprofile = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
170 for key in annotation_keys:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
171 subprofile[key] = [np.zeros(part, dtype=int), np.zeros(part,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
172 dtype=int)]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
173 subprofile["ALL"] = [np.zeros(part, dtype=int), np.zeros(part, dtype=int)]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
174 return subprofile
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
175
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
176
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
177 def parallel_process(WINDOW, OVERLAP, seq_length, annotation_keys,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
178 reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
179 BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
180 last_index, subsets_num, subset_index):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
181 ''' Run parallel function to process the input sequence in windows
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
182 Run blast for subsequence defined by the input index and window size
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
183 Create and increment subprofile vector based on reads aligned within window '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
184 loc_start = subset_index + 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
185 loc_end = subset_index + WINDOW
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
186 if loc_end >= seq_length:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
187 loc_end = seq_length
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
188 subprofile = annot_profile(annotation_keys, seq_length - loc_start + 1)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
189 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
190 subprofile = annot_profile(annotation_keys, WINDOW + 1)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
191
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
192 # Find HSP records for every window defined by query location and parse the tabular stdout:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
193 # 1. query, 2. database read, 3. alignment start, 4. alignment end, 5. bitscore
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
194 p = subprocess.Popen(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
195 "blastn -query {} -query_loc {}-{} -db {} -evalue {} -word_size {} -dust {} -task {} -num_alignments {} -outfmt '6 qseqid sseqid qstart qend bitscore pident'".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
196 subfasta, loc_start, loc_end, BLAST_DB, E_VALUE, WORD_SIZE,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
197 DUST_FILTER, BLAST_TASK, MAX_ALIGNMENTS),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
198 stdout=subprocess.PIPE,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
199 shell=True)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
200 count_hits = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
201 for line in p.stdout:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
202 column = line.decode("utf-8").rstrip().split("\t")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
203 if float(column[4]) >= BITSCORE:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
204 count_hits += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
205 read = column[1] # ID of individual aligned read
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
206 if "reduce" in read:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
207 reads_representation = int(read.split("reduce")[-1])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
208 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
209 reads_representation = 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
210 qstart = int(column[2]) # starting position of alignment
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
211 qend = int(column[3]) # ending position of alignemnt
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
212 if read in reads_annotations:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
213 annotation = reads_annotations[read]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
214 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
215 annotation = "ALL"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
216 subprofile[annotation][0][
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
217 qstart - subset_index - 1:qend - subset_index] = subprofile[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
218 annotation][0][qstart - subset_index - 1:qend -
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
219 subset_index] + reads_representation
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
220 subprofile[annotation][1][qstart - subset_index - 1:qend -
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
221 subset_index] = subprofile[annotation][
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
222 1][qstart - subset_index - 1:qend -
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
223 subset_index] + float(column[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
224 5]) * reads_representation
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
225 subprofile["ALL"][0] = sum([item[0] for item in subprofile.values()])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
226 subprofile["ALL"][1] = sum([item[1] for item in subprofile.values()])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
227 for repeat in subprofile.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
228 subprofile[repeat][1] = [int(round(quality / hits_num))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
229 if hits_num != 0 else quality
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
230 for hits_num, quality in zip(subprofile[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
231 repeat][0], subprofile[repeat][1])]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
232 if subset_index == 0:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
233 if subsets_num == 1:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
234 subprf_name = subprofile_single(subprofile, subset_index)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
235 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
236 subprf_name = subprofile_first(subprofile, subset_index, WINDOW,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
237 OVERLAP)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
238 elif subset_index == last_index:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
239 subprf_name = subprofile_last(subprofile, subset_index, OVERLAP)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
240 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
241 subprf_name = subprofiles_middle(subprofile, subset_index, WINDOW,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
242 OVERLAP)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
243 return subprf_name
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
244
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
245
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
246 def subprofile_single(subprofile, subset_index):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
247 subprofile['idx'] = list(range(1, len(subprofile["ALL"][0]) + 1))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
248 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
249 delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
250 with open(subprf_dict.name, 'wb') as handle:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
251 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
252 subprf_dict.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
253 return subprf_dict.name
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
254
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
255
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
256 def subprofile_first(subprofile, subset_index, WINDOW, OVERLAP):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
257 for key in subprofile.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
258 subprofile[key][0] = subprofile[key][0][0:-OVERLAP // 2 - 1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
259 subprofile[key][1] = subprofile[key][1][0:-OVERLAP // 2 - 1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
260 subprofile['idx'] = list(range(subset_index + 1, subset_index + WINDOW -
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
261 OVERLAP // 2 + 1))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
262 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
263 delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
264 with open(subprf_dict.name, 'wb') as handle:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
265 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
266 subprf_dict.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
267 return subprf_dict.name
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
268
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
269
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
270 def subprofiles_middle(subprofile, subset_index, WINDOW, OVERLAP):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
271 for key in subprofile.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
272 subprofile[key][0] = subprofile[key][0][OVERLAP // 2:-OVERLAP // 2 - 1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
273 subprofile[key][1] = subprofile[key][1][OVERLAP // 2:-OVERLAP // 2 - 1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
274 subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
275 subset_index + WINDOW - OVERLAP // 2 + 1))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
276 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
277 delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
278 with open(subprf_dict.name, 'wb') as handle:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
279 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
280 subprf_dict.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
281 return subprf_dict.name
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
282
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
283
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
284 def subprofile_last(subprofile, subset_index, OVERLAP):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
285 len_subprofile = len(subprofile['ALL'][0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
286 for key in subprofile.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
287 subprofile[key][0] = subprofile[key][0][OVERLAP // 2:]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
288 subprofile[key][1] = subprofile[key][1][OVERLAP // 2:]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
289 subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
290 subset_index + len_subprofile + 1))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
291 subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
292 delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
293 with open(subprf_dict.name, 'wb') as handle:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
294 pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
295 subprf_dict.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
296 return subprf_dict.name
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
297
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
298
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
299 def concatenate_prof(subprofiles_all, files_dict, seq_id, HTML_DATA,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
300 wig_files):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
301 for subprofile in subprofiles_all:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
302 with open(subprofile, 'rb') as handle:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
303 individual_dict = pickle.load(handle)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
304 exclude = set(["idx"])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
305 for key in set(individual_dict.keys()).difference(exclude):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
306 if any(individual_dict[key][0]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
307 indices = handle_zero_lines(individual_dict[key][0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
308 if key not in files_dict.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
309 prf_name = "{}/{}.wig".format(HTML_DATA, re.sub(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
310 '[\/\|]', '_', key))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
311 prf_qual_name = "{}/{}_qual.wig".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
312 HTML_DATA, re.sub('[\/\|]', '_', key))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
313 prf_file = open(prf_name, "w")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
314 prf_q_file = open(prf_qual_name, "w")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
315 prf_file.write("{}{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
316 configuration.HEADER_WIG, seq_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
317 prf_q_file.write("{}{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
318 configuration.HEADER_WIG, seq_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
319 for i in indices:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
320 prf_file.write("{}\t{}\n".format(individual_dict[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
321 'idx'][i], individual_dict[key][0][i]))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
322 prf_q_file.write("{}\t{}\n".format(individual_dict[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
323 'idx'][i], int(individual_dict[key][1][i])))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
324 files_dict[key] = [prf_name, [seq_id], prf_qual_name]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
325 wig_files.append(prf_file)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
326 wig_files.append(prf_q_file)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
327 prf_file.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
328 prf_q_file.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
329 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
330 prf_name = files_dict[key][0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
331 prf_qual_name = files_dict[key][2]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
332 with open(prf_name, "a") as prf_file, open(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
333 prf_qual_name, "a") as prf_q_file:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
334 if seq_id not in files_dict[key][1]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
335 prf_file.write("{}{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
336 configuration.HEADER_WIG, seq_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
337 prf_q_file.write("{}{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
338 configuration.HEADER_WIG, seq_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
339 files_dict[key][1].append(seq_id)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
340 for i in indices:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
341 prf_file.write("{}\t{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
342 individual_dict['idx'][i], individual_dict[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
343 key][0][i]))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
344 prf_q_file.write("{}\t{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
345 individual_dict['idx'][i], int(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
346 individual_dict[key][1][i])))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
347 return files_dict, wig_files
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
348
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
349
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
350 def concatenate_prof_CN(CV, subprofiles_all, files_dict, seq_id, HTML_DATA,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
351 wig_files):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
352 for subprofile in subprofiles_all:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
353 with open(subprofile, 'rb') as handle:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
354 individual_dict = pickle.load(handle)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
355 exclude = set(["idx"])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
356 for key in set(individual_dict.keys()).difference(exclude):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
357 if any(individual_dict[key][0]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
358 indices = handle_zero_lines(individual_dict[key][0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
359 if key not in files_dict.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
360 prf_name = "{}/{}.wig".format(HTML_DATA, re.sub(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
361 '[\/\|]', '_', key))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
362 prf_qual_name = "{}/{}_qual.wig".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
363 HTML_DATA, re.sub('[\/\|]', '_', key))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
364 prf_file = open(prf_name, "w")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
365 prf_q_file = open(prf_qual_name, "w")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
366 prf_file.write("{}{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
367 configuration.HEADER_WIG, seq_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
368 prf_q_file.write("{}{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
369 configuration.HEADER_WIG, seq_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
370 for i in indices:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
371 prf_file.write("{}\t{}\n".format(individual_dict[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
372 'idx'][i], int(individual_dict[key][0][i] /
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
373 CV)))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
374 prf_q_file.write("{}\t{}\n".format(individual_dict[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
375 'idx'][i], int(individual_dict[key][1][i])))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
376 files_dict[key] = [prf_name, [seq_id], prf_qual_name]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
377 wig_files.append(prf_file)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
378 wig_files.append(prf_q_file)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
379 prf_file.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
380 prf_q_file.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
381 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
382 prf_name = files_dict[key][0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
383 prf_qual_name = files_dict[key][2]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
384 with open(prf_name, "a") as prf_file, open(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
385 prf_qual_name, "a") as prf_q_file:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
386 if seq_id not in files_dict[key][1]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
387 prf_file.write("{}{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
388 configuration.HEADER_WIG, seq_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
389 prf_q_file.write("{}{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
390 configuration.HEADER_WIG, seq_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
391 files_dict[key][1].append(seq_id)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
392 for i in indices:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
393 prf_file.write("{}\t{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
394 individual_dict['idx'][i], int(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
395 individual_dict[key][0][i] / CV)))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
396 prf_q_file.write("{}\t{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
397 individual_dict['idx'][i], int(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
398 individual_dict[key][1][i])))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
399 return files_dict, wig_files
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
400
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
401
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
402 def handle_zero_lines(repeat_subhits):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
403 ''' Clean lines which contains only zeros, i.e. positons which do not contain any hit. However border zero positions need to be preserved due to correct graphs plotting '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
404 zero_idx = [idx for idx, val in enumerate(repeat_subhits) if val == 0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
405 indices = [idx for idx, val in enumerate(repeat_subhits) if val != 0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
406 zero_breakpoints = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
407 for key, group in groupby(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
408 enumerate(zero_idx),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
409 lambda index_item: index_item[0] - index_item[1]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
410 group = list(map(itemgetter(1), group))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
411 zero_breakpoints.append(group[0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
412 zero_breakpoints.append(group[-1])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
413 if indices:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
414 indices.extend(zero_breakpoints)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
415 indices = sorted(set(indices), key=int)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
416 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
417 indices = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
418 return indices
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
419
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
420
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
421 def repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
422 xminimal, xmaximal, domains, seq_ids_dom, CN,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
423 seq_ids_all, seq_lengths_all, files_dict):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
424 ''' Process the hits table separately for each fasta, create gff file and profile picture '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
425 if files_dict:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
426 gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
427 seq_ids_all)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
428 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
429 with open(OUTPUT_GFF, "w") as gff_file:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
430 gff_file.write("{}\n".format(configuration.HEADER_GFF))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
431
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
432 # TODO remove plotting, keep only basic report
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
433 return None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
434 seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
435 graphs_dict = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
436 seqs_long = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
437 if files_dict:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
438 [graphs_dict, seqs_long] = visualization.vis_profrep(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
439 seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
440 seqs_all_part)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
441 count_seq = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
442 for seq in seqs_all_part:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
443 if seq in graphs_dict.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
444 fig = graphs_dict[seq][0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
445 ax = graphs_dict[seq][1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
446 art = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
447 lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
448 art.append(lgd)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
449 if seq in seq_ids_dom:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
450 dom_idx = seq_ids_dom.index(seq)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
451 [fig, ax] = visualization.vis_domains(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
452 fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx],
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
453 domains[dom_idx])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
454 elif seq in seqs_long:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
455 [fig, ax] = visualization.plot_figure(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
456 seq, seq_lengths_all[count_seq], CN)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
457 ax.text(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
458 0.3,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
459 0.5,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
460 "Graphs are only displayed if sequence is not longer than {} bp".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
461 configuration.SEQ_LEN_VIZ),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
462 transform=ax.transAxes,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
463 fontsize=14,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
464 verticalalignment='center',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
465 color='blue')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
466 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
467 [fig, ax] = visualization.plot_figure(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
468 seq, seq_lengths_all[count_seq], CN)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
469 ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
470 if seq in seq_ids_dom:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
471 dom_idx = seq_ids_dom.index(seq)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
472 [fig, ax] = visualization.vis_domains(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
473 fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx],
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
474 domains[dom_idx])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
475 output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
476 fig.savefig(output_pic_png,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
477 bbox_inches="tight",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
478 format="png",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
479 dpi=configuration.IMAGE_RES)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
480 count_seq += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
481 return None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
482
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
483
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
484 def repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA, CN,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
485 seq_ids_all, seq_lengths_all, files_dict):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
486 ''' Process the hits table separately for each fasta, create gff file and profile picture '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
487 if files_dict:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
488 gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
489 seq_ids_all)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
490 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
491 with open(OUTPUT_GFF, "w") as gff_file:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
492 gff_file.write("{}\n".format(configuration.HEADER_GFF))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
493
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
494 # TODO remove plotting, keep only basic report
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
495 return None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
496 seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
497 graphs_dict = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
498 seqs_long = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
499 if files_dict:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
500 [graphs_dict, seqs_long] = visualization.vis_profrep(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
501 seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
502 seqs_all_part)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
503 count_seq = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
504 for seq in seqs_all_part:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
505 if seq in graphs_dict.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
506 fig = graphs_dict[seq][0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
507 ax = graphs_dict[seq][1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
508 art = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
509 lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
510 art.append(lgd)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
511 elif seq in seqs_long:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
512 [fig, ax] = visualization.plot_figure(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
513 seq, seq_lengths_all[count_seq], CN)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
514 ax.text(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
515 0.3,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
516 0.5,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
517 "Graphs are only displayed if sequence is not longer than {} bp".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
518 configuration.SEQ_LEN_VIZ),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
519 transform=ax.transAxes,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
520 fontsize=14,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
521 verticalalignment='center',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
522 color='blue')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
523 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
524 [fig, ax] = visualization.plot_figure(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
525 seq, seq_lengths_all[count_seq], CN)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
526 ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
527 output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
528 fig.savefig(output_pic_png,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
529 bbox_inches="tight",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
530 format="png",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
531 dpi=configuration.IMAGE_RES)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
532 plt.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
533 count_seq += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
534 return None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
535
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
536
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
537 def html_output(total_length, seq_lengths_all, seq_names, HTML, DB_NAME, REF,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
538 REF_LINK):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
539 ''' Define html output with limited number of output pictures and link to JBrowse '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
540 info = "\t\t".join(['<pre> {} [{} bp]</pre>'.format(seq_name, seq_length)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
541 for seq_name, seq_length in zip(seq_names,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
542 seq_lengths_all)])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
543 if REF:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
544 ref_part_1 = REF.split("-")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
545 ref_part_2 = "-".join(REF.split("-")[1:]).split(". ")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
546 ref_part_3 = ". ".join("-".join(REF.split("-")[1:]).split(". ")[1:])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
547 ref_string = '''<h6> {} - <a href="{}" target="_blank" >{}</a>. {}'''.format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
548 ref_part_1, REF_LINK, ref_part_2, ref_part_3)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
549 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
550 ref_string = "Custom Data"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
551 pictures = "\n\t\t".join(['<img src="{}.png" width=1800>'.format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
552 pic) for pic in range(len(seq_names))[:configuration.MAX_PIC_NUM]])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
553 html_str = configuration.HTML_STR.format(info, total_length, DB_NAME,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
554 pictures, ref_string)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
555 with open(HTML, "w") as html_file:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
556 html_file.write(html_str)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
557
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
558
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
559 def adjust_tracklist(jbrowse_data_path):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
560 starting_lines = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
561 ending_lines = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
562 end = False
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
563 with open(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
564 os.path.join(jbrowse_data_path,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
565 "trackList.json"), "r") as track_list:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
566 for line in track_list:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
567 if "]" not in line and not end:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
568 starting_lines.append(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
569 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
570 end = True
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
571 ending_lines.append(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
572 with open(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
573 os.path.join(jbrowse_data_path,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
574 "trackList.json"), "w") as track_list:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
575 for line in starting_lines:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
576 track_list.write(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
577 return ending_lines
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
578
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
579
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
580 def jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
581 total_length, JBROWSE_BIN, files_dict):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
582 ''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
583 jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
584 with tempfile.TemporaryDirectory() as dirpath:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
585 subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
586 "--fasta", QUERY, "--out", jbrowse_data_path])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
587 subprocess.call(["{}/flatfile-to-json.pl".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
588 JBROWSE_BIN), "--gff", OUT_DOMAIN_GFF, "--trackLabel",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
589 "GFF_domains", "--out", jbrowse_data_path])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
590 subprocess.call(["{}/flatfile-to-json.pl".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
591 JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
592 "--config", configuration.JSON_CONF_R, "--out",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
593 jbrowse_data_path])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
594 subprocess.call(["{}/flatfile-to-json.pl".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
595 JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
596 "--config", configuration.JSON_CONF_N, "--out",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
597 jbrowse_data_path])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
598 count = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
599 # Control the total length processed, if above threshold, dont create wig image tracks
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
600 if files_dict:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
601 exclude = set(['ALL'])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
602 sorted_keys = sorted(set(files_dict.keys()).difference(exclude))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
603 sorted_keys.insert(0, "ALL")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
604 ending_lines = adjust_tracklist(jbrowse_data_path)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
605 track_list = open(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
606 os.path.join(jbrowse_data_path, "trackList.json"), "a")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
607 color_avail = len(configuration.COLORS_HEX)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
608 for repeat_id in sorted_keys:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
609 if count <= color_avail - 1:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
610 color = configuration.COLORS_HEX[count]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
611 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
612 r = lambda: random.randint(0, 255)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
613 color = '#%02X%02X%02X' % (r(), r(), r())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
614 count += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
615 bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
616 subprocess.call(["wigToBigWig", files_dict[repeat_id][
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
617 0], os.path.join(HTML_DATA,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
618 configuration.CHROM_SIZES_FILE),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
619 os.path.join(jbrowse_data_path, bw_name)])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
620 track_list.write(configuration.TRACK_LIST.format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
621 "{", bw_name, repeat_id, repeat_id, "{", color, "}", "}"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
622 for line in ending_lines:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
623 track_list.write(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
624 distutils.dir_util.copy_tree(dirpath, jbrowse_data_path)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
625 return None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
626
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
627
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
628 def jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
629 JBROWSE_BIN, files_dict):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
630 ''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
631 jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
632 with tempfile.TemporaryDirectory() as dirpath:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
633 subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
634 "--fasta", QUERY, "--out", jbrowse_data_path])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
635 subprocess.call(["{}/flatfile-to-json.pl".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
636 JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
637 "--config", configuration.JSON_CONF_R, "--out",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
638 jbrowse_data_path])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
639 subprocess.call(["{}/flatfile-to-json.pl".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
640 JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
641 "--config", configuration.JSON_CONF_N, "--out",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
642 jbrowse_data_path])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
643 count = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
644 ## Control the total length processed, if above threshold, dont create wig image tracks
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
645 if files_dict:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
646 exclude = set(['ALL'])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
647 sorted_keys = sorted(set(files_dict.keys()).difference(exclude))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
648 sorted_keys.insert(0, "ALL")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
649 ending_lines = adjust_tracklist(jbrowse_data_path)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
650 track_list = open(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
651 os.path.join(jbrowse_data_path, "trackList.json"), "a")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
652 color_avail = len(configuration.COLORS_HEX)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
653 for repeat_id in sorted_keys:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
654 if count <= color_avail - 1:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
655 color = configuration.COLORS_HEX[count]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
656 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
657 r = lambda: random.randint(0, 255)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
658 color = '#%02X%02X%02X' % (r(), r(), r())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
659 count += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
660 bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
661 subprocess.call(["wigToBigWig", files_dict[repeat_id][
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
662 0], os.path.join(HTML_DATA,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
663 configuration.CHROM_SIZES_FILE),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
664 os.path.join(jbrowse_data_path, bw_name)])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
665 track_list.write(configuration.TRACK_LIST.format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
666 "{", bw_name, repeat_id, repeat_id, "{", color, "}", "}"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
667 for line in ending_lines:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
668 track_list.write(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
669 track_list.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
670 distutils.dir_util.copy_tree(dirpath, jbrowse_data_path)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
671 return None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
672
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
673
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
674 def genome2coverage(GS, BLAST_DB):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
675 ''' Convert genome size to coverage '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
676 num_of_reads = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
677 with open(BLAST_DB, "r") as reads_all:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
678 first_line = reads_all.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
679 if first_line.startswith(">"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
680 num_of_reads += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
681 first_seq = reads_all.readline().rstrip()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
682 for line in reads_all:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
683 if line.startswith(">"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
684 num_of_reads += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
685 len_of_read = len(first_seq)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
686 CV = (num_of_reads * len_of_read) / (GS * 1000000) # GS in Mb
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
687 return CV
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
688
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
689
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
690 def prepared_data(DB_ID):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
691 ''' Get prepared rep. annotation data from the table based on the selected species ID '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
692 tbl = os.path.join(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
693 os.path.dirname(os.path.realpath(__file__)),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
694 configuration.PROFREP_DATA, configuration.PROFREP_TBL)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
695 with open(tbl, "r") as datasets:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
696 for line in datasets:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
697 if line.split("\t")[0] == DB_ID:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
698 DB_NAME = line.split("\t")[1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
699 CV = float(line.split("\t")[5])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
700 REF = line.split("\t")[6]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
701 REF_LINK = line.split("\t")[7]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
702 return DB_NAME, CV, REF, REF_LINK
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
703
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
704
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
705 def seq_sizes_file(seq_ids, seq_lengths_all, HTML_DATA):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
706 chrom_sizes = os.path.join(HTML_DATA, configuration.CHROM_SIZES_FILE)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
707 with open(chrom_sizes, "w") as chroms:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
708 for seq_id, seq_length in zip(seq_ids, seq_lengths_all):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
709 chroms.write("{}\t{}\n".format(seq_id, seq_length))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
710
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
711
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
712 def main(args):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
713 ## Command line arguments
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
714 QUERY = args.query
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
715 BLAST_DB = args.reads
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
716 CL_ANNOTATION_TBL = args.ann_tbl
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
717 CLS = args.cls
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
718 BITSCORE = args.bit_score
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
719 E_VALUE = args.e_value
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
720 WORD_SIZE = args.word_size
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
721 WINDOW = args.window
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
722 OVERLAP = args.overlap
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
723 BLAST_TASK = args.task
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
724 MAX_ALIGNMENTS = args.max_alignments
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
725 NEW_DB = args.new_db
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
726 THRESHOLD = args.threshold_repeat
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
727 THRESHOLD_SEGMENT = args.threshold_segment
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
728 TH_IDENTITY = args.th_identity
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
729 TH_LENGTH = args.th_length
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
730 TH_INTERRUPT = args.interruptions
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
731 TH_SIMILARITY = args.th_similarity
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
732 TH_LEN_RATIO = args.max_len_proportion
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
733 OUTPUT_GFF = args.output_gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
734 DOMAINS = args.protein_domains
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
735 LAST_DB = args.protein_database
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
736 CLASSIFICATION = args.classification
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
737 OUT_DOMAIN_GFF = args.domain_gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
738 HTML = args.html_file
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
739 HTML_DATA = args.html_path
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
740 N_GFF = args.n_gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
741 CN = args.copy_numbers
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
742 GS = args.genome_size
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
743 DB_ID = args.db_id
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
744 THRESHOLD_SCORE = args.threshold_score
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
745 WIN_DOM = args.win_dom
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
746 OVERLAP_DOM = args.overlap_dom
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
747 JBROWSE_BIN = args.jbrowse_bin
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
748 DUST_FILTER = args.dust_filter
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
749 LOG_FILE = args.log_file
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
750 #JBROWSE_BIN = os.environ['JBROWSE_SOURCE_DIR']+"/bin"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
751 #if not JBROWSE_BIN:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
752 # try:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
753 # JBROWSE_BIN = os.environ['JBROWSE_BIN']
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
754 # except KeyError:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
755 # raise ValueError('There was no path to JBrowse bin found - set the enviroment variable JBROWSE_BIN or pass the argument explicitly')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
756 if CN and not DB_ID and not GS:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
757 raise ValueError("Genome size missing - if you want to convert hits to copy numbers please enter --genome_size parameter")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
758
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
759
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
760 ## Check if there are forbidden characters in fasta IDs
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
761 [forbidden_ids, headers] = check_fasta_id(QUERY)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
762 if forbidden_ids:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
763 ##################### USER ERROR ###############################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
764 raise UserWarning(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
765 "The following IDs contain forbidden characters ('/' or '\\') - PLEASE REPLACE OR DELETE THEM:\n{}".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
766 "\n".join(forbidden_ids)))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
767
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
768 if len(headers) > len(set([header.split(" ")[0] for header in headers])):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
769 raise NameError(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
770 '''Sequences in multifasta format are not named correctly:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
771 seq IDs(before the first space) are the same''')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
772
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
773 ## Create new blast database of reads
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
774 if NEW_DB:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
775 subprocess.call("makeblastdb -in {} -dbtype nucl".format(BLAST_DB),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
776 shell=True)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
777
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
778 ## Parse prepared annotation data table
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
779 if DB_ID:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
780 [DB_NAME, CV, REF, REF_LINK] = prepared_data(DB_ID)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
781 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
782 REF = None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
783 REF_LINK = None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
784 DB_NAME = "CUSTOM"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
785
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
786 ## Create dir to store outputs for html and JBROWSE
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
787 if not os.path.exists(HTML_DATA):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
788 os.makedirs(HTML_DATA)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
789
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
790 if not os.path.isabs(HTML):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
791 HTML = os.path.join(HTML_DATA, HTML)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
792
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
793 if not os.path.isabs(OUT_DOMAIN_GFF):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
794 OUT_DOMAIN_GFF = os.path.join(HTML_DATA, OUT_DOMAIN_GFF)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
795
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
796 if not os.path.isabs(LOG_FILE):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
797 LOG_FILE = os.path.join(HTML_DATA, LOG_FILE)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
798
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
799 if not os.path.isabs(N_GFF):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
800 N_GFF = os.path.join(HTML_DATA, N_GFF)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
801
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
802 if not os.path.isabs(OUTPUT_GFF):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
803 OUTPUT_GFF = os.path.join(HTML_DATA, OUTPUT_GFF)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
804
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
805 path = os.path.dirname(os.path.realpath(__file__))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
806 version_string = get_version(path)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
807
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
808 log = os.open(LOG_FILE, os.O_RDWR | os.O_CREAT)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
809
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
810 os.write(log, version_string.encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
811
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
812 ## Define parameters for parallel process
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
813 STEP = WINDOW - OVERLAP
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
814 NUM_CORES = multiprocessing.cpu_count()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
815 os.write(log, "NUM_OF_CORES = {}\n".format(NUM_CORES).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
816
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
817 ## Convert genome size to coverage
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
818 if CN and GS:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
819 CV = genome2coverage(GS, BLAST_DB)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
820 os.write(log, "COVERAGE = {}\n".format(CV).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
821
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
822 parallel_pool = Pool(NUM_CORES)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
823
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
824 ## Assign clusters to repetitive classes
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
825 [cl_annotations_items, annotation_keys
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
826 ] = cluster_annotation(CL_ANNOTATION_TBL)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
827
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
828 ## Assign reads to repetitive classes
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
829 reads_annotations = read_annotation(CLS, cl_annotations_items)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
830
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
831 ## Detect all fasta sequences from input
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
832 fasta_list = multifasta(QUERY)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
833 headers = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
834 files_dict = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
835 wig_files = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
836 seq_count = 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
837 start = 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
838 total_length = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
839 seq_lengths_all = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
840 Ngff = open(N_GFF, "w")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
841 Ngff.write("{}\n".format(configuration.HEADER_GFF))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
842
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
843 ## Find hits for each fasta sequence separetely
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
844 t_blast = time.time()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
845 for subfasta in fasta_list:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
846 [header, sequence] = fasta_read(subfasta)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
847 os.write(log, "Sequence {} is being processed...\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
848 header).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
849 os.fsync(log)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
850 indices_N = [indices + 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
851 for indices, n in enumerate(sequence)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
852 if n == "n" or n == "N"]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
853 if indices_N:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
854 gff.idx_ranges_N(indices_N, configuration.N_segment, header, Ngff,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
855 configuration.N_NAME, configuration.N_FEATURE)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
856 seq_length = len(sequence)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
857 headers.append(header)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
858 ## Create parallel process
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
859 subset_index = list(range(0, seq_length, STEP))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
860 ## Situation when penultimal window is not complete but it is following by another one
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
861 if len(subset_index) > 1 and subset_index[-2] + WINDOW >= seq_length:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
862 subset_index = subset_index[:-1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
863 last_index = subset_index[-1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
864 index_range = range(len(subset_index))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
865 for chunk_index in index_range[0::configuration.MAX_FILES_SUBPROFILES]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
866 multiple_param = partial(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
867 parallel_process, WINDOW, OVERLAP, seq_length, annotation_keys,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
868 reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
869 BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER, last_index,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
870 len(subset_index))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
871 subprofiles_all = parallel_pool.map(multiple_param, subset_index[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
872 chunk_index:chunk_index + configuration.MAX_FILES_SUBPROFILES])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
873 ## Join partial profiles to the final profile of the sequence
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
874 if CN:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
875 [files_dict, wig_files
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
876 ] = concatenate_prof_CN(CV, subprofiles_all, files_dict,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
877 header, HTML_DATA, wig_files)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
878 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
879 [files_dict, wig_files] = concatenate_prof(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
880 subprofiles_all, files_dict, header, HTML_DATA, wig_files)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
881 for subprofile in subprofiles_all:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
882 os.unlink(subprofile)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
883 total_length += seq_length
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
884 seq_lengths_all.append(seq_length)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
885
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
886 os.write(log, "ELAPSED_TIME_BLAST = {} s\n".format(time.time(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
887 ) - t_blast).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
888 os.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
889 log,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
890 "TOTAL_LENGHT_ANALYZED = {} bp\n".format(total_length).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
891
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
892 ## Close opened files
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
893 for opened_file in wig_files:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
894 opened_file.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
895 Ngff.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
896
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
897 ## Create file containing size of sequences to convert wig to bigwig
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
898 seq_sizes_file(headers, seq_lengths_all, HTML_DATA)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
899
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
900 ## Protein domains module
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
901 t_domains = time.time()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
902 if DOMAINS:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
903 os.write(log, "Domains module has started...\n".encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
904 os.fsync(log)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
905 domains_primary = NamedTemporaryFile(delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
906 protein_domains.domain_search(QUERY, LAST_DB, CLASSIFICATION,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
907 domains_primary.name, THRESHOLD_SCORE,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
908 WIN_DOM, OVERLAP_DOM)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
909 domains_primary.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
910 [xminimal, xmaximal, domains, seq_ids_dom
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
911 ] = domains_filtering.filter_qual_dom(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
912 domains_primary.name, OUT_DOMAIN_GFF, TH_IDENTITY, TH_SIMILARITY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
913 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, 'All', "")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
914 os.unlink(domains_primary.name)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
915 os.write(log, "ELAPSED_TIME_DOMAINS = {} s\n".format(time.time(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
916 ) - t_domains).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
917
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
918 # Process individual sequences from the input file sequentially
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
919 t_gff_vis = time.time()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
920 repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
921 HTML_DATA, xminimal, xmaximal, domains,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
922 seq_ids_dom, CN, headers, seq_lengths_all,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
923 files_dict)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
924 os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
925 ) - t_gff_vis).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
926 os.fsync(log)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
927 # Prepare data for html output
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
928 t_jbrowse = time.time()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
929 os.write(log, "JBrowse tracks are being prepared...\n".encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
930 os.fsync(log)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
931 jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
932 total_length, JBROWSE_BIN, files_dict)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
933 os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
934 ) - t_jbrowse).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
935 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
936 # Process individual sequences from the input file sequentially
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
937 t_gff_vis = time.time()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
938 repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
939 CN, headers, seq_lengths_all, files_dict)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
940 os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
941 ) - t_gff_vis).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
942
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
943 # Prepare data for html output
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
944 t_jbrowse = time.time()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
945 jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
946 JBROWSE_BIN, files_dict)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
947 os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
948 ) - t_jbrowse).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
949
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
950 # Create HTML output
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
951 t_html = time.time()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
952 os.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
953 log,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
954 "HTML output and JBrowse data structure are being prepared...\n".encode(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
955 "utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
956 os.fsync(log)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
957 html_output(total_length, seq_lengths_all, headers, HTML, DB_NAME, REF,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
958 REF_LINK)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
959 os.write(log, "ELAPSED_TIME_HTML = {} s\n".format(time.time() -
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
960 t_html).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
961 os.write(log, "ELAPSED_TIME_PROFREP = {} s\n".format(time.time(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
962 ) - t_profrep).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
963 os.close(log)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
964
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
965 ## Clean up the temporary fasta files
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
966 for subfasta in fasta_list:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
967 os.unlink(subfasta)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
968
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
969
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
970 if __name__ == "__main__":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
971 import argparse
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
972 from argparse import RawDescriptionHelpFormatter
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
973
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
974 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
975 argparse.RawDescriptionHelpFormatter):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
976 pass
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
977
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
978 # Default paths (command line usage)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
979 HTML = configuration.HTML
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
980 DOMAINS_GFF = configuration.DOMAINS_GFF
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
981 REPEATS_GFF = configuration.REPEATS_GFF
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
982 N_GFF = configuration.N_GFF
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
983 LOG_FILE = configuration.LOG_FILE
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
984 PROFREP_OUTPUT_DIR = configuration.PROFREP_OUTPUT_DIR
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
985
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
986 # Command line arguments
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
987 parser = argparse.ArgumentParser(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
988 description='''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
989
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
990 DEPENDENCIES:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
991 - python 3.4 or higher with packages:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
992 - numpy
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
993 - matplotlib
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
994 - biopython
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
995 - BLAST 2.2.28+ or higher
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
996 - LAST 744 or higher
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
997 - wigToBigWig
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
998 - cd-hit
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
999 - JBrowse - ! Only bin needed, does not have to be installed under a web server
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1000 * ProfRep Modules:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1001 - gff.py
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1002 - visualization.py
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1003 - configuration.py
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1004 - protein_domains.py
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1005 - domains_filtering.py
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1006
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1007 EXAMPLE OF USAGE:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1008
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1009 ./protein.py --query PATH_TO_DNA_SEQ --reads PATH_TO_READS --ann_tbl PATH_TO_CLUSTERS_CLASSIFICATION --cls PATH_TO_hitsort.cls [--new_db True]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1010 ''',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1011 epilog="""take a look at README for more detailed information""",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1012 formatter_class=CustomFormatter)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1013
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1014 Required = parser.add_argument_group('required arguments')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1015 altRequired = parser.add_argument_group(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1016 'alternative required arguments - prepared datasets')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1017 blastOpt = parser.add_argument_group('optional arguments - BLAST Search')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1018 parallelOpt = parser.add_argument_group(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1019 'optional arguments - Parallel Processing')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1020 protOpt = parser.add_argument_group('optional arguments - Protein Domains')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1021 outOpt = parser.add_argument_group('optional arguments - Output Paths')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1022 cnOpt = parser.add_argument_group(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1023 'optional arguments - Copy Numbers/Hits ')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1024 galaxyOpt = parser.add_argument_group(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1025 'optional arguments - Enviroment Variables')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1026
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1027 ################ INPUTS ############################################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1028 Required.add_argument('-q',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1029 '--query',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1030 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1031 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1032 help='input DNA sequence in (multi)fasta format')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1033 Required.add_argument('-rdb',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1034 '--reads',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1035 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1036 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1037 help='blast database of all sequencing reads')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1038 Required.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1039 '-a',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1040 '--ann_tbl',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1041 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1042 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1043 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1044 'clusters annotation table, tab-separated number of cluster and its classification')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1045 Required.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1046 '-c',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1047 '--cls',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1048 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1049 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1050 help='cls file containing reads assigned to clusters (hitsort.cls)')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1051 altRequired.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1052 '-id',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1053 '--db_id',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1054 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1055 help='annotation dataset ID (first column of datasets table)')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1056
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1057 ################ BLAST parameters ##################################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1058 blastOpt.add_argument('-bs',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1059 '--bit_score',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1060 type=float,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1061 default=50,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1062 help='bitscore threshold')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1063 blastOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1064 '-m',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1065 '--max_alignments',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1066 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1067 default=10000000,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1068 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1069 'blast filtering option: maximal number of alignments in the output')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1070 blastOpt.add_argument('-e',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1071 '--e_value',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1072 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1073 default=0.1,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1074 help='blast setting option: e-value')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1075 blastOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1076 '-df',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1077 '--dust_filter',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1078 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1079 default="'20 64 1'",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1080 help='dust filters low-complexity regions during BLAST search')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1081 blastOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1082 '-ws',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1083 '--word_size',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1084 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1085 default=11,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1086 help='blast search option: initial word size for alignment')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1087 blastOpt.add_argument('-t',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1088 '--task',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1089 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1090 default="blastn",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1091 help='type of blast to be triggered')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1092 blastOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1093 '-n',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1094 '--new_db',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1095 type=str2bool,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1096 default=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1097 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1098 'create a new blast database, USE THIS OPTION IF YOU RUN PROFREP WITH NEW DATABASE FOR THE FIRST TIME')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1099
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1100 ############### PARALLEL PROCESSING ARGUMENTS ######################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1101 parallelOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1102 '-w',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1103 '--window',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1104 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1105 default=5000,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1106 help='sliding window size for parallel processing')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1107 parallelOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1108 '-o',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1109 '--overlap',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1110 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1111 default=150,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1112 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1113 'overlap for parallely processed regions, set greater than a read size')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1114
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1115 ################ PROTEIN DOMAINS PARAMETERS ########################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1116 protOpt.add_argument('-pd',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1117 '--protein_domains',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1118 type=str2bool,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1119 default=False,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1120 help='use module for protein domains')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1121 protOpt.add_argument('-pdb',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1122 '--protein_database',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1123 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1124 help='protein domains database')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1125 protOpt.add_argument('-cs',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1126 '--classification',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1127 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1128 help='protein domains classification file')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1129 protOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1130 '-wd',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1131 '--win_dom',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1132 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1133 default=10000000,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1134 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1135 'protein domains module: sliding window to process large input sequences sequentially')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1136 protOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1137 '-od',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1138 '--overlap_dom',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1139 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1140 default=10000,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1141 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1142 'protein domains module: overlap of sequences in two consecutive windows')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1143 protOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1144 '-thsc',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1145 '--threshold_score',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1146 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1147 default=80,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1148 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1149 'protein domains module: percentage of the best score within the cluster to significant domains')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1150 protOpt.add_argument("-thl",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1151 "--th_length",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1152 type=float,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1153 choices=[Range(0.0, 1.0)],
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1154 default=0.8,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1155 help="proportion of alignment length threshold")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1156 protOpt.add_argument("-thi",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1157 "--th_identity",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1158 type=float,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1159 choices=[Range(0.0, 1.0)],
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1160 default=0.35,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1161 help="proportion of alignment identity threshold")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1162 protOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1163 "-ths",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1164 "--th_similarity",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1165 type=float,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1166 choices=[Range(0.0, 1.0)],
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1167 default=0.45,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1168 help="threshold for alignment proportional similarity")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1169 protOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1170 "-ir",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1171 "--interruptions",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1172 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1173 default=3,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1174 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1175 "interruptions (frameshifts + stop codons) tolerance threshold per 100 AA")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1176 protOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1177 "-mlen",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1178 "--max_len_proportion",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1179 type=float,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1180 default=1.2,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1181 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1182 "maximal proportion of alignment length to the original length of protein domain from database")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1183
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1184 ################ OUTPUTS ###########################################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1185 outOpt.add_argument('-lg',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1186 '--log_file',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1187 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1188 default=LOG_FILE,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1189 help='path to log file')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1190 outOpt.add_argument('-ouf',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1191 '--output_gff',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1192 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1193 default=REPEATS_GFF,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1194 help='path to output gff of repetitive regions')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1195 outOpt.add_argument('-oug',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1196 '--domain_gff',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1197 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1198 default=DOMAINS_GFF,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1199 help='path to output gff of protein domains')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1200 outOpt.add_argument('-oun',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1201 '--n_gff',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1202 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1203 default=N_GFF,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1204 help='path to output gff of N regions')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1205 outOpt.add_argument('-hf',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1206 '--html_file',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1207 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1208 default=HTML,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1209 help='path to output html file')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1210 outOpt.add_argument('-hp',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1211 '--html_path',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1212 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1213 default=PROFREP_OUTPUT_DIR,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1214 help='path to html extra files')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1215
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1216 ################ HITS/COPY NUMBERS ####################################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1217 cnOpt.add_argument('-cn',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1218 '--copy_numbers',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1219 type=str2bool,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1220 default=False,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1221 help='convert hits to copy numbers')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1222 cnOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1223 '-gs',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1224 '--genome_size',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1225 type=float,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1226 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1227 'genome size is required when converting hits to copy numbers and you use custom data')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1228 cnOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1229 '-thr',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1230 '--threshold_repeat',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1231 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1232 default=3,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1233 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1234 'threshold for hits/copy numbers per position to be considered repetitive')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1235 cnOpt.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1236 '-thsg',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1237 '--threshold_segment',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1238 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1239 default=80,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1240 help='threshold for the length of repetitive segment to be reported')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1241
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1242 ################ JBrowse ##########################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1243 galaxyOpt.add_argument('-jb',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1244 '--jbrowse_bin',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1245 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1246 help='path to JBrowse bin directory')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1247
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1248 args = parser.parse_args()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1249 main(args)