Mercurial > repos > rnateam > graphprot_predict_profile
comparison gplib.py @ 1:20429f4c1b95 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit f3fb925b83a4982e0cf9a0c11ff93ecbb8e4e6d5"
author | bgruening |
---|---|
date | Wed, 22 Jan 2020 10:14:41 -0500 |
parents | |
children | ace92c9a4653 |
comparison
equal
deleted
inserted
replaced
0:215925e588c4 | 1:20429f4c1b95 |
---|---|
1 | |
2 from distutils.spawn import find_executable | |
3 import subprocess | |
4 import statistics | |
5 import random | |
6 import gzip | |
7 import uuid | |
8 import sys | |
9 import re | |
10 import os | |
11 | |
12 """ | |
13 | |
14 Run doctests: | |
15 | |
16 python3 -m doctest gplib.py | |
17 | |
18 | |
19 """ | |
20 | |
21 | |
22 ################################################################################ | |
23 | |
24 def graphprot_predictions_get_median(predictions_file): | |
25 """ | |
26 Given a GraphProt .predictions file, read in site scores and return | |
27 the median value. | |
28 | |
29 >>> test_file = "test-data/test.predictions" | |
30 >>> graphprot_predictions_get_median(test_file) | |
31 0.571673 | |
32 | |
33 """ | |
34 # Site scores list. | |
35 sc_list = [] | |
36 with open(predictions_file) as f: | |
37 for line in f: | |
38 cols = line.strip().split("\t") | |
39 score = float(cols[2]) | |
40 sc_list.append(score) | |
41 f.close() | |
42 # Return the median. | |
43 return statistics.median(sc_list) | |
44 | |
45 | |
46 ################################################################################ | |
47 | |
48 def graphprot_profile_get_top_scores_median(profile_file, | |
49 profile_type="profile", | |
50 avg_profile_extlr=5): | |
51 | |
52 """ | |
53 Given a GraphProt .profile file, extract for each site (identified by | |
54 column 1 ID) the top (= highest) score. Then return the median of these | |
55 top scores. | |
56 | |
57 profile_type can be either "profile" or "avg_profile". | |
58 "avg_profile means that the position-wise scores will first get smoothed | |
59 out by calculating for each position a new score through taking a | |
60 sequence window -avg_profile_extlr to +avg_profile_extlr of the position | |
61 and calculate the mean score over this window and assign it to the position. | |
62 After that, the maximum score of each site is chosen, and the median over | |
63 all maximum scores is returned. | |
64 "profile" leaves the position-wise scores as they are, directly extracting | |
65 the maximum for each site and then reporting the median. | |
66 | |
67 >>> test_file = "test-data/test.profile" | |
68 >>> graphprot_profile_get_top_scores_median(test_file) | |
69 3.2 | |
70 | |
71 """ | |
72 # Dictionary of lists, with list of scores (value) for each site (key). | |
73 lists_dic = {} | |
74 with open(profile_file) as f: | |
75 for line in f: | |
76 cols = line.strip().split("\t") | |
77 seq_id = cols[0] | |
78 score = float(cols[2]) | |
79 if seq_id in lists_dic: | |
80 lists_dic[seq_id].append(score) | |
81 else: | |
82 lists_dic[seq_id] = [] | |
83 lists_dic[seq_id].append(score) | |
84 f.close() | |
85 # For each site, extract maximum and store in new list. | |
86 max_list = [] | |
87 for seq_id in lists_dic: | |
88 if profile_type == "profile": | |
89 max_sc = max(lists_dic[seq_id]) | |
90 max_list.append(max_sc) | |
91 elif profile_type == "avg_profile": | |
92 # Convert profile score list to average profile scores list. | |
93 aps_list = list_moving_window_average_values(lists_dic[seq_id], | |
94 win_extlr=avg_profile_extlr) | |
95 max_sc = max(aps_list) | |
96 max_list.append(max_sc) | |
97 else: | |
98 assert 0, "invalid profile_type argument given: \"%s\"" %(profile_type) | |
99 # Return the median. | |
100 return statistics.median(max_list) | |
101 | |
102 | |
103 ################################################################################ | |
104 | |
105 def list_moving_window_average_values(in_list, | |
106 win_extlr=5, | |
107 method=1): | |
108 """ | |
109 Take a list of numeric values, and calculate for each position a new value, | |
110 by taking the mean value of the window of positions -win_extlr and | |
111 +win_extlr. If full extension is not possible (at list ends), it just | |
112 takes what it gets. | |
113 Two implementations of the task are given, chose by method=1 or method=2. | |
114 | |
115 >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1] | |
116 >>> list_moving_window_average_values(test_list, win_extlr=2, method=1) | |
117 [3.3333333333333335, 4.5, 4.4, 4.6, 5.4, 4.6, 3.75, 3.6666666666666665] | |
118 >>> list_moving_window_average_values(test_list, win_extlr=2, method=2) | |
119 [3.3333333333333335, 4.5, 4.4, 4.6, 5.4, 4.6, 3.75, 3.6666666666666665] | |
120 | |
121 """ | |
122 l_list = len(in_list) | |
123 assert l_list, "Given list is empty" | |
124 new_list = [0] * l_list | |
125 if win_extlr == 0: | |
126 return l_list | |
127 if method == 1: | |
128 for i in range(l_list): | |
129 s = i - win_extlr | |
130 e = i + win_extlr + 1 | |
131 if s < 0: | |
132 s = 0 | |
133 if e > l_list: | |
134 e = l_list | |
135 # Extract portion and assign value to new list. | |
136 new_list[i] = statistics.mean(in_list[s:e]) | |
137 elif method == 2: | |
138 for i in range(l_list): | |
139 s = i - win_extlr | |
140 e = i + win_extlr + 1 | |
141 if s < 0: | |
142 s = 0 | |
143 if e > l_list: | |
144 e = l_list | |
145 l = e-s | |
146 sc_sum = 0 | |
147 for j in range(l): | |
148 sc_sum += in_list[s+j] | |
149 new_list[i] = sc_sum / l | |
150 else: | |
151 assert 0, "invalid method ID given (%i)" %(method) | |
152 return new_list | |
153 | |
154 | |
155 ################################################################################ | |
156 | |
157 def echo_add_to_file(echo_string, out_file): | |
158 """ | |
159 Add a string to file, using echo command. | |
160 | |
161 """ | |
162 check_cmd = 'echo "%s" >> %s' % (echo_string, out_file) | |
163 output = subprocess.getoutput(check_cmd) | |
164 error = False | |
165 if output: | |
166 error = True | |
167 assert error == False, "echo is complaining:\n%s\n%s" %(check_cmd, output) | |
168 | |
169 | |
170 ################################################################################ | |
171 | |
172 def is_tool(name): | |
173 """Check whether tool "name" is in PATH.""" | |
174 return find_executable(name) is not None | |
175 | |
176 | |
177 ################################################################################ | |
178 | |
179 def count_fasta_headers(fasta_file): | |
180 """ | |
181 Count number of FASTA headers in fasta_file using grep. | |
182 | |
183 >>> test_file = "test-data/test.fa" | |
184 >>> count_fasta_headers(test_file) | |
185 2 | |
186 >>> test_file = "test-data/empty_file" | |
187 >>> count_fasta_headers(test_file) | |
188 0 | |
189 | |
190 """ | |
191 check_cmd = 'grep -c ">" ' + fasta_file | |
192 output = subprocess.getoutput(check_cmd) | |
193 row_count = int(output.strip()) | |
194 return row_count | |
195 | |
196 | |
197 ################################################################################ | |
198 | |
199 def make_file_copy(in_file, out_file): | |
200 """ | |
201 Make a file copy by copying in_file to out_file. | |
202 | |
203 """ | |
204 check_cmd = "cat " + in_file + " > " + out_file | |
205 assert in_file != out_file, "cat does not like to cat file into same file (%s)" %(check_cmd) | |
206 output = subprocess.getoutput(check_cmd) | |
207 error = False | |
208 if output: | |
209 error = True | |
210 assert error == False, "cat did not like your input (in_file: %s, out_file: %s):\n%s" %(in_file, out_file, output) | |
211 | |
212 | |
213 ################################################################################ | |
214 | |
215 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, | |
216 test_size=500): | |
217 """ | |
218 Split in_fasta .fa file into two files (e.g. test, train). | |
219 | |
220 """ | |
221 # Read in in_fasta. | |
222 seqs_dic = read_fasta_into_dic(in_fasta) | |
223 # Shuffle IDs. | |
224 rand_ids_list = random_order_dic_keys_into_list(seqs_dic) | |
225 c_out = 0 | |
226 TESTOUT = open(test_out_fa, "w") | |
227 TRAINOUT = open(train_out_fa, "w") | |
228 for seq_id in rand_ids_list: | |
229 seq = seqs_dic[seq_id] | |
230 if (c_out >= test_size): | |
231 TRAINOUT.write(">%s\n%s\n" % (seq_id, seq)) | |
232 else: | |
233 TESTOUT.write(">%s\n%s\n" % (seq_id, seq)) | |
234 c_out += 1 | |
235 TESTOUT.close() | |
236 TRAINOUT.close() | |
237 | |
238 | |
239 ################################################################################ | |
240 | |
241 def read_fasta_into_dic(fasta_file, | |
242 seqs_dic=False, | |
243 ids_dic=False, | |
244 read_dna=False, | |
245 reject_lc=False, | |
246 convert_to_uc=False, | |
247 skip_n_seqs=True): | |
248 """ | |
249 Read in FASTA sequences, convert to RNA, store in dictionary | |
250 and return dictionary. | |
251 | |
252 >>> test_fasta = "test-data/test.fa" | |
253 >>> read_fasta_into_dic(test_fasta) | |
254 {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'} | |
255 >>> test_fasta = "test-data/test2.fa" | |
256 >>> read_fasta_into_dic(test_fasta) | |
257 {} | |
258 >>> test_fasta = "test-data/test.ensembl.fa" | |
259 >>> read_fasta_into_dic(test_fasta, read_dna=True) | |
260 {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'} | |
261 | |
262 """ | |
263 if not seqs_dic: | |
264 seqs_dic = {} | |
265 seq_id = "" | |
266 seq = "" | |
267 | |
268 # Go through FASTA file, extract sequences. | |
269 if re.search(".+\.gz$", fasta_file): | |
270 f = gzip.open(fasta_file, 'rt') | |
271 else: | |
272 f = open(fasta_file, "r") | |
273 for line in f: | |
274 if re.search(">.+", line): | |
275 m = re.search(">(.+)", line) | |
276 seq_id = m.group(1) | |
277 # If there is a ".", take only first part of header. | |
278 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." | |
279 if re.search(".+\..+", seq_id): | |
280 m = re.search("(.+?)\..+", seq_id) | |
281 seq_id = m.group(1) | |
282 assert seq_id not in seqs_dic, "non-unique FASTA header \"%s\" in \"%s\"" % (seq_id, fasta_file) | |
283 if ids_dic: | |
284 if seq_id in ids_dic: | |
285 seqs_dic[seq_id] = "" | |
286 else: | |
287 seqs_dic[seq_id] = "" | |
288 elif re.search("[ACGTUN]+", line, re.I): | |
289 if seq_id in seqs_dic: | |
290 m = re.search("([ACGTUN]+)", line, re.I) | |
291 seq = m.group(1) | |
292 if reject_lc: | |
293 assert not re.search("[a-z]", seq), "lowercase characters detected in sequence \"%i\" (reject_lc=True)" %(seq_id) | |
294 if convert_to_uc: | |
295 seq = seq.upper() | |
296 # If sequences with N nucleotides should be skipped. | |
297 if skip_n_seqs: | |
298 if "n" in m.group(1) or "N" in m.group(1): | |
299 print ("WARNING: \"%s\" contains N nucleotides. Discarding sequence ... " % (seq_id)) | |
300 del seqs_dic[seq_id] | |
301 continue | |
302 # Convert to RNA, concatenate sequence. | |
303 if read_dna: | |
304 seqs_dic[seq_id] += m.group(1).replace("U","T").replace("u","t") | |
305 else: | |
306 seqs_dic[seq_id] += m.group(1).replace("T","U").replace("t","u") | |
307 f.close() | |
308 return seqs_dic | |
309 | |
310 | |
311 ################################################################################ | |
312 | |
313 def random_order_dic_keys_into_list(in_dic): | |
314 """ | |
315 Read in dictionary keys, and return random order list of IDs. | |
316 | |
317 """ | |
318 id_list = [] | |
319 for key in in_dic: | |
320 id_list.append(key) | |
321 random.shuffle(id_list) | |
322 return id_list | |
323 | |
324 | |
325 ################################################################################ | |
326 | |
327 def graphprot_get_param_string(params_file): | |
328 """ | |
329 Get parameter string from GraphProt .params file. | |
330 | |
331 >>> test_params = "test-data/test.params" | |
332 >>> graphprot_get_param_string(test_params) | |
333 '-epochs 20 -lambda 0.01 -R 1 -D 3 -bitsize 14 -onlyseq ' | |
334 | |
335 """ | |
336 param_string = "" | |
337 with open(params_file) as f: | |
338 for line in f: | |
339 cols = line.strip().split(" ") | |
340 param = cols[0] | |
341 setting = cols[1] | |
342 if re.search(".+:", param): | |
343 m = re.search("(.+):", line) | |
344 par = m.group(1) | |
345 if re.search("pos_train.+", line): | |
346 continue | |
347 if par == "model_type": | |
348 if setting == "sequence": | |
349 param_string += "-onlyseq " | |
350 else: | |
351 param_string += "-%s %s " %(par, setting) | |
352 else: | |
353 assert 0, "pattern matching failed for string \"%s\"" %(param) | |
354 return param_string | |
355 | |
356 | |
357 ################################################################################ | |
358 | |
359 def seqs_dic_count_uc_nts(seqs_dic): | |
360 """ | |
361 Count number of uppercase nucleotides in sequences stored in sequence | |
362 dictionary. | |
363 | |
364 >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'} | |
365 >>> seqs_dic_count_uc_nts(seqs_dic) | |
366 6 | |
367 >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'} | |
368 >>> seqs_dic_count_uc_nts(seqs_dic) | |
369 0 | |
370 | |
371 """ | |
372 assert seqs_dic, "Given sequence dictionary empty" | |
373 c_uc = 0 | |
374 for seq_id in seqs_dic: | |
375 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) | |
376 return c_uc | |
377 | |
378 | |
379 ################################################################################ | |
380 | |
381 def seqs_dic_count_lc_nts(seqs_dic): | |
382 """ | |
383 Count number of lowercase nucleotides in sequences stored in sequence | |
384 dictionary. | |
385 | |
386 >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'} | |
387 >>> seqs_dic_count_lc_nts(seqs_dic) | |
388 10 | |
389 >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'} | |
390 >>> seqs_dic_count_lc_nts(seqs_dic) | |
391 0 | |
392 | |
393 """ | |
394 assert seqs_dic, "Given sequence dictionary empty" | |
395 c_uc = 0 | |
396 for seq_id in seqs_dic: | |
397 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) | |
398 return c_uc | |
399 | |
400 | |
401 ################################################################################ | |
402 | |
403 def count_file_rows(in_file): | |
404 """ | |
405 Count number of file rows for given input file. | |
406 | |
407 >>> test_file = "test-data/test1.bed" | |
408 >>> count_file_rows(test_file) | |
409 7 | |
410 >>> test_file = "test-data/empty_file" | |
411 >>> count_file_rows(test_file) | |
412 0 | |
413 | |
414 """ | |
415 check_cmd = "cat " + in_file + " | wc -l" | |
416 output = subprocess.getoutput(check_cmd) | |
417 row_count = int(output.strip()) | |
418 return row_count | |
419 | |
420 | |
421 ################################################################################ | |
422 | |
423 def bed_check_six_col_format(bed_file): | |
424 """ | |
425 Check whether given .bed file has 6 columns. | |
426 | |
427 >>> test_bed = "test-data/test1.bed" | |
428 >>> bed_check_six_col_format(test_bed) | |
429 True | |
430 >>> test_bed = "test-data/empty_file" | |
431 >>> bed_check_six_col_format(test_bed) | |
432 False | |
433 | |
434 """ | |
435 | |
436 six_col_format = False | |
437 with open(bed_file) as f: | |
438 for line in f: | |
439 cols = line.strip().split("\t") | |
440 if len(cols) == 6: | |
441 six_col_format = True | |
442 break | |
443 f.closed | |
444 return six_col_format | |
445 | |
446 | |
447 ################################################################################ | |
448 | |
449 def bed_check_unique_ids(bed_file): | |
450 """ | |
451 Check whether .bed file (6 column format with IDs in column 4) | |
452 has unique column 4 IDs. | |
453 | |
454 >>> test_bed = "test-data/test1.bed" | |
455 >>> bed_check_unique_ids(test_bed) | |
456 True | |
457 >>> test_bed = "test-data/test2.bed" | |
458 >>> bed_check_unique_ids(test_bed) | |
459 False | |
460 | |
461 """ | |
462 | |
463 check_cmd = "cut -f 4 " + bed_file + " | sort | uniq -d" | |
464 output = subprocess.getoutput(check_cmd) | |
465 if output: | |
466 return False | |
467 else: | |
468 return True | |
469 | |
470 | |
471 ################################################################################ | |
472 | |
473 def get_seq_lengths_from_seqs_dic(seqs_dic): | |
474 """ | |
475 Given a dictionary of sequences, return dictionary of sequence lengths. | |
476 Mapping is sequence ID -> sequence length. | |
477 """ | |
478 seq_len_dic = {} | |
479 assert seqs_dic, "sequence dictionary seems to be empty" | |
480 for seq_id in seqs_dic: | |
481 seq_l = len(seqs_dic[seq_id]) | |
482 seq_len_dic[seq_id] = seq_l | |
483 return seq_len_dic | |
484 | |
485 | |
486 ################################################################################ | |
487 | |
488 def bed_get_region_lengths(bed_file): | |
489 """ | |
490 Read in .bed file, store and return region lengths in dictionary. | |
491 key : region ID (.bed col4) | |
492 value : region length (.bed col3-col2) | |
493 | |
494 >>> test_file = "test-data/test4.bed" | |
495 >>> bed_get_region_lengths(test_file) | |
496 {'CLIP1': 10, 'CLIP2': 10} | |
497 | |
498 """ | |
499 id2len_dic = {} | |
500 with open(bed_file) as f: | |
501 for line in f: | |
502 row = line.strip() | |
503 cols = line.strip().split("\t") | |
504 site_s = int(cols[1]) | |
505 site_e = int(cols[2]) | |
506 site_id = cols[3] | |
507 site_l = site_e - site_s | |
508 assert site_id not in id2len_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(bed_file) | |
509 id2len_dic[site_id] = site_l | |
510 f.closed | |
511 assert id2len_dic, "No IDs read into dictionary (input file \"%s\" empty or malformatted?)" % (in_bed) | |
512 return id2len_dic | |
513 | |
514 | |
515 ################################################################################ | |
516 | |
517 def graphprot_get_param_dic(params_file): | |
518 """ | |
519 Read in GraphProt .params file and store in dictionary. | |
520 key = parameter | |
521 value = parameter value | |
522 | |
523 >>> params_file = "test-data/test.params" | |
524 >>> graphprot_get_param_dic(params_file) | |
525 {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', 'pos_train_profile_median': '5.039610', 'pos_train_avg_profile_median_1': '4.236340', 'pos_train_avg_profile_median_2': '3.868431', 'pos_train_avg_profile_median_3': '3.331277', 'pos_train_avg_profile_median_4': '2.998667', 'pos_train_avg_profile_median_5': '2.829782', 'pos_train_avg_profile_median_6': '2.626623', 'pos_train_avg_profile_median_7': '2.447083', 'pos_train_avg_profile_median_8': '2.349919', 'pos_train_avg_profile_median_9': '2.239829', 'pos_train_avg_profile_median_10': '2.161676'} | |
526 | |
527 """ | |
528 param_dic = {} | |
529 with open(params_file) as f: | |
530 for line in f: | |
531 cols = line.strip().split(" ") | |
532 param = cols[0] | |
533 setting = cols[1] | |
534 if re.search(".+:", param): | |
535 m = re.search("(.+):", line) | |
536 par = m.group(1) | |
537 param_dic[par] = setting | |
538 f.close() | |
539 return param_dic | |
540 | |
541 | |
542 ################################################################################ | |
543 | |
544 def graphprot_filter_predictions_file(in_file, out_file, | |
545 sc_thr=0): | |
546 """ | |
547 Filter GraphProt .predictions file by given score thr_sc. | |
548 """ | |
549 OUTPRED = open(out_file, "w") | |
550 with open(in_file) as f: | |
551 for line in f: | |
552 row = line.strip() | |
553 cols = line.strip().split("\t") | |
554 score = float(cols[2]) | |
555 if score < sc_thr: | |
556 continue | |
557 OUTPRED.write("%s\n" %(row)) | |
558 f.close() | |
559 OUTPRED.close() | |
560 | |
561 | |
562 ################################################################################ | |
563 | |
564 def fasta_read_in_ids(fasta_file): | |
565 """ | |
566 Given a .fa file, read in header IDs in order appearing in file, | |
567 and store in list. | |
568 | |
569 >>> test_file = "test-data/test3.fa" | |
570 >>> fasta_read_in_ids(test_file) | |
571 ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316'] | |
572 | |
573 """ | |
574 ids_list = [] | |
575 with open(fasta_file) as f: | |
576 for line in f: | |
577 if re.search(">.+", line): | |
578 m = re.search(">(.+)", line) | |
579 seq_id = m.group(1) | |
580 ids_list.append(seq_id) | |
581 f.close() | |
582 return ids_list | |
583 | |
584 | |
585 ################################################################################ | |
586 | |
587 def graphprot_profile_calculate_avg_profile(in_file, out_file, | |
588 ap_extlr=5, | |
589 seq_ids_list=False, | |
590 method=1): | |
591 """ | |
592 Given a GraphProt .profile file, calculate average profiles and output | |
593 average profile file. | |
594 Average profile means that the position-wise scores will get smoothed | |
595 out by calculating for each position a new score, taking a sequence | |
596 window -ap_extlr to +ap_extlr relative to the position | |
597 and calculate the mean score over this window. The mean score then | |
598 becomes the new average profile score at this position. | |
599 Two different implementations of the task are given: | |
600 method=1 (new python implementation, slower + more memory but easy to read) | |
601 method=2 (old perl implementation, faster and less memory but more code) | |
602 | |
603 >>> in_file = "test-data/test2.profile" | |
604 >>> out_file1 = "test-data/test2_1.avg_profile" | |
605 >>> out_file2 = "test-data/test2_2.avg_profile" | |
606 >>> out_file4 = "test-data/test2_3.avg_profile" | |
607 >>> graphprot_profile_calculate_avg_profile(in_file, out_file1, ap_extlr=2, method=1) | |
608 >>> graphprot_profile_calculate_avg_profile(in_file, out_file2, ap_extlr=2, method=2) | |
609 >>> diff_two_files_identical(out_file1, out_file2) | |
610 True | |
611 >>> test_list = ["s1", "s2", "s3", "s4"] | |
612 >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile" | |
613 >>> out_file3 = "test-data/test3_added_ids_out.avg_profile" | |
614 >>> graphprot_profile_calculate_avg_profile(in_file, out_file3, ap_extlr=2, method=1, seq_ids_list=test_list) | |
615 >>> diff_two_files_identical(out_file3_exp, out_file3) | |
616 True | |
617 | |
618 """ | |
619 if method == 1: | |
620 # Dictionary of lists, with list of scores (value) for each site (key). | |
621 lists_dic = {} | |
622 site_starts_dic = {} | |
623 with open(in_file) as f: | |
624 for line in f: | |
625 cols = line.strip().split("\t") | |
626 site_id = int(cols[0]) | |
627 pos = int(cols[1]) # 0-based. | |
628 score = float(cols[2]) | |
629 # Store first position of site. | |
630 if site_id not in site_starts_dic: | |
631 site_starts_dic[site_id] = pos | |
632 if site_id in lists_dic: | |
633 lists_dic[site_id].append(score) | |
634 else: | |
635 lists_dic[site_id] = [] | |
636 lists_dic[site_id].append(score) | |
637 f.close() | |
638 # Check number of IDs (# FASTA sequence IDs has to be same as # site IDs). | |
639 if seq_ids_list: | |
640 c_seq_ids = len(seq_ids_list) | |
641 c_site_ids = len(site_starts_dic) | |
642 assert c_seq_ids == c_site_ids, "# sequence IDs != # site IDs (%i != %i)" %(c_seq_ids, c_site_ids) | |
643 OUTPROF = open(out_file, "w") | |
644 # For each site, calculate average profile scores list. | |
645 max_list = [] | |
646 for site_id in lists_dic: | |
647 # Convert profile score list to average profile scores list. | |
648 aps_list = list_moving_window_average_values(lists_dic[site_id], | |
649 win_extlr=ap_extlr) | |
650 start_pos = site_starts_dic[site_id] | |
651 # Get original FASTA sequence ID. | |
652 if seq_ids_list: | |
653 site_id = seq_ids_list[site_id] | |
654 for i, sc in enumerate(aps_list): | |
655 pos = i + start_pos + 1 # make 1-based. | |
656 OUTPROF.write("%s\t%i\t%f\n" %(site_id, pos, sc)) | |
657 OUTPROF.close() | |
658 elif method == 2: | |
659 OUTPROF = open(out_file, "w") | |
660 # Old site ID. | |
661 old_id = "" | |
662 # Current site ID. | |
663 cur_id = "" | |
664 # Scores list. | |
665 scores_list = [] | |
666 site_starts_dic = {} | |
667 with open(in_file) as f: | |
668 for line in f: | |
669 cols = line.strip().split("\t") | |
670 cur_id = int(cols[0]) | |
671 pos = int(cols[1]) # 0-based. | |
672 score = float(cols[2]) | |
673 # Store first position of site. | |
674 if cur_id not in site_starts_dic: | |
675 site_starts_dic[cur_id] = pos | |
676 # Case: new site (new column 1 ID). | |
677 if cur_id != old_id: | |
678 # Process old id scores. | |
679 if scores_list: | |
680 aps_list = list_moving_window_average_values(scores_list, | |
681 win_extlr=ap_extlr) | |
682 start_pos = site_starts_dic[old_id] | |
683 seq_id = old_id | |
684 # Get original FASTA sequence ID. | |
685 if seq_ids_list: | |
686 seq_id = seq_ids_list[old_id] | |
687 for i, sc in enumerate(aps_list): | |
688 pos = i + start_pos + 1 # make 1-based. | |
689 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) | |
690 # Reset list. | |
691 scores_list = [] | |
692 old_id = cur_id | |
693 scores_list.append(score) | |
694 else: | |
695 # Add to scores_list. | |
696 scores_list.append(score) | |
697 f.close() | |
698 # Process last block. | |
699 if scores_list: | |
700 aps_list = list_moving_window_average_values(scores_list, | |
701 win_extlr=ap_extlr) | |
702 start_pos = site_starts_dic[old_id] | |
703 seq_id = old_id | |
704 # Get original FASTA sequence ID. | |
705 if seq_ids_list: | |
706 seq_id = seq_ids_list[old_id] | |
707 for i, sc in enumerate(aps_list): | |
708 pos = i + start_pos + 1 # make 1-based. | |
709 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) | |
710 OUTPROF.close() | |
711 | |
712 | |
713 ################################################################################ | |
714 | |
715 def graphprot_profile_extract_peak_regions(in_file, out_file, | |
716 max_merge_dist=0, | |
717 sc_thr=0): | |
718 """ | |
719 Extract peak regions from GraphProt .profile file. | |
720 Store the peak regions (defined as regions with scores >= sc_thr) | |
721 as to out_file in 6-column .bed. | |
722 | |
723 TODO: | |
724 Add option for genomic coordinates input (+ - polarity support). | |
725 Output genomic regions instead of sequence regions. | |
726 | |
727 >>> in_file = "test-data/test4.avg_profile" | |
728 >>> out_file = "test-data/test4_out.peaks.bed" | |
729 >>> exp_file = "test-data/test4_out_exp.peaks.bed" | |
730 >>> exp2_file = "test-data/test4_out_exp2.peaks.bed" | |
731 >>> empty_file = "test-data/empty_file" | |
732 >>> graphprot_profile_extract_peak_regions(in_file, out_file) | |
733 >>> diff_two_files_identical(out_file, exp_file) | |
734 True | |
735 >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10) | |
736 >>> diff_two_files_identical(out_file, empty_file) | |
737 True | |
738 >>> graphprot_profile_extract_peak_regions(in_file, out_file, max_merge_dist=2) | |
739 >>> diff_two_files_identical(out_file, exp2_file) | |
740 True | |
741 | |
742 """ | |
743 | |
744 OUTPEAKS = open(out_file, "w") | |
745 # Old site ID. | |
746 old_id = "" | |
747 # Current site ID. | |
748 cur_id = "" | |
749 # Scores list. | |
750 scores_list = [] | |
751 site_starts_dic = {} | |
752 with open(in_file) as f: | |
753 for line in f: | |
754 cols = line.strip().split("\t") | |
755 cur_id = cols[0] | |
756 pos = int(cols[1]) # 0-based. | |
757 score = float(cols[2]) | |
758 # Store first position of site. | |
759 if cur_id not in site_starts_dic: | |
760 # If first position != zero, we assume positions are 1-based. | |
761 if pos != 0: | |
762 # Make index 0-based. | |
763 site_starts_dic[cur_id] = pos - 1 | |
764 else: | |
765 site_starts_dic[cur_id] = pos | |
766 # Case: new site (new column 1 ID). | |
767 if cur_id != old_id: | |
768 # Process old id scores. | |
769 if scores_list: | |
770 # Extract peaks from region. | |
771 peak_list = list_extract_peaks(scores_list, | |
772 max_merge_dist=max_merge_dist, | |
773 coords="bed", | |
774 sc_thr=sc_thr) | |
775 start_pos = site_starts_dic[old_id] | |
776 # Print out peaks in .bed format. | |
777 for l in peak_list: | |
778 peak_s = start_pos + l[0] | |
779 peak_e = start_pos + l[1] | |
780 site_id = "%s,%i" %(old_id, l[2]) | |
781 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) | |
782 # Reset list. | |
783 scores_list = [] | |
784 old_id = cur_id | |
785 scores_list.append(score) | |
786 else: | |
787 # Add to scores_list. | |
788 scores_list.append(score) | |
789 f.close() | |
790 # Process last block. | |
791 if scores_list: | |
792 # Extract peaks from region. | |
793 peak_list = list_extract_peaks(scores_list, | |
794 max_merge_dist=max_merge_dist, | |
795 coords="bed", | |
796 sc_thr=sc_thr) | |
797 start_pos = site_starts_dic[old_id] | |
798 # Print out peaks in .bed format. | |
799 for l in peak_list: | |
800 peak_s = start_pos + l[0] | |
801 peak_e = start_pos + l[1] | |
802 site_id = "%s,%i" %(old_id, l[2]) # best score also 1-based. | |
803 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) | |
804 OUTPEAKS.close() | |
805 | |
806 | |
807 ################################################################################ | |
808 | |
809 def list_extract_peaks(in_list, | |
810 max_merge_dist=0, | |
811 coords="list", | |
812 sc_thr=0): | |
813 """ | |
814 Extract peak regions from list. | |
815 Peak region is defined as region >= score threshold. | |
816 | |
817 coords=bed : peak start 0-based, peak end 1-based. | |
818 coords=list : peak start 0-based, peak end 0-based. | |
819 | |
820 >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5] | |
821 >>> list_extract_peaks(test_list) | |
822 [[1, 4, 3, 4.5], [6, 7, 7, 6.5]] | |
823 >>> list_extract_peaks(test_list, sc_thr=2) | |
824 [[2, 3, 3, 4.5], [6, 7, 7, 6.5]] | |
825 >>> list_extract_peaks(test_list, sc_thr=2, coords="bed") | |
826 [[2, 4, 4, 4.5], [6, 8, 8, 6.5]] | |
827 >>> list_extract_peaks(test_list, sc_thr=10) | |
828 [] | |
829 >>> test_list = [2, -1, 3, -1, 4, -1, -1, 6, 9] | |
830 >>> list_extract_peaks(test_list, max_merge_dist=2) | |
831 [[0, 4, 4, 4], [7, 8, 8, 9]] | |
832 >>> list_extract_peaks(test_list, max_merge_dist=3) | |
833 [[0, 8, 8, 9]] | |
834 | |
835 """ | |
836 # Check. | |
837 assert len(in_list), "Given list is empty" | |
838 # Peak regions list. | |
839 peak_list = [] | |
840 # Help me. | |
841 inside = False | |
842 pr_s = 0 | |
843 pr_e = 0 | |
844 pr_top_pos = 0 | |
845 pr_top_sc = -100000 | |
846 for i, sc in enumerate(in_list): | |
847 # Part of peak region? | |
848 if sc >= sc_thr: | |
849 # At peak start. | |
850 if not inside: | |
851 pr_s = i | |
852 pr_e = i | |
853 inside = True | |
854 else: | |
855 # Inside peak region. | |
856 pr_e = i | |
857 # Store top position. | |
858 if sc > pr_top_sc: | |
859 pr_top_sc = sc | |
860 pr_top_pos = i | |
861 else: | |
862 # Before was peak region? | |
863 if inside: | |
864 # Store peak region. | |
865 #peak_infos = "%i,%i,%i,%f" %(pr_s, pr_e, pr_top_pos, pr_top_sc) | |
866 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] | |
867 peak_list.append(peak_infos) | |
868 inside = False | |
869 pr_top_pos = 0 | |
870 pr_top_sc = -100000 | |
871 # If peak at the end, also report. | |
872 if inside: | |
873 # Store peak region. | |
874 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] | |
875 peak_list.append(peak_infos) | |
876 # Merge peaks. | |
877 if max_merge_dist and len(peak_list) > 1: | |
878 iterate = True | |
879 while iterate: | |
880 merged_peak_list = [] | |
881 added_peaks_dic = {} | |
882 peaks_merged = False | |
883 for i, l in enumerate(peak_list): | |
884 if i in added_peaks_dic: | |
885 continue | |
886 j = i + 1 | |
887 # Last element. | |
888 if j == len(peak_list): | |
889 if i not in added_peaks_dic: | |
890 merged_peak_list.append(peak_list[i]) | |
891 break | |
892 # Compare two elements. | |
893 new_peak = [] | |
894 if (peak_list[j][0] - peak_list[i][1]) <= max_merge_dist: | |
895 peaks_merged = True | |
896 new_top_pos = peak_list[i][2] | |
897 new_top_sc = peak_list[i][3] | |
898 if peak_list[i][3] < peak_list[j][3]: | |
899 new_top_pos = peak_list[j][2] | |
900 new_top_sc = peak_list[j][3] | |
901 new_peak = [peak_list[i][0], peak_list[j][1], new_top_pos, new_top_sc] | |
902 # If two peaks were merged. | |
903 if new_peak: | |
904 merged_peak_list.append(new_peak) | |
905 added_peaks_dic[i] = 1 | |
906 added_peaks_dic[j] = 1 | |
907 else: | |
908 merged_peak_list.append(peak_list[i]) | |
909 added_peaks_dic[i] = 1 | |
910 if not peaks_merged: | |
911 iterate = False | |
912 peak_list = merged_peak_list | |
913 peaks_merged = False | |
914 # If peak coordinates should be in .bed format, make peak ends 1-based. | |
915 if coords == "bed": | |
916 for i in range(len(peak_list)): | |
917 peak_list[i][1] += 1 | |
918 peak_list[i][2] += 1 # 1-base best score position too. | |
919 return peak_list | |
920 | |
921 | |
922 ################################################################################ | |
923 | |
924 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False): | |
925 """ | |
926 Given a .bed file of sequence peak regions (possible coordinates from | |
927 0 to length of s), convert peak coordinates to genomic coordinates. | |
928 Do this by taking genomic regions of sequences as input. | |
929 | |
930 >>> test_in = "test-data/test.peaks.bed" | |
931 >>> test_exp = "test-data/test_exp.peaks.bed" | |
932 >>> test_out = "test-data/test_out.peaks.bed" | |
933 >>> gen_in = "test-data/test.peaks_genomic.bed" | |
934 >>> bed_peaks_to_genomic_peaks(test_in, test_out, gen_in) | |
935 >>> diff_two_files_identical(test_out, test_exp) | |
936 True | |
937 | |
938 """ | |
939 # Read in genomic region info. | |
940 id2row_dic = {} | |
941 | |
942 with open(genomic_sites_bed) as f: | |
943 for line in f: | |
944 row = line.strip() | |
945 cols = line.strip().split("\t") | |
946 site_id = cols[3] | |
947 assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(args.genomic_sites_bed) | |
948 id2row_dic[site_id] = row | |
949 f.close() | |
950 | |
951 # Read in peaks file and convert coordinates. | |
952 OUTPEAKS = open(genomic_peak_file, "w") | |
953 with open(peak_file) as f: | |
954 for line in f: | |
955 cols = line.strip().split("\t") | |
956 site_id = cols[0] | |
957 site_s = int(cols[1]) | |
958 site_e = int(cols[2]) | |
959 site_id2 = cols[3] | |
960 site_sc = float(cols[4]) | |
961 assert re.search(".+,.+", site_id2), "regular expression failed for ID \"%s\"" %(site_id2) | |
962 m = re.search(".+,(\d+)", site_id2) | |
963 sc_pos = int(m.group(1)) # 1-based. | |
964 assert site_id in id2row_dic, "site ID \"%s\" not found in genomic sites dictionary" %(site_id) | |
965 row = id2row_dic[site_id] | |
966 rowl = row.split("\t") | |
967 gen_chr = rowl[0] | |
968 gen_s = int(rowl[1]) | |
969 gen_e = int(rowl[2]) | |
970 gen_pol = rowl[5] | |
971 new_s = site_s + gen_s | |
972 new_e = site_e + gen_s | |
973 new_sc_pos = sc_pos + gen_s | |
974 if gen_pol == "-": | |
975 new_s = gen_e - site_e | |
976 new_e = gen_e - site_s | |
977 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. | |
978 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol) | |
979 OUTPEAKS.write("%s\n" %(new_row)) | |
980 if print_rows: | |
981 print(new_row) | |
982 OUTPEAKS.close() | |
983 | |
984 | |
985 ################################################################################ | |
986 | |
987 def diff_two_files_identical(file1, file2): | |
988 """ | |
989 Check whether two files are identical. Return true if diff reports no | |
990 differences. | |
991 | |
992 >>> file1 = "test-data/file1" | |
993 >>> file2 = "test-data/file2" | |
994 >>> diff_two_files_identical(file1, file2) | |
995 True | |
996 >>> file1 = "test-data/test1.bed" | |
997 >>> diff_two_files_identical(file1, file2) | |
998 False | |
999 | |
1000 """ | |
1001 same = True | |
1002 check_cmd = "diff " + file1 + " " + file2 | |
1003 output = subprocess.getoutput(check_cmd) | |
1004 if output: | |
1005 same = False | |
1006 return same | |
1007 | |
1008 | |
1009 ################################################################################ | |
1010 | |
1011 |