comparison gplib.py @ 1:20429f4c1b95 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit f3fb925b83a4982e0cf9a0c11ff93ecbb8e4e6d5"
author bgruening
date Wed, 22 Jan 2020 10:14:41 -0500
parents
children ace92c9a4653
comparison
equal deleted inserted replaced
0:215925e588c4 1:20429f4c1b95
1
2 from distutils.spawn import find_executable
3 import subprocess
4 import statistics
5 import random
6 import gzip
7 import uuid
8 import sys
9 import re
10 import os
11
12 """
13
14 Run doctests:
15
16 python3 -m doctest gplib.py
17
18
19 """
20
21
22 ################################################################################
23
24 def graphprot_predictions_get_median(predictions_file):
25 """
26 Given a GraphProt .predictions file, read in site scores and return
27 the median value.
28
29 >>> test_file = "test-data/test.predictions"
30 >>> graphprot_predictions_get_median(test_file)
31 0.571673
32
33 """
34 # Site scores list.
35 sc_list = []
36 with open(predictions_file) as f:
37 for line in f:
38 cols = line.strip().split("\t")
39 score = float(cols[2])
40 sc_list.append(score)
41 f.close()
42 # Return the median.
43 return statistics.median(sc_list)
44
45
46 ################################################################################
47
48 def graphprot_profile_get_top_scores_median(profile_file,
49 profile_type="profile",
50 avg_profile_extlr=5):
51
52 """
53 Given a GraphProt .profile file, extract for each site (identified by
54 column 1 ID) the top (= highest) score. Then return the median of these
55 top scores.
56
57 profile_type can be either "profile" or "avg_profile".
58 "avg_profile means that the position-wise scores will first get smoothed
59 out by calculating for each position a new score through taking a
60 sequence window -avg_profile_extlr to +avg_profile_extlr of the position
61 and calculate the mean score over this window and assign it to the position.
62 After that, the maximum score of each site is chosen, and the median over
63 all maximum scores is returned.
64 "profile" leaves the position-wise scores as they are, directly extracting
65 the maximum for each site and then reporting the median.
66
67 >>> test_file = "test-data/test.profile"
68 >>> graphprot_profile_get_top_scores_median(test_file)
69 3.2
70
71 """
72 # Dictionary of lists, with list of scores (value) for each site (key).
73 lists_dic = {}
74 with open(profile_file) as f:
75 for line in f:
76 cols = line.strip().split("\t")
77 seq_id = cols[0]
78 score = float(cols[2])
79 if seq_id in lists_dic:
80 lists_dic[seq_id].append(score)
81 else:
82 lists_dic[seq_id] = []
83 lists_dic[seq_id].append(score)
84 f.close()
85 # For each site, extract maximum and store in new list.
86 max_list = []
87 for seq_id in lists_dic:
88 if profile_type == "profile":
89 max_sc = max(lists_dic[seq_id])
90 max_list.append(max_sc)
91 elif profile_type == "avg_profile":
92 # Convert profile score list to average profile scores list.
93 aps_list = list_moving_window_average_values(lists_dic[seq_id],
94 win_extlr=avg_profile_extlr)
95 max_sc = max(aps_list)
96 max_list.append(max_sc)
97 else:
98 assert 0, "invalid profile_type argument given: \"%s\"" %(profile_type)
99 # Return the median.
100 return statistics.median(max_list)
101
102
103 ################################################################################
104
105 def list_moving_window_average_values(in_list,
106 win_extlr=5,
107 method=1):
108 """
109 Take a list of numeric values, and calculate for each position a new value,
110 by taking the mean value of the window of positions -win_extlr and
111 +win_extlr. If full extension is not possible (at list ends), it just
112 takes what it gets.
113 Two implementations of the task are given, chose by method=1 or method=2.
114
115 >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1]
116 >>> list_moving_window_average_values(test_list, win_extlr=2, method=1)
117 [3.3333333333333335, 4.5, 4.4, 4.6, 5.4, 4.6, 3.75, 3.6666666666666665]
118 >>> list_moving_window_average_values(test_list, win_extlr=2, method=2)
119 [3.3333333333333335, 4.5, 4.4, 4.6, 5.4, 4.6, 3.75, 3.6666666666666665]
120
121 """
122 l_list = len(in_list)
123 assert l_list, "Given list is empty"
124 new_list = [0] * l_list
125 if win_extlr == 0:
126 return l_list
127 if method == 1:
128 for i in range(l_list):
129 s = i - win_extlr
130 e = i + win_extlr + 1
131 if s < 0:
132 s = 0
133 if e > l_list:
134 e = l_list
135 # Extract portion and assign value to new list.
136 new_list[i] = statistics.mean(in_list[s:e])
137 elif method == 2:
138 for i in range(l_list):
139 s = i - win_extlr
140 e = i + win_extlr + 1
141 if s < 0:
142 s = 0
143 if e > l_list:
144 e = l_list
145 l = e-s
146 sc_sum = 0
147 for j in range(l):
148 sc_sum += in_list[s+j]
149 new_list[i] = sc_sum / l
150 else:
151 assert 0, "invalid method ID given (%i)" %(method)
152 return new_list
153
154
155 ################################################################################
156
157 def echo_add_to_file(echo_string, out_file):
158 """
159 Add a string to file, using echo command.
160
161 """
162 check_cmd = 'echo "%s" >> %s' % (echo_string, out_file)
163 output = subprocess.getoutput(check_cmd)
164 error = False
165 if output:
166 error = True
167 assert error == False, "echo is complaining:\n%s\n%s" %(check_cmd, output)
168
169
170 ################################################################################
171
172 def is_tool(name):
173 """Check whether tool "name" is in PATH."""
174 return find_executable(name) is not None
175
176
177 ################################################################################
178
179 def count_fasta_headers(fasta_file):
180 """
181 Count number of FASTA headers in fasta_file using grep.
182
183 >>> test_file = "test-data/test.fa"
184 >>> count_fasta_headers(test_file)
185 2
186 >>> test_file = "test-data/empty_file"
187 >>> count_fasta_headers(test_file)
188 0
189
190 """
191 check_cmd = 'grep -c ">" ' + fasta_file
192 output = subprocess.getoutput(check_cmd)
193 row_count = int(output.strip())
194 return row_count
195
196
197 ################################################################################
198
199 def make_file_copy(in_file, out_file):
200 """
201 Make a file copy by copying in_file to out_file.
202
203 """
204 check_cmd = "cat " + in_file + " > " + out_file
205 assert in_file != out_file, "cat does not like to cat file into same file (%s)" %(check_cmd)
206 output = subprocess.getoutput(check_cmd)
207 error = False
208 if output:
209 error = True
210 assert error == False, "cat did not like your input (in_file: %s, out_file: %s):\n%s" %(in_file, out_file, output)
211
212
213 ################################################################################
214
215 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa,
216 test_size=500):
217 """
218 Split in_fasta .fa file into two files (e.g. test, train).
219
220 """
221 # Read in in_fasta.
222 seqs_dic = read_fasta_into_dic(in_fasta)
223 # Shuffle IDs.
224 rand_ids_list = random_order_dic_keys_into_list(seqs_dic)
225 c_out = 0
226 TESTOUT = open(test_out_fa, "w")
227 TRAINOUT = open(train_out_fa, "w")
228 for seq_id in rand_ids_list:
229 seq = seqs_dic[seq_id]
230 if (c_out >= test_size):
231 TRAINOUT.write(">%s\n%s\n" % (seq_id, seq))
232 else:
233 TESTOUT.write(">%s\n%s\n" % (seq_id, seq))
234 c_out += 1
235 TESTOUT.close()
236 TRAINOUT.close()
237
238
239 ################################################################################
240
241 def read_fasta_into_dic(fasta_file,
242 seqs_dic=False,
243 ids_dic=False,
244 read_dna=False,
245 reject_lc=False,
246 convert_to_uc=False,
247 skip_n_seqs=True):
248 """
249 Read in FASTA sequences, convert to RNA, store in dictionary
250 and return dictionary.
251
252 >>> test_fasta = "test-data/test.fa"
253 >>> read_fasta_into_dic(test_fasta)
254 {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'}
255 >>> test_fasta = "test-data/test2.fa"
256 >>> read_fasta_into_dic(test_fasta)
257 {}
258 >>> test_fasta = "test-data/test.ensembl.fa"
259 >>> read_fasta_into_dic(test_fasta, read_dna=True)
260 {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'}
261
262 """
263 if not seqs_dic:
264 seqs_dic = {}
265 seq_id = ""
266 seq = ""
267
268 # Go through FASTA file, extract sequences.
269 if re.search(".+\.gz$", fasta_file):
270 f = gzip.open(fasta_file, 'rt')
271 else:
272 f = open(fasta_file, "r")
273 for line in f:
274 if re.search(">.+", line):
275 m = re.search(">(.+)", line)
276 seq_id = m.group(1)
277 # If there is a ".", take only first part of header.
278 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..."
279 if re.search(".+\..+", seq_id):
280 m = re.search("(.+?)\..+", seq_id)
281 seq_id = m.group(1)
282 assert seq_id not in seqs_dic, "non-unique FASTA header \"%s\" in \"%s\"" % (seq_id, fasta_file)
283 if ids_dic:
284 if seq_id in ids_dic:
285 seqs_dic[seq_id] = ""
286 else:
287 seqs_dic[seq_id] = ""
288 elif re.search("[ACGTUN]+", line, re.I):
289 if seq_id in seqs_dic:
290 m = re.search("([ACGTUN]+)", line, re.I)
291 seq = m.group(1)
292 if reject_lc:
293 assert not re.search("[a-z]", seq), "lowercase characters detected in sequence \"%i\" (reject_lc=True)" %(seq_id)
294 if convert_to_uc:
295 seq = seq.upper()
296 # If sequences with N nucleotides should be skipped.
297 if skip_n_seqs:
298 if "n" in m.group(1) or "N" in m.group(1):
299 print ("WARNING: \"%s\" contains N nucleotides. Discarding sequence ... " % (seq_id))
300 del seqs_dic[seq_id]
301 continue
302 # Convert to RNA, concatenate sequence.
303 if read_dna:
304 seqs_dic[seq_id] += m.group(1).replace("U","T").replace("u","t")
305 else:
306 seqs_dic[seq_id] += m.group(1).replace("T","U").replace("t","u")
307 f.close()
308 return seqs_dic
309
310
311 ################################################################################
312
313 def random_order_dic_keys_into_list(in_dic):
314 """
315 Read in dictionary keys, and return random order list of IDs.
316
317 """
318 id_list = []
319 for key in in_dic:
320 id_list.append(key)
321 random.shuffle(id_list)
322 return id_list
323
324
325 ################################################################################
326
327 def graphprot_get_param_string(params_file):
328 """
329 Get parameter string from GraphProt .params file.
330
331 >>> test_params = "test-data/test.params"
332 >>> graphprot_get_param_string(test_params)
333 '-epochs 20 -lambda 0.01 -R 1 -D 3 -bitsize 14 -onlyseq '
334
335 """
336 param_string = ""
337 with open(params_file) as f:
338 for line in f:
339 cols = line.strip().split(" ")
340 param = cols[0]
341 setting = cols[1]
342 if re.search(".+:", param):
343 m = re.search("(.+):", line)
344 par = m.group(1)
345 if re.search("pos_train.+", line):
346 continue
347 if par == "model_type":
348 if setting == "sequence":
349 param_string += "-onlyseq "
350 else:
351 param_string += "-%s %s " %(par, setting)
352 else:
353 assert 0, "pattern matching failed for string \"%s\"" %(param)
354 return param_string
355
356
357 ################################################################################
358
359 def seqs_dic_count_uc_nts(seqs_dic):
360 """
361 Count number of uppercase nucleotides in sequences stored in sequence
362 dictionary.
363
364 >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'}
365 >>> seqs_dic_count_uc_nts(seqs_dic)
366 6
367 >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'}
368 >>> seqs_dic_count_uc_nts(seqs_dic)
369 0
370
371 """
372 assert seqs_dic, "Given sequence dictionary empty"
373 c_uc = 0
374 for seq_id in seqs_dic:
375 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id]))
376 return c_uc
377
378
379 ################################################################################
380
381 def seqs_dic_count_lc_nts(seqs_dic):
382 """
383 Count number of lowercase nucleotides in sequences stored in sequence
384 dictionary.
385
386 >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'}
387 >>> seqs_dic_count_lc_nts(seqs_dic)
388 10
389 >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'}
390 >>> seqs_dic_count_lc_nts(seqs_dic)
391 0
392
393 """
394 assert seqs_dic, "Given sequence dictionary empty"
395 c_uc = 0
396 for seq_id in seqs_dic:
397 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id]))
398 return c_uc
399
400
401 ################################################################################
402
403 def count_file_rows(in_file):
404 """
405 Count number of file rows for given input file.
406
407 >>> test_file = "test-data/test1.bed"
408 >>> count_file_rows(test_file)
409 7
410 >>> test_file = "test-data/empty_file"
411 >>> count_file_rows(test_file)
412 0
413
414 """
415 check_cmd = "cat " + in_file + " | wc -l"
416 output = subprocess.getoutput(check_cmd)
417 row_count = int(output.strip())
418 return row_count
419
420
421 ################################################################################
422
423 def bed_check_six_col_format(bed_file):
424 """
425 Check whether given .bed file has 6 columns.
426
427 >>> test_bed = "test-data/test1.bed"
428 >>> bed_check_six_col_format(test_bed)
429 True
430 >>> test_bed = "test-data/empty_file"
431 >>> bed_check_six_col_format(test_bed)
432 False
433
434 """
435
436 six_col_format = False
437 with open(bed_file) as f:
438 for line in f:
439 cols = line.strip().split("\t")
440 if len(cols) == 6:
441 six_col_format = True
442 break
443 f.closed
444 return six_col_format
445
446
447 ################################################################################
448
449 def bed_check_unique_ids(bed_file):
450 """
451 Check whether .bed file (6 column format with IDs in column 4)
452 has unique column 4 IDs.
453
454 >>> test_bed = "test-data/test1.bed"
455 >>> bed_check_unique_ids(test_bed)
456 True
457 >>> test_bed = "test-data/test2.bed"
458 >>> bed_check_unique_ids(test_bed)
459 False
460
461 """
462
463 check_cmd = "cut -f 4 " + bed_file + " | sort | uniq -d"
464 output = subprocess.getoutput(check_cmd)
465 if output:
466 return False
467 else:
468 return True
469
470
471 ################################################################################
472
473 def get_seq_lengths_from_seqs_dic(seqs_dic):
474 """
475 Given a dictionary of sequences, return dictionary of sequence lengths.
476 Mapping is sequence ID -> sequence length.
477 """
478 seq_len_dic = {}
479 assert seqs_dic, "sequence dictionary seems to be empty"
480 for seq_id in seqs_dic:
481 seq_l = len(seqs_dic[seq_id])
482 seq_len_dic[seq_id] = seq_l
483 return seq_len_dic
484
485
486 ################################################################################
487
488 def bed_get_region_lengths(bed_file):
489 """
490 Read in .bed file, store and return region lengths in dictionary.
491 key : region ID (.bed col4)
492 value : region length (.bed col3-col2)
493
494 >>> test_file = "test-data/test4.bed"
495 >>> bed_get_region_lengths(test_file)
496 {'CLIP1': 10, 'CLIP2': 10}
497
498 """
499 id2len_dic = {}
500 with open(bed_file) as f:
501 for line in f:
502 row = line.strip()
503 cols = line.strip().split("\t")
504 site_s = int(cols[1])
505 site_e = int(cols[2])
506 site_id = cols[3]
507 site_l = site_e - site_s
508 assert site_id not in id2len_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(bed_file)
509 id2len_dic[site_id] = site_l
510 f.closed
511 assert id2len_dic, "No IDs read into dictionary (input file \"%s\" empty or malformatted?)" % (in_bed)
512 return id2len_dic
513
514
515 ################################################################################
516
517 def graphprot_get_param_dic(params_file):
518 """
519 Read in GraphProt .params file and store in dictionary.
520 key = parameter
521 value = parameter value
522
523 >>> params_file = "test-data/test.params"
524 >>> graphprot_get_param_dic(params_file)
525 {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', 'pos_train_profile_median': '5.039610', 'pos_train_avg_profile_median_1': '4.236340', 'pos_train_avg_profile_median_2': '3.868431', 'pos_train_avg_profile_median_3': '3.331277', 'pos_train_avg_profile_median_4': '2.998667', 'pos_train_avg_profile_median_5': '2.829782', 'pos_train_avg_profile_median_6': '2.626623', 'pos_train_avg_profile_median_7': '2.447083', 'pos_train_avg_profile_median_8': '2.349919', 'pos_train_avg_profile_median_9': '2.239829', 'pos_train_avg_profile_median_10': '2.161676'}
526
527 """
528 param_dic = {}
529 with open(params_file) as f:
530 for line in f:
531 cols = line.strip().split(" ")
532 param = cols[0]
533 setting = cols[1]
534 if re.search(".+:", param):
535 m = re.search("(.+):", line)
536 par = m.group(1)
537 param_dic[par] = setting
538 f.close()
539 return param_dic
540
541
542 ################################################################################
543
544 def graphprot_filter_predictions_file(in_file, out_file,
545 sc_thr=0):
546 """
547 Filter GraphProt .predictions file by given score thr_sc.
548 """
549 OUTPRED = open(out_file, "w")
550 with open(in_file) as f:
551 for line in f:
552 row = line.strip()
553 cols = line.strip().split("\t")
554 score = float(cols[2])
555 if score < sc_thr:
556 continue
557 OUTPRED.write("%s\n" %(row))
558 f.close()
559 OUTPRED.close()
560
561
562 ################################################################################
563
564 def fasta_read_in_ids(fasta_file):
565 """
566 Given a .fa file, read in header IDs in order appearing in file,
567 and store in list.
568
569 >>> test_file = "test-data/test3.fa"
570 >>> fasta_read_in_ids(test_file)
571 ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316']
572
573 """
574 ids_list = []
575 with open(fasta_file) as f:
576 for line in f:
577 if re.search(">.+", line):
578 m = re.search(">(.+)", line)
579 seq_id = m.group(1)
580 ids_list.append(seq_id)
581 f.close()
582 return ids_list
583
584
585 ################################################################################
586
587 def graphprot_profile_calculate_avg_profile(in_file, out_file,
588 ap_extlr=5,
589 seq_ids_list=False,
590 method=1):
591 """
592 Given a GraphProt .profile file, calculate average profiles and output
593 average profile file.
594 Average profile means that the position-wise scores will get smoothed
595 out by calculating for each position a new score, taking a sequence
596 window -ap_extlr to +ap_extlr relative to the position
597 and calculate the mean score over this window. The mean score then
598 becomes the new average profile score at this position.
599 Two different implementations of the task are given:
600 method=1 (new python implementation, slower + more memory but easy to read)
601 method=2 (old perl implementation, faster and less memory but more code)
602
603 >>> in_file = "test-data/test2.profile"
604 >>> out_file1 = "test-data/test2_1.avg_profile"
605 >>> out_file2 = "test-data/test2_2.avg_profile"
606 >>> out_file4 = "test-data/test2_3.avg_profile"
607 >>> graphprot_profile_calculate_avg_profile(in_file, out_file1, ap_extlr=2, method=1)
608 >>> graphprot_profile_calculate_avg_profile(in_file, out_file2, ap_extlr=2, method=2)
609 >>> diff_two_files_identical(out_file1, out_file2)
610 True
611 >>> test_list = ["s1", "s2", "s3", "s4"]
612 >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile"
613 >>> out_file3 = "test-data/test3_added_ids_out.avg_profile"
614 >>> graphprot_profile_calculate_avg_profile(in_file, out_file3, ap_extlr=2, method=1, seq_ids_list=test_list)
615 >>> diff_two_files_identical(out_file3_exp, out_file3)
616 True
617
618 """
619 if method == 1:
620 # Dictionary of lists, with list of scores (value) for each site (key).
621 lists_dic = {}
622 site_starts_dic = {}
623 with open(in_file) as f:
624 for line in f:
625 cols = line.strip().split("\t")
626 site_id = int(cols[0])
627 pos = int(cols[1]) # 0-based.
628 score = float(cols[2])
629 # Store first position of site.
630 if site_id not in site_starts_dic:
631 site_starts_dic[site_id] = pos
632 if site_id in lists_dic:
633 lists_dic[site_id].append(score)
634 else:
635 lists_dic[site_id] = []
636 lists_dic[site_id].append(score)
637 f.close()
638 # Check number of IDs (# FASTA sequence IDs has to be same as # site IDs).
639 if seq_ids_list:
640 c_seq_ids = len(seq_ids_list)
641 c_site_ids = len(site_starts_dic)
642 assert c_seq_ids == c_site_ids, "# sequence IDs != # site IDs (%i != %i)" %(c_seq_ids, c_site_ids)
643 OUTPROF = open(out_file, "w")
644 # For each site, calculate average profile scores list.
645 max_list = []
646 for site_id in lists_dic:
647 # Convert profile score list to average profile scores list.
648 aps_list = list_moving_window_average_values(lists_dic[site_id],
649 win_extlr=ap_extlr)
650 start_pos = site_starts_dic[site_id]
651 # Get original FASTA sequence ID.
652 if seq_ids_list:
653 site_id = seq_ids_list[site_id]
654 for i, sc in enumerate(aps_list):
655 pos = i + start_pos + 1 # make 1-based.
656 OUTPROF.write("%s\t%i\t%f\n" %(site_id, pos, sc))
657 OUTPROF.close()
658 elif method == 2:
659 OUTPROF = open(out_file, "w")
660 # Old site ID.
661 old_id = ""
662 # Current site ID.
663 cur_id = ""
664 # Scores list.
665 scores_list = []
666 site_starts_dic = {}
667 with open(in_file) as f:
668 for line in f:
669 cols = line.strip().split("\t")
670 cur_id = int(cols[0])
671 pos = int(cols[1]) # 0-based.
672 score = float(cols[2])
673 # Store first position of site.
674 if cur_id not in site_starts_dic:
675 site_starts_dic[cur_id] = pos
676 # Case: new site (new column 1 ID).
677 if cur_id != old_id:
678 # Process old id scores.
679 if scores_list:
680 aps_list = list_moving_window_average_values(scores_list,
681 win_extlr=ap_extlr)
682 start_pos = site_starts_dic[old_id]
683 seq_id = old_id
684 # Get original FASTA sequence ID.
685 if seq_ids_list:
686 seq_id = seq_ids_list[old_id]
687 for i, sc in enumerate(aps_list):
688 pos = i + start_pos + 1 # make 1-based.
689 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc))
690 # Reset list.
691 scores_list = []
692 old_id = cur_id
693 scores_list.append(score)
694 else:
695 # Add to scores_list.
696 scores_list.append(score)
697 f.close()
698 # Process last block.
699 if scores_list:
700 aps_list = list_moving_window_average_values(scores_list,
701 win_extlr=ap_extlr)
702 start_pos = site_starts_dic[old_id]
703 seq_id = old_id
704 # Get original FASTA sequence ID.
705 if seq_ids_list:
706 seq_id = seq_ids_list[old_id]
707 for i, sc in enumerate(aps_list):
708 pos = i + start_pos + 1 # make 1-based.
709 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc))
710 OUTPROF.close()
711
712
713 ################################################################################
714
715 def graphprot_profile_extract_peak_regions(in_file, out_file,
716 max_merge_dist=0,
717 sc_thr=0):
718 """
719 Extract peak regions from GraphProt .profile file.
720 Store the peak regions (defined as regions with scores >= sc_thr)
721 as to out_file in 6-column .bed.
722
723 TODO:
724 Add option for genomic coordinates input (+ - polarity support).
725 Output genomic regions instead of sequence regions.
726
727 >>> in_file = "test-data/test4.avg_profile"
728 >>> out_file = "test-data/test4_out.peaks.bed"
729 >>> exp_file = "test-data/test4_out_exp.peaks.bed"
730 >>> exp2_file = "test-data/test4_out_exp2.peaks.bed"
731 >>> empty_file = "test-data/empty_file"
732 >>> graphprot_profile_extract_peak_regions(in_file, out_file)
733 >>> diff_two_files_identical(out_file, exp_file)
734 True
735 >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10)
736 >>> diff_two_files_identical(out_file, empty_file)
737 True
738 >>> graphprot_profile_extract_peak_regions(in_file, out_file, max_merge_dist=2)
739 >>> diff_two_files_identical(out_file, exp2_file)
740 True
741
742 """
743
744 OUTPEAKS = open(out_file, "w")
745 # Old site ID.
746 old_id = ""
747 # Current site ID.
748 cur_id = ""
749 # Scores list.
750 scores_list = []
751 site_starts_dic = {}
752 with open(in_file) as f:
753 for line in f:
754 cols = line.strip().split("\t")
755 cur_id = cols[0]
756 pos = int(cols[1]) # 0-based.
757 score = float(cols[2])
758 # Store first position of site.
759 if cur_id not in site_starts_dic:
760 # If first position != zero, we assume positions are 1-based.
761 if pos != 0:
762 # Make index 0-based.
763 site_starts_dic[cur_id] = pos - 1
764 else:
765 site_starts_dic[cur_id] = pos
766 # Case: new site (new column 1 ID).
767 if cur_id != old_id:
768 # Process old id scores.
769 if scores_list:
770 # Extract peaks from region.
771 peak_list = list_extract_peaks(scores_list,
772 max_merge_dist=max_merge_dist,
773 coords="bed",
774 sc_thr=sc_thr)
775 start_pos = site_starts_dic[old_id]
776 # Print out peaks in .bed format.
777 for l in peak_list:
778 peak_s = start_pos + l[0]
779 peak_e = start_pos + l[1]
780 site_id = "%s,%i" %(old_id, l[2])
781 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3]))
782 # Reset list.
783 scores_list = []
784 old_id = cur_id
785 scores_list.append(score)
786 else:
787 # Add to scores_list.
788 scores_list.append(score)
789 f.close()
790 # Process last block.
791 if scores_list:
792 # Extract peaks from region.
793 peak_list = list_extract_peaks(scores_list,
794 max_merge_dist=max_merge_dist,
795 coords="bed",
796 sc_thr=sc_thr)
797 start_pos = site_starts_dic[old_id]
798 # Print out peaks in .bed format.
799 for l in peak_list:
800 peak_s = start_pos + l[0]
801 peak_e = start_pos + l[1]
802 site_id = "%s,%i" %(old_id, l[2]) # best score also 1-based.
803 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3]))
804 OUTPEAKS.close()
805
806
807 ################################################################################
808
809 def list_extract_peaks(in_list,
810 max_merge_dist=0,
811 coords="list",
812 sc_thr=0):
813 """
814 Extract peak regions from list.
815 Peak region is defined as region >= score threshold.
816
817 coords=bed : peak start 0-based, peak end 1-based.
818 coords=list : peak start 0-based, peak end 0-based.
819
820 >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5]
821 >>> list_extract_peaks(test_list)
822 [[1, 4, 3, 4.5], [6, 7, 7, 6.5]]
823 >>> list_extract_peaks(test_list, sc_thr=2)
824 [[2, 3, 3, 4.5], [6, 7, 7, 6.5]]
825 >>> list_extract_peaks(test_list, sc_thr=2, coords="bed")
826 [[2, 4, 4, 4.5], [6, 8, 8, 6.5]]
827 >>> list_extract_peaks(test_list, sc_thr=10)
828 []
829 >>> test_list = [2, -1, 3, -1, 4, -1, -1, 6, 9]
830 >>> list_extract_peaks(test_list, max_merge_dist=2)
831 [[0, 4, 4, 4], [7, 8, 8, 9]]
832 >>> list_extract_peaks(test_list, max_merge_dist=3)
833 [[0, 8, 8, 9]]
834
835 """
836 # Check.
837 assert len(in_list), "Given list is empty"
838 # Peak regions list.
839 peak_list = []
840 # Help me.
841 inside = False
842 pr_s = 0
843 pr_e = 0
844 pr_top_pos = 0
845 pr_top_sc = -100000
846 for i, sc in enumerate(in_list):
847 # Part of peak region?
848 if sc >= sc_thr:
849 # At peak start.
850 if not inside:
851 pr_s = i
852 pr_e = i
853 inside = True
854 else:
855 # Inside peak region.
856 pr_e = i
857 # Store top position.
858 if sc > pr_top_sc:
859 pr_top_sc = sc
860 pr_top_pos = i
861 else:
862 # Before was peak region?
863 if inside:
864 # Store peak region.
865 #peak_infos = "%i,%i,%i,%f" %(pr_s, pr_e, pr_top_pos, pr_top_sc)
866 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc]
867 peak_list.append(peak_infos)
868 inside = False
869 pr_top_pos = 0
870 pr_top_sc = -100000
871 # If peak at the end, also report.
872 if inside:
873 # Store peak region.
874 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc]
875 peak_list.append(peak_infos)
876 # Merge peaks.
877 if max_merge_dist and len(peak_list) > 1:
878 iterate = True
879 while iterate:
880 merged_peak_list = []
881 added_peaks_dic = {}
882 peaks_merged = False
883 for i, l in enumerate(peak_list):
884 if i in added_peaks_dic:
885 continue
886 j = i + 1
887 # Last element.
888 if j == len(peak_list):
889 if i not in added_peaks_dic:
890 merged_peak_list.append(peak_list[i])
891 break
892 # Compare two elements.
893 new_peak = []
894 if (peak_list[j][0] - peak_list[i][1]) <= max_merge_dist:
895 peaks_merged = True
896 new_top_pos = peak_list[i][2]
897 new_top_sc = peak_list[i][3]
898 if peak_list[i][3] < peak_list[j][3]:
899 new_top_pos = peak_list[j][2]
900 new_top_sc = peak_list[j][3]
901 new_peak = [peak_list[i][0], peak_list[j][1], new_top_pos, new_top_sc]
902 # If two peaks were merged.
903 if new_peak:
904 merged_peak_list.append(new_peak)
905 added_peaks_dic[i] = 1
906 added_peaks_dic[j] = 1
907 else:
908 merged_peak_list.append(peak_list[i])
909 added_peaks_dic[i] = 1
910 if not peaks_merged:
911 iterate = False
912 peak_list = merged_peak_list
913 peaks_merged = False
914 # If peak coordinates should be in .bed format, make peak ends 1-based.
915 if coords == "bed":
916 for i in range(len(peak_list)):
917 peak_list[i][1] += 1
918 peak_list[i][2] += 1 # 1-base best score position too.
919 return peak_list
920
921
922 ################################################################################
923
924 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False):
925 """
926 Given a .bed file of sequence peak regions (possible coordinates from
927 0 to length of s), convert peak coordinates to genomic coordinates.
928 Do this by taking genomic regions of sequences as input.
929
930 >>> test_in = "test-data/test.peaks.bed"
931 >>> test_exp = "test-data/test_exp.peaks.bed"
932 >>> test_out = "test-data/test_out.peaks.bed"
933 >>> gen_in = "test-data/test.peaks_genomic.bed"
934 >>> bed_peaks_to_genomic_peaks(test_in, test_out, gen_in)
935 >>> diff_two_files_identical(test_out, test_exp)
936 True
937
938 """
939 # Read in genomic region info.
940 id2row_dic = {}
941
942 with open(genomic_sites_bed) as f:
943 for line in f:
944 row = line.strip()
945 cols = line.strip().split("\t")
946 site_id = cols[3]
947 assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(args.genomic_sites_bed)
948 id2row_dic[site_id] = row
949 f.close()
950
951 # Read in peaks file and convert coordinates.
952 OUTPEAKS = open(genomic_peak_file, "w")
953 with open(peak_file) as f:
954 for line in f:
955 cols = line.strip().split("\t")
956 site_id = cols[0]
957 site_s = int(cols[1])
958 site_e = int(cols[2])
959 site_id2 = cols[3]
960 site_sc = float(cols[4])
961 assert re.search(".+,.+", site_id2), "regular expression failed for ID \"%s\"" %(site_id2)
962 m = re.search(".+,(\d+)", site_id2)
963 sc_pos = int(m.group(1)) # 1-based.
964 assert site_id in id2row_dic, "site ID \"%s\" not found in genomic sites dictionary" %(site_id)
965 row = id2row_dic[site_id]
966 rowl = row.split("\t")
967 gen_chr = rowl[0]
968 gen_s = int(rowl[1])
969 gen_e = int(rowl[2])
970 gen_pol = rowl[5]
971 new_s = site_s + gen_s
972 new_e = site_e + gen_s
973 new_sc_pos = sc_pos + gen_s
974 if gen_pol == "-":
975 new_s = gen_e - site_e
976 new_e = gen_e - site_s
977 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based.
978 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol)
979 OUTPEAKS.write("%s\n" %(new_row))
980 if print_rows:
981 print(new_row)
982 OUTPEAKS.close()
983
984
985 ################################################################################
986
987 def diff_two_files_identical(file1, file2):
988 """
989 Check whether two files are identical. Return true if diff reports no
990 differences.
991
992 >>> file1 = "test-data/file1"
993 >>> file2 = "test-data/file2"
994 >>> diff_two_files_identical(file1, file2)
995 True
996 >>> file1 = "test-data/test1.bed"
997 >>> diff_two_files_identical(file1, file2)
998 False
999
1000 """
1001 same = True
1002 check_cmd = "diff " + file1 + " " + file2
1003 output = subprocess.getoutput(check_cmd)
1004 if output:
1005 same = False
1006 return same
1007
1008
1009 ################################################################################
1010
1011