comparison gplib.py @ 5:ddcf35a868b8 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419
author bgruening
date Wed, 05 Jun 2024 16:40:51 +0000
parents ace92c9a4653
children
comparison
equal deleted inserted replaced
4:4ad83aed5c3c 5:ddcf35a868b8
1
2 import gzip 1 import gzip
3 import random 2 import random
4 import re 3 import re
5 import statistics 4 import statistics
6 import subprocess 5 import subprocess
7 from distutils.spawn import find_executable 6 from distutils.spawn import find_executable
8 7
9
10 """ 8 """
11 9
12 Run doctests: 10 Run doctests:
13 11
14 python3 -m doctest gplib.py 12 python3 -m doctest gplib.py
15 13
16 14
17 """ 15 """
18 16
19 17
20 ############################################################################### 18 #######################################################################
19
21 20
22 def graphprot_predictions_get_median(predictions_file): 21 def graphprot_predictions_get_median(predictions_file):
23 """ 22 """
24 Given a GraphProt .predictions file, read in site scores and return 23 Given a GraphProt .predictions file, read in site scores and return
25 the median value. 24 the median value.
39 f.close() 38 f.close()
40 # Return the median. 39 # Return the median.
41 return statistics.median(sc_list) 40 return statistics.median(sc_list)
42 41
43 42
44 ############################################################################### 43 #######################################################################
45 44
46 def graphprot_profile_get_tsm(profile_file, 45
47 profile_type="profile", 46 def graphprot_profile_get_tsm(
48 avg_profile_extlr=5): 47 profile_file, profile_type="profile", avg_profile_extlr=5
48 ):
49 49
50 """ 50 """
51 Given a GraphProt .profile file, extract for each site (identified by 51 Given a GraphProt .profile file, extract for each site (identified by
52 column 1 ID) the top (= highest) score. Then return the median of these 52 column 1 ID) the top (= highest) score. Then return the median of these
53 top scores. 53 top scores.
86 if profile_type == "profile": 86 if profile_type == "profile":
87 max_sc = max(lists_dic[seq_id]) 87 max_sc = max(lists_dic[seq_id])
88 max_list.append(max_sc) 88 max_list.append(max_sc)
89 elif profile_type == "avg_profile": 89 elif profile_type == "avg_profile":
90 # Convert profile score list to average profile scores list. 90 # Convert profile score list to average profile scores list.
91 aps_list = \ 91 aps_list = list_moving_window_average_values(
92 list_moving_window_average_values(lists_dic[seq_id], 92 lists_dic[seq_id], win_extlr=avg_profile_extlr
93 win_extlr=avg_profile_extlr) 93 )
94 max_sc = max(aps_list) 94 max_sc = max(aps_list)
95 max_list.append(max_sc) 95 max_list.append(max_sc)
96 else: 96 else:
97 assert 0, "invalid profile_type argument given: \"%s\"" \ 97 assert 0, 'invalid profile_type argument given: "%s"' % (profile_type)
98 % (profile_type)
99 # Return the median. 98 # Return the median.
100 return statistics.median(max_list) 99 return statistics.median(max_list)
101 100
102 101
103 ############################################################################### 102 #######################################################################
104 103
105 def list_moving_window_average_values(in_list, 104
106 win_extlr=5, 105 def list_moving_window_average_values(in_list, win_extlr=5, method=1):
107 method=1):
108 """ 106 """
109 Take a list of numeric values, and calculate for each position a new value, 107 Take a list of numeric values, and calculate for each position a new value,
110 by taking the mean value of the window of positions -win_extlr and 108 by taking the mean value of the window of positions -win_extlr and
111 +win_extlr. If full extension is not possible (at list ends), it just 109 +win_extlr. If full extension is not possible (at list ends), it just
112 takes what it gets. 110 takes what it gets.
150 else: 148 else:
151 assert 0, "invalid method ID given (%i)" % (method) 149 assert 0, "invalid method ID given (%i)" % (method)
152 return new_list 150 return new_list
153 151
154 152
155 ############################################################################### 153 #######################################################################
154
156 155
157 def echo_add_to_file(echo_string, out_file): 156 def echo_add_to_file(echo_string, out_file):
158 """ 157 """
159 Add a string to file, using echo command. 158 Add a string to file, using echo command.
160 159
165 if output: 164 if output:
166 error = True 165 error = True
167 assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output) 166 assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output)
168 167
169 168
170 ############################################################################### 169 #######################################################################
170
171 171
172 def is_tool(name): 172 def is_tool(name):
173 """Check whether tool "name" is in PATH.""" 173 """Check whether tool "name" is in PATH."""
174 return find_executable(name) is not None 174 return find_executable(name) is not None
175 175
176 176
177 ############################################################################### 177 #######################################################################
178
178 179
179 def count_fasta_headers(fasta_file): 180 def count_fasta_headers(fasta_file):
180 """ 181 """
181 Count number of FASTA headers in fasta_file using grep. 182 Count number of FASTA headers in fasta_file using grep.
182 183
192 output = subprocess.getoutput(check_cmd) 193 output = subprocess.getoutput(check_cmd)
193 row_count = int(output.strip()) 194 row_count = int(output.strip())
194 return row_count 195 return row_count
195 196
196 197
197 ############################################################################### 198 #######################################################################
199
198 200
199 def make_file_copy(in_file, out_file): 201 def make_file_copy(in_file, out_file):
200 """ 202 """
201 Make a file copy by copying in_file to out_file. 203 Make a file copy by copying in_file to out_file.
202 204
203 """ 205 """
204 check_cmd = "cat " + in_file + " > " + out_file 206 check_cmd = "cat " + in_file + " > " + out_file
205 assert in_file != out_file, \ 207 assert in_file != out_file, "cat does not like to cat file into same file (%s)" % (
206 "cat does not like to cat file into same file (%s)" % (check_cmd) 208 check_cmd
209 )
207 output = subprocess.getoutput(check_cmd) 210 output = subprocess.getoutput(check_cmd)
208 error = False 211 error = False
209 if output: 212 if output:
210 error = True 213 error = True
211 assert not error, \ 214 assert not error, "cat did not like your input (in_file: %s, out_file: %s):\n%s" % (
212 "cat did not like your input (in_file: %s, out_file: %s):\n%s" \ 215 in_file,
213 % (in_file, out_file, output) 216 out_file,
214 217 output,
215 218 )
216 ############################################################################### 219
217 220
218 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, 221 #######################################################################
219 test_size=500): 222
223
224 def split_fasta_into_test_train_files(
225 in_fasta, test_out_fa, train_out_fa, test_size=500
226 ):
220 """ 227 """
221 Split in_fasta .fa file into two files (e.g. test, train). 228 Split in_fasta .fa file into two files (e.g. test, train).
222 229
223 """ 230 """
224 # Read in in_fasta. 231 # Read in in_fasta.
228 c_out = 0 235 c_out = 0
229 TESTOUT = open(test_out_fa, "w") 236 TESTOUT = open(test_out_fa, "w")
230 TRAINOUT = open(train_out_fa, "w") 237 TRAINOUT = open(train_out_fa, "w")
231 for seq_id in rand_ids_list: 238 for seq_id in rand_ids_list:
232 seq = seqs_dic[seq_id] 239 seq = seqs_dic[seq_id]
233 if (c_out >= test_size): 240 if c_out >= test_size:
234 TRAINOUT.write(">%s\n%s\n" % (seq_id, seq)) 241 TRAINOUT.write(">%s\n%s\n" % (seq_id, seq))
235 else: 242 else:
236 TESTOUT.write(">%s\n%s\n" % (seq_id, seq)) 243 TESTOUT.write(">%s\n%s\n" % (seq_id, seq))
237 c_out += 1 244 c_out += 1
238 TESTOUT.close() 245 TESTOUT.close()
239 TRAINOUT.close() 246 TRAINOUT.close()
240 247
241 248
242 ############################################################################### 249 #######################################################################
250
243 251
244 def check_seqs_dic_format(seqs_dic): 252 def check_seqs_dic_format(seqs_dic):
245 """ 253 """
246 Check sequence dictionary for lowercase-only sequences or sequences 254 Check sequence dictionary for lowercase-only sequences or sequences
247 wich have lowercase nts in between uppercase nts. 255 wich have lowercase nts in between uppercase nts.
265 if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq): 273 if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq):
266 bad_seq_ids.append(seq_id) 274 bad_seq_ids.append(seq_id)
267 return bad_seq_ids 275 return bad_seq_ids
268 276
269 277
270 ############################################################################### 278 #######################################################################
271 279
272 def read_fasta_into_dic(fasta_file, 280
273 seqs_dic=False, 281 def read_fasta_into_dic(
274 ids_dic=False, 282 fasta_file,
275 read_dna=False, 283 seqs_dic=False,
276 short_ensembl=False, 284 ids_dic=False,
277 reject_lc=False, 285 read_dna=False,
278 convert_to_uc=False, 286 short_ensembl=False,
279 skip_n_seqs=True): 287 reject_lc=False,
288 convert_to_uc=False,
289 skip_n_seqs=True,
290 ):
280 """ 291 """
281 Read in FASTA sequences, convert to RNA, store in dictionary 292 Read in FASTA sequences, convert to RNA, store in dictionary
282 and return dictionary. 293 and return dictionary.
283 294
284 >>> test_fasta = "test-data/test.fa" 295 >>> test_fasta = "test-data/test.fa"
300 seq_id = "" 311 seq_id = ""
301 seq = "" 312 seq = ""
302 313
303 # Go through FASTA file, extract sequences. 314 # Go through FASTA file, extract sequences.
304 if re.search(r".+\.gz$", fasta_file): 315 if re.search(r".+\.gz$", fasta_file):
305 f = gzip.open(fasta_file, 'rt') 316 f = gzip.open(fasta_file, "rt")
306 else: 317 else:
307 f = open(fasta_file, "r") 318 f = open(fasta_file, "r")
308 for line in f: 319 for line in f:
309 if re.search(">.+", line): 320 if re.search(">.+", line):
310 m = re.search(">(.+)", line) 321 m = re.search(">(.+)", line)
313 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." 324 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..."
314 if short_ensembl: 325 if short_ensembl:
315 if re.search(r".+\..+", seq_id): 326 if re.search(r".+\..+", seq_id):
316 m = re.search(r"(.+?)\..+", seq_id) 327 m = re.search(r"(.+?)\..+", seq_id)
317 seq_id = m.group(1) 328 seq_id = m.group(1)
318 assert seq_id not in seqs_dic, \ 329 assert seq_id not in seqs_dic, 'non-unique FASTA header "%s" in "%s"' % (
319 "non-unique FASTA header \"%s\" in \"%s\"" \ 330 seq_id,
320 % (seq_id, fasta_file) 331 fasta_file,
332 )
321 if ids_dic: 333 if ids_dic:
322 if seq_id in ids_dic: 334 if seq_id in ids_dic:
323 seqs_dic[seq_id] = "" 335 seqs_dic[seq_id] = ""
324 else: 336 else:
325 seqs_dic[seq_id] = "" 337 seqs_dic[seq_id] = ""
326 elif re.search("[ACGTUN]+", line, re.I): 338 elif re.search("[ACGTUN]+", line, re.I):
327 if seq_id in seqs_dic: 339 if seq_id in seqs_dic:
328 m = re.search("([ACGTUN]+)", line, re.I) 340 m = re.search("([ACGTUN]+)", line, re.I)
329 seq = m.group(1) 341 seq = m.group(1)
330 if reject_lc: 342 if reject_lc:
331 assert \ 343 assert not re.search(
332 not re.search("[a-z]", seq), \ 344 "[a-z]", seq
333 "lc char detected in seq \"%i\" (reject_lc=True)" \ 345 ), 'lc char detected in seq "%i" (reject_lc=True)' % (seq_id)
334 % (seq_id)
335 if convert_to_uc: 346 if convert_to_uc:
336 seq = seq.upper() 347 seq = seq.upper()
337 # If sequences with N nucleotides should be skipped. 348 # If sequences with N nucleotides should be skipped.
338 if skip_n_seqs: 349 if skip_n_seqs:
339 if "n" in m.group(1) or "N" in m.group(1): 350 if "n" in m.group(1) or "N" in m.group(1):
340 print("WARNING: \"%s\" contains N. Discarding " 351 print(
341 "sequence ... " % (seq_id)) 352 'WARNING: "%s" contains N. Discarding '
353 "sequence ... " % (seq_id)
354 )
342 del seqs_dic[seq_id] 355 del seqs_dic[seq_id]
343 continue 356 continue
344 # Convert to RNA, concatenate sequence. 357 # Convert to RNA, concatenate sequence.
345 if read_dna: 358 if read_dna:
346 seqs_dic[seq_id] += \ 359 seqs_dic[seq_id] += m.group(1).replace("U", "T").replace("u", "t")
347 m.group(1).replace("U", "T").replace("u", "t")
348 else: 360 else:
349 seqs_dic[seq_id] += \ 361 seqs_dic[seq_id] += m.group(1).replace("T", "U").replace("t", "u")
350 m.group(1).replace("T", "U").replace("t", "u")
351 f.close() 362 f.close()
352 return seqs_dic 363 return seqs_dic
353 364
354 365
355 ############################################################################### 366 #######################################################################
367
356 368
357 def random_order_dic_keys_into_list(in_dic): 369 def random_order_dic_keys_into_list(in_dic):
358 """ 370 """
359 Read in dictionary keys, and return random order list of IDs. 371 Read in dictionary keys, and return random order list of IDs.
360 372
364 id_list.append(key) 376 id_list.append(key)
365 random.shuffle(id_list) 377 random.shuffle(id_list)
366 return id_list 378 return id_list
367 379
368 380
369 ############################################################################### 381 #######################################################################
382
370 383
371 def graphprot_get_param_string(params_file): 384 def graphprot_get_param_string(params_file):
372 """ 385 """
373 Get parameter string from GraphProt .params file. 386 Get parameter string from GraphProt .params file.
374 387
392 if setting == "sequence": 405 if setting == "sequence":
393 param_string += "-onlyseq " 406 param_string += "-onlyseq "
394 else: 407 else:
395 param_string += "-%s %s " % (par, setting) 408 param_string += "-%s %s " % (par, setting)
396 else: 409 else:
397 assert 0, "pattern matching failed for string \"%s\"" % (param) 410 assert 0, 'pattern matching failed for string "%s"' % (param)
398 return param_string 411 return param_string
399 412
400 413
401 ############################################################################### 414 #######################################################################
415
402 416
403 def seqs_dic_count_uc_nts(seqs_dic): 417 def seqs_dic_count_uc_nts(seqs_dic):
404 """ 418 """
405 Count number of uppercase nucleotides in sequences stored in sequence 419 Count number of uppercase nucleotides in sequences stored in sequence
406 dictionary. 420 dictionary.
414 428
415 """ 429 """
416 assert seqs_dic, "Given sequence dictionary empty" 430 assert seqs_dic, "Given sequence dictionary empty"
417 c_uc = 0 431 c_uc = 0
418 for seq_id in seqs_dic: 432 for seq_id in seqs_dic:
419 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) 433 c_uc += len(re.findall(r"[A-Z]", seqs_dic[seq_id]))
420 return c_uc 434 return c_uc
421 435
422 436
423 ############################################################################### 437 #######################################################################
438
424 439
425 def seqs_dic_count_lc_nts(seqs_dic): 440 def seqs_dic_count_lc_nts(seqs_dic):
426 """ 441 """
427 Count number of lowercase nucleotides in sequences stored in sequence 442 Count number of lowercase nucleotides in sequences stored in sequence
428 dictionary. 443 dictionary.
436 451
437 """ 452 """
438 assert seqs_dic, "Given sequence dictionary empty" 453 assert seqs_dic, "Given sequence dictionary empty"
439 c_uc = 0 454 c_uc = 0
440 for seq_id in seqs_dic: 455 for seq_id in seqs_dic:
441 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) 456 c_uc += len(re.findall(r"[a-z]", seqs_dic[seq_id]))
442 return c_uc 457 return c_uc
443 458
444 459
445 ############################################################################### 460 #######################################################################
461
446 462
447 def count_file_rows(in_file): 463 def count_file_rows(in_file):
448 """ 464 """
449 Count number of file rows for given input file. 465 Count number of file rows for given input file.
450 466
460 output = subprocess.getoutput(check_cmd) 476 output = subprocess.getoutput(check_cmd)
461 row_count = int(output.strip()) 477 row_count = int(output.strip())
462 return row_count 478 return row_count
463 479
464 480
465 ############################################################################### 481 #######################################################################
482
466 483
467 def bed_check_six_col_format(bed_file): 484 def bed_check_six_col_format(bed_file):
468 """ 485 """
469 Check whether given .bed file has 6 columns. 486 Check whether given .bed file has 6 columns.
470 487
486 break 503 break
487 f.closed 504 f.closed
488 return six_col_format 505 return six_col_format
489 506
490 507
491 ############################################################################### 508 #######################################################################
509
492 510
493 def bed_check_unique_ids(bed_file): 511 def bed_check_unique_ids(bed_file):
494 """ 512 """
495 Check whether .bed file (6 column format with IDs in column 4) 513 Check whether .bed file (6 column format with IDs in column 4)
496 has unique column 4 IDs. 514 has unique column 4 IDs.
510 return False 528 return False
511 else: 529 else:
512 return True 530 return True
513 531
514 532
515 ############################################################################### 533 #######################################################################
534
516 535
517 def get_seq_lengths_from_seqs_dic(seqs_dic): 536 def get_seq_lengths_from_seqs_dic(seqs_dic):
518 """ 537 """
519 Given a dictionary of sequences, return dictionary of sequence lengths. 538 Given a dictionary of sequences, return dictionary of sequence lengths.
520 Mapping is sequence ID -> sequence length. 539 Mapping is sequence ID -> sequence length.
525 seq_l = len(seqs_dic[seq_id]) 544 seq_l = len(seqs_dic[seq_id])
526 seq_len_dic[seq_id] = seq_l 545 seq_len_dic[seq_id] = seq_l
527 return seq_len_dic 546 return seq_len_dic
528 547
529 548
530 ############################################################################### 549 #######################################################################
550
531 551
532 def bed_get_region_lengths(bed_file): 552 def bed_get_region_lengths(bed_file):
533 """ 553 """
534 Read in .bed file, store and return region lengths in dictionary. 554 Read in .bed file, store and return region lengths in dictionary.
535 key : region ID (.bed col4) 555 key : region ID (.bed col4)
546 cols = line.strip().split("\t") 566 cols = line.strip().split("\t")
547 site_s = int(cols[1]) 567 site_s = int(cols[1])
548 site_e = int(cols[2]) 568 site_e = int(cols[2])
549 site_id = cols[3] 569 site_id = cols[3]
550 site_l = site_e - site_s 570 site_l = site_e - site_s
551 assert site_id \ 571 assert (
552 not in id2len_dic, \ 572 site_id not in id2len_dic
553 "column 4 IDs not unique in given .bed file \"%s\"" \ 573 ), 'column 4 IDs not unique in given .bed file "%s"' % (bed_file)
554 % (bed_file)
555 id2len_dic[site_id] = site_l 574 id2len_dic[site_id] = site_l
556 f.closed 575 f.closed
557 assert id2len_dic, \ 576 assert (
558 "No IDs read into dic (input file \"%s\" empty or malformatted?)" \ 577 id2len_dic
559 % (bed_file) 578 ), 'No IDs read into dic (input file "%s" empty or malformatted?)' % (bed_file)
560 return id2len_dic 579 return id2len_dic
561 580
562 581
563 ############################################################################### 582 #######################################################################
583
564 584
565 def graphprot_get_param_dic(params_file): 585 def graphprot_get_param_dic(params_file):
566 """ 586 """
567 Read in GraphProt .params file and store in dictionary. 587 Read in GraphProt .params file and store in dictionary.
568 key = parameter 588 key = parameter
597 param_dic[par] = setting 617 param_dic[par] = setting
598 f.close() 618 f.close()
599 return param_dic 619 return param_dic
600 620
601 621
602 ############################################################################### 622 #######################################################################
603 623
604 def graphprot_filter_predictions_file(in_file, out_file, 624
605 sc_thr=0): 625 def graphprot_filter_predictions_file(in_file, out_file, sc_thr=0):
606 """ 626 """
607 Filter GraphProt .predictions file by given score thr_sc. 627 Filter GraphProt .predictions file by given score thr_sc.
608 """ 628 """
609 OUTPRED = open(out_file, "w") 629 OUTPRED = open(out_file, "w")
610 with open(in_file) as f: 630 with open(in_file) as f:
617 OUTPRED.write("%s\n" % (row)) 637 OUTPRED.write("%s\n" % (row))
618 f.close() 638 f.close()
619 OUTPRED.close() 639 OUTPRED.close()
620 640
621 641
622 ############################################################################### 642 #######################################################################
643
623 644
624 def fasta_read_in_ids(fasta_file): 645 def fasta_read_in_ids(fasta_file):
625 """ 646 """
626 Given a .fa file, read in header IDs in order appearing in file, 647 Given a .fa file, read in header IDs in order appearing in file,
627 and store in list. 648 and store in list.
640 ids_list.append(seq_id) 661 ids_list.append(seq_id)
641 f.close() 662 f.close()
642 return ids_list 663 return ids_list
643 664
644 665
645 ############################################################################### 666 #######################################################################
646 667
647 def graphprot_profile_calc_avg_profile(in_file, out_file, 668
648 ap_extlr=5, 669 def graphprot_profile_calc_avg_profile(
649 seq_ids_list=False, 670 in_file, out_file, ap_extlr=5, seq_ids_list=False, method=1
650 method=1): 671 ):
651 """ 672 """
652 Given a GraphProt .profile file, calculate average profiles and output 673 Given a GraphProt .profile file, calculate average profiles and output
653 average profile file. 674 average profile file.
654 Average profile means that the position-wise scores will get smoothed 675 Average profile means that the position-wise scores will get smoothed
655 out by calculating for each position a new score, taking a sequence 676 out by calculating for each position a new score, taking a sequence
700 f.close() 721 f.close()
701 # Check number of IDs (# FASTA IDs has to be same as # site IDs). 722 # Check number of IDs (# FASTA IDs has to be same as # site IDs).
702 if seq_ids_list: 723 if seq_ids_list:
703 c_seq_ids = len(seq_ids_list) 724 c_seq_ids = len(seq_ids_list)
704 c_site_ids = len(site_starts_dic) 725 c_site_ids = len(site_starts_dic)
705 assert c_seq_ids == c_site_ids, \ 726 assert (
706 "# sequence IDs != # site IDs (%i != %i)" \ 727 c_seq_ids == c_site_ids
707 % (c_seq_ids, c_site_ids) 728 ), "# sequence IDs != # site IDs (%i != %i)" % (c_seq_ids, c_site_ids)
708 OUTPROF = open(out_file, "w") 729 OUTPROF = open(out_file, "w")
709 # For each site, calculate average profile scores list. 730 # For each site, calculate average profile scores list.
710 for site_id in lists_dic: 731 for site_id in lists_dic:
711 # Convert profile score list to average profile scores list. 732 # Convert profile score list to average profile scores list.
712 aps_list = list_moving_window_average_values(lists_dic[site_id], 733 aps_list = list_moving_window_average_values(
713 win_extlr=ap_extlr) 734 lists_dic[site_id], win_extlr=ap_extlr
735 )
714 start_pos = site_starts_dic[site_id] 736 start_pos = site_starts_dic[site_id]
715 # Get original FASTA sequence ID. 737 # Get original FASTA sequence ID.
716 if seq_ids_list: 738 if seq_ids_list:
717 site_id = seq_ids_list[site_id] 739 site_id = seq_ids_list[site_id]
718 for i, sc in enumerate(aps_list): 740 for i, sc in enumerate(aps_list):
739 site_starts_dic[cur_id] = pos 761 site_starts_dic[cur_id] = pos
740 # Case: new site (new column 1 ID). 762 # Case: new site (new column 1 ID).
741 if cur_id != old_id: 763 if cur_id != old_id:
742 # Process old id scores. 764 # Process old id scores.
743 if scores_list: 765 if scores_list:
744 aps_list = \ 766 aps_list = list_moving_window_average_values(
745 list_moving_window_average_values( 767 scores_list, win_extlr=ap_extlr
746 scores_list, 768 )
747 win_extlr=ap_extlr)
748 start_pos = site_starts_dic[old_id] 769 start_pos = site_starts_dic[old_id]
749 seq_id = old_id 770 seq_id = old_id
750 # Get original FASTA sequence ID. 771 # Get original FASTA sequence ID.
751 if seq_ids_list: 772 if seq_ids_list:
752 seq_id = seq_ids_list[old_id] 773 seq_id = seq_ids_list[old_id]
761 # Add to scores_list. 782 # Add to scores_list.
762 scores_list.append(score) 783 scores_list.append(score)
763 f.close() 784 f.close()
764 # Process last block. 785 # Process last block.
765 if scores_list: 786 if scores_list:
766 aps_list = list_moving_window_average_values(scores_list, 787 aps_list = list_moving_window_average_values(
767 win_extlr=ap_extlr) 788 scores_list, win_extlr=ap_extlr
789 )
768 start_pos = site_starts_dic[old_id] 790 start_pos = site_starts_dic[old_id]
769 seq_id = old_id 791 seq_id = old_id
770 # Get original FASTA sequence ID. 792 # Get original FASTA sequence ID.
771 if seq_ids_list: 793 if seq_ids_list:
772 seq_id = seq_ids_list[old_id] 794 seq_id = seq_ids_list[old_id]
774 pos = i + start_pos + 1 # make 1-based. 796 pos = i + start_pos + 1 # make 1-based.
775 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc)) 797 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc))
776 OUTPROF.close() 798 OUTPROF.close()
777 799
778 800
779 ############################################################################### 801 #######################################################################
780 802
781 def graphprot_profile_extract_peak_regions(in_file, out_file, 803
782 max_merge_dist=0, 804 def graphprot_profile_extract_peak_regions(
783 sc_thr=0): 805 in_file, out_file, max_merge_dist=0, sc_thr=0
806 ):
784 """ 807 """
785 Extract peak regions from GraphProt .profile file. 808 Extract peak regions from GraphProt .profile file.
786 Store the peak regions (defined as regions with scores >= sc_thr) 809 Store the peak regions (defined as regions with scores >= sc_thr)
787 as to out_file in 6-column .bed. 810 as to out_file in 6-column .bed.
788 811
833 # Case: new site (new column 1 ID). 856 # Case: new site (new column 1 ID).
834 if cur_id != old_id: 857 if cur_id != old_id:
835 # Process old id scores. 858 # Process old id scores.
836 if scores_list: 859 if scores_list:
837 # Extract peaks from region. 860 # Extract peaks from region.
838 peak_list = \ 861 peak_list = list_extract_peaks(
839 list_extract_peaks(scores_list, 862 scores_list,
840 max_merge_dist=max_merge_dist, 863 max_merge_dist=max_merge_dist,
841 coords="bed", 864 coords="bed",
842 sc_thr=sc_thr) 865 sc_thr=sc_thr,
866 )
843 start_pos = site_starts_dic[old_id] 867 start_pos = site_starts_dic[old_id]
844 # Print out peaks in .bed format. 868 # Print out peaks in .bed format.
845 for ln in peak_list: 869 for ln in peak_list:
846 peak_s = start_pos + ln[0] 870 peak_s = start_pos + ln[0]
847 peak_e = start_pos + ln[1] 871 peak_e = start_pos + ln[1]
848 site_id = "%s,%i" % (old_id, ln[2]) 872 site_id = "%s,%i" % (old_id, ln[2])
849 OUTPEAKS.write("%s\t%i\t%i" 873 OUTPEAKS.write(
850 "\t%s\t%f\t+\n" 874 "%s\t%i\t%i"
851 % (old_id, peak_s, 875 "\t%s\t%f\t+\n" % (old_id, peak_s, peak_e, site_id, ln[3])
852 peak_e, site_id, ln[3])) 876 )
853 # Reset list. 877 # Reset list.
854 scores_list = [] 878 scores_list = []
855 old_id = cur_id 879 old_id = cur_id
856 scores_list.append(score) 880 scores_list.append(score)
857 else: 881 else:
859 scores_list.append(score) 883 scores_list.append(score)
860 f.close() 884 f.close()
861 # Process last block. 885 # Process last block.
862 if scores_list: 886 if scores_list:
863 # Extract peaks from region. 887 # Extract peaks from region.
864 peak_list = list_extract_peaks(scores_list, 888 peak_list = list_extract_peaks(
865 max_merge_dist=max_merge_dist, 889 scores_list, max_merge_dist=max_merge_dist, coords="bed", sc_thr=sc_thr
866 coords="bed", 890 )
867 sc_thr=sc_thr)
868 start_pos = site_starts_dic[old_id] 891 start_pos = site_starts_dic[old_id]
869 # Print out peaks in .bed format. 892 # Print out peaks in .bed format.
870 for ln in peak_list: 893 for ln in peak_list:
871 peak_s = start_pos + ln[0] 894 peak_s = start_pos + ln[0]
872 peak_e = start_pos + ln[1] 895 peak_e = start_pos + ln[1]
873 site_id = "%s,%i" % (old_id, ln[2]) # best score also 1-based. 896 site_id = "%s,%i" % (old_id, ln[2]) # best score also 1-based.
874 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" 897 OUTPEAKS.write(
875 % (old_id, peak_s, peak_e, site_id, ln[3])) 898 "%s\t%i\t%i\t%s\t%f\t+\n" % (old_id, peak_s, peak_e, site_id, ln[3])
899 )
876 OUTPEAKS.close() 900 OUTPEAKS.close()
877 901
878 902
879 ############################################################################### 903 #######################################################################
880 904
881 def list_extract_peaks(in_list, 905
882 max_merge_dist=0, 906 def list_extract_peaks(in_list, max_merge_dist=0, coords="list", sc_thr=0):
883 coords="list",
884 sc_thr=0):
885 """ 907 """
886 Extract peak regions from list. 908 Extract peak regions from list.
887 Peak region is defined as region >= score threshold. 909 Peak region is defined as region >= score threshold.
888 910
889 coords=bed : peak start 0-based, peak end 1-based. 911 coords=bed : peak start 0-based, peak end 1-based.
967 new_top_pos = peak_list[i][2] 989 new_top_pos = peak_list[i][2]
968 new_top_sc = peak_list[i][3] 990 new_top_sc = peak_list[i][3]
969 if peak_list[i][3] < peak_list[j][3]: 991 if peak_list[i][3] < peak_list[j][3]:
970 new_top_pos = peak_list[j][2] 992 new_top_pos = peak_list[j][2]
971 new_top_sc = peak_list[j][3] 993 new_top_sc = peak_list[j][3]
972 new_peak = [peak_list[i][0], peak_list[j][1], 994 new_peak = [
973 new_top_pos, new_top_sc] 995 peak_list[i][0],
996 peak_list[j][1],
997 new_top_pos,
998 new_top_sc,
999 ]
974 # If two peaks were merged. 1000 # If two peaks were merged.
975 if new_peak: 1001 if new_peak:
976 merged_peak_list.append(new_peak) 1002 merged_peak_list.append(new_peak)
977 added_peaks_dic[i] = 1 1003 added_peaks_dic[i] = 1
978 added_peaks_dic[j] = 1 1004 added_peaks_dic[j] = 1
989 peak_list[i][1] += 1 1015 peak_list[i][1] += 1
990 peak_list[i][2] += 1 # 1-base best score position too. 1016 peak_list[i][2] += 1 # 1-base best score position too.
991 return peak_list 1017 return peak_list
992 1018
993 1019
994 ############################################################################### 1020 #######################################################################
995 1021
996 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, 1022
997 print_rows=False): 1023 def bed_peaks_to_genomic_peaks(
1024 peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False
1025 ):
998 """ 1026 """
999 Given a .bed file of sequence peak regions (possible coordinates from 1027 Given a .bed file of sequence peak regions (possible coordinates from
1000 0 to length of s), convert peak coordinates to genomic coordinates. 1028 0 to length of s), convert peak coordinates to genomic coordinates.
1001 Do this by taking genomic regions of sequences as input. 1029 Do this by taking genomic regions of sequences as input.
1002 1030
1015 with open(genomic_sites_bed) as f: 1043 with open(genomic_sites_bed) as f:
1016 for line in f: 1044 for line in f:
1017 row = line.strip() 1045 row = line.strip()
1018 cols = line.strip().split("\t") 1046 cols = line.strip().split("\t")
1019 site_id = cols[3] 1047 site_id = cols[3]
1020 assert site_id \ 1048 assert (
1021 not in id2row_dic, \ 1049 site_id not in id2row_dic
1022 "column 4 IDs not unique in given .bed file \"%s\"" \ 1050 ), 'column 4 IDs not unique in given .bed file "%s"' % (genomic_sites_bed)
1023 % (genomic_sites_bed)
1024 id2row_dic[site_id] = row 1051 id2row_dic[site_id] = row
1025 f.close() 1052 f.close()
1026 1053
1027 # Read in peaks file and convert coordinates. 1054 # Read in peaks file and convert coordinates.
1028 OUTPEAKS = open(genomic_peak_file, "w") 1055 OUTPEAKS = open(genomic_peak_file, "w")
1032 site_id = cols[0] 1059 site_id = cols[0]
1033 site_s = int(cols[1]) 1060 site_s = int(cols[1])
1034 site_e = int(cols[2]) 1061 site_e = int(cols[2])
1035 site_id2 = cols[3] 1062 site_id2 = cols[3]
1036 site_sc = float(cols[4]) 1063 site_sc = float(cols[4])
1037 assert re.search(".+,.+", site_id2), \ 1064 assert re.search(
1038 "regular expression failed for ID \"%s\"" % (site_id2) 1065 ".+,.+", site_id2
1066 ), 'regular expression failed for ID "%s"' % (site_id2)
1039 m = re.search(r".+,(\d+)", site_id2) 1067 m = re.search(r".+,(\d+)", site_id2)
1040 sc_pos = int(m.group(1)) # 1-based. 1068 sc_pos = int(m.group(1)) # 1-based.
1041 assert site_id in id2row_dic, \ 1069 assert (
1042 "site ID \"%s\" not found in genomic sites dictionary" \ 1070 site_id in id2row_dic
1043 % (site_id) 1071 ), 'site ID "%s" not found in genomic sites dictionary' % (site_id)
1044 row = id2row_dic[site_id] 1072 row = id2row_dic[site_id]
1045 rowl = row.split("\t") 1073 rowl = row.split("\t")
1046 gen_chr = rowl[0] 1074 gen_chr = rowl[0]
1047 gen_s = int(rowl[1]) 1075 gen_s = int(rowl[1])
1048 gen_e = int(rowl[2]) 1076 gen_e = int(rowl[2])
1052 new_sc_pos = sc_pos + gen_s 1080 new_sc_pos = sc_pos + gen_s
1053 if gen_pol == "-": 1081 if gen_pol == "-":
1054 new_s = gen_e - site_e 1082 new_s = gen_e - site_e
1055 new_e = gen_e - site_s 1083 new_e = gen_e - site_s
1056 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. 1084 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based.
1057 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \ 1085 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" % (
1058 % (gen_chr, new_s, new_e, 1086 gen_chr,
1059 site_id, new_sc_pos, site_sc, gen_pol) 1087 new_s,
1088 new_e,
1089 site_id,
1090 new_sc_pos,
1091 site_sc,
1092 gen_pol,
1093 )
1060 OUTPEAKS.write("%s\n" % (new_row)) 1094 OUTPEAKS.write("%s\n" % (new_row))
1061 if print_rows: 1095 if print_rows:
1062 print(new_row) 1096 print(new_row)
1063 OUTPEAKS.close() 1097 OUTPEAKS.close()
1064 1098
1065 1099
1066 ############################################################################### 1100 #######################################################################
1101
1067 1102
1068 def diff_two_files_identical(file1, file2): 1103 def diff_two_files_identical(file1, file2):
1069 """ 1104 """
1070 Check whether two files are identical. Return true if diff reports no 1105 Check whether two files are identical. Return true if diff reports no
1071 differences. 1106 differences.
1085 if output: 1120 if output:
1086 same = False 1121 same = False
1087 return same 1122 return same
1088 1123
1089 1124
1090 ############################################################################### 1125 #######################################################################