Mercurial > repos > rnateam > graphprot_predict_profile
comparison gplib.py @ 5:ddcf35a868b8 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419
author | bgruening |
---|---|
date | Wed, 05 Jun 2024 16:40:51 +0000 |
parents | ace92c9a4653 |
children |
comparison
equal
deleted
inserted
replaced
4:4ad83aed5c3c | 5:ddcf35a868b8 |
---|---|
1 | |
2 import gzip | 1 import gzip |
3 import random | 2 import random |
4 import re | 3 import re |
5 import statistics | 4 import statistics |
6 import subprocess | 5 import subprocess |
7 from distutils.spawn import find_executable | 6 from distutils.spawn import find_executable |
8 | 7 |
9 | |
10 """ | 8 """ |
11 | 9 |
12 Run doctests: | 10 Run doctests: |
13 | 11 |
14 python3 -m doctest gplib.py | 12 python3 -m doctest gplib.py |
15 | 13 |
16 | 14 |
17 """ | 15 """ |
18 | 16 |
19 | 17 |
20 ############################################################################### | 18 ####################################################################### |
19 | |
21 | 20 |
22 def graphprot_predictions_get_median(predictions_file): | 21 def graphprot_predictions_get_median(predictions_file): |
23 """ | 22 """ |
24 Given a GraphProt .predictions file, read in site scores and return | 23 Given a GraphProt .predictions file, read in site scores and return |
25 the median value. | 24 the median value. |
39 f.close() | 38 f.close() |
40 # Return the median. | 39 # Return the median. |
41 return statistics.median(sc_list) | 40 return statistics.median(sc_list) |
42 | 41 |
43 | 42 |
44 ############################################################################### | 43 ####################################################################### |
45 | 44 |
46 def graphprot_profile_get_tsm(profile_file, | 45 |
47 profile_type="profile", | 46 def graphprot_profile_get_tsm( |
48 avg_profile_extlr=5): | 47 profile_file, profile_type="profile", avg_profile_extlr=5 |
48 ): | |
49 | 49 |
50 """ | 50 """ |
51 Given a GraphProt .profile file, extract for each site (identified by | 51 Given a GraphProt .profile file, extract for each site (identified by |
52 column 1 ID) the top (= highest) score. Then return the median of these | 52 column 1 ID) the top (= highest) score. Then return the median of these |
53 top scores. | 53 top scores. |
86 if profile_type == "profile": | 86 if profile_type == "profile": |
87 max_sc = max(lists_dic[seq_id]) | 87 max_sc = max(lists_dic[seq_id]) |
88 max_list.append(max_sc) | 88 max_list.append(max_sc) |
89 elif profile_type == "avg_profile": | 89 elif profile_type == "avg_profile": |
90 # Convert profile score list to average profile scores list. | 90 # Convert profile score list to average profile scores list. |
91 aps_list = \ | 91 aps_list = list_moving_window_average_values( |
92 list_moving_window_average_values(lists_dic[seq_id], | 92 lists_dic[seq_id], win_extlr=avg_profile_extlr |
93 win_extlr=avg_profile_extlr) | 93 ) |
94 max_sc = max(aps_list) | 94 max_sc = max(aps_list) |
95 max_list.append(max_sc) | 95 max_list.append(max_sc) |
96 else: | 96 else: |
97 assert 0, "invalid profile_type argument given: \"%s\"" \ | 97 assert 0, 'invalid profile_type argument given: "%s"' % (profile_type) |
98 % (profile_type) | |
99 # Return the median. | 98 # Return the median. |
100 return statistics.median(max_list) | 99 return statistics.median(max_list) |
101 | 100 |
102 | 101 |
103 ############################################################################### | 102 ####################################################################### |
104 | 103 |
105 def list_moving_window_average_values(in_list, | 104 |
106 win_extlr=5, | 105 def list_moving_window_average_values(in_list, win_extlr=5, method=1): |
107 method=1): | |
108 """ | 106 """ |
109 Take a list of numeric values, and calculate for each position a new value, | 107 Take a list of numeric values, and calculate for each position a new value, |
110 by taking the mean value of the window of positions -win_extlr and | 108 by taking the mean value of the window of positions -win_extlr and |
111 +win_extlr. If full extension is not possible (at list ends), it just | 109 +win_extlr. If full extension is not possible (at list ends), it just |
112 takes what it gets. | 110 takes what it gets. |
150 else: | 148 else: |
151 assert 0, "invalid method ID given (%i)" % (method) | 149 assert 0, "invalid method ID given (%i)" % (method) |
152 return new_list | 150 return new_list |
153 | 151 |
154 | 152 |
155 ############################################################################### | 153 ####################################################################### |
154 | |
156 | 155 |
157 def echo_add_to_file(echo_string, out_file): | 156 def echo_add_to_file(echo_string, out_file): |
158 """ | 157 """ |
159 Add a string to file, using echo command. | 158 Add a string to file, using echo command. |
160 | 159 |
165 if output: | 164 if output: |
166 error = True | 165 error = True |
167 assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output) | 166 assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output) |
168 | 167 |
169 | 168 |
170 ############################################################################### | 169 ####################################################################### |
170 | |
171 | 171 |
172 def is_tool(name): | 172 def is_tool(name): |
173 """Check whether tool "name" is in PATH.""" | 173 """Check whether tool "name" is in PATH.""" |
174 return find_executable(name) is not None | 174 return find_executable(name) is not None |
175 | 175 |
176 | 176 |
177 ############################################################################### | 177 ####################################################################### |
178 | |
178 | 179 |
179 def count_fasta_headers(fasta_file): | 180 def count_fasta_headers(fasta_file): |
180 """ | 181 """ |
181 Count number of FASTA headers in fasta_file using grep. | 182 Count number of FASTA headers in fasta_file using grep. |
182 | 183 |
192 output = subprocess.getoutput(check_cmd) | 193 output = subprocess.getoutput(check_cmd) |
193 row_count = int(output.strip()) | 194 row_count = int(output.strip()) |
194 return row_count | 195 return row_count |
195 | 196 |
196 | 197 |
197 ############################################################################### | 198 ####################################################################### |
199 | |
198 | 200 |
199 def make_file_copy(in_file, out_file): | 201 def make_file_copy(in_file, out_file): |
200 """ | 202 """ |
201 Make a file copy by copying in_file to out_file. | 203 Make a file copy by copying in_file to out_file. |
202 | 204 |
203 """ | 205 """ |
204 check_cmd = "cat " + in_file + " > " + out_file | 206 check_cmd = "cat " + in_file + " > " + out_file |
205 assert in_file != out_file, \ | 207 assert in_file != out_file, "cat does not like to cat file into same file (%s)" % ( |
206 "cat does not like to cat file into same file (%s)" % (check_cmd) | 208 check_cmd |
209 ) | |
207 output = subprocess.getoutput(check_cmd) | 210 output = subprocess.getoutput(check_cmd) |
208 error = False | 211 error = False |
209 if output: | 212 if output: |
210 error = True | 213 error = True |
211 assert not error, \ | 214 assert not error, "cat did not like your input (in_file: %s, out_file: %s):\n%s" % ( |
212 "cat did not like your input (in_file: %s, out_file: %s):\n%s" \ | 215 in_file, |
213 % (in_file, out_file, output) | 216 out_file, |
214 | 217 output, |
215 | 218 ) |
216 ############################################################################### | 219 |
217 | 220 |
218 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, | 221 ####################################################################### |
219 test_size=500): | 222 |
223 | |
224 def split_fasta_into_test_train_files( | |
225 in_fasta, test_out_fa, train_out_fa, test_size=500 | |
226 ): | |
220 """ | 227 """ |
221 Split in_fasta .fa file into two files (e.g. test, train). | 228 Split in_fasta .fa file into two files (e.g. test, train). |
222 | 229 |
223 """ | 230 """ |
224 # Read in in_fasta. | 231 # Read in in_fasta. |
228 c_out = 0 | 235 c_out = 0 |
229 TESTOUT = open(test_out_fa, "w") | 236 TESTOUT = open(test_out_fa, "w") |
230 TRAINOUT = open(train_out_fa, "w") | 237 TRAINOUT = open(train_out_fa, "w") |
231 for seq_id in rand_ids_list: | 238 for seq_id in rand_ids_list: |
232 seq = seqs_dic[seq_id] | 239 seq = seqs_dic[seq_id] |
233 if (c_out >= test_size): | 240 if c_out >= test_size: |
234 TRAINOUT.write(">%s\n%s\n" % (seq_id, seq)) | 241 TRAINOUT.write(">%s\n%s\n" % (seq_id, seq)) |
235 else: | 242 else: |
236 TESTOUT.write(">%s\n%s\n" % (seq_id, seq)) | 243 TESTOUT.write(">%s\n%s\n" % (seq_id, seq)) |
237 c_out += 1 | 244 c_out += 1 |
238 TESTOUT.close() | 245 TESTOUT.close() |
239 TRAINOUT.close() | 246 TRAINOUT.close() |
240 | 247 |
241 | 248 |
242 ############################################################################### | 249 ####################################################################### |
250 | |
243 | 251 |
244 def check_seqs_dic_format(seqs_dic): | 252 def check_seqs_dic_format(seqs_dic): |
245 """ | 253 """ |
246 Check sequence dictionary for lowercase-only sequences or sequences | 254 Check sequence dictionary for lowercase-only sequences or sequences |
247 wich have lowercase nts in between uppercase nts. | 255 wich have lowercase nts in between uppercase nts. |
265 if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq): | 273 if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq): |
266 bad_seq_ids.append(seq_id) | 274 bad_seq_ids.append(seq_id) |
267 return bad_seq_ids | 275 return bad_seq_ids |
268 | 276 |
269 | 277 |
270 ############################################################################### | 278 ####################################################################### |
271 | 279 |
272 def read_fasta_into_dic(fasta_file, | 280 |
273 seqs_dic=False, | 281 def read_fasta_into_dic( |
274 ids_dic=False, | 282 fasta_file, |
275 read_dna=False, | 283 seqs_dic=False, |
276 short_ensembl=False, | 284 ids_dic=False, |
277 reject_lc=False, | 285 read_dna=False, |
278 convert_to_uc=False, | 286 short_ensembl=False, |
279 skip_n_seqs=True): | 287 reject_lc=False, |
288 convert_to_uc=False, | |
289 skip_n_seqs=True, | |
290 ): | |
280 """ | 291 """ |
281 Read in FASTA sequences, convert to RNA, store in dictionary | 292 Read in FASTA sequences, convert to RNA, store in dictionary |
282 and return dictionary. | 293 and return dictionary. |
283 | 294 |
284 >>> test_fasta = "test-data/test.fa" | 295 >>> test_fasta = "test-data/test.fa" |
300 seq_id = "" | 311 seq_id = "" |
301 seq = "" | 312 seq = "" |
302 | 313 |
303 # Go through FASTA file, extract sequences. | 314 # Go through FASTA file, extract sequences. |
304 if re.search(r".+\.gz$", fasta_file): | 315 if re.search(r".+\.gz$", fasta_file): |
305 f = gzip.open(fasta_file, 'rt') | 316 f = gzip.open(fasta_file, "rt") |
306 else: | 317 else: |
307 f = open(fasta_file, "r") | 318 f = open(fasta_file, "r") |
308 for line in f: | 319 for line in f: |
309 if re.search(">.+", line): | 320 if re.search(">.+", line): |
310 m = re.search(">(.+)", line) | 321 m = re.search(">(.+)", line) |
313 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." | 324 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." |
314 if short_ensembl: | 325 if short_ensembl: |
315 if re.search(r".+\..+", seq_id): | 326 if re.search(r".+\..+", seq_id): |
316 m = re.search(r"(.+?)\..+", seq_id) | 327 m = re.search(r"(.+?)\..+", seq_id) |
317 seq_id = m.group(1) | 328 seq_id = m.group(1) |
318 assert seq_id not in seqs_dic, \ | 329 assert seq_id not in seqs_dic, 'non-unique FASTA header "%s" in "%s"' % ( |
319 "non-unique FASTA header \"%s\" in \"%s\"" \ | 330 seq_id, |
320 % (seq_id, fasta_file) | 331 fasta_file, |
332 ) | |
321 if ids_dic: | 333 if ids_dic: |
322 if seq_id in ids_dic: | 334 if seq_id in ids_dic: |
323 seqs_dic[seq_id] = "" | 335 seqs_dic[seq_id] = "" |
324 else: | 336 else: |
325 seqs_dic[seq_id] = "" | 337 seqs_dic[seq_id] = "" |
326 elif re.search("[ACGTUN]+", line, re.I): | 338 elif re.search("[ACGTUN]+", line, re.I): |
327 if seq_id in seqs_dic: | 339 if seq_id in seqs_dic: |
328 m = re.search("([ACGTUN]+)", line, re.I) | 340 m = re.search("([ACGTUN]+)", line, re.I) |
329 seq = m.group(1) | 341 seq = m.group(1) |
330 if reject_lc: | 342 if reject_lc: |
331 assert \ | 343 assert not re.search( |
332 not re.search("[a-z]", seq), \ | 344 "[a-z]", seq |
333 "lc char detected in seq \"%i\" (reject_lc=True)" \ | 345 ), 'lc char detected in seq "%i" (reject_lc=True)' % (seq_id) |
334 % (seq_id) | |
335 if convert_to_uc: | 346 if convert_to_uc: |
336 seq = seq.upper() | 347 seq = seq.upper() |
337 # If sequences with N nucleotides should be skipped. | 348 # If sequences with N nucleotides should be skipped. |
338 if skip_n_seqs: | 349 if skip_n_seqs: |
339 if "n" in m.group(1) or "N" in m.group(1): | 350 if "n" in m.group(1) or "N" in m.group(1): |
340 print("WARNING: \"%s\" contains N. Discarding " | 351 print( |
341 "sequence ... " % (seq_id)) | 352 'WARNING: "%s" contains N. Discarding ' |
353 "sequence ... " % (seq_id) | |
354 ) | |
342 del seqs_dic[seq_id] | 355 del seqs_dic[seq_id] |
343 continue | 356 continue |
344 # Convert to RNA, concatenate sequence. | 357 # Convert to RNA, concatenate sequence. |
345 if read_dna: | 358 if read_dna: |
346 seqs_dic[seq_id] += \ | 359 seqs_dic[seq_id] += m.group(1).replace("U", "T").replace("u", "t") |
347 m.group(1).replace("U", "T").replace("u", "t") | |
348 else: | 360 else: |
349 seqs_dic[seq_id] += \ | 361 seqs_dic[seq_id] += m.group(1).replace("T", "U").replace("t", "u") |
350 m.group(1).replace("T", "U").replace("t", "u") | |
351 f.close() | 362 f.close() |
352 return seqs_dic | 363 return seqs_dic |
353 | 364 |
354 | 365 |
355 ############################################################################### | 366 ####################################################################### |
367 | |
356 | 368 |
357 def random_order_dic_keys_into_list(in_dic): | 369 def random_order_dic_keys_into_list(in_dic): |
358 """ | 370 """ |
359 Read in dictionary keys, and return random order list of IDs. | 371 Read in dictionary keys, and return random order list of IDs. |
360 | 372 |
364 id_list.append(key) | 376 id_list.append(key) |
365 random.shuffle(id_list) | 377 random.shuffle(id_list) |
366 return id_list | 378 return id_list |
367 | 379 |
368 | 380 |
369 ############################################################################### | 381 ####################################################################### |
382 | |
370 | 383 |
371 def graphprot_get_param_string(params_file): | 384 def graphprot_get_param_string(params_file): |
372 """ | 385 """ |
373 Get parameter string from GraphProt .params file. | 386 Get parameter string from GraphProt .params file. |
374 | 387 |
392 if setting == "sequence": | 405 if setting == "sequence": |
393 param_string += "-onlyseq " | 406 param_string += "-onlyseq " |
394 else: | 407 else: |
395 param_string += "-%s %s " % (par, setting) | 408 param_string += "-%s %s " % (par, setting) |
396 else: | 409 else: |
397 assert 0, "pattern matching failed for string \"%s\"" % (param) | 410 assert 0, 'pattern matching failed for string "%s"' % (param) |
398 return param_string | 411 return param_string |
399 | 412 |
400 | 413 |
401 ############################################################################### | 414 ####################################################################### |
415 | |
402 | 416 |
403 def seqs_dic_count_uc_nts(seqs_dic): | 417 def seqs_dic_count_uc_nts(seqs_dic): |
404 """ | 418 """ |
405 Count number of uppercase nucleotides in sequences stored in sequence | 419 Count number of uppercase nucleotides in sequences stored in sequence |
406 dictionary. | 420 dictionary. |
414 | 428 |
415 """ | 429 """ |
416 assert seqs_dic, "Given sequence dictionary empty" | 430 assert seqs_dic, "Given sequence dictionary empty" |
417 c_uc = 0 | 431 c_uc = 0 |
418 for seq_id in seqs_dic: | 432 for seq_id in seqs_dic: |
419 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) | 433 c_uc += len(re.findall(r"[A-Z]", seqs_dic[seq_id])) |
420 return c_uc | 434 return c_uc |
421 | 435 |
422 | 436 |
423 ############################################################################### | 437 ####################################################################### |
438 | |
424 | 439 |
425 def seqs_dic_count_lc_nts(seqs_dic): | 440 def seqs_dic_count_lc_nts(seqs_dic): |
426 """ | 441 """ |
427 Count number of lowercase nucleotides in sequences stored in sequence | 442 Count number of lowercase nucleotides in sequences stored in sequence |
428 dictionary. | 443 dictionary. |
436 | 451 |
437 """ | 452 """ |
438 assert seqs_dic, "Given sequence dictionary empty" | 453 assert seqs_dic, "Given sequence dictionary empty" |
439 c_uc = 0 | 454 c_uc = 0 |
440 for seq_id in seqs_dic: | 455 for seq_id in seqs_dic: |
441 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) | 456 c_uc += len(re.findall(r"[a-z]", seqs_dic[seq_id])) |
442 return c_uc | 457 return c_uc |
443 | 458 |
444 | 459 |
445 ############################################################################### | 460 ####################################################################### |
461 | |
446 | 462 |
447 def count_file_rows(in_file): | 463 def count_file_rows(in_file): |
448 """ | 464 """ |
449 Count number of file rows for given input file. | 465 Count number of file rows for given input file. |
450 | 466 |
460 output = subprocess.getoutput(check_cmd) | 476 output = subprocess.getoutput(check_cmd) |
461 row_count = int(output.strip()) | 477 row_count = int(output.strip()) |
462 return row_count | 478 return row_count |
463 | 479 |
464 | 480 |
465 ############################################################################### | 481 ####################################################################### |
482 | |
466 | 483 |
467 def bed_check_six_col_format(bed_file): | 484 def bed_check_six_col_format(bed_file): |
468 """ | 485 """ |
469 Check whether given .bed file has 6 columns. | 486 Check whether given .bed file has 6 columns. |
470 | 487 |
486 break | 503 break |
487 f.closed | 504 f.closed |
488 return six_col_format | 505 return six_col_format |
489 | 506 |
490 | 507 |
491 ############################################################################### | 508 ####################################################################### |
509 | |
492 | 510 |
493 def bed_check_unique_ids(bed_file): | 511 def bed_check_unique_ids(bed_file): |
494 """ | 512 """ |
495 Check whether .bed file (6 column format with IDs in column 4) | 513 Check whether .bed file (6 column format with IDs in column 4) |
496 has unique column 4 IDs. | 514 has unique column 4 IDs. |
510 return False | 528 return False |
511 else: | 529 else: |
512 return True | 530 return True |
513 | 531 |
514 | 532 |
515 ############################################################################### | 533 ####################################################################### |
534 | |
516 | 535 |
517 def get_seq_lengths_from_seqs_dic(seqs_dic): | 536 def get_seq_lengths_from_seqs_dic(seqs_dic): |
518 """ | 537 """ |
519 Given a dictionary of sequences, return dictionary of sequence lengths. | 538 Given a dictionary of sequences, return dictionary of sequence lengths. |
520 Mapping is sequence ID -> sequence length. | 539 Mapping is sequence ID -> sequence length. |
525 seq_l = len(seqs_dic[seq_id]) | 544 seq_l = len(seqs_dic[seq_id]) |
526 seq_len_dic[seq_id] = seq_l | 545 seq_len_dic[seq_id] = seq_l |
527 return seq_len_dic | 546 return seq_len_dic |
528 | 547 |
529 | 548 |
530 ############################################################################### | 549 ####################################################################### |
550 | |
531 | 551 |
532 def bed_get_region_lengths(bed_file): | 552 def bed_get_region_lengths(bed_file): |
533 """ | 553 """ |
534 Read in .bed file, store and return region lengths in dictionary. | 554 Read in .bed file, store and return region lengths in dictionary. |
535 key : region ID (.bed col4) | 555 key : region ID (.bed col4) |
546 cols = line.strip().split("\t") | 566 cols = line.strip().split("\t") |
547 site_s = int(cols[1]) | 567 site_s = int(cols[1]) |
548 site_e = int(cols[2]) | 568 site_e = int(cols[2]) |
549 site_id = cols[3] | 569 site_id = cols[3] |
550 site_l = site_e - site_s | 570 site_l = site_e - site_s |
551 assert site_id \ | 571 assert ( |
552 not in id2len_dic, \ | 572 site_id not in id2len_dic |
553 "column 4 IDs not unique in given .bed file \"%s\"" \ | 573 ), 'column 4 IDs not unique in given .bed file "%s"' % (bed_file) |
554 % (bed_file) | |
555 id2len_dic[site_id] = site_l | 574 id2len_dic[site_id] = site_l |
556 f.closed | 575 f.closed |
557 assert id2len_dic, \ | 576 assert ( |
558 "No IDs read into dic (input file \"%s\" empty or malformatted?)" \ | 577 id2len_dic |
559 % (bed_file) | 578 ), 'No IDs read into dic (input file "%s" empty or malformatted?)' % (bed_file) |
560 return id2len_dic | 579 return id2len_dic |
561 | 580 |
562 | 581 |
563 ############################################################################### | 582 ####################################################################### |
583 | |
564 | 584 |
565 def graphprot_get_param_dic(params_file): | 585 def graphprot_get_param_dic(params_file): |
566 """ | 586 """ |
567 Read in GraphProt .params file and store in dictionary. | 587 Read in GraphProt .params file and store in dictionary. |
568 key = parameter | 588 key = parameter |
597 param_dic[par] = setting | 617 param_dic[par] = setting |
598 f.close() | 618 f.close() |
599 return param_dic | 619 return param_dic |
600 | 620 |
601 | 621 |
602 ############################################################################### | 622 ####################################################################### |
603 | 623 |
604 def graphprot_filter_predictions_file(in_file, out_file, | 624 |
605 sc_thr=0): | 625 def graphprot_filter_predictions_file(in_file, out_file, sc_thr=0): |
606 """ | 626 """ |
607 Filter GraphProt .predictions file by given score thr_sc. | 627 Filter GraphProt .predictions file by given score thr_sc. |
608 """ | 628 """ |
609 OUTPRED = open(out_file, "w") | 629 OUTPRED = open(out_file, "w") |
610 with open(in_file) as f: | 630 with open(in_file) as f: |
617 OUTPRED.write("%s\n" % (row)) | 637 OUTPRED.write("%s\n" % (row)) |
618 f.close() | 638 f.close() |
619 OUTPRED.close() | 639 OUTPRED.close() |
620 | 640 |
621 | 641 |
622 ############################################################################### | 642 ####################################################################### |
643 | |
623 | 644 |
624 def fasta_read_in_ids(fasta_file): | 645 def fasta_read_in_ids(fasta_file): |
625 """ | 646 """ |
626 Given a .fa file, read in header IDs in order appearing in file, | 647 Given a .fa file, read in header IDs in order appearing in file, |
627 and store in list. | 648 and store in list. |
640 ids_list.append(seq_id) | 661 ids_list.append(seq_id) |
641 f.close() | 662 f.close() |
642 return ids_list | 663 return ids_list |
643 | 664 |
644 | 665 |
645 ############################################################################### | 666 ####################################################################### |
646 | 667 |
647 def graphprot_profile_calc_avg_profile(in_file, out_file, | 668 |
648 ap_extlr=5, | 669 def graphprot_profile_calc_avg_profile( |
649 seq_ids_list=False, | 670 in_file, out_file, ap_extlr=5, seq_ids_list=False, method=1 |
650 method=1): | 671 ): |
651 """ | 672 """ |
652 Given a GraphProt .profile file, calculate average profiles and output | 673 Given a GraphProt .profile file, calculate average profiles and output |
653 average profile file. | 674 average profile file. |
654 Average profile means that the position-wise scores will get smoothed | 675 Average profile means that the position-wise scores will get smoothed |
655 out by calculating for each position a new score, taking a sequence | 676 out by calculating for each position a new score, taking a sequence |
700 f.close() | 721 f.close() |
701 # Check number of IDs (# FASTA IDs has to be same as # site IDs). | 722 # Check number of IDs (# FASTA IDs has to be same as # site IDs). |
702 if seq_ids_list: | 723 if seq_ids_list: |
703 c_seq_ids = len(seq_ids_list) | 724 c_seq_ids = len(seq_ids_list) |
704 c_site_ids = len(site_starts_dic) | 725 c_site_ids = len(site_starts_dic) |
705 assert c_seq_ids == c_site_ids, \ | 726 assert ( |
706 "# sequence IDs != # site IDs (%i != %i)" \ | 727 c_seq_ids == c_site_ids |
707 % (c_seq_ids, c_site_ids) | 728 ), "# sequence IDs != # site IDs (%i != %i)" % (c_seq_ids, c_site_ids) |
708 OUTPROF = open(out_file, "w") | 729 OUTPROF = open(out_file, "w") |
709 # For each site, calculate average profile scores list. | 730 # For each site, calculate average profile scores list. |
710 for site_id in lists_dic: | 731 for site_id in lists_dic: |
711 # Convert profile score list to average profile scores list. | 732 # Convert profile score list to average profile scores list. |
712 aps_list = list_moving_window_average_values(lists_dic[site_id], | 733 aps_list = list_moving_window_average_values( |
713 win_extlr=ap_extlr) | 734 lists_dic[site_id], win_extlr=ap_extlr |
735 ) | |
714 start_pos = site_starts_dic[site_id] | 736 start_pos = site_starts_dic[site_id] |
715 # Get original FASTA sequence ID. | 737 # Get original FASTA sequence ID. |
716 if seq_ids_list: | 738 if seq_ids_list: |
717 site_id = seq_ids_list[site_id] | 739 site_id = seq_ids_list[site_id] |
718 for i, sc in enumerate(aps_list): | 740 for i, sc in enumerate(aps_list): |
739 site_starts_dic[cur_id] = pos | 761 site_starts_dic[cur_id] = pos |
740 # Case: new site (new column 1 ID). | 762 # Case: new site (new column 1 ID). |
741 if cur_id != old_id: | 763 if cur_id != old_id: |
742 # Process old id scores. | 764 # Process old id scores. |
743 if scores_list: | 765 if scores_list: |
744 aps_list = \ | 766 aps_list = list_moving_window_average_values( |
745 list_moving_window_average_values( | 767 scores_list, win_extlr=ap_extlr |
746 scores_list, | 768 ) |
747 win_extlr=ap_extlr) | |
748 start_pos = site_starts_dic[old_id] | 769 start_pos = site_starts_dic[old_id] |
749 seq_id = old_id | 770 seq_id = old_id |
750 # Get original FASTA sequence ID. | 771 # Get original FASTA sequence ID. |
751 if seq_ids_list: | 772 if seq_ids_list: |
752 seq_id = seq_ids_list[old_id] | 773 seq_id = seq_ids_list[old_id] |
761 # Add to scores_list. | 782 # Add to scores_list. |
762 scores_list.append(score) | 783 scores_list.append(score) |
763 f.close() | 784 f.close() |
764 # Process last block. | 785 # Process last block. |
765 if scores_list: | 786 if scores_list: |
766 aps_list = list_moving_window_average_values(scores_list, | 787 aps_list = list_moving_window_average_values( |
767 win_extlr=ap_extlr) | 788 scores_list, win_extlr=ap_extlr |
789 ) | |
768 start_pos = site_starts_dic[old_id] | 790 start_pos = site_starts_dic[old_id] |
769 seq_id = old_id | 791 seq_id = old_id |
770 # Get original FASTA sequence ID. | 792 # Get original FASTA sequence ID. |
771 if seq_ids_list: | 793 if seq_ids_list: |
772 seq_id = seq_ids_list[old_id] | 794 seq_id = seq_ids_list[old_id] |
774 pos = i + start_pos + 1 # make 1-based. | 796 pos = i + start_pos + 1 # make 1-based. |
775 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc)) | 797 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc)) |
776 OUTPROF.close() | 798 OUTPROF.close() |
777 | 799 |
778 | 800 |
779 ############################################################################### | 801 ####################################################################### |
780 | 802 |
781 def graphprot_profile_extract_peak_regions(in_file, out_file, | 803 |
782 max_merge_dist=0, | 804 def graphprot_profile_extract_peak_regions( |
783 sc_thr=0): | 805 in_file, out_file, max_merge_dist=0, sc_thr=0 |
806 ): | |
784 """ | 807 """ |
785 Extract peak regions from GraphProt .profile file. | 808 Extract peak regions from GraphProt .profile file. |
786 Store the peak regions (defined as regions with scores >= sc_thr) | 809 Store the peak regions (defined as regions with scores >= sc_thr) |
787 as to out_file in 6-column .bed. | 810 as to out_file in 6-column .bed. |
788 | 811 |
833 # Case: new site (new column 1 ID). | 856 # Case: new site (new column 1 ID). |
834 if cur_id != old_id: | 857 if cur_id != old_id: |
835 # Process old id scores. | 858 # Process old id scores. |
836 if scores_list: | 859 if scores_list: |
837 # Extract peaks from region. | 860 # Extract peaks from region. |
838 peak_list = \ | 861 peak_list = list_extract_peaks( |
839 list_extract_peaks(scores_list, | 862 scores_list, |
840 max_merge_dist=max_merge_dist, | 863 max_merge_dist=max_merge_dist, |
841 coords="bed", | 864 coords="bed", |
842 sc_thr=sc_thr) | 865 sc_thr=sc_thr, |
866 ) | |
843 start_pos = site_starts_dic[old_id] | 867 start_pos = site_starts_dic[old_id] |
844 # Print out peaks in .bed format. | 868 # Print out peaks in .bed format. |
845 for ln in peak_list: | 869 for ln in peak_list: |
846 peak_s = start_pos + ln[0] | 870 peak_s = start_pos + ln[0] |
847 peak_e = start_pos + ln[1] | 871 peak_e = start_pos + ln[1] |
848 site_id = "%s,%i" % (old_id, ln[2]) | 872 site_id = "%s,%i" % (old_id, ln[2]) |
849 OUTPEAKS.write("%s\t%i\t%i" | 873 OUTPEAKS.write( |
850 "\t%s\t%f\t+\n" | 874 "%s\t%i\t%i" |
851 % (old_id, peak_s, | 875 "\t%s\t%f\t+\n" % (old_id, peak_s, peak_e, site_id, ln[3]) |
852 peak_e, site_id, ln[3])) | 876 ) |
853 # Reset list. | 877 # Reset list. |
854 scores_list = [] | 878 scores_list = [] |
855 old_id = cur_id | 879 old_id = cur_id |
856 scores_list.append(score) | 880 scores_list.append(score) |
857 else: | 881 else: |
859 scores_list.append(score) | 883 scores_list.append(score) |
860 f.close() | 884 f.close() |
861 # Process last block. | 885 # Process last block. |
862 if scores_list: | 886 if scores_list: |
863 # Extract peaks from region. | 887 # Extract peaks from region. |
864 peak_list = list_extract_peaks(scores_list, | 888 peak_list = list_extract_peaks( |
865 max_merge_dist=max_merge_dist, | 889 scores_list, max_merge_dist=max_merge_dist, coords="bed", sc_thr=sc_thr |
866 coords="bed", | 890 ) |
867 sc_thr=sc_thr) | |
868 start_pos = site_starts_dic[old_id] | 891 start_pos = site_starts_dic[old_id] |
869 # Print out peaks in .bed format. | 892 # Print out peaks in .bed format. |
870 for ln in peak_list: | 893 for ln in peak_list: |
871 peak_s = start_pos + ln[0] | 894 peak_s = start_pos + ln[0] |
872 peak_e = start_pos + ln[1] | 895 peak_e = start_pos + ln[1] |
873 site_id = "%s,%i" % (old_id, ln[2]) # best score also 1-based. | 896 site_id = "%s,%i" % (old_id, ln[2]) # best score also 1-based. |
874 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" | 897 OUTPEAKS.write( |
875 % (old_id, peak_s, peak_e, site_id, ln[3])) | 898 "%s\t%i\t%i\t%s\t%f\t+\n" % (old_id, peak_s, peak_e, site_id, ln[3]) |
899 ) | |
876 OUTPEAKS.close() | 900 OUTPEAKS.close() |
877 | 901 |
878 | 902 |
879 ############################################################################### | 903 ####################################################################### |
880 | 904 |
881 def list_extract_peaks(in_list, | 905 |
882 max_merge_dist=0, | 906 def list_extract_peaks(in_list, max_merge_dist=0, coords="list", sc_thr=0): |
883 coords="list", | |
884 sc_thr=0): | |
885 """ | 907 """ |
886 Extract peak regions from list. | 908 Extract peak regions from list. |
887 Peak region is defined as region >= score threshold. | 909 Peak region is defined as region >= score threshold. |
888 | 910 |
889 coords=bed : peak start 0-based, peak end 1-based. | 911 coords=bed : peak start 0-based, peak end 1-based. |
967 new_top_pos = peak_list[i][2] | 989 new_top_pos = peak_list[i][2] |
968 new_top_sc = peak_list[i][3] | 990 new_top_sc = peak_list[i][3] |
969 if peak_list[i][3] < peak_list[j][3]: | 991 if peak_list[i][3] < peak_list[j][3]: |
970 new_top_pos = peak_list[j][2] | 992 new_top_pos = peak_list[j][2] |
971 new_top_sc = peak_list[j][3] | 993 new_top_sc = peak_list[j][3] |
972 new_peak = [peak_list[i][0], peak_list[j][1], | 994 new_peak = [ |
973 new_top_pos, new_top_sc] | 995 peak_list[i][0], |
996 peak_list[j][1], | |
997 new_top_pos, | |
998 new_top_sc, | |
999 ] | |
974 # If two peaks were merged. | 1000 # If two peaks were merged. |
975 if new_peak: | 1001 if new_peak: |
976 merged_peak_list.append(new_peak) | 1002 merged_peak_list.append(new_peak) |
977 added_peaks_dic[i] = 1 | 1003 added_peaks_dic[i] = 1 |
978 added_peaks_dic[j] = 1 | 1004 added_peaks_dic[j] = 1 |
989 peak_list[i][1] += 1 | 1015 peak_list[i][1] += 1 |
990 peak_list[i][2] += 1 # 1-base best score position too. | 1016 peak_list[i][2] += 1 # 1-base best score position too. |
991 return peak_list | 1017 return peak_list |
992 | 1018 |
993 | 1019 |
994 ############################################################################### | 1020 ####################################################################### |
995 | 1021 |
996 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, | 1022 |
997 print_rows=False): | 1023 def bed_peaks_to_genomic_peaks( |
1024 peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False | |
1025 ): | |
998 """ | 1026 """ |
999 Given a .bed file of sequence peak regions (possible coordinates from | 1027 Given a .bed file of sequence peak regions (possible coordinates from |
1000 0 to length of s), convert peak coordinates to genomic coordinates. | 1028 0 to length of s), convert peak coordinates to genomic coordinates. |
1001 Do this by taking genomic regions of sequences as input. | 1029 Do this by taking genomic regions of sequences as input. |
1002 | 1030 |
1015 with open(genomic_sites_bed) as f: | 1043 with open(genomic_sites_bed) as f: |
1016 for line in f: | 1044 for line in f: |
1017 row = line.strip() | 1045 row = line.strip() |
1018 cols = line.strip().split("\t") | 1046 cols = line.strip().split("\t") |
1019 site_id = cols[3] | 1047 site_id = cols[3] |
1020 assert site_id \ | 1048 assert ( |
1021 not in id2row_dic, \ | 1049 site_id not in id2row_dic |
1022 "column 4 IDs not unique in given .bed file \"%s\"" \ | 1050 ), 'column 4 IDs not unique in given .bed file "%s"' % (genomic_sites_bed) |
1023 % (genomic_sites_bed) | |
1024 id2row_dic[site_id] = row | 1051 id2row_dic[site_id] = row |
1025 f.close() | 1052 f.close() |
1026 | 1053 |
1027 # Read in peaks file and convert coordinates. | 1054 # Read in peaks file and convert coordinates. |
1028 OUTPEAKS = open(genomic_peak_file, "w") | 1055 OUTPEAKS = open(genomic_peak_file, "w") |
1032 site_id = cols[0] | 1059 site_id = cols[0] |
1033 site_s = int(cols[1]) | 1060 site_s = int(cols[1]) |
1034 site_e = int(cols[2]) | 1061 site_e = int(cols[2]) |
1035 site_id2 = cols[3] | 1062 site_id2 = cols[3] |
1036 site_sc = float(cols[4]) | 1063 site_sc = float(cols[4]) |
1037 assert re.search(".+,.+", site_id2), \ | 1064 assert re.search( |
1038 "regular expression failed for ID \"%s\"" % (site_id2) | 1065 ".+,.+", site_id2 |
1066 ), 'regular expression failed for ID "%s"' % (site_id2) | |
1039 m = re.search(r".+,(\d+)", site_id2) | 1067 m = re.search(r".+,(\d+)", site_id2) |
1040 sc_pos = int(m.group(1)) # 1-based. | 1068 sc_pos = int(m.group(1)) # 1-based. |
1041 assert site_id in id2row_dic, \ | 1069 assert ( |
1042 "site ID \"%s\" not found in genomic sites dictionary" \ | 1070 site_id in id2row_dic |
1043 % (site_id) | 1071 ), 'site ID "%s" not found in genomic sites dictionary' % (site_id) |
1044 row = id2row_dic[site_id] | 1072 row = id2row_dic[site_id] |
1045 rowl = row.split("\t") | 1073 rowl = row.split("\t") |
1046 gen_chr = rowl[0] | 1074 gen_chr = rowl[0] |
1047 gen_s = int(rowl[1]) | 1075 gen_s = int(rowl[1]) |
1048 gen_e = int(rowl[2]) | 1076 gen_e = int(rowl[2]) |
1052 new_sc_pos = sc_pos + gen_s | 1080 new_sc_pos = sc_pos + gen_s |
1053 if gen_pol == "-": | 1081 if gen_pol == "-": |
1054 new_s = gen_e - site_e | 1082 new_s = gen_e - site_e |
1055 new_e = gen_e - site_s | 1083 new_e = gen_e - site_s |
1056 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. | 1084 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. |
1057 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \ | 1085 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" % ( |
1058 % (gen_chr, new_s, new_e, | 1086 gen_chr, |
1059 site_id, new_sc_pos, site_sc, gen_pol) | 1087 new_s, |
1088 new_e, | |
1089 site_id, | |
1090 new_sc_pos, | |
1091 site_sc, | |
1092 gen_pol, | |
1093 ) | |
1060 OUTPEAKS.write("%s\n" % (new_row)) | 1094 OUTPEAKS.write("%s\n" % (new_row)) |
1061 if print_rows: | 1095 if print_rows: |
1062 print(new_row) | 1096 print(new_row) |
1063 OUTPEAKS.close() | 1097 OUTPEAKS.close() |
1064 | 1098 |
1065 | 1099 |
1066 ############################################################################### | 1100 ####################################################################### |
1101 | |
1067 | 1102 |
1068 def diff_two_files_identical(file1, file2): | 1103 def diff_two_files_identical(file1, file2): |
1069 """ | 1104 """ |
1070 Check whether two files are identical. Return true if diff reports no | 1105 Check whether two files are identical. Return true if diff reports no |
1071 differences. | 1106 differences. |
1085 if output: | 1120 if output: |
1086 same = False | 1121 same = False |
1087 return same | 1122 return same |
1088 | 1123 |
1089 | 1124 |
1090 ############################################################################### | 1125 ####################################################################### |