Mercurial > repos > rnateam > graphprot_predict_profile
comparison gplib.py @ 1:20429f4c1b95 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit f3fb925b83a4982e0cf9a0c11ff93ecbb8e4e6d5"
| author | bgruening |
|---|---|
| date | Wed, 22 Jan 2020 10:14:41 -0500 |
| parents | |
| children | ace92c9a4653 |
comparison
equal
deleted
inserted
replaced
| 0:215925e588c4 | 1:20429f4c1b95 |
|---|---|
| 1 | |
| 2 from distutils.spawn import find_executable | |
| 3 import subprocess | |
| 4 import statistics | |
| 5 import random | |
| 6 import gzip | |
| 7 import uuid | |
| 8 import sys | |
| 9 import re | |
| 10 import os | |
| 11 | |
| 12 """ | |
| 13 | |
| 14 Run doctests: | |
| 15 | |
| 16 python3 -m doctest gplib.py | |
| 17 | |
| 18 | |
| 19 """ | |
| 20 | |
| 21 | |
| 22 ################################################################################ | |
| 23 | |
| 24 def graphprot_predictions_get_median(predictions_file): | |
| 25 """ | |
| 26 Given a GraphProt .predictions file, read in site scores and return | |
| 27 the median value. | |
| 28 | |
| 29 >>> test_file = "test-data/test.predictions" | |
| 30 >>> graphprot_predictions_get_median(test_file) | |
| 31 0.571673 | |
| 32 | |
| 33 """ | |
| 34 # Site scores list. | |
| 35 sc_list = [] | |
| 36 with open(predictions_file) as f: | |
| 37 for line in f: | |
| 38 cols = line.strip().split("\t") | |
| 39 score = float(cols[2]) | |
| 40 sc_list.append(score) | |
| 41 f.close() | |
| 42 # Return the median. | |
| 43 return statistics.median(sc_list) | |
| 44 | |
| 45 | |
| 46 ################################################################################ | |
| 47 | |
| 48 def graphprot_profile_get_top_scores_median(profile_file, | |
| 49 profile_type="profile", | |
| 50 avg_profile_extlr=5): | |
| 51 | |
| 52 """ | |
| 53 Given a GraphProt .profile file, extract for each site (identified by | |
| 54 column 1 ID) the top (= highest) score. Then return the median of these | |
| 55 top scores. | |
| 56 | |
| 57 profile_type can be either "profile" or "avg_profile". | |
| 58 "avg_profile means that the position-wise scores will first get smoothed | |
| 59 out by calculating for each position a new score through taking a | |
| 60 sequence window -avg_profile_extlr to +avg_profile_extlr of the position | |
| 61 and calculate the mean score over this window and assign it to the position. | |
| 62 After that, the maximum score of each site is chosen, and the median over | |
| 63 all maximum scores is returned. | |
| 64 "profile" leaves the position-wise scores as they are, directly extracting | |
| 65 the maximum for each site and then reporting the median. | |
| 66 | |
| 67 >>> test_file = "test-data/test.profile" | |
| 68 >>> graphprot_profile_get_top_scores_median(test_file) | |
| 69 3.2 | |
| 70 | |
| 71 """ | |
| 72 # Dictionary of lists, with list of scores (value) for each site (key). | |
| 73 lists_dic = {} | |
| 74 with open(profile_file) as f: | |
| 75 for line in f: | |
| 76 cols = line.strip().split("\t") | |
| 77 seq_id = cols[0] | |
| 78 score = float(cols[2]) | |
| 79 if seq_id in lists_dic: | |
| 80 lists_dic[seq_id].append(score) | |
| 81 else: | |
| 82 lists_dic[seq_id] = [] | |
| 83 lists_dic[seq_id].append(score) | |
| 84 f.close() | |
| 85 # For each site, extract maximum and store in new list. | |
| 86 max_list = [] | |
| 87 for seq_id in lists_dic: | |
| 88 if profile_type == "profile": | |
| 89 max_sc = max(lists_dic[seq_id]) | |
| 90 max_list.append(max_sc) | |
| 91 elif profile_type == "avg_profile": | |
| 92 # Convert profile score list to average profile scores list. | |
| 93 aps_list = list_moving_window_average_values(lists_dic[seq_id], | |
| 94 win_extlr=avg_profile_extlr) | |
| 95 max_sc = max(aps_list) | |
| 96 max_list.append(max_sc) | |
| 97 else: | |
| 98 assert 0, "invalid profile_type argument given: \"%s\"" %(profile_type) | |
| 99 # Return the median. | |
| 100 return statistics.median(max_list) | |
| 101 | |
| 102 | |
| 103 ################################################################################ | |
| 104 | |
| 105 def list_moving_window_average_values(in_list, | |
| 106 win_extlr=5, | |
| 107 method=1): | |
| 108 """ | |
| 109 Take a list of numeric values, and calculate for each position a new value, | |
| 110 by taking the mean value of the window of positions -win_extlr and | |
| 111 +win_extlr. If full extension is not possible (at list ends), it just | |
| 112 takes what it gets. | |
| 113 Two implementations of the task are given, chose by method=1 or method=2. | |
| 114 | |
| 115 >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1] | |
| 116 >>> list_moving_window_average_values(test_list, win_extlr=2, method=1) | |
| 117 [3.3333333333333335, 4.5, 4.4, 4.6, 5.4, 4.6, 3.75, 3.6666666666666665] | |
| 118 >>> list_moving_window_average_values(test_list, win_extlr=2, method=2) | |
| 119 [3.3333333333333335, 4.5, 4.4, 4.6, 5.4, 4.6, 3.75, 3.6666666666666665] | |
| 120 | |
| 121 """ | |
| 122 l_list = len(in_list) | |
| 123 assert l_list, "Given list is empty" | |
| 124 new_list = [0] * l_list | |
| 125 if win_extlr == 0: | |
| 126 return l_list | |
| 127 if method == 1: | |
| 128 for i in range(l_list): | |
| 129 s = i - win_extlr | |
| 130 e = i + win_extlr + 1 | |
| 131 if s < 0: | |
| 132 s = 0 | |
| 133 if e > l_list: | |
| 134 e = l_list | |
| 135 # Extract portion and assign value to new list. | |
| 136 new_list[i] = statistics.mean(in_list[s:e]) | |
| 137 elif method == 2: | |
| 138 for i in range(l_list): | |
| 139 s = i - win_extlr | |
| 140 e = i + win_extlr + 1 | |
| 141 if s < 0: | |
| 142 s = 0 | |
| 143 if e > l_list: | |
| 144 e = l_list | |
| 145 l = e-s | |
| 146 sc_sum = 0 | |
| 147 for j in range(l): | |
| 148 sc_sum += in_list[s+j] | |
| 149 new_list[i] = sc_sum / l | |
| 150 else: | |
| 151 assert 0, "invalid method ID given (%i)" %(method) | |
| 152 return new_list | |
| 153 | |
| 154 | |
| 155 ################################################################################ | |
| 156 | |
| 157 def echo_add_to_file(echo_string, out_file): | |
| 158 """ | |
| 159 Add a string to file, using echo command. | |
| 160 | |
| 161 """ | |
| 162 check_cmd = 'echo "%s" >> %s' % (echo_string, out_file) | |
| 163 output = subprocess.getoutput(check_cmd) | |
| 164 error = False | |
| 165 if output: | |
| 166 error = True | |
| 167 assert error == False, "echo is complaining:\n%s\n%s" %(check_cmd, output) | |
| 168 | |
| 169 | |
| 170 ################################################################################ | |
| 171 | |
| 172 def is_tool(name): | |
| 173 """Check whether tool "name" is in PATH.""" | |
| 174 return find_executable(name) is not None | |
| 175 | |
| 176 | |
| 177 ################################################################################ | |
| 178 | |
| 179 def count_fasta_headers(fasta_file): | |
| 180 """ | |
| 181 Count number of FASTA headers in fasta_file using grep. | |
| 182 | |
| 183 >>> test_file = "test-data/test.fa" | |
| 184 >>> count_fasta_headers(test_file) | |
| 185 2 | |
| 186 >>> test_file = "test-data/empty_file" | |
| 187 >>> count_fasta_headers(test_file) | |
| 188 0 | |
| 189 | |
| 190 """ | |
| 191 check_cmd = 'grep -c ">" ' + fasta_file | |
| 192 output = subprocess.getoutput(check_cmd) | |
| 193 row_count = int(output.strip()) | |
| 194 return row_count | |
| 195 | |
| 196 | |
| 197 ################################################################################ | |
| 198 | |
| 199 def make_file_copy(in_file, out_file): | |
| 200 """ | |
| 201 Make a file copy by copying in_file to out_file. | |
| 202 | |
| 203 """ | |
| 204 check_cmd = "cat " + in_file + " > " + out_file | |
| 205 assert in_file != out_file, "cat does not like to cat file into same file (%s)" %(check_cmd) | |
| 206 output = subprocess.getoutput(check_cmd) | |
| 207 error = False | |
| 208 if output: | |
| 209 error = True | |
| 210 assert error == False, "cat did not like your input (in_file: %s, out_file: %s):\n%s" %(in_file, out_file, output) | |
| 211 | |
| 212 | |
| 213 ################################################################################ | |
| 214 | |
| 215 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, | |
| 216 test_size=500): | |
| 217 """ | |
| 218 Split in_fasta .fa file into two files (e.g. test, train). | |
| 219 | |
| 220 """ | |
| 221 # Read in in_fasta. | |
| 222 seqs_dic = read_fasta_into_dic(in_fasta) | |
| 223 # Shuffle IDs. | |
| 224 rand_ids_list = random_order_dic_keys_into_list(seqs_dic) | |
| 225 c_out = 0 | |
| 226 TESTOUT = open(test_out_fa, "w") | |
| 227 TRAINOUT = open(train_out_fa, "w") | |
| 228 for seq_id in rand_ids_list: | |
| 229 seq = seqs_dic[seq_id] | |
| 230 if (c_out >= test_size): | |
| 231 TRAINOUT.write(">%s\n%s\n" % (seq_id, seq)) | |
| 232 else: | |
| 233 TESTOUT.write(">%s\n%s\n" % (seq_id, seq)) | |
| 234 c_out += 1 | |
| 235 TESTOUT.close() | |
| 236 TRAINOUT.close() | |
| 237 | |
| 238 | |
| 239 ################################################################################ | |
| 240 | |
| 241 def read_fasta_into_dic(fasta_file, | |
| 242 seqs_dic=False, | |
| 243 ids_dic=False, | |
| 244 read_dna=False, | |
| 245 reject_lc=False, | |
| 246 convert_to_uc=False, | |
| 247 skip_n_seqs=True): | |
| 248 """ | |
| 249 Read in FASTA sequences, convert to RNA, store in dictionary | |
| 250 and return dictionary. | |
| 251 | |
| 252 >>> test_fasta = "test-data/test.fa" | |
| 253 >>> read_fasta_into_dic(test_fasta) | |
| 254 {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'} | |
| 255 >>> test_fasta = "test-data/test2.fa" | |
| 256 >>> read_fasta_into_dic(test_fasta) | |
| 257 {} | |
| 258 >>> test_fasta = "test-data/test.ensembl.fa" | |
| 259 >>> read_fasta_into_dic(test_fasta, read_dna=True) | |
| 260 {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'} | |
| 261 | |
| 262 """ | |
| 263 if not seqs_dic: | |
| 264 seqs_dic = {} | |
| 265 seq_id = "" | |
| 266 seq = "" | |
| 267 | |
| 268 # Go through FASTA file, extract sequences. | |
| 269 if re.search(".+\.gz$", fasta_file): | |
| 270 f = gzip.open(fasta_file, 'rt') | |
| 271 else: | |
| 272 f = open(fasta_file, "r") | |
| 273 for line in f: | |
| 274 if re.search(">.+", line): | |
| 275 m = re.search(">(.+)", line) | |
| 276 seq_id = m.group(1) | |
| 277 # If there is a ".", take only first part of header. | |
| 278 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." | |
| 279 if re.search(".+\..+", seq_id): | |
| 280 m = re.search("(.+?)\..+", seq_id) | |
| 281 seq_id = m.group(1) | |
| 282 assert seq_id not in seqs_dic, "non-unique FASTA header \"%s\" in \"%s\"" % (seq_id, fasta_file) | |
| 283 if ids_dic: | |
| 284 if seq_id in ids_dic: | |
| 285 seqs_dic[seq_id] = "" | |
| 286 else: | |
| 287 seqs_dic[seq_id] = "" | |
| 288 elif re.search("[ACGTUN]+", line, re.I): | |
| 289 if seq_id in seqs_dic: | |
| 290 m = re.search("([ACGTUN]+)", line, re.I) | |
| 291 seq = m.group(1) | |
| 292 if reject_lc: | |
| 293 assert not re.search("[a-z]", seq), "lowercase characters detected in sequence \"%i\" (reject_lc=True)" %(seq_id) | |
| 294 if convert_to_uc: | |
| 295 seq = seq.upper() | |
| 296 # If sequences with N nucleotides should be skipped. | |
| 297 if skip_n_seqs: | |
| 298 if "n" in m.group(1) or "N" in m.group(1): | |
| 299 print ("WARNING: \"%s\" contains N nucleotides. Discarding sequence ... " % (seq_id)) | |
| 300 del seqs_dic[seq_id] | |
| 301 continue | |
| 302 # Convert to RNA, concatenate sequence. | |
| 303 if read_dna: | |
| 304 seqs_dic[seq_id] += m.group(1).replace("U","T").replace("u","t") | |
| 305 else: | |
| 306 seqs_dic[seq_id] += m.group(1).replace("T","U").replace("t","u") | |
| 307 f.close() | |
| 308 return seqs_dic | |
| 309 | |
| 310 | |
| 311 ################################################################################ | |
| 312 | |
| 313 def random_order_dic_keys_into_list(in_dic): | |
| 314 """ | |
| 315 Read in dictionary keys, and return random order list of IDs. | |
| 316 | |
| 317 """ | |
| 318 id_list = [] | |
| 319 for key in in_dic: | |
| 320 id_list.append(key) | |
| 321 random.shuffle(id_list) | |
| 322 return id_list | |
| 323 | |
| 324 | |
| 325 ################################################################################ | |
| 326 | |
| 327 def graphprot_get_param_string(params_file): | |
| 328 """ | |
| 329 Get parameter string from GraphProt .params file. | |
| 330 | |
| 331 >>> test_params = "test-data/test.params" | |
| 332 >>> graphprot_get_param_string(test_params) | |
| 333 '-epochs 20 -lambda 0.01 -R 1 -D 3 -bitsize 14 -onlyseq ' | |
| 334 | |
| 335 """ | |
| 336 param_string = "" | |
| 337 with open(params_file) as f: | |
| 338 for line in f: | |
| 339 cols = line.strip().split(" ") | |
| 340 param = cols[0] | |
| 341 setting = cols[1] | |
| 342 if re.search(".+:", param): | |
| 343 m = re.search("(.+):", line) | |
| 344 par = m.group(1) | |
| 345 if re.search("pos_train.+", line): | |
| 346 continue | |
| 347 if par == "model_type": | |
| 348 if setting == "sequence": | |
| 349 param_string += "-onlyseq " | |
| 350 else: | |
| 351 param_string += "-%s %s " %(par, setting) | |
| 352 else: | |
| 353 assert 0, "pattern matching failed for string \"%s\"" %(param) | |
| 354 return param_string | |
| 355 | |
| 356 | |
| 357 ################################################################################ | |
| 358 | |
| 359 def seqs_dic_count_uc_nts(seqs_dic): | |
| 360 """ | |
| 361 Count number of uppercase nucleotides in sequences stored in sequence | |
| 362 dictionary. | |
| 363 | |
| 364 >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'} | |
| 365 >>> seqs_dic_count_uc_nts(seqs_dic) | |
| 366 6 | |
| 367 >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'} | |
| 368 >>> seqs_dic_count_uc_nts(seqs_dic) | |
| 369 0 | |
| 370 | |
| 371 """ | |
| 372 assert seqs_dic, "Given sequence dictionary empty" | |
| 373 c_uc = 0 | |
| 374 for seq_id in seqs_dic: | |
| 375 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) | |
| 376 return c_uc | |
| 377 | |
| 378 | |
| 379 ################################################################################ | |
| 380 | |
| 381 def seqs_dic_count_lc_nts(seqs_dic): | |
| 382 """ | |
| 383 Count number of lowercase nucleotides in sequences stored in sequence | |
| 384 dictionary. | |
| 385 | |
| 386 >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'} | |
| 387 >>> seqs_dic_count_lc_nts(seqs_dic) | |
| 388 10 | |
| 389 >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'} | |
| 390 >>> seqs_dic_count_lc_nts(seqs_dic) | |
| 391 0 | |
| 392 | |
| 393 """ | |
| 394 assert seqs_dic, "Given sequence dictionary empty" | |
| 395 c_uc = 0 | |
| 396 for seq_id in seqs_dic: | |
| 397 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) | |
| 398 return c_uc | |
| 399 | |
| 400 | |
| 401 ################################################################################ | |
| 402 | |
| 403 def count_file_rows(in_file): | |
| 404 """ | |
| 405 Count number of file rows for given input file. | |
| 406 | |
| 407 >>> test_file = "test-data/test1.bed" | |
| 408 >>> count_file_rows(test_file) | |
| 409 7 | |
| 410 >>> test_file = "test-data/empty_file" | |
| 411 >>> count_file_rows(test_file) | |
| 412 0 | |
| 413 | |
| 414 """ | |
| 415 check_cmd = "cat " + in_file + " | wc -l" | |
| 416 output = subprocess.getoutput(check_cmd) | |
| 417 row_count = int(output.strip()) | |
| 418 return row_count | |
| 419 | |
| 420 | |
| 421 ################################################################################ | |
| 422 | |
| 423 def bed_check_six_col_format(bed_file): | |
| 424 """ | |
| 425 Check whether given .bed file has 6 columns. | |
| 426 | |
| 427 >>> test_bed = "test-data/test1.bed" | |
| 428 >>> bed_check_six_col_format(test_bed) | |
| 429 True | |
| 430 >>> test_bed = "test-data/empty_file" | |
| 431 >>> bed_check_six_col_format(test_bed) | |
| 432 False | |
| 433 | |
| 434 """ | |
| 435 | |
| 436 six_col_format = False | |
| 437 with open(bed_file) as f: | |
| 438 for line in f: | |
| 439 cols = line.strip().split("\t") | |
| 440 if len(cols) == 6: | |
| 441 six_col_format = True | |
| 442 break | |
| 443 f.closed | |
| 444 return six_col_format | |
| 445 | |
| 446 | |
| 447 ################################################################################ | |
| 448 | |
| 449 def bed_check_unique_ids(bed_file): | |
| 450 """ | |
| 451 Check whether .bed file (6 column format with IDs in column 4) | |
| 452 has unique column 4 IDs. | |
| 453 | |
| 454 >>> test_bed = "test-data/test1.bed" | |
| 455 >>> bed_check_unique_ids(test_bed) | |
| 456 True | |
| 457 >>> test_bed = "test-data/test2.bed" | |
| 458 >>> bed_check_unique_ids(test_bed) | |
| 459 False | |
| 460 | |
| 461 """ | |
| 462 | |
| 463 check_cmd = "cut -f 4 " + bed_file + " | sort | uniq -d" | |
| 464 output = subprocess.getoutput(check_cmd) | |
| 465 if output: | |
| 466 return False | |
| 467 else: | |
| 468 return True | |
| 469 | |
| 470 | |
| 471 ################################################################################ | |
| 472 | |
| 473 def get_seq_lengths_from_seqs_dic(seqs_dic): | |
| 474 """ | |
| 475 Given a dictionary of sequences, return dictionary of sequence lengths. | |
| 476 Mapping is sequence ID -> sequence length. | |
| 477 """ | |
| 478 seq_len_dic = {} | |
| 479 assert seqs_dic, "sequence dictionary seems to be empty" | |
| 480 for seq_id in seqs_dic: | |
| 481 seq_l = len(seqs_dic[seq_id]) | |
| 482 seq_len_dic[seq_id] = seq_l | |
| 483 return seq_len_dic | |
| 484 | |
| 485 | |
| 486 ################################################################################ | |
| 487 | |
| 488 def bed_get_region_lengths(bed_file): | |
| 489 """ | |
| 490 Read in .bed file, store and return region lengths in dictionary. | |
| 491 key : region ID (.bed col4) | |
| 492 value : region length (.bed col3-col2) | |
| 493 | |
| 494 >>> test_file = "test-data/test4.bed" | |
| 495 >>> bed_get_region_lengths(test_file) | |
| 496 {'CLIP1': 10, 'CLIP2': 10} | |
| 497 | |
| 498 """ | |
| 499 id2len_dic = {} | |
| 500 with open(bed_file) as f: | |
| 501 for line in f: | |
| 502 row = line.strip() | |
| 503 cols = line.strip().split("\t") | |
| 504 site_s = int(cols[1]) | |
| 505 site_e = int(cols[2]) | |
| 506 site_id = cols[3] | |
| 507 site_l = site_e - site_s | |
| 508 assert site_id not in id2len_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(bed_file) | |
| 509 id2len_dic[site_id] = site_l | |
| 510 f.closed | |
| 511 assert id2len_dic, "No IDs read into dictionary (input file \"%s\" empty or malformatted?)" % (in_bed) | |
| 512 return id2len_dic | |
| 513 | |
| 514 | |
| 515 ################################################################################ | |
| 516 | |
| 517 def graphprot_get_param_dic(params_file): | |
| 518 """ | |
| 519 Read in GraphProt .params file and store in dictionary. | |
| 520 key = parameter | |
| 521 value = parameter value | |
| 522 | |
| 523 >>> params_file = "test-data/test.params" | |
| 524 >>> graphprot_get_param_dic(params_file) | |
| 525 {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', 'pos_train_profile_median': '5.039610', 'pos_train_avg_profile_median_1': '4.236340', 'pos_train_avg_profile_median_2': '3.868431', 'pos_train_avg_profile_median_3': '3.331277', 'pos_train_avg_profile_median_4': '2.998667', 'pos_train_avg_profile_median_5': '2.829782', 'pos_train_avg_profile_median_6': '2.626623', 'pos_train_avg_profile_median_7': '2.447083', 'pos_train_avg_profile_median_8': '2.349919', 'pos_train_avg_profile_median_9': '2.239829', 'pos_train_avg_profile_median_10': '2.161676'} | |
| 526 | |
| 527 """ | |
| 528 param_dic = {} | |
| 529 with open(params_file) as f: | |
| 530 for line in f: | |
| 531 cols = line.strip().split(" ") | |
| 532 param = cols[0] | |
| 533 setting = cols[1] | |
| 534 if re.search(".+:", param): | |
| 535 m = re.search("(.+):", line) | |
| 536 par = m.group(1) | |
| 537 param_dic[par] = setting | |
| 538 f.close() | |
| 539 return param_dic | |
| 540 | |
| 541 | |
| 542 ################################################################################ | |
| 543 | |
| 544 def graphprot_filter_predictions_file(in_file, out_file, | |
| 545 sc_thr=0): | |
| 546 """ | |
| 547 Filter GraphProt .predictions file by given score thr_sc. | |
| 548 """ | |
| 549 OUTPRED = open(out_file, "w") | |
| 550 with open(in_file) as f: | |
| 551 for line in f: | |
| 552 row = line.strip() | |
| 553 cols = line.strip().split("\t") | |
| 554 score = float(cols[2]) | |
| 555 if score < sc_thr: | |
| 556 continue | |
| 557 OUTPRED.write("%s\n" %(row)) | |
| 558 f.close() | |
| 559 OUTPRED.close() | |
| 560 | |
| 561 | |
| 562 ################################################################################ | |
| 563 | |
| 564 def fasta_read_in_ids(fasta_file): | |
| 565 """ | |
| 566 Given a .fa file, read in header IDs in order appearing in file, | |
| 567 and store in list. | |
| 568 | |
| 569 >>> test_file = "test-data/test3.fa" | |
| 570 >>> fasta_read_in_ids(test_file) | |
| 571 ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316'] | |
| 572 | |
| 573 """ | |
| 574 ids_list = [] | |
| 575 with open(fasta_file) as f: | |
| 576 for line in f: | |
| 577 if re.search(">.+", line): | |
| 578 m = re.search(">(.+)", line) | |
| 579 seq_id = m.group(1) | |
| 580 ids_list.append(seq_id) | |
| 581 f.close() | |
| 582 return ids_list | |
| 583 | |
| 584 | |
| 585 ################################################################################ | |
| 586 | |
| 587 def graphprot_profile_calculate_avg_profile(in_file, out_file, | |
| 588 ap_extlr=5, | |
| 589 seq_ids_list=False, | |
| 590 method=1): | |
| 591 """ | |
| 592 Given a GraphProt .profile file, calculate average profiles and output | |
| 593 average profile file. | |
| 594 Average profile means that the position-wise scores will get smoothed | |
| 595 out by calculating for each position a new score, taking a sequence | |
| 596 window -ap_extlr to +ap_extlr relative to the position | |
| 597 and calculate the mean score over this window. The mean score then | |
| 598 becomes the new average profile score at this position. | |
| 599 Two different implementations of the task are given: | |
| 600 method=1 (new python implementation, slower + more memory but easy to read) | |
| 601 method=2 (old perl implementation, faster and less memory but more code) | |
| 602 | |
| 603 >>> in_file = "test-data/test2.profile" | |
| 604 >>> out_file1 = "test-data/test2_1.avg_profile" | |
| 605 >>> out_file2 = "test-data/test2_2.avg_profile" | |
| 606 >>> out_file4 = "test-data/test2_3.avg_profile" | |
| 607 >>> graphprot_profile_calculate_avg_profile(in_file, out_file1, ap_extlr=2, method=1) | |
| 608 >>> graphprot_profile_calculate_avg_profile(in_file, out_file2, ap_extlr=2, method=2) | |
| 609 >>> diff_two_files_identical(out_file1, out_file2) | |
| 610 True | |
| 611 >>> test_list = ["s1", "s2", "s3", "s4"] | |
| 612 >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile" | |
| 613 >>> out_file3 = "test-data/test3_added_ids_out.avg_profile" | |
| 614 >>> graphprot_profile_calculate_avg_profile(in_file, out_file3, ap_extlr=2, method=1, seq_ids_list=test_list) | |
| 615 >>> diff_two_files_identical(out_file3_exp, out_file3) | |
| 616 True | |
| 617 | |
| 618 """ | |
| 619 if method == 1: | |
| 620 # Dictionary of lists, with list of scores (value) for each site (key). | |
| 621 lists_dic = {} | |
| 622 site_starts_dic = {} | |
| 623 with open(in_file) as f: | |
| 624 for line in f: | |
| 625 cols = line.strip().split("\t") | |
| 626 site_id = int(cols[0]) | |
| 627 pos = int(cols[1]) # 0-based. | |
| 628 score = float(cols[2]) | |
| 629 # Store first position of site. | |
| 630 if site_id not in site_starts_dic: | |
| 631 site_starts_dic[site_id] = pos | |
| 632 if site_id in lists_dic: | |
| 633 lists_dic[site_id].append(score) | |
| 634 else: | |
| 635 lists_dic[site_id] = [] | |
| 636 lists_dic[site_id].append(score) | |
| 637 f.close() | |
| 638 # Check number of IDs (# FASTA sequence IDs has to be same as # site IDs). | |
| 639 if seq_ids_list: | |
| 640 c_seq_ids = len(seq_ids_list) | |
| 641 c_site_ids = len(site_starts_dic) | |
| 642 assert c_seq_ids == c_site_ids, "# sequence IDs != # site IDs (%i != %i)" %(c_seq_ids, c_site_ids) | |
| 643 OUTPROF = open(out_file, "w") | |
| 644 # For each site, calculate average profile scores list. | |
| 645 max_list = [] | |
| 646 for site_id in lists_dic: | |
| 647 # Convert profile score list to average profile scores list. | |
| 648 aps_list = list_moving_window_average_values(lists_dic[site_id], | |
| 649 win_extlr=ap_extlr) | |
| 650 start_pos = site_starts_dic[site_id] | |
| 651 # Get original FASTA sequence ID. | |
| 652 if seq_ids_list: | |
| 653 site_id = seq_ids_list[site_id] | |
| 654 for i, sc in enumerate(aps_list): | |
| 655 pos = i + start_pos + 1 # make 1-based. | |
| 656 OUTPROF.write("%s\t%i\t%f\n" %(site_id, pos, sc)) | |
| 657 OUTPROF.close() | |
| 658 elif method == 2: | |
| 659 OUTPROF = open(out_file, "w") | |
| 660 # Old site ID. | |
| 661 old_id = "" | |
| 662 # Current site ID. | |
| 663 cur_id = "" | |
| 664 # Scores list. | |
| 665 scores_list = [] | |
| 666 site_starts_dic = {} | |
| 667 with open(in_file) as f: | |
| 668 for line in f: | |
| 669 cols = line.strip().split("\t") | |
| 670 cur_id = int(cols[0]) | |
| 671 pos = int(cols[1]) # 0-based. | |
| 672 score = float(cols[2]) | |
| 673 # Store first position of site. | |
| 674 if cur_id not in site_starts_dic: | |
| 675 site_starts_dic[cur_id] = pos | |
| 676 # Case: new site (new column 1 ID). | |
| 677 if cur_id != old_id: | |
| 678 # Process old id scores. | |
| 679 if scores_list: | |
| 680 aps_list = list_moving_window_average_values(scores_list, | |
| 681 win_extlr=ap_extlr) | |
| 682 start_pos = site_starts_dic[old_id] | |
| 683 seq_id = old_id | |
| 684 # Get original FASTA sequence ID. | |
| 685 if seq_ids_list: | |
| 686 seq_id = seq_ids_list[old_id] | |
| 687 for i, sc in enumerate(aps_list): | |
| 688 pos = i + start_pos + 1 # make 1-based. | |
| 689 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) | |
| 690 # Reset list. | |
| 691 scores_list = [] | |
| 692 old_id = cur_id | |
| 693 scores_list.append(score) | |
| 694 else: | |
| 695 # Add to scores_list. | |
| 696 scores_list.append(score) | |
| 697 f.close() | |
| 698 # Process last block. | |
| 699 if scores_list: | |
| 700 aps_list = list_moving_window_average_values(scores_list, | |
| 701 win_extlr=ap_extlr) | |
| 702 start_pos = site_starts_dic[old_id] | |
| 703 seq_id = old_id | |
| 704 # Get original FASTA sequence ID. | |
| 705 if seq_ids_list: | |
| 706 seq_id = seq_ids_list[old_id] | |
| 707 for i, sc in enumerate(aps_list): | |
| 708 pos = i + start_pos + 1 # make 1-based. | |
| 709 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) | |
| 710 OUTPROF.close() | |
| 711 | |
| 712 | |
| 713 ################################################################################ | |
| 714 | |
| 715 def graphprot_profile_extract_peak_regions(in_file, out_file, | |
| 716 max_merge_dist=0, | |
| 717 sc_thr=0): | |
| 718 """ | |
| 719 Extract peak regions from GraphProt .profile file. | |
| 720 Store the peak regions (defined as regions with scores >= sc_thr) | |
| 721 as to out_file in 6-column .bed. | |
| 722 | |
| 723 TODO: | |
| 724 Add option for genomic coordinates input (+ - polarity support). | |
| 725 Output genomic regions instead of sequence regions. | |
| 726 | |
| 727 >>> in_file = "test-data/test4.avg_profile" | |
| 728 >>> out_file = "test-data/test4_out.peaks.bed" | |
| 729 >>> exp_file = "test-data/test4_out_exp.peaks.bed" | |
| 730 >>> exp2_file = "test-data/test4_out_exp2.peaks.bed" | |
| 731 >>> empty_file = "test-data/empty_file" | |
| 732 >>> graphprot_profile_extract_peak_regions(in_file, out_file) | |
| 733 >>> diff_two_files_identical(out_file, exp_file) | |
| 734 True | |
| 735 >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10) | |
| 736 >>> diff_two_files_identical(out_file, empty_file) | |
| 737 True | |
| 738 >>> graphprot_profile_extract_peak_regions(in_file, out_file, max_merge_dist=2) | |
| 739 >>> diff_two_files_identical(out_file, exp2_file) | |
| 740 True | |
| 741 | |
| 742 """ | |
| 743 | |
| 744 OUTPEAKS = open(out_file, "w") | |
| 745 # Old site ID. | |
| 746 old_id = "" | |
| 747 # Current site ID. | |
| 748 cur_id = "" | |
| 749 # Scores list. | |
| 750 scores_list = [] | |
| 751 site_starts_dic = {} | |
| 752 with open(in_file) as f: | |
| 753 for line in f: | |
| 754 cols = line.strip().split("\t") | |
| 755 cur_id = cols[0] | |
| 756 pos = int(cols[1]) # 0-based. | |
| 757 score = float(cols[2]) | |
| 758 # Store first position of site. | |
| 759 if cur_id not in site_starts_dic: | |
| 760 # If first position != zero, we assume positions are 1-based. | |
| 761 if pos != 0: | |
| 762 # Make index 0-based. | |
| 763 site_starts_dic[cur_id] = pos - 1 | |
| 764 else: | |
| 765 site_starts_dic[cur_id] = pos | |
| 766 # Case: new site (new column 1 ID). | |
| 767 if cur_id != old_id: | |
| 768 # Process old id scores. | |
| 769 if scores_list: | |
| 770 # Extract peaks from region. | |
| 771 peak_list = list_extract_peaks(scores_list, | |
| 772 max_merge_dist=max_merge_dist, | |
| 773 coords="bed", | |
| 774 sc_thr=sc_thr) | |
| 775 start_pos = site_starts_dic[old_id] | |
| 776 # Print out peaks in .bed format. | |
| 777 for l in peak_list: | |
| 778 peak_s = start_pos + l[0] | |
| 779 peak_e = start_pos + l[1] | |
| 780 site_id = "%s,%i" %(old_id, l[2]) | |
| 781 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) | |
| 782 # Reset list. | |
| 783 scores_list = [] | |
| 784 old_id = cur_id | |
| 785 scores_list.append(score) | |
| 786 else: | |
| 787 # Add to scores_list. | |
| 788 scores_list.append(score) | |
| 789 f.close() | |
| 790 # Process last block. | |
| 791 if scores_list: | |
| 792 # Extract peaks from region. | |
| 793 peak_list = list_extract_peaks(scores_list, | |
| 794 max_merge_dist=max_merge_dist, | |
| 795 coords="bed", | |
| 796 sc_thr=sc_thr) | |
| 797 start_pos = site_starts_dic[old_id] | |
| 798 # Print out peaks in .bed format. | |
| 799 for l in peak_list: | |
| 800 peak_s = start_pos + l[0] | |
| 801 peak_e = start_pos + l[1] | |
| 802 site_id = "%s,%i" %(old_id, l[2]) # best score also 1-based. | |
| 803 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) | |
| 804 OUTPEAKS.close() | |
| 805 | |
| 806 | |
| 807 ################################################################################ | |
| 808 | |
| 809 def list_extract_peaks(in_list, | |
| 810 max_merge_dist=0, | |
| 811 coords="list", | |
| 812 sc_thr=0): | |
| 813 """ | |
| 814 Extract peak regions from list. | |
| 815 Peak region is defined as region >= score threshold. | |
| 816 | |
| 817 coords=bed : peak start 0-based, peak end 1-based. | |
| 818 coords=list : peak start 0-based, peak end 0-based. | |
| 819 | |
| 820 >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5] | |
| 821 >>> list_extract_peaks(test_list) | |
| 822 [[1, 4, 3, 4.5], [6, 7, 7, 6.5]] | |
| 823 >>> list_extract_peaks(test_list, sc_thr=2) | |
| 824 [[2, 3, 3, 4.5], [6, 7, 7, 6.5]] | |
| 825 >>> list_extract_peaks(test_list, sc_thr=2, coords="bed") | |
| 826 [[2, 4, 4, 4.5], [6, 8, 8, 6.5]] | |
| 827 >>> list_extract_peaks(test_list, sc_thr=10) | |
| 828 [] | |
| 829 >>> test_list = [2, -1, 3, -1, 4, -1, -1, 6, 9] | |
| 830 >>> list_extract_peaks(test_list, max_merge_dist=2) | |
| 831 [[0, 4, 4, 4], [7, 8, 8, 9]] | |
| 832 >>> list_extract_peaks(test_list, max_merge_dist=3) | |
| 833 [[0, 8, 8, 9]] | |
| 834 | |
| 835 """ | |
| 836 # Check. | |
| 837 assert len(in_list), "Given list is empty" | |
| 838 # Peak regions list. | |
| 839 peak_list = [] | |
| 840 # Help me. | |
| 841 inside = False | |
| 842 pr_s = 0 | |
| 843 pr_e = 0 | |
| 844 pr_top_pos = 0 | |
| 845 pr_top_sc = -100000 | |
| 846 for i, sc in enumerate(in_list): | |
| 847 # Part of peak region? | |
| 848 if sc >= sc_thr: | |
| 849 # At peak start. | |
| 850 if not inside: | |
| 851 pr_s = i | |
| 852 pr_e = i | |
| 853 inside = True | |
| 854 else: | |
| 855 # Inside peak region. | |
| 856 pr_e = i | |
| 857 # Store top position. | |
| 858 if sc > pr_top_sc: | |
| 859 pr_top_sc = sc | |
| 860 pr_top_pos = i | |
| 861 else: | |
| 862 # Before was peak region? | |
| 863 if inside: | |
| 864 # Store peak region. | |
| 865 #peak_infos = "%i,%i,%i,%f" %(pr_s, pr_e, pr_top_pos, pr_top_sc) | |
| 866 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] | |
| 867 peak_list.append(peak_infos) | |
| 868 inside = False | |
| 869 pr_top_pos = 0 | |
| 870 pr_top_sc = -100000 | |
| 871 # If peak at the end, also report. | |
| 872 if inside: | |
| 873 # Store peak region. | |
| 874 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] | |
| 875 peak_list.append(peak_infos) | |
| 876 # Merge peaks. | |
| 877 if max_merge_dist and len(peak_list) > 1: | |
| 878 iterate = True | |
| 879 while iterate: | |
| 880 merged_peak_list = [] | |
| 881 added_peaks_dic = {} | |
| 882 peaks_merged = False | |
| 883 for i, l in enumerate(peak_list): | |
| 884 if i in added_peaks_dic: | |
| 885 continue | |
| 886 j = i + 1 | |
| 887 # Last element. | |
| 888 if j == len(peak_list): | |
| 889 if i not in added_peaks_dic: | |
| 890 merged_peak_list.append(peak_list[i]) | |
| 891 break | |
| 892 # Compare two elements. | |
| 893 new_peak = [] | |
| 894 if (peak_list[j][0] - peak_list[i][1]) <= max_merge_dist: | |
| 895 peaks_merged = True | |
| 896 new_top_pos = peak_list[i][2] | |
| 897 new_top_sc = peak_list[i][3] | |
| 898 if peak_list[i][3] < peak_list[j][3]: | |
| 899 new_top_pos = peak_list[j][2] | |
| 900 new_top_sc = peak_list[j][3] | |
| 901 new_peak = [peak_list[i][0], peak_list[j][1], new_top_pos, new_top_sc] | |
| 902 # If two peaks were merged. | |
| 903 if new_peak: | |
| 904 merged_peak_list.append(new_peak) | |
| 905 added_peaks_dic[i] = 1 | |
| 906 added_peaks_dic[j] = 1 | |
| 907 else: | |
| 908 merged_peak_list.append(peak_list[i]) | |
| 909 added_peaks_dic[i] = 1 | |
| 910 if not peaks_merged: | |
| 911 iterate = False | |
| 912 peak_list = merged_peak_list | |
| 913 peaks_merged = False | |
| 914 # If peak coordinates should be in .bed format, make peak ends 1-based. | |
| 915 if coords == "bed": | |
| 916 for i in range(len(peak_list)): | |
| 917 peak_list[i][1] += 1 | |
| 918 peak_list[i][2] += 1 # 1-base best score position too. | |
| 919 return peak_list | |
| 920 | |
| 921 | |
| 922 ################################################################################ | |
| 923 | |
| 924 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False): | |
| 925 """ | |
| 926 Given a .bed file of sequence peak regions (possible coordinates from | |
| 927 0 to length of s), convert peak coordinates to genomic coordinates. | |
| 928 Do this by taking genomic regions of sequences as input. | |
| 929 | |
| 930 >>> test_in = "test-data/test.peaks.bed" | |
| 931 >>> test_exp = "test-data/test_exp.peaks.bed" | |
| 932 >>> test_out = "test-data/test_out.peaks.bed" | |
| 933 >>> gen_in = "test-data/test.peaks_genomic.bed" | |
| 934 >>> bed_peaks_to_genomic_peaks(test_in, test_out, gen_in) | |
| 935 >>> diff_two_files_identical(test_out, test_exp) | |
| 936 True | |
| 937 | |
| 938 """ | |
| 939 # Read in genomic region info. | |
| 940 id2row_dic = {} | |
| 941 | |
| 942 with open(genomic_sites_bed) as f: | |
| 943 for line in f: | |
| 944 row = line.strip() | |
| 945 cols = line.strip().split("\t") | |
| 946 site_id = cols[3] | |
| 947 assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(args.genomic_sites_bed) | |
| 948 id2row_dic[site_id] = row | |
| 949 f.close() | |
| 950 | |
| 951 # Read in peaks file and convert coordinates. | |
| 952 OUTPEAKS = open(genomic_peak_file, "w") | |
| 953 with open(peak_file) as f: | |
| 954 for line in f: | |
| 955 cols = line.strip().split("\t") | |
| 956 site_id = cols[0] | |
| 957 site_s = int(cols[1]) | |
| 958 site_e = int(cols[2]) | |
| 959 site_id2 = cols[3] | |
| 960 site_sc = float(cols[4]) | |
| 961 assert re.search(".+,.+", site_id2), "regular expression failed for ID \"%s\"" %(site_id2) | |
| 962 m = re.search(".+,(\d+)", site_id2) | |
| 963 sc_pos = int(m.group(1)) # 1-based. | |
| 964 assert site_id in id2row_dic, "site ID \"%s\" not found in genomic sites dictionary" %(site_id) | |
| 965 row = id2row_dic[site_id] | |
| 966 rowl = row.split("\t") | |
| 967 gen_chr = rowl[0] | |
| 968 gen_s = int(rowl[1]) | |
| 969 gen_e = int(rowl[2]) | |
| 970 gen_pol = rowl[5] | |
| 971 new_s = site_s + gen_s | |
| 972 new_e = site_e + gen_s | |
| 973 new_sc_pos = sc_pos + gen_s | |
| 974 if gen_pol == "-": | |
| 975 new_s = gen_e - site_e | |
| 976 new_e = gen_e - site_s | |
| 977 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. | |
| 978 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol) | |
| 979 OUTPEAKS.write("%s\n" %(new_row)) | |
| 980 if print_rows: | |
| 981 print(new_row) | |
| 982 OUTPEAKS.close() | |
| 983 | |
| 984 | |
| 985 ################################################################################ | |
| 986 | |
| 987 def diff_two_files_identical(file1, file2): | |
| 988 """ | |
| 989 Check whether two files are identical. Return true if diff reports no | |
| 990 differences. | |
| 991 | |
| 992 >>> file1 = "test-data/file1" | |
| 993 >>> file2 = "test-data/file2" | |
| 994 >>> diff_two_files_identical(file1, file2) | |
| 995 True | |
| 996 >>> file1 = "test-data/test1.bed" | |
| 997 >>> diff_two_files_identical(file1, file2) | |
| 998 False | |
| 999 | |
| 1000 """ | |
| 1001 same = True | |
| 1002 check_cmd = "diff " + file1 + " " + file2 | |
| 1003 output = subprocess.getoutput(check_cmd) | |
| 1004 if output: | |
| 1005 same = False | |
| 1006 return same | |
| 1007 | |
| 1008 | |
| 1009 ################################################################################ | |
| 1010 | |
| 1011 |
