Mercurial > repos > rnateam > graphprot_predict_profile
comparison gplib.py @ 3:ace92c9a4653 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"
| author | bgruening |
|---|---|
| date | Wed, 27 Jan 2021 19:27:47 +0000 |
| parents | 20429f4c1b95 |
| children | ddcf35a868b8 |
comparison
equal
deleted
inserted
replaced
| 2:7bbb7bf6304f | 3:ace92c9a4653 |
|---|---|
| 1 | 1 |
| 2 import gzip | |
| 3 import random | |
| 4 import re | |
| 5 import statistics | |
| 6 import subprocess | |
| 2 from distutils.spawn import find_executable | 7 from distutils.spawn import find_executable |
| 3 import subprocess | 8 |
| 4 import statistics | |
| 5 import random | |
| 6 import gzip | |
| 7 import uuid | |
| 8 import sys | |
| 9 import re | |
| 10 import os | |
| 11 | 9 |
| 12 """ | 10 """ |
| 13 | 11 |
| 14 Run doctests: | 12 Run doctests: |
| 15 | 13 |
| 17 | 15 |
| 18 | 16 |
| 19 """ | 17 """ |
| 20 | 18 |
| 21 | 19 |
| 22 ################################################################################ | 20 ############################################################################### |
| 23 | 21 |
| 24 def graphprot_predictions_get_median(predictions_file): | 22 def graphprot_predictions_get_median(predictions_file): |
| 25 """ | 23 """ |
| 26 Given a GraphProt .predictions file, read in site scores and return | 24 Given a GraphProt .predictions file, read in site scores and return |
| 27 the median value. | 25 the median value. |
| 28 | 26 |
| 29 >>> test_file = "test-data/test.predictions" | 27 >>> test_file = "test-data/test.predictions" |
| 30 >>> graphprot_predictions_get_median(test_file) | 28 >>> graphprot_predictions_get_median(test_file) |
| 31 0.571673 | 29 0.571673 |
| 41 f.close() | 39 f.close() |
| 42 # Return the median. | 40 # Return the median. |
| 43 return statistics.median(sc_list) | 41 return statistics.median(sc_list) |
| 44 | 42 |
| 45 | 43 |
| 46 ################################################################################ | 44 ############################################################################### |
| 47 | 45 |
| 48 def graphprot_profile_get_top_scores_median(profile_file, | 46 def graphprot_profile_get_tsm(profile_file, |
| 49 profile_type="profile", | 47 profile_type="profile", |
| 50 avg_profile_extlr=5): | 48 avg_profile_extlr=5): |
| 51 | 49 |
| 52 """ | 50 """ |
| 53 Given a GraphProt .profile file, extract for each site (identified by | 51 Given a GraphProt .profile file, extract for each site (identified by |
| 54 column 1 ID) the top (= highest) score. Then return the median of these | 52 column 1 ID) the top (= highest) score. Then return the median of these |
| 55 top scores. | 53 top scores. |
| 56 | 54 |
| 57 profile_type can be either "profile" or "avg_profile". | 55 profile_type can be either "profile" or "avg_profile". |
| 58 "avg_profile means that the position-wise scores will first get smoothed | 56 "avg_profile means that the position-wise scores will first get smoothed |
| 59 out by calculating for each position a new score through taking a | 57 out by calculating for each position a new score through taking a |
| 60 sequence window -avg_profile_extlr to +avg_profile_extlr of the position | 58 sequence window -avg_profile_extlr to +avg_profile_extlr of the position |
| 61 and calculate the mean score over this window and assign it to the position. | 59 and calculate the mean score over this window and assign it to the |
| 62 After that, the maximum score of each site is chosen, and the median over | 60 position. After that, the maximum score of each site is chosen, and the |
| 63 all maximum scores is returned. | 61 median over all maximum scores is returned. |
| 64 "profile" leaves the position-wise scores as they are, directly extracting | 62 "profile" leaves the position-wise scores as they are, directly extracting |
| 65 the maximum for each site and then reporting the median. | 63 the maximum for each site and then reporting the median. |
| 66 | 64 |
| 67 >>> test_file = "test-data/test.profile" | 65 >>> test_file = "test-data/test.profile" |
| 68 >>> graphprot_profile_get_top_scores_median(test_file) | 66 >>> graphprot_profile_get_tsm(test_file) |
| 69 3.2 | 67 3.2 |
| 70 | 68 |
| 71 """ | 69 """ |
| 72 # Dictionary of lists, with list of scores (value) for each site (key). | 70 # Dictionary of lists, with list of scores (value) for each site (key). |
| 73 lists_dic = {} | 71 lists_dic = {} |
| 88 if profile_type == "profile": | 86 if profile_type == "profile": |
| 89 max_sc = max(lists_dic[seq_id]) | 87 max_sc = max(lists_dic[seq_id]) |
| 90 max_list.append(max_sc) | 88 max_list.append(max_sc) |
| 91 elif profile_type == "avg_profile": | 89 elif profile_type == "avg_profile": |
| 92 # Convert profile score list to average profile scores list. | 90 # Convert profile score list to average profile scores list. |
| 93 aps_list = list_moving_window_average_values(lists_dic[seq_id], | 91 aps_list = \ |
| 94 win_extlr=avg_profile_extlr) | 92 list_moving_window_average_values(lists_dic[seq_id], |
| 93 win_extlr=avg_profile_extlr) | |
| 95 max_sc = max(aps_list) | 94 max_sc = max(aps_list) |
| 96 max_list.append(max_sc) | 95 max_list.append(max_sc) |
| 97 else: | 96 else: |
| 98 assert 0, "invalid profile_type argument given: \"%s\"" %(profile_type) | 97 assert 0, "invalid profile_type argument given: \"%s\"" \ |
| 98 % (profile_type) | |
| 99 # Return the median. | 99 # Return the median. |
| 100 return statistics.median(max_list) | 100 return statistics.median(max_list) |
| 101 | 101 |
| 102 | 102 |
| 103 ################################################################################ | 103 ############################################################################### |
| 104 | 104 |
| 105 def list_moving_window_average_values(in_list, | 105 def list_moving_window_average_values(in_list, |
| 106 win_extlr=5, | 106 win_extlr=5, |
| 107 method=1): | 107 method=1): |
| 108 """ | 108 """ |
| 109 Take a list of numeric values, and calculate for each position a new value, | 109 Take a list of numeric values, and calculate for each position a new value, |
| 110 by taking the mean value of the window of positions -win_extlr and | 110 by taking the mean value of the window of positions -win_extlr and |
| 111 +win_extlr. If full extension is not possible (at list ends), it just | 111 +win_extlr. If full extension is not possible (at list ends), it just |
| 112 takes what it gets. | 112 takes what it gets. |
| 113 Two implementations of the task are given, chose by method=1 or method=2. | 113 Two implementations of the task are given, chose by method=1 or method=2. |
| 114 | 114 |
| 115 >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1] | 115 >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1] |
| 116 >>> list_moving_window_average_values(test_list, win_extlr=2, method=1) | 116 >>> list_moving_window_average_values(test_list, win_extlr=2, method=1) |
| 140 e = i + win_extlr + 1 | 140 e = i + win_extlr + 1 |
| 141 if s < 0: | 141 if s < 0: |
| 142 s = 0 | 142 s = 0 |
| 143 if e > l_list: | 143 if e > l_list: |
| 144 e = l_list | 144 e = l_list |
| 145 l = e-s | 145 ln = e - s |
| 146 sc_sum = 0 | 146 sc_sum = 0 |
| 147 for j in range(l): | 147 for j in range(ln): |
| 148 sc_sum += in_list[s+j] | 148 sc_sum += in_list[s + j] |
| 149 new_list[i] = sc_sum / l | 149 new_list[i] = sc_sum / ln |
| 150 else: | 150 else: |
| 151 assert 0, "invalid method ID given (%i)" %(method) | 151 assert 0, "invalid method ID given (%i)" % (method) |
| 152 return new_list | 152 return new_list |
| 153 | 153 |
| 154 | 154 |
| 155 ################################################################################ | 155 ############################################################################### |
| 156 | 156 |
| 157 def echo_add_to_file(echo_string, out_file): | 157 def echo_add_to_file(echo_string, out_file): |
| 158 """ | 158 """ |
| 159 Add a string to file, using echo command. | 159 Add a string to file, using echo command. |
| 160 | 160 |
| 162 check_cmd = 'echo "%s" >> %s' % (echo_string, out_file) | 162 check_cmd = 'echo "%s" >> %s' % (echo_string, out_file) |
| 163 output = subprocess.getoutput(check_cmd) | 163 output = subprocess.getoutput(check_cmd) |
| 164 error = False | 164 error = False |
| 165 if output: | 165 if output: |
| 166 error = True | 166 error = True |
| 167 assert error == False, "echo is complaining:\n%s\n%s" %(check_cmd, output) | 167 assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output) |
| 168 | 168 |
| 169 | 169 |
| 170 ################################################################################ | 170 ############################################################################### |
| 171 | 171 |
| 172 def is_tool(name): | 172 def is_tool(name): |
| 173 """Check whether tool "name" is in PATH.""" | 173 """Check whether tool "name" is in PATH.""" |
| 174 return find_executable(name) is not None | 174 return find_executable(name) is not None |
| 175 | 175 |
| 176 | 176 |
| 177 ################################################################################ | 177 ############################################################################### |
| 178 | 178 |
| 179 def count_fasta_headers(fasta_file): | 179 def count_fasta_headers(fasta_file): |
| 180 """ | 180 """ |
| 181 Count number of FASTA headers in fasta_file using grep. | 181 Count number of FASTA headers in fasta_file using grep. |
| 182 | 182 |
| 192 output = subprocess.getoutput(check_cmd) | 192 output = subprocess.getoutput(check_cmd) |
| 193 row_count = int(output.strip()) | 193 row_count = int(output.strip()) |
| 194 return row_count | 194 return row_count |
| 195 | 195 |
| 196 | 196 |
| 197 ################################################################################ | 197 ############################################################################### |
| 198 | 198 |
| 199 def make_file_copy(in_file, out_file): | 199 def make_file_copy(in_file, out_file): |
| 200 """ | 200 """ |
| 201 Make a file copy by copying in_file to out_file. | 201 Make a file copy by copying in_file to out_file. |
| 202 | 202 |
| 203 """ | 203 """ |
| 204 check_cmd = "cat " + in_file + " > " + out_file | 204 check_cmd = "cat " + in_file + " > " + out_file |
| 205 assert in_file != out_file, "cat does not like to cat file into same file (%s)" %(check_cmd) | 205 assert in_file != out_file, \ |
| 206 "cat does not like to cat file into same file (%s)" % (check_cmd) | |
| 206 output = subprocess.getoutput(check_cmd) | 207 output = subprocess.getoutput(check_cmd) |
| 207 error = False | 208 error = False |
| 208 if output: | 209 if output: |
| 209 error = True | 210 error = True |
| 210 assert error == False, "cat did not like your input (in_file: %s, out_file: %s):\n%s" %(in_file, out_file, output) | 211 assert not error, \ |
| 211 | 212 "cat did not like your input (in_file: %s, out_file: %s):\n%s" \ |
| 212 | 213 % (in_file, out_file, output) |
| 213 ################################################################################ | 214 |
| 214 | 215 |
| 215 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, | 216 ############################################################################### |
| 217 | |
| 218 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, | |
| 216 test_size=500): | 219 test_size=500): |
| 217 """ | 220 """ |
| 218 Split in_fasta .fa file into two files (e.g. test, train). | 221 Split in_fasta .fa file into two files (e.g. test, train). |
| 219 | 222 |
| 220 """ | 223 """ |
| 234 c_out += 1 | 237 c_out += 1 |
| 235 TESTOUT.close() | 238 TESTOUT.close() |
| 236 TRAINOUT.close() | 239 TRAINOUT.close() |
| 237 | 240 |
| 238 | 241 |
| 239 ################################################################################ | 242 ############################################################################### |
| 243 | |
| 244 def check_seqs_dic_format(seqs_dic): | |
| 245 """ | |
| 246 Check sequence dictionary for lowercase-only sequences or sequences | |
| 247 wich have lowercase nts in between uppercase nts. | |
| 248 Return suspicious IDs as list or empty list if not hits. | |
| 249 IDs with lowercase-only sequences. | |
| 250 | |
| 251 >>> seqs_dic = {"id1" : "acguACGU", "id2" : "acgua", "id3" : "acgUUaUcc"} | |
| 252 >>> check_seqs_dic_format(seqs_dic) | |
| 253 ['id2', 'id3'] | |
| 254 >>> seqs_dic = {"id1" : "acgAUaa", "id2" : "ACGUACUA"} | |
| 255 >>> check_seqs_dic_format(seqs_dic) | |
| 256 [] | |
| 257 | |
| 258 """ | |
| 259 assert seqs_dic, "given seqs_dic empty" | |
| 260 bad_seq_ids = [] | |
| 261 for seq_id in seqs_dic: | |
| 262 seq = seqs_dic[seq_id] | |
| 263 if re.search("^[acgtun]+$", seq): | |
| 264 bad_seq_ids.append(seq_id) | |
| 265 if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq): | |
| 266 bad_seq_ids.append(seq_id) | |
| 267 return bad_seq_ids | |
| 268 | |
| 269 | |
| 270 ############################################################################### | |
| 240 | 271 |
| 241 def read_fasta_into_dic(fasta_file, | 272 def read_fasta_into_dic(fasta_file, |
| 242 seqs_dic=False, | 273 seqs_dic=False, |
| 243 ids_dic=False, | 274 ids_dic=False, |
| 244 read_dna=False, | 275 read_dna=False, |
| 276 short_ensembl=False, | |
| 245 reject_lc=False, | 277 reject_lc=False, |
| 246 convert_to_uc=False, | 278 convert_to_uc=False, |
| 247 skip_n_seqs=True): | 279 skip_n_seqs=True): |
| 248 """ | 280 """ |
| 249 Read in FASTA sequences, convert to RNA, store in dictionary | 281 Read in FASTA sequences, convert to RNA, store in dictionary |
| 250 and return dictionary. | 282 and return dictionary. |
| 251 | 283 |
| 252 >>> test_fasta = "test-data/test.fa" | 284 >>> test_fasta = "test-data/test.fa" |
| 253 >>> read_fasta_into_dic(test_fasta) | 285 >>> read_fasta_into_dic(test_fasta) |
| 254 {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'} | 286 {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'} |
| 255 >>> test_fasta = "test-data/test2.fa" | 287 >>> test_fasta = "test-data/test2.fa" |
| 256 >>> read_fasta_into_dic(test_fasta) | 288 >>> read_fasta_into_dic(test_fasta) |
| 257 {} | 289 {} |
| 258 >>> test_fasta = "test-data/test.ensembl.fa" | 290 >>> test_fasta = "test-data/test.ensembl.fa" |
| 259 >>> read_fasta_into_dic(test_fasta, read_dna=True) | 291 >>> read_fasta_into_dic(test_fasta, read_dna=True, short_ensembl=True) |
| 260 {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'} | 292 {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'} |
| 293 >>> test_fasta = "test-data/test4.fa" | |
| 294 >>> read_fasta_into_dic(test_fasta) | |
| 295 {'1': 'gccuAUGUuuua', '2': 'cugaAACUaugu'} | |
| 261 | 296 |
| 262 """ | 297 """ |
| 263 if not seqs_dic: | 298 if not seqs_dic: |
| 264 seqs_dic = {} | 299 seqs_dic = {} |
| 265 seq_id = "" | 300 seq_id = "" |
| 266 seq = "" | 301 seq = "" |
| 267 | 302 |
| 268 # Go through FASTA file, extract sequences. | 303 # Go through FASTA file, extract sequences. |
| 269 if re.search(".+\.gz$", fasta_file): | 304 if re.search(r".+\.gz$", fasta_file): |
| 270 f = gzip.open(fasta_file, 'rt') | 305 f = gzip.open(fasta_file, 'rt') |
| 271 else: | 306 else: |
| 272 f = open(fasta_file, "r") | 307 f = open(fasta_file, "r") |
| 273 for line in f: | 308 for line in f: |
| 274 if re.search(">.+", line): | 309 if re.search(">.+", line): |
| 275 m = re.search(">(.+)", line) | 310 m = re.search(">(.+)", line) |
| 276 seq_id = m.group(1) | 311 seq_id = m.group(1) |
| 277 # If there is a ".", take only first part of header. | 312 # If there is a ".", take only first part of header. |
| 278 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." | 313 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." |
| 279 if re.search(".+\..+", seq_id): | 314 if short_ensembl: |
| 280 m = re.search("(.+?)\..+", seq_id) | 315 if re.search(r".+\..+", seq_id): |
| 281 seq_id = m.group(1) | 316 m = re.search(r"(.+?)\..+", seq_id) |
| 282 assert seq_id not in seqs_dic, "non-unique FASTA header \"%s\" in \"%s\"" % (seq_id, fasta_file) | 317 seq_id = m.group(1) |
| 318 assert seq_id not in seqs_dic, \ | |
| 319 "non-unique FASTA header \"%s\" in \"%s\"" \ | |
| 320 % (seq_id, fasta_file) | |
| 283 if ids_dic: | 321 if ids_dic: |
| 284 if seq_id in ids_dic: | 322 if seq_id in ids_dic: |
| 285 seqs_dic[seq_id] = "" | 323 seqs_dic[seq_id] = "" |
| 286 else: | 324 else: |
| 287 seqs_dic[seq_id] = "" | 325 seqs_dic[seq_id] = "" |
| 288 elif re.search("[ACGTUN]+", line, re.I): | 326 elif re.search("[ACGTUN]+", line, re.I): |
| 289 if seq_id in seqs_dic: | 327 if seq_id in seqs_dic: |
| 290 m = re.search("([ACGTUN]+)", line, re.I) | 328 m = re.search("([ACGTUN]+)", line, re.I) |
| 291 seq = m.group(1) | 329 seq = m.group(1) |
| 292 if reject_lc: | 330 if reject_lc: |
| 293 assert not re.search("[a-z]", seq), "lowercase characters detected in sequence \"%i\" (reject_lc=True)" %(seq_id) | 331 assert \ |
| 332 not re.search("[a-z]", seq), \ | |
| 333 "lc char detected in seq \"%i\" (reject_lc=True)" \ | |
| 334 % (seq_id) | |
| 294 if convert_to_uc: | 335 if convert_to_uc: |
| 295 seq = seq.upper() | 336 seq = seq.upper() |
| 296 # If sequences with N nucleotides should be skipped. | 337 # If sequences with N nucleotides should be skipped. |
| 297 if skip_n_seqs: | 338 if skip_n_seqs: |
| 298 if "n" in m.group(1) or "N" in m.group(1): | 339 if "n" in m.group(1) or "N" in m.group(1): |
| 299 print ("WARNING: \"%s\" contains N nucleotides. Discarding sequence ... " % (seq_id)) | 340 print("WARNING: \"%s\" contains N. Discarding " |
| 341 "sequence ... " % (seq_id)) | |
| 300 del seqs_dic[seq_id] | 342 del seqs_dic[seq_id] |
| 301 continue | 343 continue |
| 302 # Convert to RNA, concatenate sequence. | 344 # Convert to RNA, concatenate sequence. |
| 303 if read_dna: | 345 if read_dna: |
| 304 seqs_dic[seq_id] += m.group(1).replace("U","T").replace("u","t") | 346 seqs_dic[seq_id] += \ |
| 347 m.group(1).replace("U", "T").replace("u", "t") | |
| 305 else: | 348 else: |
| 306 seqs_dic[seq_id] += m.group(1).replace("T","U").replace("t","u") | 349 seqs_dic[seq_id] += \ |
| 350 m.group(1).replace("T", "U").replace("t", "u") | |
| 307 f.close() | 351 f.close() |
| 308 return seqs_dic | 352 return seqs_dic |
| 309 | 353 |
| 310 | 354 |
| 311 ################################################################################ | 355 ############################################################################### |
| 312 | 356 |
| 313 def random_order_dic_keys_into_list(in_dic): | 357 def random_order_dic_keys_into_list(in_dic): |
| 314 """ | 358 """ |
| 315 Read in dictionary keys, and return random order list of IDs. | 359 Read in dictionary keys, and return random order list of IDs. |
| 316 | 360 |
| 320 id_list.append(key) | 364 id_list.append(key) |
| 321 random.shuffle(id_list) | 365 random.shuffle(id_list) |
| 322 return id_list | 366 return id_list |
| 323 | 367 |
| 324 | 368 |
| 325 ################################################################################ | 369 ############################################################################### |
| 326 | 370 |
| 327 def graphprot_get_param_string(params_file): | 371 def graphprot_get_param_string(params_file): |
| 328 """ | 372 """ |
| 329 Get parameter string from GraphProt .params file. | 373 Get parameter string from GraphProt .params file. |
| 330 | 374 |
| 346 continue | 390 continue |
| 347 if par == "model_type": | 391 if par == "model_type": |
| 348 if setting == "sequence": | 392 if setting == "sequence": |
| 349 param_string += "-onlyseq " | 393 param_string += "-onlyseq " |
| 350 else: | 394 else: |
| 351 param_string += "-%s %s " %(par, setting) | 395 param_string += "-%s %s " % (par, setting) |
| 352 else: | 396 else: |
| 353 assert 0, "pattern matching failed for string \"%s\"" %(param) | 397 assert 0, "pattern matching failed for string \"%s\"" % (param) |
| 354 return param_string | 398 return param_string |
| 355 | 399 |
| 356 | 400 |
| 357 ################################################################################ | 401 ############################################################################### |
| 358 | 402 |
| 359 def seqs_dic_count_uc_nts(seqs_dic): | 403 def seqs_dic_count_uc_nts(seqs_dic): |
| 360 """ | 404 """ |
| 361 Count number of uppercase nucleotides in sequences stored in sequence | 405 Count number of uppercase nucleotides in sequences stored in sequence |
| 362 dictionary. | 406 dictionary. |
| 363 | 407 |
| 364 >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'} | 408 >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'} |
| 365 >>> seqs_dic_count_uc_nts(seqs_dic) | 409 >>> seqs_dic_count_uc_nts(seqs_dic) |
| 366 6 | 410 6 |
| 367 >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'} | 411 >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'} |
| 368 >>> seqs_dic_count_uc_nts(seqs_dic) | 412 >>> seqs_dic_count_uc_nts(seqs_dic) |
| 374 for seq_id in seqs_dic: | 418 for seq_id in seqs_dic: |
| 375 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) | 419 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) |
| 376 return c_uc | 420 return c_uc |
| 377 | 421 |
| 378 | 422 |
| 379 ################################################################################ | 423 ############################################################################### |
| 380 | 424 |
| 381 def seqs_dic_count_lc_nts(seqs_dic): | 425 def seqs_dic_count_lc_nts(seqs_dic): |
| 382 """ | 426 """ |
| 383 Count number of lowercase nucleotides in sequences stored in sequence | 427 Count number of lowercase nucleotides in sequences stored in sequence |
| 384 dictionary. | 428 dictionary. |
| 385 | 429 |
| 386 >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'} | 430 >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'} |
| 387 >>> seqs_dic_count_lc_nts(seqs_dic) | 431 >>> seqs_dic_count_lc_nts(seqs_dic) |
| 388 10 | 432 10 |
| 389 >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'} | 433 >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'} |
| 390 >>> seqs_dic_count_lc_nts(seqs_dic) | 434 >>> seqs_dic_count_lc_nts(seqs_dic) |
| 396 for seq_id in seqs_dic: | 440 for seq_id in seqs_dic: |
| 397 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) | 441 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) |
| 398 return c_uc | 442 return c_uc |
| 399 | 443 |
| 400 | 444 |
| 401 ################################################################################ | 445 ############################################################################### |
| 402 | 446 |
| 403 def count_file_rows(in_file): | 447 def count_file_rows(in_file): |
| 404 """ | 448 """ |
| 405 Count number of file rows for given input file. | 449 Count number of file rows for given input file. |
| 406 | 450 |
| 407 >>> test_file = "test-data/test1.bed" | 451 >>> test_file = "test-data/test1.bed" |
| 408 >>> count_file_rows(test_file) | 452 >>> count_file_rows(test_file) |
| 409 7 | 453 7 |
| 410 >>> test_file = "test-data/empty_file" | 454 >>> test_file = "test-data/empty_file" |
| 411 >>> count_file_rows(test_file) | 455 >>> count_file_rows(test_file) |
| 416 output = subprocess.getoutput(check_cmd) | 460 output = subprocess.getoutput(check_cmd) |
| 417 row_count = int(output.strip()) | 461 row_count = int(output.strip()) |
| 418 return row_count | 462 return row_count |
| 419 | 463 |
| 420 | 464 |
| 421 ################################################################################ | 465 ############################################################################### |
| 422 | 466 |
| 423 def bed_check_six_col_format(bed_file): | 467 def bed_check_six_col_format(bed_file): |
| 424 """ | 468 """ |
| 425 Check whether given .bed file has 6 columns. | 469 Check whether given .bed file has 6 columns. |
| 426 | 470 |
| 442 break | 486 break |
| 443 f.closed | 487 f.closed |
| 444 return six_col_format | 488 return six_col_format |
| 445 | 489 |
| 446 | 490 |
| 447 ################################################################################ | 491 ############################################################################### |
| 448 | 492 |
| 449 def bed_check_unique_ids(bed_file): | 493 def bed_check_unique_ids(bed_file): |
| 450 """ | 494 """ |
| 451 Check whether .bed file (6 column format with IDs in column 4) | 495 Check whether .bed file (6 column format with IDs in column 4) |
| 452 has unique column 4 IDs. | 496 has unique column 4 IDs. |
| 453 | 497 |
| 454 >>> test_bed = "test-data/test1.bed" | 498 >>> test_bed = "test-data/test1.bed" |
| 455 >>> bed_check_unique_ids(test_bed) | 499 >>> bed_check_unique_ids(test_bed) |
| 456 True | 500 True |
| 457 >>> test_bed = "test-data/test2.bed" | 501 >>> test_bed = "test-data/test2.bed" |
| 458 >>> bed_check_unique_ids(test_bed) | 502 >>> bed_check_unique_ids(test_bed) |
| 466 return False | 510 return False |
| 467 else: | 511 else: |
| 468 return True | 512 return True |
| 469 | 513 |
| 470 | 514 |
| 471 ################################################################################ | 515 ############################################################################### |
| 472 | 516 |
| 473 def get_seq_lengths_from_seqs_dic(seqs_dic): | 517 def get_seq_lengths_from_seqs_dic(seqs_dic): |
| 474 """ | 518 """ |
| 475 Given a dictionary of sequences, return dictionary of sequence lengths. | 519 Given a dictionary of sequences, return dictionary of sequence lengths. |
| 476 Mapping is sequence ID -> sequence length. | 520 Mapping is sequence ID -> sequence length. |
| 481 seq_l = len(seqs_dic[seq_id]) | 525 seq_l = len(seqs_dic[seq_id]) |
| 482 seq_len_dic[seq_id] = seq_l | 526 seq_len_dic[seq_id] = seq_l |
| 483 return seq_len_dic | 527 return seq_len_dic |
| 484 | 528 |
| 485 | 529 |
| 486 ################################################################################ | 530 ############################################################################### |
| 487 | 531 |
| 488 def bed_get_region_lengths(bed_file): | 532 def bed_get_region_lengths(bed_file): |
| 489 """ | 533 """ |
| 490 Read in .bed file, store and return region lengths in dictionary. | 534 Read in .bed file, store and return region lengths in dictionary. |
| 491 key : region ID (.bed col4) | 535 key : region ID (.bed col4) |
| 497 | 541 |
| 498 """ | 542 """ |
| 499 id2len_dic = {} | 543 id2len_dic = {} |
| 500 with open(bed_file) as f: | 544 with open(bed_file) as f: |
| 501 for line in f: | 545 for line in f: |
| 502 row = line.strip() | |
| 503 cols = line.strip().split("\t") | 546 cols = line.strip().split("\t") |
| 504 site_s = int(cols[1]) | 547 site_s = int(cols[1]) |
| 505 site_e = int(cols[2]) | 548 site_e = int(cols[2]) |
| 506 site_id = cols[3] | 549 site_id = cols[3] |
| 507 site_l = site_e - site_s | 550 site_l = site_e - site_s |
| 508 assert site_id not in id2len_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(bed_file) | 551 assert site_id \ |
| 552 not in id2len_dic, \ | |
| 553 "column 4 IDs not unique in given .bed file \"%s\"" \ | |
| 554 % (bed_file) | |
| 509 id2len_dic[site_id] = site_l | 555 id2len_dic[site_id] = site_l |
| 510 f.closed | 556 f.closed |
| 511 assert id2len_dic, "No IDs read into dictionary (input file \"%s\" empty or malformatted?)" % (in_bed) | 557 assert id2len_dic, \ |
| 558 "No IDs read into dic (input file \"%s\" empty or malformatted?)" \ | |
| 559 % (bed_file) | |
| 512 return id2len_dic | 560 return id2len_dic |
| 513 | 561 |
| 514 | 562 |
| 515 ################################################################################ | 563 ############################################################################### |
| 516 | 564 |
| 517 def graphprot_get_param_dic(params_file): | 565 def graphprot_get_param_dic(params_file): |
| 518 """ | 566 """ |
| 519 Read in GraphProt .params file and store in dictionary. | 567 Read in GraphProt .params file and store in dictionary. |
| 520 key = parameter | 568 key = parameter |
| 521 value = parameter value | 569 value = parameter value |
| 522 | 570 |
| 523 >>> params_file = "test-data/test.params" | 571 >>> params_file = "test-data/test.params" |
| 524 >>> graphprot_get_param_dic(params_file) | 572 >>> graphprot_get_param_dic(params_file) |
| 525 {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', 'pos_train_profile_median': '5.039610', 'pos_train_avg_profile_median_1': '4.236340', 'pos_train_avg_profile_median_2': '3.868431', 'pos_train_avg_profile_median_3': '3.331277', 'pos_train_avg_profile_median_4': '2.998667', 'pos_train_avg_profile_median_5': '2.829782', 'pos_train_avg_profile_median_6': '2.626623', 'pos_train_avg_profile_median_7': '2.447083', 'pos_train_avg_profile_median_8': '2.349919', 'pos_train_avg_profile_median_9': '2.239829', 'pos_train_avg_profile_median_10': '2.161676'} | 573 {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', \ |
| 574 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', \ | |
| 575 'pos_train_profile_median': '5.039610', \ | |
| 576 'pos_train_avg_profile_median_1': '4.236340', \ | |
| 577 'pos_train_avg_profile_median_2': '3.868431', \ | |
| 578 'pos_train_avg_profile_median_3': '3.331277', \ | |
| 579 'pos_train_avg_profile_median_4': '2.998667', \ | |
| 580 'pos_train_avg_profile_median_5': '2.829782', \ | |
| 581 'pos_train_avg_profile_median_6': '2.626623', \ | |
| 582 'pos_train_avg_profile_median_7': '2.447083', \ | |
| 583 'pos_train_avg_profile_median_8': '2.349919', \ | |
| 584 'pos_train_avg_profile_median_9': '2.239829', \ | |
| 585 'pos_train_avg_profile_median_10': '2.161676'} | |
| 526 | 586 |
| 527 """ | 587 """ |
| 528 param_dic = {} | 588 param_dic = {} |
| 529 with open(params_file) as f: | 589 with open(params_file) as f: |
| 530 for line in f: | 590 for line in f: |
| 537 param_dic[par] = setting | 597 param_dic[par] = setting |
| 538 f.close() | 598 f.close() |
| 539 return param_dic | 599 return param_dic |
| 540 | 600 |
| 541 | 601 |
| 542 ################################################################################ | 602 ############################################################################### |
| 543 | 603 |
| 544 def graphprot_filter_predictions_file(in_file, out_file, | 604 def graphprot_filter_predictions_file(in_file, out_file, |
| 545 sc_thr=0): | 605 sc_thr=0): |
| 546 """ | 606 """ |
| 547 Filter GraphProt .predictions file by given score thr_sc. | 607 Filter GraphProt .predictions file by given score thr_sc. |
| 552 row = line.strip() | 612 row = line.strip() |
| 553 cols = line.strip().split("\t") | 613 cols = line.strip().split("\t") |
| 554 score = float(cols[2]) | 614 score = float(cols[2]) |
| 555 if score < sc_thr: | 615 if score < sc_thr: |
| 556 continue | 616 continue |
| 557 OUTPRED.write("%s\n" %(row)) | 617 OUTPRED.write("%s\n" % (row)) |
| 558 f.close() | 618 f.close() |
| 559 OUTPRED.close() | 619 OUTPRED.close() |
| 560 | 620 |
| 561 | 621 |
| 562 ################################################################################ | 622 ############################################################################### |
| 563 | 623 |
| 564 def fasta_read_in_ids(fasta_file): | 624 def fasta_read_in_ids(fasta_file): |
| 565 """ | 625 """ |
| 566 Given a .fa file, read in header IDs in order appearing in file, | 626 Given a .fa file, read in header IDs in order appearing in file, |
| 567 and store in list. | 627 and store in list. |
| 568 | 628 |
| 569 >>> test_file = "test-data/test3.fa" | 629 >>> test_file = "test-data/test3.fa" |
| 570 >>> fasta_read_in_ids(test_file) | 630 >>> fasta_read_in_ids(test_file) |
| 571 ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316'] | 631 ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316'] |
| 580 ids_list.append(seq_id) | 640 ids_list.append(seq_id) |
| 581 f.close() | 641 f.close() |
| 582 return ids_list | 642 return ids_list |
| 583 | 643 |
| 584 | 644 |
| 585 ################################################################################ | 645 ############################################################################### |
| 586 | 646 |
| 587 def graphprot_profile_calculate_avg_profile(in_file, out_file, | 647 def graphprot_profile_calc_avg_profile(in_file, out_file, |
| 588 ap_extlr=5, | 648 ap_extlr=5, |
| 589 seq_ids_list=False, | 649 seq_ids_list=False, |
| 590 method=1): | 650 method=1): |
| 591 """ | 651 """ |
| 592 Given a GraphProt .profile file, calculate average profiles and output | 652 Given a GraphProt .profile file, calculate average profiles and output |
| 593 average profile file. | 653 average profile file. |
| 594 Average profile means that the position-wise scores will get smoothed | 654 Average profile means that the position-wise scores will get smoothed |
| 595 out by calculating for each position a new score, taking a sequence | 655 out by calculating for each position a new score, taking a sequence |
| 596 window -ap_extlr to +ap_extlr relative to the position | 656 window -ap_extlr to +ap_extlr relative to the position |
| 597 and calculate the mean score over this window. The mean score then | 657 and calculate the mean score over this window. The mean score then |
| 598 becomes the new average profile score at this position. | 658 becomes the new average profile score at this position. |
| 599 Two different implementations of the task are given: | 659 Two different implementations of the task are given: |
| 600 method=1 (new python implementation, slower + more memory but easy to read) | 660 method=1 (new python implementation, slower + more memory but easy to read) |
| 601 method=2 (old perl implementation, faster and less memory but more code) | 661 method=2 (old perl implementation, faster and less memory but more code) |
| 602 | 662 |
| 603 >>> in_file = "test-data/test2.profile" | 663 >>> in_file = "test-data/test2.profile" |
| 604 >>> out_file1 = "test-data/test2_1.avg_profile" | 664 >>> out_file1 = "test-data/test2_1.avg_profile" |
| 605 >>> out_file2 = "test-data/test2_2.avg_profile" | 665 >>> out_file2 = "test-data/test2_2.avg_profile" |
| 606 >>> out_file4 = "test-data/test2_3.avg_profile" | 666 >>> out_file4 = "test-data/test2_3.avg_profile" |
| 607 >>> graphprot_profile_calculate_avg_profile(in_file, out_file1, ap_extlr=2, method=1) | 667 >>> graphprot_profile_calc_avg_profile(in_file, \ |
| 608 >>> graphprot_profile_calculate_avg_profile(in_file, out_file2, ap_extlr=2, method=2) | 668 out_file1, ap_extlr=2, method=1) |
| 669 >>> graphprot_profile_calc_avg_profile(in_file, \ | |
| 670 out_file2, ap_extlr=2, method=2) | |
| 609 >>> diff_two_files_identical(out_file1, out_file2) | 671 >>> diff_two_files_identical(out_file1, out_file2) |
| 610 True | 672 True |
| 611 >>> test_list = ["s1", "s2", "s3", "s4"] | 673 >>> test_list = ["s1", "s2", "s3", "s4"] |
| 612 >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile" | 674 >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile" |
| 613 >>> out_file3 = "test-data/test3_added_ids_out.avg_profile" | 675 >>> out_file3 = "test-data/test3_added_ids_out.avg_profile" |
| 614 >>> graphprot_profile_calculate_avg_profile(in_file, out_file3, ap_extlr=2, method=1, seq_ids_list=test_list) | 676 >>> graphprot_profile_calc_avg_profile(in_file, out_file3, \ |
| 677 ap_extlr=2, method=1, seq_ids_list=test_list) | |
| 615 >>> diff_two_files_identical(out_file3_exp, out_file3) | 678 >>> diff_two_files_identical(out_file3_exp, out_file3) |
| 616 True | 679 True |
| 617 | 680 |
| 618 """ | 681 """ |
| 619 if method == 1: | 682 if method == 1: |
| 622 site_starts_dic = {} | 685 site_starts_dic = {} |
| 623 with open(in_file) as f: | 686 with open(in_file) as f: |
| 624 for line in f: | 687 for line in f: |
| 625 cols = line.strip().split("\t") | 688 cols = line.strip().split("\t") |
| 626 site_id = int(cols[0]) | 689 site_id = int(cols[0]) |
| 627 pos = int(cols[1]) # 0-based. | 690 pos = int(cols[1]) # 0-based. |
| 628 score = float(cols[2]) | 691 score = float(cols[2]) |
| 629 # Store first position of site. | 692 # Store first position of site. |
| 630 if site_id not in site_starts_dic: | 693 if site_id not in site_starts_dic: |
| 631 site_starts_dic[site_id] = pos | 694 site_starts_dic[site_id] = pos |
| 632 if site_id in lists_dic: | 695 if site_id in lists_dic: |
| 633 lists_dic[site_id].append(score) | 696 lists_dic[site_id].append(score) |
| 634 else: | 697 else: |
| 635 lists_dic[site_id] = [] | 698 lists_dic[site_id] = [] |
| 636 lists_dic[site_id].append(score) | 699 lists_dic[site_id].append(score) |
| 637 f.close() | 700 f.close() |
| 638 # Check number of IDs (# FASTA sequence IDs has to be same as # site IDs). | 701 # Check number of IDs (# FASTA IDs has to be same as # site IDs). |
| 639 if seq_ids_list: | 702 if seq_ids_list: |
| 640 c_seq_ids = len(seq_ids_list) | 703 c_seq_ids = len(seq_ids_list) |
| 641 c_site_ids = len(site_starts_dic) | 704 c_site_ids = len(site_starts_dic) |
| 642 assert c_seq_ids == c_site_ids, "# sequence IDs != # site IDs (%i != %i)" %(c_seq_ids, c_site_ids) | 705 assert c_seq_ids == c_site_ids, \ |
| 706 "# sequence IDs != # site IDs (%i != %i)" \ | |
| 707 % (c_seq_ids, c_site_ids) | |
| 643 OUTPROF = open(out_file, "w") | 708 OUTPROF = open(out_file, "w") |
| 644 # For each site, calculate average profile scores list. | 709 # For each site, calculate average profile scores list. |
| 645 max_list = [] | |
| 646 for site_id in lists_dic: | 710 for site_id in lists_dic: |
| 647 # Convert profile score list to average profile scores list. | 711 # Convert profile score list to average profile scores list. |
| 648 aps_list = list_moving_window_average_values(lists_dic[site_id], | 712 aps_list = list_moving_window_average_values(lists_dic[site_id], |
| 649 win_extlr=ap_extlr) | 713 win_extlr=ap_extlr) |
| 650 start_pos = site_starts_dic[site_id] | 714 start_pos = site_starts_dic[site_id] |
| 651 # Get original FASTA sequence ID. | 715 # Get original FASTA sequence ID. |
| 652 if seq_ids_list: | 716 if seq_ids_list: |
| 653 site_id = seq_ids_list[site_id] | 717 site_id = seq_ids_list[site_id] |
| 654 for i, sc in enumerate(aps_list): | 718 for i, sc in enumerate(aps_list): |
| 655 pos = i + start_pos + 1 # make 1-based. | 719 pos = i + start_pos + 1 # make 1-based. |
| 656 OUTPROF.write("%s\t%i\t%f\n" %(site_id, pos, sc)) | 720 OUTPROF.write("%s\t%i\t%f\n" % (site_id, pos, sc)) |
| 657 OUTPROF.close() | 721 OUTPROF.close() |
| 658 elif method == 2: | 722 elif method == 2: |
| 659 OUTPROF = open(out_file, "w") | 723 OUTPROF = open(out_file, "w") |
| 660 # Old site ID. | 724 # Old site ID. |
| 661 old_id = "" | 725 old_id = "" |
| 666 site_starts_dic = {} | 730 site_starts_dic = {} |
| 667 with open(in_file) as f: | 731 with open(in_file) as f: |
| 668 for line in f: | 732 for line in f: |
| 669 cols = line.strip().split("\t") | 733 cols = line.strip().split("\t") |
| 670 cur_id = int(cols[0]) | 734 cur_id = int(cols[0]) |
| 671 pos = int(cols[1]) # 0-based. | 735 pos = int(cols[1]) # 0-based. |
| 672 score = float(cols[2]) | 736 score = float(cols[2]) |
| 673 # Store first position of site. | 737 # Store first position of site. |
| 674 if cur_id not in site_starts_dic: | 738 if cur_id not in site_starts_dic: |
| 675 site_starts_dic[cur_id] = pos | 739 site_starts_dic[cur_id] = pos |
| 676 # Case: new site (new column 1 ID). | 740 # Case: new site (new column 1 ID). |
| 677 if cur_id != old_id: | 741 if cur_id != old_id: |
| 678 # Process old id scores. | 742 # Process old id scores. |
| 679 if scores_list: | 743 if scores_list: |
| 680 aps_list = list_moving_window_average_values(scores_list, | 744 aps_list = \ |
| 681 win_extlr=ap_extlr) | 745 list_moving_window_average_values( |
| 746 scores_list, | |
| 747 win_extlr=ap_extlr) | |
| 682 start_pos = site_starts_dic[old_id] | 748 start_pos = site_starts_dic[old_id] |
| 683 seq_id = old_id | 749 seq_id = old_id |
| 684 # Get original FASTA sequence ID. | 750 # Get original FASTA sequence ID. |
| 685 if seq_ids_list: | 751 if seq_ids_list: |
| 686 seq_id = seq_ids_list[old_id] | 752 seq_id = seq_ids_list[old_id] |
| 687 for i, sc in enumerate(aps_list): | 753 for i, sc in enumerate(aps_list): |
| 688 pos = i + start_pos + 1 # make 1-based. | 754 pos = i + start_pos + 1 # make 1-based. |
| 689 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) | 755 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc)) |
| 690 # Reset list. | 756 # Reset list. |
| 691 scores_list = [] | 757 scores_list = [] |
| 692 old_id = cur_id | 758 old_id = cur_id |
| 693 scores_list.append(score) | 759 scores_list.append(score) |
| 694 else: | 760 else: |
| 703 seq_id = old_id | 769 seq_id = old_id |
| 704 # Get original FASTA sequence ID. | 770 # Get original FASTA sequence ID. |
| 705 if seq_ids_list: | 771 if seq_ids_list: |
| 706 seq_id = seq_ids_list[old_id] | 772 seq_id = seq_ids_list[old_id] |
| 707 for i, sc in enumerate(aps_list): | 773 for i, sc in enumerate(aps_list): |
| 708 pos = i + start_pos + 1 # make 1-based. | 774 pos = i + start_pos + 1 # make 1-based. |
| 709 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) | 775 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc)) |
| 710 OUTPROF.close() | 776 OUTPROF.close() |
| 711 | 777 |
| 712 | 778 |
| 713 ################################################################################ | 779 ############################################################################### |
| 714 | 780 |
| 715 def graphprot_profile_extract_peak_regions(in_file, out_file, | 781 def graphprot_profile_extract_peak_regions(in_file, out_file, |
| 716 max_merge_dist=0, | 782 max_merge_dist=0, |
| 717 sc_thr=0): | 783 sc_thr=0): |
| 718 """ | 784 """ |
| 719 Extract peak regions from GraphProt .profile file. | 785 Extract peak regions from GraphProt .profile file. |
| 720 Store the peak regions (defined as regions with scores >= sc_thr) | 786 Store the peak regions (defined as regions with scores >= sc_thr) |
| 721 as to out_file in 6-column .bed. | 787 as to out_file in 6-column .bed. |
| 722 | 788 |
| 723 TODO: | 789 TODO: |
| 724 Add option for genomic coordinates input (+ - polarity support). | 790 Add option for genomic coordinates input (+ - polarity support). |
| 725 Output genomic regions instead of sequence regions. | 791 Output genomic regions instead of sequence regions. |
| 733 >>> diff_two_files_identical(out_file, exp_file) | 799 >>> diff_two_files_identical(out_file, exp_file) |
| 734 True | 800 True |
| 735 >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10) | 801 >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10) |
| 736 >>> diff_two_files_identical(out_file, empty_file) | 802 >>> diff_two_files_identical(out_file, empty_file) |
| 737 True | 803 True |
| 738 >>> graphprot_profile_extract_peak_regions(in_file, out_file, max_merge_dist=2) | 804 >>> graphprot_profile_extract_peak_regions(in_file, out_file, \ |
| 805 max_merge_dist=2) | |
| 739 >>> diff_two_files_identical(out_file, exp2_file) | 806 >>> diff_two_files_identical(out_file, exp2_file) |
| 740 True | 807 True |
| 741 | 808 |
| 742 """ | 809 """ |
| 743 | 810 |
| 751 site_starts_dic = {} | 818 site_starts_dic = {} |
| 752 with open(in_file) as f: | 819 with open(in_file) as f: |
| 753 for line in f: | 820 for line in f: |
| 754 cols = line.strip().split("\t") | 821 cols = line.strip().split("\t") |
| 755 cur_id = cols[0] | 822 cur_id = cols[0] |
| 756 pos = int(cols[1]) # 0-based. | 823 pos = int(cols[1]) # 0-based. |
| 757 score = float(cols[2]) | 824 score = float(cols[2]) |
| 758 # Store first position of site. | 825 # Store first position of site. |
| 759 if cur_id not in site_starts_dic: | 826 if cur_id not in site_starts_dic: |
| 760 # If first position != zero, we assume positions are 1-based. | 827 # If first position != zero, we assume positions are 1-based. |
| 761 if pos != 0: | 828 if pos != 0: |
| 766 # Case: new site (new column 1 ID). | 833 # Case: new site (new column 1 ID). |
| 767 if cur_id != old_id: | 834 if cur_id != old_id: |
| 768 # Process old id scores. | 835 # Process old id scores. |
| 769 if scores_list: | 836 if scores_list: |
| 770 # Extract peaks from region. | 837 # Extract peaks from region. |
| 771 peak_list = list_extract_peaks(scores_list, | 838 peak_list = \ |
| 772 max_merge_dist=max_merge_dist, | 839 list_extract_peaks(scores_list, |
| 773 coords="bed", | 840 max_merge_dist=max_merge_dist, |
| 774 sc_thr=sc_thr) | 841 coords="bed", |
| 842 sc_thr=sc_thr) | |
| 775 start_pos = site_starts_dic[old_id] | 843 start_pos = site_starts_dic[old_id] |
| 776 # Print out peaks in .bed format. | 844 # Print out peaks in .bed format. |
| 777 for l in peak_list: | 845 for ln in peak_list: |
| 778 peak_s = start_pos + l[0] | 846 peak_s = start_pos + ln[0] |
| 779 peak_e = start_pos + l[1] | 847 peak_e = start_pos + ln[1] |
| 780 site_id = "%s,%i" %(old_id, l[2]) | 848 site_id = "%s,%i" % (old_id, ln[2]) |
| 781 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) | 849 OUTPEAKS.write("%s\t%i\t%i" |
| 850 "\t%s\t%f\t+\n" | |
| 851 % (old_id, peak_s, | |
| 852 peak_e, site_id, ln[3])) | |
| 782 # Reset list. | 853 # Reset list. |
| 783 scores_list = [] | 854 scores_list = [] |
| 784 old_id = cur_id | 855 old_id = cur_id |
| 785 scores_list.append(score) | 856 scores_list.append(score) |
| 786 else: | 857 else: |
| 788 scores_list.append(score) | 859 scores_list.append(score) |
| 789 f.close() | 860 f.close() |
| 790 # Process last block. | 861 # Process last block. |
| 791 if scores_list: | 862 if scores_list: |
| 792 # Extract peaks from region. | 863 # Extract peaks from region. |
| 793 peak_list = list_extract_peaks(scores_list, | 864 peak_list = list_extract_peaks(scores_list, |
| 794 max_merge_dist=max_merge_dist, | 865 max_merge_dist=max_merge_dist, |
| 795 coords="bed", | 866 coords="bed", |
| 796 sc_thr=sc_thr) | 867 sc_thr=sc_thr) |
| 797 start_pos = site_starts_dic[old_id] | 868 start_pos = site_starts_dic[old_id] |
| 798 # Print out peaks in .bed format. | 869 # Print out peaks in .bed format. |
| 799 for l in peak_list: | 870 for ln in peak_list: |
| 800 peak_s = start_pos + l[0] | 871 peak_s = start_pos + ln[0] |
| 801 peak_e = start_pos + l[1] | 872 peak_e = start_pos + ln[1] |
| 802 site_id = "%s,%i" %(old_id, l[2]) # best score also 1-based. | 873 site_id = "%s,%i" % (old_id, ln[2]) # best score also 1-based. |
| 803 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) | 874 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" |
| 875 % (old_id, peak_s, peak_e, site_id, ln[3])) | |
| 804 OUTPEAKS.close() | 876 OUTPEAKS.close() |
| 805 | 877 |
| 806 | 878 |
| 807 ################################################################################ | 879 ############################################################################### |
| 808 | 880 |
| 809 def list_extract_peaks(in_list, | 881 def list_extract_peaks(in_list, |
| 810 max_merge_dist=0, | 882 max_merge_dist=0, |
| 811 coords="list", | 883 coords="list", |
| 812 sc_thr=0): | 884 sc_thr=0): |
| 813 """ | 885 """ |
| 814 Extract peak regions from list. | 886 Extract peak regions from list. |
| 815 Peak region is defined as region >= score threshold. | 887 Peak region is defined as region >= score threshold. |
| 816 | 888 |
| 817 coords=bed : peak start 0-based, peak end 1-based. | 889 coords=bed : peak start 0-based, peak end 1-based. |
| 818 coords=list : peak start 0-based, peak end 0-based. | 890 coords=list : peak start 0-based, peak end 0-based. |
| 819 | 891 |
| 820 >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5] | 892 >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5] |
| 821 >>> list_extract_peaks(test_list) | 893 >>> list_extract_peaks(test_list) |
| 822 [[1, 4, 3, 4.5], [6, 7, 7, 6.5]] | 894 [[1, 4, 3, 4.5], [6, 7, 7, 6.5]] |
| 823 >>> list_extract_peaks(test_list, sc_thr=2) | 895 >>> list_extract_peaks(test_list, sc_thr=2) |
| 824 [[2, 3, 3, 4.5], [6, 7, 7, 6.5]] | 896 [[2, 3, 3, 4.5], [6, 7, 7, 6.5]] |
| 860 pr_top_pos = i | 932 pr_top_pos = i |
| 861 else: | 933 else: |
| 862 # Before was peak region? | 934 # Before was peak region? |
| 863 if inside: | 935 if inside: |
| 864 # Store peak region. | 936 # Store peak region. |
| 865 #peak_infos = "%i,%i,%i,%f" %(pr_s, pr_e, pr_top_pos, pr_top_sc) | |
| 866 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] | 937 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] |
| 867 peak_list.append(peak_infos) | 938 peak_list.append(peak_infos) |
| 868 inside = False | 939 inside = False |
| 869 pr_top_pos = 0 | 940 pr_top_pos = 0 |
| 870 pr_top_sc = -100000 | 941 pr_top_sc = -100000 |
| 896 new_top_pos = peak_list[i][2] | 967 new_top_pos = peak_list[i][2] |
| 897 new_top_sc = peak_list[i][3] | 968 new_top_sc = peak_list[i][3] |
| 898 if peak_list[i][3] < peak_list[j][3]: | 969 if peak_list[i][3] < peak_list[j][3]: |
| 899 new_top_pos = peak_list[j][2] | 970 new_top_pos = peak_list[j][2] |
| 900 new_top_sc = peak_list[j][3] | 971 new_top_sc = peak_list[j][3] |
| 901 new_peak = [peak_list[i][0], peak_list[j][1], new_top_pos, new_top_sc] | 972 new_peak = [peak_list[i][0], peak_list[j][1], |
| 973 new_top_pos, new_top_sc] | |
| 902 # If two peaks were merged. | 974 # If two peaks were merged. |
| 903 if new_peak: | 975 if new_peak: |
| 904 merged_peak_list.append(new_peak) | 976 merged_peak_list.append(new_peak) |
| 905 added_peaks_dic[i] = 1 | 977 added_peaks_dic[i] = 1 |
| 906 added_peaks_dic[j] = 1 | 978 added_peaks_dic[j] = 1 |
| 913 peaks_merged = False | 985 peaks_merged = False |
| 914 # If peak coordinates should be in .bed format, make peak ends 1-based. | 986 # If peak coordinates should be in .bed format, make peak ends 1-based. |
| 915 if coords == "bed": | 987 if coords == "bed": |
| 916 for i in range(len(peak_list)): | 988 for i in range(len(peak_list)): |
| 917 peak_list[i][1] += 1 | 989 peak_list[i][1] += 1 |
| 918 peak_list[i][2] += 1 # 1-base best score position too. | 990 peak_list[i][2] += 1 # 1-base best score position too. |
| 919 return peak_list | 991 return peak_list |
| 920 | 992 |
| 921 | 993 |
| 922 ################################################################################ | 994 ############################################################################### |
| 923 | 995 |
| 924 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False): | 996 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, |
| 925 """ | 997 print_rows=False): |
| 926 Given a .bed file of sequence peak regions (possible coordinates from | 998 """ |
| 999 Given a .bed file of sequence peak regions (possible coordinates from | |
| 927 0 to length of s), convert peak coordinates to genomic coordinates. | 1000 0 to length of s), convert peak coordinates to genomic coordinates. |
| 928 Do this by taking genomic regions of sequences as input. | 1001 Do this by taking genomic regions of sequences as input. |
| 929 | 1002 |
| 930 >>> test_in = "test-data/test.peaks.bed" | 1003 >>> test_in = "test-data/test.peaks.bed" |
| 931 >>> test_exp = "test-data/test_exp.peaks.bed" | 1004 >>> test_exp = "test-data/test_exp.peaks.bed" |
| 942 with open(genomic_sites_bed) as f: | 1015 with open(genomic_sites_bed) as f: |
| 943 for line in f: | 1016 for line in f: |
| 944 row = line.strip() | 1017 row = line.strip() |
| 945 cols = line.strip().split("\t") | 1018 cols = line.strip().split("\t") |
| 946 site_id = cols[3] | 1019 site_id = cols[3] |
| 947 assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(args.genomic_sites_bed) | 1020 assert site_id \ |
| 1021 not in id2row_dic, \ | |
| 1022 "column 4 IDs not unique in given .bed file \"%s\"" \ | |
| 1023 % (genomic_sites_bed) | |
| 948 id2row_dic[site_id] = row | 1024 id2row_dic[site_id] = row |
| 949 f.close() | 1025 f.close() |
| 950 | 1026 |
| 951 # Read in peaks file and convert coordinates. | 1027 # Read in peaks file and convert coordinates. |
| 952 OUTPEAKS = open(genomic_peak_file, "w") | 1028 OUTPEAKS = open(genomic_peak_file, "w") |
| 956 site_id = cols[0] | 1032 site_id = cols[0] |
| 957 site_s = int(cols[1]) | 1033 site_s = int(cols[1]) |
| 958 site_e = int(cols[2]) | 1034 site_e = int(cols[2]) |
| 959 site_id2 = cols[3] | 1035 site_id2 = cols[3] |
| 960 site_sc = float(cols[4]) | 1036 site_sc = float(cols[4]) |
| 961 assert re.search(".+,.+", site_id2), "regular expression failed for ID \"%s\"" %(site_id2) | 1037 assert re.search(".+,.+", site_id2), \ |
| 962 m = re.search(".+,(\d+)", site_id2) | 1038 "regular expression failed for ID \"%s\"" % (site_id2) |
| 963 sc_pos = int(m.group(1)) # 1-based. | 1039 m = re.search(r".+,(\d+)", site_id2) |
| 964 assert site_id in id2row_dic, "site ID \"%s\" not found in genomic sites dictionary" %(site_id) | 1040 sc_pos = int(m.group(1)) # 1-based. |
| 1041 assert site_id in id2row_dic, \ | |
| 1042 "site ID \"%s\" not found in genomic sites dictionary" \ | |
| 1043 % (site_id) | |
| 965 row = id2row_dic[site_id] | 1044 row = id2row_dic[site_id] |
| 966 rowl = row.split("\t") | 1045 rowl = row.split("\t") |
| 967 gen_chr = rowl[0] | 1046 gen_chr = rowl[0] |
| 968 gen_s = int(rowl[1]) | 1047 gen_s = int(rowl[1]) |
| 969 gen_e = int(rowl[2]) | 1048 gen_e = int(rowl[2]) |
| 972 new_e = site_e + gen_s | 1051 new_e = site_e + gen_s |
| 973 new_sc_pos = sc_pos + gen_s | 1052 new_sc_pos = sc_pos + gen_s |
| 974 if gen_pol == "-": | 1053 if gen_pol == "-": |
| 975 new_s = gen_e - site_e | 1054 new_s = gen_e - site_e |
| 976 new_e = gen_e - site_s | 1055 new_e = gen_e - site_s |
| 977 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. | 1056 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. |
| 978 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol) | 1057 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \ |
| 979 OUTPEAKS.write("%s\n" %(new_row)) | 1058 % (gen_chr, new_s, new_e, |
| 1059 site_id, new_sc_pos, site_sc, gen_pol) | |
| 1060 OUTPEAKS.write("%s\n" % (new_row)) | |
| 980 if print_rows: | 1061 if print_rows: |
| 981 print(new_row) | 1062 print(new_row) |
| 982 OUTPEAKS.close() | 1063 OUTPEAKS.close() |
| 983 | 1064 |
| 984 | 1065 |
| 985 ################################################################################ | 1066 ############################################################################### |
| 986 | 1067 |
| 987 def diff_two_files_identical(file1, file2): | 1068 def diff_two_files_identical(file1, file2): |
| 988 """ | 1069 """ |
| 989 Check whether two files are identical. Return true if diff reports no | 1070 Check whether two files are identical. Return true if diff reports no |
| 990 differences. | 1071 differences. |
| 991 | 1072 |
| 992 >>> file1 = "test-data/file1" | 1073 >>> file1 = "test-data/file1" |
| 993 >>> file2 = "test-data/file2" | 1074 >>> file2 = "test-data/file2" |
| 994 >>> diff_two_files_identical(file1, file2) | 1075 >>> diff_two_files_identical(file1, file2) |
| 995 True | 1076 True |
| 996 >>> file1 = "test-data/test1.bed" | 1077 >>> file1 = "test-data/test1.bed" |
| 1004 if output: | 1085 if output: |
| 1005 same = False | 1086 same = False |
| 1006 return same | 1087 return same |
| 1007 | 1088 |
| 1008 | 1089 |
| 1009 ################################################################################ | 1090 ############################################################################### |
| 1010 | |
| 1011 |
