comparison gplib.py @ 3:ace92c9a4653 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"
author bgruening
date Wed, 27 Jan 2021 19:27:47 +0000
parents 20429f4c1b95
children ddcf35a868b8
comparison
equal deleted inserted replaced
2:7bbb7bf6304f 3:ace92c9a4653
1 1
2 import gzip
3 import random
4 import re
5 import statistics
6 import subprocess
2 from distutils.spawn import find_executable 7 from distutils.spawn import find_executable
3 import subprocess 8
4 import statistics
5 import random
6 import gzip
7 import uuid
8 import sys
9 import re
10 import os
11 9
12 """ 10 """
13 11
14 Run doctests: 12 Run doctests:
15 13
17 15
18 16
19 """ 17 """
20 18
21 19
22 ################################################################################ 20 ###############################################################################
23 21
24 def graphprot_predictions_get_median(predictions_file): 22 def graphprot_predictions_get_median(predictions_file):
25 """ 23 """
26 Given a GraphProt .predictions file, read in site scores and return 24 Given a GraphProt .predictions file, read in site scores and return
27 the median value. 25 the median value.
28 26
29 >>> test_file = "test-data/test.predictions" 27 >>> test_file = "test-data/test.predictions"
30 >>> graphprot_predictions_get_median(test_file) 28 >>> graphprot_predictions_get_median(test_file)
31 0.571673 29 0.571673
41 f.close() 39 f.close()
42 # Return the median. 40 # Return the median.
43 return statistics.median(sc_list) 41 return statistics.median(sc_list)
44 42
45 43
46 ################################################################################ 44 ###############################################################################
47 45
48 def graphprot_profile_get_top_scores_median(profile_file, 46 def graphprot_profile_get_tsm(profile_file,
49 profile_type="profile", 47 profile_type="profile",
50 avg_profile_extlr=5): 48 avg_profile_extlr=5):
51 49
52 """ 50 """
53 Given a GraphProt .profile file, extract for each site (identified by 51 Given a GraphProt .profile file, extract for each site (identified by
54 column 1 ID) the top (= highest) score. Then return the median of these 52 column 1 ID) the top (= highest) score. Then return the median of these
55 top scores. 53 top scores.
56 54
57 profile_type can be either "profile" or "avg_profile". 55 profile_type can be either "profile" or "avg_profile".
58 "avg_profile means that the position-wise scores will first get smoothed 56 "avg_profile means that the position-wise scores will first get smoothed
59 out by calculating for each position a new score through taking a 57 out by calculating for each position a new score through taking a
60 sequence window -avg_profile_extlr to +avg_profile_extlr of the position 58 sequence window -avg_profile_extlr to +avg_profile_extlr of the position
61 and calculate the mean score over this window and assign it to the position. 59 and calculate the mean score over this window and assign it to the
62 After that, the maximum score of each site is chosen, and the median over 60 position. After that, the maximum score of each site is chosen, and the
63 all maximum scores is returned. 61 median over all maximum scores is returned.
64 "profile" leaves the position-wise scores as they are, directly extracting 62 "profile" leaves the position-wise scores as they are, directly extracting
65 the maximum for each site and then reporting the median. 63 the maximum for each site and then reporting the median.
66 64
67 >>> test_file = "test-data/test.profile" 65 >>> test_file = "test-data/test.profile"
68 >>> graphprot_profile_get_top_scores_median(test_file) 66 >>> graphprot_profile_get_tsm(test_file)
69 3.2 67 3.2
70 68
71 """ 69 """
72 # Dictionary of lists, with list of scores (value) for each site (key). 70 # Dictionary of lists, with list of scores (value) for each site (key).
73 lists_dic = {} 71 lists_dic = {}
88 if profile_type == "profile": 86 if profile_type == "profile":
89 max_sc = max(lists_dic[seq_id]) 87 max_sc = max(lists_dic[seq_id])
90 max_list.append(max_sc) 88 max_list.append(max_sc)
91 elif profile_type == "avg_profile": 89 elif profile_type == "avg_profile":
92 # Convert profile score list to average profile scores list. 90 # Convert profile score list to average profile scores list.
93 aps_list = list_moving_window_average_values(lists_dic[seq_id], 91 aps_list = \
94 win_extlr=avg_profile_extlr) 92 list_moving_window_average_values(lists_dic[seq_id],
93 win_extlr=avg_profile_extlr)
95 max_sc = max(aps_list) 94 max_sc = max(aps_list)
96 max_list.append(max_sc) 95 max_list.append(max_sc)
97 else: 96 else:
98 assert 0, "invalid profile_type argument given: \"%s\"" %(profile_type) 97 assert 0, "invalid profile_type argument given: \"%s\"" \
98 % (profile_type)
99 # Return the median. 99 # Return the median.
100 return statistics.median(max_list) 100 return statistics.median(max_list)
101 101
102 102
103 ################################################################################ 103 ###############################################################################
104 104
105 def list_moving_window_average_values(in_list, 105 def list_moving_window_average_values(in_list,
106 win_extlr=5, 106 win_extlr=5,
107 method=1): 107 method=1):
108 """ 108 """
109 Take a list of numeric values, and calculate for each position a new value, 109 Take a list of numeric values, and calculate for each position a new value,
110 by taking the mean value of the window of positions -win_extlr and 110 by taking the mean value of the window of positions -win_extlr and
111 +win_extlr. If full extension is not possible (at list ends), it just 111 +win_extlr. If full extension is not possible (at list ends), it just
112 takes what it gets. 112 takes what it gets.
113 Two implementations of the task are given, chose by method=1 or method=2. 113 Two implementations of the task are given, chose by method=1 or method=2.
114 114
115 >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1] 115 >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1]
116 >>> list_moving_window_average_values(test_list, win_extlr=2, method=1) 116 >>> list_moving_window_average_values(test_list, win_extlr=2, method=1)
140 e = i + win_extlr + 1 140 e = i + win_extlr + 1
141 if s < 0: 141 if s < 0:
142 s = 0 142 s = 0
143 if e > l_list: 143 if e > l_list:
144 e = l_list 144 e = l_list
145 l = e-s 145 ln = e - s
146 sc_sum = 0 146 sc_sum = 0
147 for j in range(l): 147 for j in range(ln):
148 sc_sum += in_list[s+j] 148 sc_sum += in_list[s + j]
149 new_list[i] = sc_sum / l 149 new_list[i] = sc_sum / ln
150 else: 150 else:
151 assert 0, "invalid method ID given (%i)" %(method) 151 assert 0, "invalid method ID given (%i)" % (method)
152 return new_list 152 return new_list
153 153
154 154
155 ################################################################################ 155 ###############################################################################
156 156
157 def echo_add_to_file(echo_string, out_file): 157 def echo_add_to_file(echo_string, out_file):
158 """ 158 """
159 Add a string to file, using echo command. 159 Add a string to file, using echo command.
160 160
162 check_cmd = 'echo "%s" >> %s' % (echo_string, out_file) 162 check_cmd = 'echo "%s" >> %s' % (echo_string, out_file)
163 output = subprocess.getoutput(check_cmd) 163 output = subprocess.getoutput(check_cmd)
164 error = False 164 error = False
165 if output: 165 if output:
166 error = True 166 error = True
167 assert error == False, "echo is complaining:\n%s\n%s" %(check_cmd, output) 167 assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output)
168 168
169 169
170 ################################################################################ 170 ###############################################################################
171 171
172 def is_tool(name): 172 def is_tool(name):
173 """Check whether tool "name" is in PATH.""" 173 """Check whether tool "name" is in PATH."""
174 return find_executable(name) is not None 174 return find_executable(name) is not None
175 175
176 176
177 ################################################################################ 177 ###############################################################################
178 178
179 def count_fasta_headers(fasta_file): 179 def count_fasta_headers(fasta_file):
180 """ 180 """
181 Count number of FASTA headers in fasta_file using grep. 181 Count number of FASTA headers in fasta_file using grep.
182 182
192 output = subprocess.getoutput(check_cmd) 192 output = subprocess.getoutput(check_cmd)
193 row_count = int(output.strip()) 193 row_count = int(output.strip())
194 return row_count 194 return row_count
195 195
196 196
197 ################################################################################ 197 ###############################################################################
198 198
199 def make_file_copy(in_file, out_file): 199 def make_file_copy(in_file, out_file):
200 """ 200 """
201 Make a file copy by copying in_file to out_file. 201 Make a file copy by copying in_file to out_file.
202 202
203 """ 203 """
204 check_cmd = "cat " + in_file + " > " + out_file 204 check_cmd = "cat " + in_file + " > " + out_file
205 assert in_file != out_file, "cat does not like to cat file into same file (%s)" %(check_cmd) 205 assert in_file != out_file, \
206 "cat does not like to cat file into same file (%s)" % (check_cmd)
206 output = subprocess.getoutput(check_cmd) 207 output = subprocess.getoutput(check_cmd)
207 error = False 208 error = False
208 if output: 209 if output:
209 error = True 210 error = True
210 assert error == False, "cat did not like your input (in_file: %s, out_file: %s):\n%s" %(in_file, out_file, output) 211 assert not error, \
211 212 "cat did not like your input (in_file: %s, out_file: %s):\n%s" \
212 213 % (in_file, out_file, output)
213 ################################################################################ 214
214 215
215 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, 216 ###############################################################################
217
218 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa,
216 test_size=500): 219 test_size=500):
217 """ 220 """
218 Split in_fasta .fa file into two files (e.g. test, train). 221 Split in_fasta .fa file into two files (e.g. test, train).
219 222
220 """ 223 """
234 c_out += 1 237 c_out += 1
235 TESTOUT.close() 238 TESTOUT.close()
236 TRAINOUT.close() 239 TRAINOUT.close()
237 240
238 241
239 ################################################################################ 242 ###############################################################################
243
244 def check_seqs_dic_format(seqs_dic):
245 """
246 Check sequence dictionary for lowercase-only sequences or sequences
247 wich have lowercase nts in between uppercase nts.
248 Return suspicious IDs as list or empty list if not hits.
249 IDs with lowercase-only sequences.
250
251 >>> seqs_dic = {"id1" : "acguACGU", "id2" : "acgua", "id3" : "acgUUaUcc"}
252 >>> check_seqs_dic_format(seqs_dic)
253 ['id2', 'id3']
254 >>> seqs_dic = {"id1" : "acgAUaa", "id2" : "ACGUACUA"}
255 >>> check_seqs_dic_format(seqs_dic)
256 []
257
258 """
259 assert seqs_dic, "given seqs_dic empty"
260 bad_seq_ids = []
261 for seq_id in seqs_dic:
262 seq = seqs_dic[seq_id]
263 if re.search("^[acgtun]+$", seq):
264 bad_seq_ids.append(seq_id)
265 if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq):
266 bad_seq_ids.append(seq_id)
267 return bad_seq_ids
268
269
270 ###############################################################################
240 271
241 def read_fasta_into_dic(fasta_file, 272 def read_fasta_into_dic(fasta_file,
242 seqs_dic=False, 273 seqs_dic=False,
243 ids_dic=False, 274 ids_dic=False,
244 read_dna=False, 275 read_dna=False,
276 short_ensembl=False,
245 reject_lc=False, 277 reject_lc=False,
246 convert_to_uc=False, 278 convert_to_uc=False,
247 skip_n_seqs=True): 279 skip_n_seqs=True):
248 """ 280 """
249 Read in FASTA sequences, convert to RNA, store in dictionary 281 Read in FASTA sequences, convert to RNA, store in dictionary
250 and return dictionary. 282 and return dictionary.
251 283
252 >>> test_fasta = "test-data/test.fa" 284 >>> test_fasta = "test-data/test.fa"
253 >>> read_fasta_into_dic(test_fasta) 285 >>> read_fasta_into_dic(test_fasta)
254 {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'} 286 {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'}
255 >>> test_fasta = "test-data/test2.fa" 287 >>> test_fasta = "test-data/test2.fa"
256 >>> read_fasta_into_dic(test_fasta) 288 >>> read_fasta_into_dic(test_fasta)
257 {} 289 {}
258 >>> test_fasta = "test-data/test.ensembl.fa" 290 >>> test_fasta = "test-data/test.ensembl.fa"
259 >>> read_fasta_into_dic(test_fasta, read_dna=True) 291 >>> read_fasta_into_dic(test_fasta, read_dna=True, short_ensembl=True)
260 {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'} 292 {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'}
293 >>> test_fasta = "test-data/test4.fa"
294 >>> read_fasta_into_dic(test_fasta)
295 {'1': 'gccuAUGUuuua', '2': 'cugaAACUaugu'}
261 296
262 """ 297 """
263 if not seqs_dic: 298 if not seqs_dic:
264 seqs_dic = {} 299 seqs_dic = {}
265 seq_id = "" 300 seq_id = ""
266 seq = "" 301 seq = ""
267 302
268 # Go through FASTA file, extract sequences. 303 # Go through FASTA file, extract sequences.
269 if re.search(".+\.gz$", fasta_file): 304 if re.search(r".+\.gz$", fasta_file):
270 f = gzip.open(fasta_file, 'rt') 305 f = gzip.open(fasta_file, 'rt')
271 else: 306 else:
272 f = open(fasta_file, "r") 307 f = open(fasta_file, "r")
273 for line in f: 308 for line in f:
274 if re.search(">.+", line): 309 if re.search(">.+", line):
275 m = re.search(">(.+)", line) 310 m = re.search(">(.+)", line)
276 seq_id = m.group(1) 311 seq_id = m.group(1)
277 # If there is a ".", take only first part of header. 312 # If there is a ".", take only first part of header.
278 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." 313 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..."
279 if re.search(".+\..+", seq_id): 314 if short_ensembl:
280 m = re.search("(.+?)\..+", seq_id) 315 if re.search(r".+\..+", seq_id):
281 seq_id = m.group(1) 316 m = re.search(r"(.+?)\..+", seq_id)
282 assert seq_id not in seqs_dic, "non-unique FASTA header \"%s\" in \"%s\"" % (seq_id, fasta_file) 317 seq_id = m.group(1)
318 assert seq_id not in seqs_dic, \
319 "non-unique FASTA header \"%s\" in \"%s\"" \
320 % (seq_id, fasta_file)
283 if ids_dic: 321 if ids_dic:
284 if seq_id in ids_dic: 322 if seq_id in ids_dic:
285 seqs_dic[seq_id] = "" 323 seqs_dic[seq_id] = ""
286 else: 324 else:
287 seqs_dic[seq_id] = "" 325 seqs_dic[seq_id] = ""
288 elif re.search("[ACGTUN]+", line, re.I): 326 elif re.search("[ACGTUN]+", line, re.I):
289 if seq_id in seqs_dic: 327 if seq_id in seqs_dic:
290 m = re.search("([ACGTUN]+)", line, re.I) 328 m = re.search("([ACGTUN]+)", line, re.I)
291 seq = m.group(1) 329 seq = m.group(1)
292 if reject_lc: 330 if reject_lc:
293 assert not re.search("[a-z]", seq), "lowercase characters detected in sequence \"%i\" (reject_lc=True)" %(seq_id) 331 assert \
332 not re.search("[a-z]", seq), \
333 "lc char detected in seq \"%i\" (reject_lc=True)" \
334 % (seq_id)
294 if convert_to_uc: 335 if convert_to_uc:
295 seq = seq.upper() 336 seq = seq.upper()
296 # If sequences with N nucleotides should be skipped. 337 # If sequences with N nucleotides should be skipped.
297 if skip_n_seqs: 338 if skip_n_seqs:
298 if "n" in m.group(1) or "N" in m.group(1): 339 if "n" in m.group(1) or "N" in m.group(1):
299 print ("WARNING: \"%s\" contains N nucleotides. Discarding sequence ... " % (seq_id)) 340 print("WARNING: \"%s\" contains N. Discarding "
341 "sequence ... " % (seq_id))
300 del seqs_dic[seq_id] 342 del seqs_dic[seq_id]
301 continue 343 continue
302 # Convert to RNA, concatenate sequence. 344 # Convert to RNA, concatenate sequence.
303 if read_dna: 345 if read_dna:
304 seqs_dic[seq_id] += m.group(1).replace("U","T").replace("u","t") 346 seqs_dic[seq_id] += \
347 m.group(1).replace("U", "T").replace("u", "t")
305 else: 348 else:
306 seqs_dic[seq_id] += m.group(1).replace("T","U").replace("t","u") 349 seqs_dic[seq_id] += \
350 m.group(1).replace("T", "U").replace("t", "u")
307 f.close() 351 f.close()
308 return seqs_dic 352 return seqs_dic
309 353
310 354
311 ################################################################################ 355 ###############################################################################
312 356
313 def random_order_dic_keys_into_list(in_dic): 357 def random_order_dic_keys_into_list(in_dic):
314 """ 358 """
315 Read in dictionary keys, and return random order list of IDs. 359 Read in dictionary keys, and return random order list of IDs.
316 360
320 id_list.append(key) 364 id_list.append(key)
321 random.shuffle(id_list) 365 random.shuffle(id_list)
322 return id_list 366 return id_list
323 367
324 368
325 ################################################################################ 369 ###############################################################################
326 370
327 def graphprot_get_param_string(params_file): 371 def graphprot_get_param_string(params_file):
328 """ 372 """
329 Get parameter string from GraphProt .params file. 373 Get parameter string from GraphProt .params file.
330 374
346 continue 390 continue
347 if par == "model_type": 391 if par == "model_type":
348 if setting == "sequence": 392 if setting == "sequence":
349 param_string += "-onlyseq " 393 param_string += "-onlyseq "
350 else: 394 else:
351 param_string += "-%s %s " %(par, setting) 395 param_string += "-%s %s " % (par, setting)
352 else: 396 else:
353 assert 0, "pattern matching failed for string \"%s\"" %(param) 397 assert 0, "pattern matching failed for string \"%s\"" % (param)
354 return param_string 398 return param_string
355 399
356 400
357 ################################################################################ 401 ###############################################################################
358 402
359 def seqs_dic_count_uc_nts(seqs_dic): 403 def seqs_dic_count_uc_nts(seqs_dic):
360 """ 404 """
361 Count number of uppercase nucleotides in sequences stored in sequence 405 Count number of uppercase nucleotides in sequences stored in sequence
362 dictionary. 406 dictionary.
363 407
364 >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'} 408 >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'}
365 >>> seqs_dic_count_uc_nts(seqs_dic) 409 >>> seqs_dic_count_uc_nts(seqs_dic)
366 6 410 6
367 >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'} 411 >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'}
368 >>> seqs_dic_count_uc_nts(seqs_dic) 412 >>> seqs_dic_count_uc_nts(seqs_dic)
374 for seq_id in seqs_dic: 418 for seq_id in seqs_dic:
375 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) 419 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id]))
376 return c_uc 420 return c_uc
377 421
378 422
379 ################################################################################ 423 ###############################################################################
380 424
381 def seqs_dic_count_lc_nts(seqs_dic): 425 def seqs_dic_count_lc_nts(seqs_dic):
382 """ 426 """
383 Count number of lowercase nucleotides in sequences stored in sequence 427 Count number of lowercase nucleotides in sequences stored in sequence
384 dictionary. 428 dictionary.
385 429
386 >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'} 430 >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'}
387 >>> seqs_dic_count_lc_nts(seqs_dic) 431 >>> seqs_dic_count_lc_nts(seqs_dic)
388 10 432 10
389 >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'} 433 >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'}
390 >>> seqs_dic_count_lc_nts(seqs_dic) 434 >>> seqs_dic_count_lc_nts(seqs_dic)
396 for seq_id in seqs_dic: 440 for seq_id in seqs_dic:
397 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) 441 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id]))
398 return c_uc 442 return c_uc
399 443
400 444
401 ################################################################################ 445 ###############################################################################
402 446
403 def count_file_rows(in_file): 447 def count_file_rows(in_file):
404 """ 448 """
405 Count number of file rows for given input file. 449 Count number of file rows for given input file.
406 450
407 >>> test_file = "test-data/test1.bed" 451 >>> test_file = "test-data/test1.bed"
408 >>> count_file_rows(test_file) 452 >>> count_file_rows(test_file)
409 7 453 7
410 >>> test_file = "test-data/empty_file" 454 >>> test_file = "test-data/empty_file"
411 >>> count_file_rows(test_file) 455 >>> count_file_rows(test_file)
416 output = subprocess.getoutput(check_cmd) 460 output = subprocess.getoutput(check_cmd)
417 row_count = int(output.strip()) 461 row_count = int(output.strip())
418 return row_count 462 return row_count
419 463
420 464
421 ################################################################################ 465 ###############################################################################
422 466
423 def bed_check_six_col_format(bed_file): 467 def bed_check_six_col_format(bed_file):
424 """ 468 """
425 Check whether given .bed file has 6 columns. 469 Check whether given .bed file has 6 columns.
426 470
442 break 486 break
443 f.closed 487 f.closed
444 return six_col_format 488 return six_col_format
445 489
446 490
447 ################################################################################ 491 ###############################################################################
448 492
449 def bed_check_unique_ids(bed_file): 493 def bed_check_unique_ids(bed_file):
450 """ 494 """
451 Check whether .bed file (6 column format with IDs in column 4) 495 Check whether .bed file (6 column format with IDs in column 4)
452 has unique column 4 IDs. 496 has unique column 4 IDs.
453 497
454 >>> test_bed = "test-data/test1.bed" 498 >>> test_bed = "test-data/test1.bed"
455 >>> bed_check_unique_ids(test_bed) 499 >>> bed_check_unique_ids(test_bed)
456 True 500 True
457 >>> test_bed = "test-data/test2.bed" 501 >>> test_bed = "test-data/test2.bed"
458 >>> bed_check_unique_ids(test_bed) 502 >>> bed_check_unique_ids(test_bed)
466 return False 510 return False
467 else: 511 else:
468 return True 512 return True
469 513
470 514
471 ################################################################################ 515 ###############################################################################
472 516
473 def get_seq_lengths_from_seqs_dic(seqs_dic): 517 def get_seq_lengths_from_seqs_dic(seqs_dic):
474 """ 518 """
475 Given a dictionary of sequences, return dictionary of sequence lengths. 519 Given a dictionary of sequences, return dictionary of sequence lengths.
476 Mapping is sequence ID -> sequence length. 520 Mapping is sequence ID -> sequence length.
481 seq_l = len(seqs_dic[seq_id]) 525 seq_l = len(seqs_dic[seq_id])
482 seq_len_dic[seq_id] = seq_l 526 seq_len_dic[seq_id] = seq_l
483 return seq_len_dic 527 return seq_len_dic
484 528
485 529
486 ################################################################################ 530 ###############################################################################
487 531
488 def bed_get_region_lengths(bed_file): 532 def bed_get_region_lengths(bed_file):
489 """ 533 """
490 Read in .bed file, store and return region lengths in dictionary. 534 Read in .bed file, store and return region lengths in dictionary.
491 key : region ID (.bed col4) 535 key : region ID (.bed col4)
497 541
498 """ 542 """
499 id2len_dic = {} 543 id2len_dic = {}
500 with open(bed_file) as f: 544 with open(bed_file) as f:
501 for line in f: 545 for line in f:
502 row = line.strip()
503 cols = line.strip().split("\t") 546 cols = line.strip().split("\t")
504 site_s = int(cols[1]) 547 site_s = int(cols[1])
505 site_e = int(cols[2]) 548 site_e = int(cols[2])
506 site_id = cols[3] 549 site_id = cols[3]
507 site_l = site_e - site_s 550 site_l = site_e - site_s
508 assert site_id not in id2len_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(bed_file) 551 assert site_id \
552 not in id2len_dic, \
553 "column 4 IDs not unique in given .bed file \"%s\"" \
554 % (bed_file)
509 id2len_dic[site_id] = site_l 555 id2len_dic[site_id] = site_l
510 f.closed 556 f.closed
511 assert id2len_dic, "No IDs read into dictionary (input file \"%s\" empty or malformatted?)" % (in_bed) 557 assert id2len_dic, \
558 "No IDs read into dic (input file \"%s\" empty or malformatted?)" \
559 % (bed_file)
512 return id2len_dic 560 return id2len_dic
513 561
514 562
515 ################################################################################ 563 ###############################################################################
516 564
517 def graphprot_get_param_dic(params_file): 565 def graphprot_get_param_dic(params_file):
518 """ 566 """
519 Read in GraphProt .params file and store in dictionary. 567 Read in GraphProt .params file and store in dictionary.
520 key = parameter 568 key = parameter
521 value = parameter value 569 value = parameter value
522 570
523 >>> params_file = "test-data/test.params" 571 >>> params_file = "test-data/test.params"
524 >>> graphprot_get_param_dic(params_file) 572 >>> graphprot_get_param_dic(params_file)
525 {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', 'pos_train_profile_median': '5.039610', 'pos_train_avg_profile_median_1': '4.236340', 'pos_train_avg_profile_median_2': '3.868431', 'pos_train_avg_profile_median_3': '3.331277', 'pos_train_avg_profile_median_4': '2.998667', 'pos_train_avg_profile_median_5': '2.829782', 'pos_train_avg_profile_median_6': '2.626623', 'pos_train_avg_profile_median_7': '2.447083', 'pos_train_avg_profile_median_8': '2.349919', 'pos_train_avg_profile_median_9': '2.239829', 'pos_train_avg_profile_median_10': '2.161676'} 573 {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', \
574 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', \
575 'pos_train_profile_median': '5.039610', \
576 'pos_train_avg_profile_median_1': '4.236340', \
577 'pos_train_avg_profile_median_2': '3.868431', \
578 'pos_train_avg_profile_median_3': '3.331277', \
579 'pos_train_avg_profile_median_4': '2.998667', \
580 'pos_train_avg_profile_median_5': '2.829782', \
581 'pos_train_avg_profile_median_6': '2.626623', \
582 'pos_train_avg_profile_median_7': '2.447083', \
583 'pos_train_avg_profile_median_8': '2.349919', \
584 'pos_train_avg_profile_median_9': '2.239829', \
585 'pos_train_avg_profile_median_10': '2.161676'}
526 586
527 """ 587 """
528 param_dic = {} 588 param_dic = {}
529 with open(params_file) as f: 589 with open(params_file) as f:
530 for line in f: 590 for line in f:
537 param_dic[par] = setting 597 param_dic[par] = setting
538 f.close() 598 f.close()
539 return param_dic 599 return param_dic
540 600
541 601
542 ################################################################################ 602 ###############################################################################
543 603
544 def graphprot_filter_predictions_file(in_file, out_file, 604 def graphprot_filter_predictions_file(in_file, out_file,
545 sc_thr=0): 605 sc_thr=0):
546 """ 606 """
547 Filter GraphProt .predictions file by given score thr_sc. 607 Filter GraphProt .predictions file by given score thr_sc.
552 row = line.strip() 612 row = line.strip()
553 cols = line.strip().split("\t") 613 cols = line.strip().split("\t")
554 score = float(cols[2]) 614 score = float(cols[2])
555 if score < sc_thr: 615 if score < sc_thr:
556 continue 616 continue
557 OUTPRED.write("%s\n" %(row)) 617 OUTPRED.write("%s\n" % (row))
558 f.close() 618 f.close()
559 OUTPRED.close() 619 OUTPRED.close()
560 620
561 621
562 ################################################################################ 622 ###############################################################################
563 623
564 def fasta_read_in_ids(fasta_file): 624 def fasta_read_in_ids(fasta_file):
565 """ 625 """
566 Given a .fa file, read in header IDs in order appearing in file, 626 Given a .fa file, read in header IDs in order appearing in file,
567 and store in list. 627 and store in list.
568 628
569 >>> test_file = "test-data/test3.fa" 629 >>> test_file = "test-data/test3.fa"
570 >>> fasta_read_in_ids(test_file) 630 >>> fasta_read_in_ids(test_file)
571 ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316'] 631 ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316']
580 ids_list.append(seq_id) 640 ids_list.append(seq_id)
581 f.close() 641 f.close()
582 return ids_list 642 return ids_list
583 643
584 644
585 ################################################################################ 645 ###############################################################################
586 646
587 def graphprot_profile_calculate_avg_profile(in_file, out_file, 647 def graphprot_profile_calc_avg_profile(in_file, out_file,
588 ap_extlr=5, 648 ap_extlr=5,
589 seq_ids_list=False, 649 seq_ids_list=False,
590 method=1): 650 method=1):
591 """ 651 """
592 Given a GraphProt .profile file, calculate average profiles and output 652 Given a GraphProt .profile file, calculate average profiles and output
593 average profile file. 653 average profile file.
594 Average profile means that the position-wise scores will get smoothed 654 Average profile means that the position-wise scores will get smoothed
595 out by calculating for each position a new score, taking a sequence 655 out by calculating for each position a new score, taking a sequence
596 window -ap_extlr to +ap_extlr relative to the position 656 window -ap_extlr to +ap_extlr relative to the position
597 and calculate the mean score over this window. The mean score then 657 and calculate the mean score over this window. The mean score then
598 becomes the new average profile score at this position. 658 becomes the new average profile score at this position.
599 Two different implementations of the task are given: 659 Two different implementations of the task are given:
600 method=1 (new python implementation, slower + more memory but easy to read) 660 method=1 (new python implementation, slower + more memory but easy to read)
601 method=2 (old perl implementation, faster and less memory but more code) 661 method=2 (old perl implementation, faster and less memory but more code)
602 662
603 >>> in_file = "test-data/test2.profile" 663 >>> in_file = "test-data/test2.profile"
604 >>> out_file1 = "test-data/test2_1.avg_profile" 664 >>> out_file1 = "test-data/test2_1.avg_profile"
605 >>> out_file2 = "test-data/test2_2.avg_profile" 665 >>> out_file2 = "test-data/test2_2.avg_profile"
606 >>> out_file4 = "test-data/test2_3.avg_profile" 666 >>> out_file4 = "test-data/test2_3.avg_profile"
607 >>> graphprot_profile_calculate_avg_profile(in_file, out_file1, ap_extlr=2, method=1) 667 >>> graphprot_profile_calc_avg_profile(in_file, \
608 >>> graphprot_profile_calculate_avg_profile(in_file, out_file2, ap_extlr=2, method=2) 668 out_file1, ap_extlr=2, method=1)
669 >>> graphprot_profile_calc_avg_profile(in_file, \
670 out_file2, ap_extlr=2, method=2)
609 >>> diff_two_files_identical(out_file1, out_file2) 671 >>> diff_two_files_identical(out_file1, out_file2)
610 True 672 True
611 >>> test_list = ["s1", "s2", "s3", "s4"] 673 >>> test_list = ["s1", "s2", "s3", "s4"]
612 >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile" 674 >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile"
613 >>> out_file3 = "test-data/test3_added_ids_out.avg_profile" 675 >>> out_file3 = "test-data/test3_added_ids_out.avg_profile"
614 >>> graphprot_profile_calculate_avg_profile(in_file, out_file3, ap_extlr=2, method=1, seq_ids_list=test_list) 676 >>> graphprot_profile_calc_avg_profile(in_file, out_file3, \
677 ap_extlr=2, method=1, seq_ids_list=test_list)
615 >>> diff_two_files_identical(out_file3_exp, out_file3) 678 >>> diff_two_files_identical(out_file3_exp, out_file3)
616 True 679 True
617 680
618 """ 681 """
619 if method == 1: 682 if method == 1:
622 site_starts_dic = {} 685 site_starts_dic = {}
623 with open(in_file) as f: 686 with open(in_file) as f:
624 for line in f: 687 for line in f:
625 cols = line.strip().split("\t") 688 cols = line.strip().split("\t")
626 site_id = int(cols[0]) 689 site_id = int(cols[0])
627 pos = int(cols[1]) # 0-based. 690 pos = int(cols[1]) # 0-based.
628 score = float(cols[2]) 691 score = float(cols[2])
629 # Store first position of site. 692 # Store first position of site.
630 if site_id not in site_starts_dic: 693 if site_id not in site_starts_dic:
631 site_starts_dic[site_id] = pos 694 site_starts_dic[site_id] = pos
632 if site_id in lists_dic: 695 if site_id in lists_dic:
633 lists_dic[site_id].append(score) 696 lists_dic[site_id].append(score)
634 else: 697 else:
635 lists_dic[site_id] = [] 698 lists_dic[site_id] = []
636 lists_dic[site_id].append(score) 699 lists_dic[site_id].append(score)
637 f.close() 700 f.close()
638 # Check number of IDs (# FASTA sequence IDs has to be same as # site IDs). 701 # Check number of IDs (# FASTA IDs has to be same as # site IDs).
639 if seq_ids_list: 702 if seq_ids_list:
640 c_seq_ids = len(seq_ids_list) 703 c_seq_ids = len(seq_ids_list)
641 c_site_ids = len(site_starts_dic) 704 c_site_ids = len(site_starts_dic)
642 assert c_seq_ids == c_site_ids, "# sequence IDs != # site IDs (%i != %i)" %(c_seq_ids, c_site_ids) 705 assert c_seq_ids == c_site_ids, \
706 "# sequence IDs != # site IDs (%i != %i)" \
707 % (c_seq_ids, c_site_ids)
643 OUTPROF = open(out_file, "w") 708 OUTPROF = open(out_file, "w")
644 # For each site, calculate average profile scores list. 709 # For each site, calculate average profile scores list.
645 max_list = []
646 for site_id in lists_dic: 710 for site_id in lists_dic:
647 # Convert profile score list to average profile scores list. 711 # Convert profile score list to average profile scores list.
648 aps_list = list_moving_window_average_values(lists_dic[site_id], 712 aps_list = list_moving_window_average_values(lists_dic[site_id],
649 win_extlr=ap_extlr) 713 win_extlr=ap_extlr)
650 start_pos = site_starts_dic[site_id] 714 start_pos = site_starts_dic[site_id]
651 # Get original FASTA sequence ID. 715 # Get original FASTA sequence ID.
652 if seq_ids_list: 716 if seq_ids_list:
653 site_id = seq_ids_list[site_id] 717 site_id = seq_ids_list[site_id]
654 for i, sc in enumerate(aps_list): 718 for i, sc in enumerate(aps_list):
655 pos = i + start_pos + 1 # make 1-based. 719 pos = i + start_pos + 1 # make 1-based.
656 OUTPROF.write("%s\t%i\t%f\n" %(site_id, pos, sc)) 720 OUTPROF.write("%s\t%i\t%f\n" % (site_id, pos, sc))
657 OUTPROF.close() 721 OUTPROF.close()
658 elif method == 2: 722 elif method == 2:
659 OUTPROF = open(out_file, "w") 723 OUTPROF = open(out_file, "w")
660 # Old site ID. 724 # Old site ID.
661 old_id = "" 725 old_id = ""
666 site_starts_dic = {} 730 site_starts_dic = {}
667 with open(in_file) as f: 731 with open(in_file) as f:
668 for line in f: 732 for line in f:
669 cols = line.strip().split("\t") 733 cols = line.strip().split("\t")
670 cur_id = int(cols[0]) 734 cur_id = int(cols[0])
671 pos = int(cols[1]) # 0-based. 735 pos = int(cols[1]) # 0-based.
672 score = float(cols[2]) 736 score = float(cols[2])
673 # Store first position of site. 737 # Store first position of site.
674 if cur_id not in site_starts_dic: 738 if cur_id not in site_starts_dic:
675 site_starts_dic[cur_id] = pos 739 site_starts_dic[cur_id] = pos
676 # Case: new site (new column 1 ID). 740 # Case: new site (new column 1 ID).
677 if cur_id != old_id: 741 if cur_id != old_id:
678 # Process old id scores. 742 # Process old id scores.
679 if scores_list: 743 if scores_list:
680 aps_list = list_moving_window_average_values(scores_list, 744 aps_list = \
681 win_extlr=ap_extlr) 745 list_moving_window_average_values(
746 scores_list,
747 win_extlr=ap_extlr)
682 start_pos = site_starts_dic[old_id] 748 start_pos = site_starts_dic[old_id]
683 seq_id = old_id 749 seq_id = old_id
684 # Get original FASTA sequence ID. 750 # Get original FASTA sequence ID.
685 if seq_ids_list: 751 if seq_ids_list:
686 seq_id = seq_ids_list[old_id] 752 seq_id = seq_ids_list[old_id]
687 for i, sc in enumerate(aps_list): 753 for i, sc in enumerate(aps_list):
688 pos = i + start_pos + 1 # make 1-based. 754 pos = i + start_pos + 1 # make 1-based.
689 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) 755 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc))
690 # Reset list. 756 # Reset list.
691 scores_list = [] 757 scores_list = []
692 old_id = cur_id 758 old_id = cur_id
693 scores_list.append(score) 759 scores_list.append(score)
694 else: 760 else:
703 seq_id = old_id 769 seq_id = old_id
704 # Get original FASTA sequence ID. 770 # Get original FASTA sequence ID.
705 if seq_ids_list: 771 if seq_ids_list:
706 seq_id = seq_ids_list[old_id] 772 seq_id = seq_ids_list[old_id]
707 for i, sc in enumerate(aps_list): 773 for i, sc in enumerate(aps_list):
708 pos = i + start_pos + 1 # make 1-based. 774 pos = i + start_pos + 1 # make 1-based.
709 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) 775 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc))
710 OUTPROF.close() 776 OUTPROF.close()
711 777
712 778
713 ################################################################################ 779 ###############################################################################
714 780
715 def graphprot_profile_extract_peak_regions(in_file, out_file, 781 def graphprot_profile_extract_peak_regions(in_file, out_file,
716 max_merge_dist=0, 782 max_merge_dist=0,
717 sc_thr=0): 783 sc_thr=0):
718 """ 784 """
719 Extract peak regions from GraphProt .profile file. 785 Extract peak regions from GraphProt .profile file.
720 Store the peak regions (defined as regions with scores >= sc_thr) 786 Store the peak regions (defined as regions with scores >= sc_thr)
721 as to out_file in 6-column .bed. 787 as to out_file in 6-column .bed.
722 788
723 TODO: 789 TODO:
724 Add option for genomic coordinates input (+ - polarity support). 790 Add option for genomic coordinates input (+ - polarity support).
725 Output genomic regions instead of sequence regions. 791 Output genomic regions instead of sequence regions.
733 >>> diff_two_files_identical(out_file, exp_file) 799 >>> diff_two_files_identical(out_file, exp_file)
734 True 800 True
735 >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10) 801 >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10)
736 >>> diff_two_files_identical(out_file, empty_file) 802 >>> diff_two_files_identical(out_file, empty_file)
737 True 803 True
738 >>> graphprot_profile_extract_peak_regions(in_file, out_file, max_merge_dist=2) 804 >>> graphprot_profile_extract_peak_regions(in_file, out_file, \
805 max_merge_dist=2)
739 >>> diff_two_files_identical(out_file, exp2_file) 806 >>> diff_two_files_identical(out_file, exp2_file)
740 True 807 True
741 808
742 """ 809 """
743 810
751 site_starts_dic = {} 818 site_starts_dic = {}
752 with open(in_file) as f: 819 with open(in_file) as f:
753 for line in f: 820 for line in f:
754 cols = line.strip().split("\t") 821 cols = line.strip().split("\t")
755 cur_id = cols[0] 822 cur_id = cols[0]
756 pos = int(cols[1]) # 0-based. 823 pos = int(cols[1]) # 0-based.
757 score = float(cols[2]) 824 score = float(cols[2])
758 # Store first position of site. 825 # Store first position of site.
759 if cur_id not in site_starts_dic: 826 if cur_id not in site_starts_dic:
760 # If first position != zero, we assume positions are 1-based. 827 # If first position != zero, we assume positions are 1-based.
761 if pos != 0: 828 if pos != 0:
766 # Case: new site (new column 1 ID). 833 # Case: new site (new column 1 ID).
767 if cur_id != old_id: 834 if cur_id != old_id:
768 # Process old id scores. 835 # Process old id scores.
769 if scores_list: 836 if scores_list:
770 # Extract peaks from region. 837 # Extract peaks from region.
771 peak_list = list_extract_peaks(scores_list, 838 peak_list = \
772 max_merge_dist=max_merge_dist, 839 list_extract_peaks(scores_list,
773 coords="bed", 840 max_merge_dist=max_merge_dist,
774 sc_thr=sc_thr) 841 coords="bed",
842 sc_thr=sc_thr)
775 start_pos = site_starts_dic[old_id] 843 start_pos = site_starts_dic[old_id]
776 # Print out peaks in .bed format. 844 # Print out peaks in .bed format.
777 for l in peak_list: 845 for ln in peak_list:
778 peak_s = start_pos + l[0] 846 peak_s = start_pos + ln[0]
779 peak_e = start_pos + l[1] 847 peak_e = start_pos + ln[1]
780 site_id = "%s,%i" %(old_id, l[2]) 848 site_id = "%s,%i" % (old_id, ln[2])
781 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) 849 OUTPEAKS.write("%s\t%i\t%i"
850 "\t%s\t%f\t+\n"
851 % (old_id, peak_s,
852 peak_e, site_id, ln[3]))
782 # Reset list. 853 # Reset list.
783 scores_list = [] 854 scores_list = []
784 old_id = cur_id 855 old_id = cur_id
785 scores_list.append(score) 856 scores_list.append(score)
786 else: 857 else:
788 scores_list.append(score) 859 scores_list.append(score)
789 f.close() 860 f.close()
790 # Process last block. 861 # Process last block.
791 if scores_list: 862 if scores_list:
792 # Extract peaks from region. 863 # Extract peaks from region.
793 peak_list = list_extract_peaks(scores_list, 864 peak_list = list_extract_peaks(scores_list,
794 max_merge_dist=max_merge_dist, 865 max_merge_dist=max_merge_dist,
795 coords="bed", 866 coords="bed",
796 sc_thr=sc_thr) 867 sc_thr=sc_thr)
797 start_pos = site_starts_dic[old_id] 868 start_pos = site_starts_dic[old_id]
798 # Print out peaks in .bed format. 869 # Print out peaks in .bed format.
799 for l in peak_list: 870 for ln in peak_list:
800 peak_s = start_pos + l[0] 871 peak_s = start_pos + ln[0]
801 peak_e = start_pos + l[1] 872 peak_e = start_pos + ln[1]
802 site_id = "%s,%i" %(old_id, l[2]) # best score also 1-based. 873 site_id = "%s,%i" % (old_id, ln[2]) # best score also 1-based.
803 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) 874 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n"
875 % (old_id, peak_s, peak_e, site_id, ln[3]))
804 OUTPEAKS.close() 876 OUTPEAKS.close()
805 877
806 878
807 ################################################################################ 879 ###############################################################################
808 880
809 def list_extract_peaks(in_list, 881 def list_extract_peaks(in_list,
810 max_merge_dist=0, 882 max_merge_dist=0,
811 coords="list", 883 coords="list",
812 sc_thr=0): 884 sc_thr=0):
813 """ 885 """
814 Extract peak regions from list. 886 Extract peak regions from list.
815 Peak region is defined as region >= score threshold. 887 Peak region is defined as region >= score threshold.
816 888
817 coords=bed : peak start 0-based, peak end 1-based. 889 coords=bed : peak start 0-based, peak end 1-based.
818 coords=list : peak start 0-based, peak end 0-based. 890 coords=list : peak start 0-based, peak end 0-based.
819 891
820 >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5] 892 >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5]
821 >>> list_extract_peaks(test_list) 893 >>> list_extract_peaks(test_list)
822 [[1, 4, 3, 4.5], [6, 7, 7, 6.5]] 894 [[1, 4, 3, 4.5], [6, 7, 7, 6.5]]
823 >>> list_extract_peaks(test_list, sc_thr=2) 895 >>> list_extract_peaks(test_list, sc_thr=2)
824 [[2, 3, 3, 4.5], [6, 7, 7, 6.5]] 896 [[2, 3, 3, 4.5], [6, 7, 7, 6.5]]
860 pr_top_pos = i 932 pr_top_pos = i
861 else: 933 else:
862 # Before was peak region? 934 # Before was peak region?
863 if inside: 935 if inside:
864 # Store peak region. 936 # Store peak region.
865 #peak_infos = "%i,%i,%i,%f" %(pr_s, pr_e, pr_top_pos, pr_top_sc)
866 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] 937 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc]
867 peak_list.append(peak_infos) 938 peak_list.append(peak_infos)
868 inside = False 939 inside = False
869 pr_top_pos = 0 940 pr_top_pos = 0
870 pr_top_sc = -100000 941 pr_top_sc = -100000
896 new_top_pos = peak_list[i][2] 967 new_top_pos = peak_list[i][2]
897 new_top_sc = peak_list[i][3] 968 new_top_sc = peak_list[i][3]
898 if peak_list[i][3] < peak_list[j][3]: 969 if peak_list[i][3] < peak_list[j][3]:
899 new_top_pos = peak_list[j][2] 970 new_top_pos = peak_list[j][2]
900 new_top_sc = peak_list[j][3] 971 new_top_sc = peak_list[j][3]
901 new_peak = [peak_list[i][0], peak_list[j][1], new_top_pos, new_top_sc] 972 new_peak = [peak_list[i][0], peak_list[j][1],
973 new_top_pos, new_top_sc]
902 # If two peaks were merged. 974 # If two peaks were merged.
903 if new_peak: 975 if new_peak:
904 merged_peak_list.append(new_peak) 976 merged_peak_list.append(new_peak)
905 added_peaks_dic[i] = 1 977 added_peaks_dic[i] = 1
906 added_peaks_dic[j] = 1 978 added_peaks_dic[j] = 1
913 peaks_merged = False 985 peaks_merged = False
914 # If peak coordinates should be in .bed format, make peak ends 1-based. 986 # If peak coordinates should be in .bed format, make peak ends 1-based.
915 if coords == "bed": 987 if coords == "bed":
916 for i in range(len(peak_list)): 988 for i in range(len(peak_list)):
917 peak_list[i][1] += 1 989 peak_list[i][1] += 1
918 peak_list[i][2] += 1 # 1-base best score position too. 990 peak_list[i][2] += 1 # 1-base best score position too.
919 return peak_list 991 return peak_list
920 992
921 993
922 ################################################################################ 994 ###############################################################################
923 995
924 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False): 996 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed,
925 """ 997 print_rows=False):
926 Given a .bed file of sequence peak regions (possible coordinates from 998 """
999 Given a .bed file of sequence peak regions (possible coordinates from
927 0 to length of s), convert peak coordinates to genomic coordinates. 1000 0 to length of s), convert peak coordinates to genomic coordinates.
928 Do this by taking genomic regions of sequences as input. 1001 Do this by taking genomic regions of sequences as input.
929 1002
930 >>> test_in = "test-data/test.peaks.bed" 1003 >>> test_in = "test-data/test.peaks.bed"
931 >>> test_exp = "test-data/test_exp.peaks.bed" 1004 >>> test_exp = "test-data/test_exp.peaks.bed"
942 with open(genomic_sites_bed) as f: 1015 with open(genomic_sites_bed) as f:
943 for line in f: 1016 for line in f:
944 row = line.strip() 1017 row = line.strip()
945 cols = line.strip().split("\t") 1018 cols = line.strip().split("\t")
946 site_id = cols[3] 1019 site_id = cols[3]
947 assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(args.genomic_sites_bed) 1020 assert site_id \
1021 not in id2row_dic, \
1022 "column 4 IDs not unique in given .bed file \"%s\"" \
1023 % (genomic_sites_bed)
948 id2row_dic[site_id] = row 1024 id2row_dic[site_id] = row
949 f.close() 1025 f.close()
950 1026
951 # Read in peaks file and convert coordinates. 1027 # Read in peaks file and convert coordinates.
952 OUTPEAKS = open(genomic_peak_file, "w") 1028 OUTPEAKS = open(genomic_peak_file, "w")
956 site_id = cols[0] 1032 site_id = cols[0]
957 site_s = int(cols[1]) 1033 site_s = int(cols[1])
958 site_e = int(cols[2]) 1034 site_e = int(cols[2])
959 site_id2 = cols[3] 1035 site_id2 = cols[3]
960 site_sc = float(cols[4]) 1036 site_sc = float(cols[4])
961 assert re.search(".+,.+", site_id2), "regular expression failed for ID \"%s\"" %(site_id2) 1037 assert re.search(".+,.+", site_id2), \
962 m = re.search(".+,(\d+)", site_id2) 1038 "regular expression failed for ID \"%s\"" % (site_id2)
963 sc_pos = int(m.group(1)) # 1-based. 1039 m = re.search(r".+,(\d+)", site_id2)
964 assert site_id in id2row_dic, "site ID \"%s\" not found in genomic sites dictionary" %(site_id) 1040 sc_pos = int(m.group(1)) # 1-based.
1041 assert site_id in id2row_dic, \
1042 "site ID \"%s\" not found in genomic sites dictionary" \
1043 % (site_id)
965 row = id2row_dic[site_id] 1044 row = id2row_dic[site_id]
966 rowl = row.split("\t") 1045 rowl = row.split("\t")
967 gen_chr = rowl[0] 1046 gen_chr = rowl[0]
968 gen_s = int(rowl[1]) 1047 gen_s = int(rowl[1])
969 gen_e = int(rowl[2]) 1048 gen_e = int(rowl[2])
972 new_e = site_e + gen_s 1051 new_e = site_e + gen_s
973 new_sc_pos = sc_pos + gen_s 1052 new_sc_pos = sc_pos + gen_s
974 if gen_pol == "-": 1053 if gen_pol == "-":
975 new_s = gen_e - site_e 1054 new_s = gen_e - site_e
976 new_e = gen_e - site_s 1055 new_e = gen_e - site_s
977 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. 1056 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based.
978 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol) 1057 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \
979 OUTPEAKS.write("%s\n" %(new_row)) 1058 % (gen_chr, new_s, new_e,
1059 site_id, new_sc_pos, site_sc, gen_pol)
1060 OUTPEAKS.write("%s\n" % (new_row))
980 if print_rows: 1061 if print_rows:
981 print(new_row) 1062 print(new_row)
982 OUTPEAKS.close() 1063 OUTPEAKS.close()
983 1064
984 1065
985 ################################################################################ 1066 ###############################################################################
986 1067
987 def diff_two_files_identical(file1, file2): 1068 def diff_two_files_identical(file1, file2):
988 """ 1069 """
989 Check whether two files are identical. Return true if diff reports no 1070 Check whether two files are identical. Return true if diff reports no
990 differences. 1071 differences.
991 1072
992 >>> file1 = "test-data/file1" 1073 >>> file1 = "test-data/file1"
993 >>> file2 = "test-data/file2" 1074 >>> file2 = "test-data/file2"
994 >>> diff_two_files_identical(file1, file2) 1075 >>> diff_two_files_identical(file1, file2)
995 True 1076 True
996 >>> file1 = "test-data/test1.bed" 1077 >>> file1 = "test-data/test1.bed"
1004 if output: 1085 if output:
1005 same = False 1086 same = False
1006 return same 1087 return same
1007 1088
1008 1089
1009 ################################################################################ 1090 ###############################################################################
1010
1011