Mercurial > repos > rnateam > graphprot_predict_profile
diff gplib.py @ 1:20429f4c1b95 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit f3fb925b83a4982e0cf9a0c11ff93ecbb8e4e6d5"
author | bgruening |
---|---|
date | Wed, 22 Jan 2020 10:14:41 -0500 |
parents | |
children | ace92c9a4653 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gplib.py Wed Jan 22 10:14:41 2020 -0500 @@ -0,0 +1,1011 @@ + +from distutils.spawn import find_executable +import subprocess +import statistics +import random +import gzip +import uuid +import sys +import re +import os + +""" + +Run doctests: + +python3 -m doctest gplib.py + + +""" + + +################################################################################ + +def graphprot_predictions_get_median(predictions_file): + """ + Given a GraphProt .predictions file, read in site scores and return + the median value. + + >>> test_file = "test-data/test.predictions" + >>> graphprot_predictions_get_median(test_file) + 0.571673 + + """ + # Site scores list. + sc_list = [] + with open(predictions_file) as f: + for line in f: + cols = line.strip().split("\t") + score = float(cols[2]) + sc_list.append(score) + f.close() + # Return the median. + return statistics.median(sc_list) + + +################################################################################ + +def graphprot_profile_get_top_scores_median(profile_file, + profile_type="profile", + avg_profile_extlr=5): + + """ + Given a GraphProt .profile file, extract for each site (identified by + column 1 ID) the top (= highest) score. Then return the median of these + top scores. + + profile_type can be either "profile" or "avg_profile". + "avg_profile means that the position-wise scores will first get smoothed + out by calculating for each position a new score through taking a + sequence window -avg_profile_extlr to +avg_profile_extlr of the position + and calculate the mean score over this window and assign it to the position. + After that, the maximum score of each site is chosen, and the median over + all maximum scores is returned. + "profile" leaves the position-wise scores as they are, directly extracting + the maximum for each site and then reporting the median. + + >>> test_file = "test-data/test.profile" + >>> graphprot_profile_get_top_scores_median(test_file) + 3.2 + + """ + # Dictionary of lists, with list of scores (value) for each site (key). + lists_dic = {} + with open(profile_file) as f: + for line in f: + cols = line.strip().split("\t") + seq_id = cols[0] + score = float(cols[2]) + if seq_id in lists_dic: + lists_dic[seq_id].append(score) + else: + lists_dic[seq_id] = [] + lists_dic[seq_id].append(score) + f.close() + # For each site, extract maximum and store in new list. + max_list = [] + for seq_id in lists_dic: + if profile_type == "profile": + max_sc = max(lists_dic[seq_id]) + max_list.append(max_sc) + elif profile_type == "avg_profile": + # Convert profile score list to average profile scores list. + aps_list = list_moving_window_average_values(lists_dic[seq_id], + win_extlr=avg_profile_extlr) + max_sc = max(aps_list) + max_list.append(max_sc) + else: + assert 0, "invalid profile_type argument given: \"%s\"" %(profile_type) + # Return the median. + return statistics.median(max_list) + + +################################################################################ + +def list_moving_window_average_values(in_list, + win_extlr=5, + method=1): + """ + Take a list of numeric values, and calculate for each position a new value, + by taking the mean value of the window of positions -win_extlr and + +win_extlr. If full extension is not possible (at list ends), it just + takes what it gets. + Two implementations of the task are given, chose by method=1 or method=2. + + >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1] + >>> list_moving_window_average_values(test_list, win_extlr=2, method=1) + [3.3333333333333335, 4.5, 4.4, 4.6, 5.4, 4.6, 3.75, 3.6666666666666665] + >>> list_moving_window_average_values(test_list, win_extlr=2, method=2) + [3.3333333333333335, 4.5, 4.4, 4.6, 5.4, 4.6, 3.75, 3.6666666666666665] + + """ + l_list = len(in_list) + assert l_list, "Given list is empty" + new_list = [0] * l_list + if win_extlr == 0: + return l_list + if method == 1: + for i in range(l_list): + s = i - win_extlr + e = i + win_extlr + 1 + if s < 0: + s = 0 + if e > l_list: + e = l_list + # Extract portion and assign value to new list. + new_list[i] = statistics.mean(in_list[s:e]) + elif method == 2: + for i in range(l_list): + s = i - win_extlr + e = i + win_extlr + 1 + if s < 0: + s = 0 + if e > l_list: + e = l_list + l = e-s + sc_sum = 0 + for j in range(l): + sc_sum += in_list[s+j] + new_list[i] = sc_sum / l + else: + assert 0, "invalid method ID given (%i)" %(method) + return new_list + + +################################################################################ + +def echo_add_to_file(echo_string, out_file): + """ + Add a string to file, using echo command. + + """ + check_cmd = 'echo "%s" >> %s' % (echo_string, out_file) + output = subprocess.getoutput(check_cmd) + error = False + if output: + error = True + assert error == False, "echo is complaining:\n%s\n%s" %(check_cmd, output) + + +################################################################################ + +def is_tool(name): + """Check whether tool "name" is in PATH.""" + return find_executable(name) is not None + + +################################################################################ + +def count_fasta_headers(fasta_file): + """ + Count number of FASTA headers in fasta_file using grep. + + >>> test_file = "test-data/test.fa" + >>> count_fasta_headers(test_file) + 2 + >>> test_file = "test-data/empty_file" + >>> count_fasta_headers(test_file) + 0 + + """ + check_cmd = 'grep -c ">" ' + fasta_file + output = subprocess.getoutput(check_cmd) + row_count = int(output.strip()) + return row_count + + +################################################################################ + +def make_file_copy(in_file, out_file): + """ + Make a file copy by copying in_file to out_file. + + """ + check_cmd = "cat " + in_file + " > " + out_file + assert in_file != out_file, "cat does not like to cat file into same file (%s)" %(check_cmd) + output = subprocess.getoutput(check_cmd) + error = False + if output: + error = True + assert error == False, "cat did not like your input (in_file: %s, out_file: %s):\n%s" %(in_file, out_file, output) + + +################################################################################ + +def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, + test_size=500): + """ + Split in_fasta .fa file into two files (e.g. test, train). + + """ + # Read in in_fasta. + seqs_dic = read_fasta_into_dic(in_fasta) + # Shuffle IDs. + rand_ids_list = random_order_dic_keys_into_list(seqs_dic) + c_out = 0 + TESTOUT = open(test_out_fa, "w") + TRAINOUT = open(train_out_fa, "w") + for seq_id in rand_ids_list: + seq = seqs_dic[seq_id] + if (c_out >= test_size): + TRAINOUT.write(">%s\n%s\n" % (seq_id, seq)) + else: + TESTOUT.write(">%s\n%s\n" % (seq_id, seq)) + c_out += 1 + TESTOUT.close() + TRAINOUT.close() + + +################################################################################ + +def read_fasta_into_dic(fasta_file, + seqs_dic=False, + ids_dic=False, + read_dna=False, + reject_lc=False, + convert_to_uc=False, + skip_n_seqs=True): + """ + Read in FASTA sequences, convert to RNA, store in dictionary + and return dictionary. + + >>> test_fasta = "test-data/test.fa" + >>> read_fasta_into_dic(test_fasta) + {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'} + >>> test_fasta = "test-data/test2.fa" + >>> read_fasta_into_dic(test_fasta) + {} + >>> test_fasta = "test-data/test.ensembl.fa" + >>> read_fasta_into_dic(test_fasta, read_dna=True) + {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'} + + """ + if not seqs_dic: + seqs_dic = {} + seq_id = "" + seq = "" + + # Go through FASTA file, extract sequences. + if re.search(".+\.gz$", fasta_file): + f = gzip.open(fasta_file, 'rt') + else: + f = open(fasta_file, "r") + for line in f: + if re.search(">.+", line): + m = re.search(">(.+)", line) + seq_id = m.group(1) + # If there is a ".", take only first part of header. + # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." + if re.search(".+\..+", seq_id): + m = re.search("(.+?)\..+", seq_id) + seq_id = m.group(1) + assert seq_id not in seqs_dic, "non-unique FASTA header \"%s\" in \"%s\"" % (seq_id, fasta_file) + if ids_dic: + if seq_id in ids_dic: + seqs_dic[seq_id] = "" + else: + seqs_dic[seq_id] = "" + elif re.search("[ACGTUN]+", line, re.I): + if seq_id in seqs_dic: + m = re.search("([ACGTUN]+)", line, re.I) + seq = m.group(1) + if reject_lc: + assert not re.search("[a-z]", seq), "lowercase characters detected in sequence \"%i\" (reject_lc=True)" %(seq_id) + if convert_to_uc: + seq = seq.upper() + # If sequences with N nucleotides should be skipped. + if skip_n_seqs: + if "n" in m.group(1) or "N" in m.group(1): + print ("WARNING: \"%s\" contains N nucleotides. Discarding sequence ... " % (seq_id)) + del seqs_dic[seq_id] + continue + # Convert to RNA, concatenate sequence. + if read_dna: + seqs_dic[seq_id] += m.group(1).replace("U","T").replace("u","t") + else: + seqs_dic[seq_id] += m.group(1).replace("T","U").replace("t","u") + f.close() + return seqs_dic + + +################################################################################ + +def random_order_dic_keys_into_list(in_dic): + """ + Read in dictionary keys, and return random order list of IDs. + + """ + id_list = [] + for key in in_dic: + id_list.append(key) + random.shuffle(id_list) + return id_list + + +################################################################################ + +def graphprot_get_param_string(params_file): + """ + Get parameter string from GraphProt .params file. + + >>> test_params = "test-data/test.params" + >>> graphprot_get_param_string(test_params) + '-epochs 20 -lambda 0.01 -R 1 -D 3 -bitsize 14 -onlyseq ' + + """ + param_string = "" + with open(params_file) as f: + for line in f: + cols = line.strip().split(" ") + param = cols[0] + setting = cols[1] + if re.search(".+:", param): + m = re.search("(.+):", line) + par = m.group(1) + if re.search("pos_train.+", line): + continue + if par == "model_type": + if setting == "sequence": + param_string += "-onlyseq " + else: + param_string += "-%s %s " %(par, setting) + else: + assert 0, "pattern matching failed for string \"%s\"" %(param) + return param_string + + +################################################################################ + +def seqs_dic_count_uc_nts(seqs_dic): + """ + Count number of uppercase nucleotides in sequences stored in sequence + dictionary. + + >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'} + >>> seqs_dic_count_uc_nts(seqs_dic) + 6 + >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'} + >>> seqs_dic_count_uc_nts(seqs_dic) + 0 + + """ + assert seqs_dic, "Given sequence dictionary empty" + c_uc = 0 + for seq_id in seqs_dic: + c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) + return c_uc + + +################################################################################ + +def seqs_dic_count_lc_nts(seqs_dic): + """ + Count number of lowercase nucleotides in sequences stored in sequence + dictionary. + + >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'} + >>> seqs_dic_count_lc_nts(seqs_dic) + 10 + >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'} + >>> seqs_dic_count_lc_nts(seqs_dic) + 0 + + """ + assert seqs_dic, "Given sequence dictionary empty" + c_uc = 0 + for seq_id in seqs_dic: + c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) + return c_uc + + +################################################################################ + +def count_file_rows(in_file): + """ + Count number of file rows for given input file. + + >>> test_file = "test-data/test1.bed" + >>> count_file_rows(test_file) + 7 + >>> test_file = "test-data/empty_file" + >>> count_file_rows(test_file) + 0 + + """ + check_cmd = "cat " + in_file + " | wc -l" + output = subprocess.getoutput(check_cmd) + row_count = int(output.strip()) + return row_count + + +################################################################################ + +def bed_check_six_col_format(bed_file): + """ + Check whether given .bed file has 6 columns. + + >>> test_bed = "test-data/test1.bed" + >>> bed_check_six_col_format(test_bed) + True + >>> test_bed = "test-data/empty_file" + >>> bed_check_six_col_format(test_bed) + False + + """ + + six_col_format = False + with open(bed_file) as f: + for line in f: + cols = line.strip().split("\t") + if len(cols) == 6: + six_col_format = True + break + f.closed + return six_col_format + + +################################################################################ + +def bed_check_unique_ids(bed_file): + """ + Check whether .bed file (6 column format with IDs in column 4) + has unique column 4 IDs. + + >>> test_bed = "test-data/test1.bed" + >>> bed_check_unique_ids(test_bed) + True + >>> test_bed = "test-data/test2.bed" + >>> bed_check_unique_ids(test_bed) + False + + """ + + check_cmd = "cut -f 4 " + bed_file + " | sort | uniq -d" + output = subprocess.getoutput(check_cmd) + if output: + return False + else: + return True + + +################################################################################ + +def get_seq_lengths_from_seqs_dic(seqs_dic): + """ + Given a dictionary of sequences, return dictionary of sequence lengths. + Mapping is sequence ID -> sequence length. + """ + seq_len_dic = {} + assert seqs_dic, "sequence dictionary seems to be empty" + for seq_id in seqs_dic: + seq_l = len(seqs_dic[seq_id]) + seq_len_dic[seq_id] = seq_l + return seq_len_dic + + +################################################################################ + +def bed_get_region_lengths(bed_file): + """ + Read in .bed file, store and return region lengths in dictionary. + key : region ID (.bed col4) + value : region length (.bed col3-col2) + + >>> test_file = "test-data/test4.bed" + >>> bed_get_region_lengths(test_file) + {'CLIP1': 10, 'CLIP2': 10} + + """ + id2len_dic = {} + with open(bed_file) as f: + for line in f: + row = line.strip() + cols = line.strip().split("\t") + site_s = int(cols[1]) + site_e = int(cols[2]) + site_id = cols[3] + site_l = site_e - site_s + assert site_id not in id2len_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(bed_file) + id2len_dic[site_id] = site_l + f.closed + assert id2len_dic, "No IDs read into dictionary (input file \"%s\" empty or malformatted?)" % (in_bed) + return id2len_dic + + +################################################################################ + +def graphprot_get_param_dic(params_file): + """ + Read in GraphProt .params file and store in dictionary. + key = parameter + value = parameter value + + >>> params_file = "test-data/test.params" + >>> graphprot_get_param_dic(params_file) + {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', 'pos_train_profile_median': '5.039610', 'pos_train_avg_profile_median_1': '4.236340', 'pos_train_avg_profile_median_2': '3.868431', 'pos_train_avg_profile_median_3': '3.331277', 'pos_train_avg_profile_median_4': '2.998667', 'pos_train_avg_profile_median_5': '2.829782', 'pos_train_avg_profile_median_6': '2.626623', 'pos_train_avg_profile_median_7': '2.447083', 'pos_train_avg_profile_median_8': '2.349919', 'pos_train_avg_profile_median_9': '2.239829', 'pos_train_avg_profile_median_10': '2.161676'} + + """ + param_dic = {} + with open(params_file) as f: + for line in f: + cols = line.strip().split(" ") + param = cols[0] + setting = cols[1] + if re.search(".+:", param): + m = re.search("(.+):", line) + par = m.group(1) + param_dic[par] = setting + f.close() + return param_dic + + +################################################################################ + +def graphprot_filter_predictions_file(in_file, out_file, + sc_thr=0): + """ + Filter GraphProt .predictions file by given score thr_sc. + """ + OUTPRED = open(out_file, "w") + with open(in_file) as f: + for line in f: + row = line.strip() + cols = line.strip().split("\t") + score = float(cols[2]) + if score < sc_thr: + continue + OUTPRED.write("%s\n" %(row)) + f.close() + OUTPRED.close() + + +################################################################################ + +def fasta_read_in_ids(fasta_file): + """ + Given a .fa file, read in header IDs in order appearing in file, + and store in list. + + >>> test_file = "test-data/test3.fa" + >>> fasta_read_in_ids(test_file) + ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316'] + + """ + ids_list = [] + with open(fasta_file) as f: + for line in f: + if re.search(">.+", line): + m = re.search(">(.+)", line) + seq_id = m.group(1) + ids_list.append(seq_id) + f.close() + return ids_list + + +################################################################################ + +def graphprot_profile_calculate_avg_profile(in_file, out_file, + ap_extlr=5, + seq_ids_list=False, + method=1): + """ + Given a GraphProt .profile file, calculate average profiles and output + average profile file. + Average profile means that the position-wise scores will get smoothed + out by calculating for each position a new score, taking a sequence + window -ap_extlr to +ap_extlr relative to the position + and calculate the mean score over this window. The mean score then + becomes the new average profile score at this position. + Two different implementations of the task are given: + method=1 (new python implementation, slower + more memory but easy to read) + method=2 (old perl implementation, faster and less memory but more code) + + >>> in_file = "test-data/test2.profile" + >>> out_file1 = "test-data/test2_1.avg_profile" + >>> out_file2 = "test-data/test2_2.avg_profile" + >>> out_file4 = "test-data/test2_3.avg_profile" + >>> graphprot_profile_calculate_avg_profile(in_file, out_file1, ap_extlr=2, method=1) + >>> graphprot_profile_calculate_avg_profile(in_file, out_file2, ap_extlr=2, method=2) + >>> diff_two_files_identical(out_file1, out_file2) + True + >>> test_list = ["s1", "s2", "s3", "s4"] + >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile" + >>> out_file3 = "test-data/test3_added_ids_out.avg_profile" + >>> graphprot_profile_calculate_avg_profile(in_file, out_file3, ap_extlr=2, method=1, seq_ids_list=test_list) + >>> diff_two_files_identical(out_file3_exp, out_file3) + True + + """ + if method == 1: + # Dictionary of lists, with list of scores (value) for each site (key). + lists_dic = {} + site_starts_dic = {} + with open(in_file) as f: + for line in f: + cols = line.strip().split("\t") + site_id = int(cols[0]) + pos = int(cols[1]) # 0-based. + score = float(cols[2]) + # Store first position of site. + if site_id not in site_starts_dic: + site_starts_dic[site_id] = pos + if site_id in lists_dic: + lists_dic[site_id].append(score) + else: + lists_dic[site_id] = [] + lists_dic[site_id].append(score) + f.close() + # Check number of IDs (# FASTA sequence IDs has to be same as # site IDs). + if seq_ids_list: + c_seq_ids = len(seq_ids_list) + c_site_ids = len(site_starts_dic) + assert c_seq_ids == c_site_ids, "# sequence IDs != # site IDs (%i != %i)" %(c_seq_ids, c_site_ids) + OUTPROF = open(out_file, "w") + # For each site, calculate average profile scores list. + max_list = [] + for site_id in lists_dic: + # Convert profile score list to average profile scores list. + aps_list = list_moving_window_average_values(lists_dic[site_id], + win_extlr=ap_extlr) + start_pos = site_starts_dic[site_id] + # Get original FASTA sequence ID. + if seq_ids_list: + site_id = seq_ids_list[site_id] + for i, sc in enumerate(aps_list): + pos = i + start_pos + 1 # make 1-based. + OUTPROF.write("%s\t%i\t%f\n" %(site_id, pos, sc)) + OUTPROF.close() + elif method == 2: + OUTPROF = open(out_file, "w") + # Old site ID. + old_id = "" + # Current site ID. + cur_id = "" + # Scores list. + scores_list = [] + site_starts_dic = {} + with open(in_file) as f: + for line in f: + cols = line.strip().split("\t") + cur_id = int(cols[0]) + pos = int(cols[1]) # 0-based. + score = float(cols[2]) + # Store first position of site. + if cur_id not in site_starts_dic: + site_starts_dic[cur_id] = pos + # Case: new site (new column 1 ID). + if cur_id != old_id: + # Process old id scores. + if scores_list: + aps_list = list_moving_window_average_values(scores_list, + win_extlr=ap_extlr) + start_pos = site_starts_dic[old_id] + seq_id = old_id + # Get original FASTA sequence ID. + if seq_ids_list: + seq_id = seq_ids_list[old_id] + for i, sc in enumerate(aps_list): + pos = i + start_pos + 1 # make 1-based. + OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) + # Reset list. + scores_list = [] + old_id = cur_id + scores_list.append(score) + else: + # Add to scores_list. + scores_list.append(score) + f.close() + # Process last block. + if scores_list: + aps_list = list_moving_window_average_values(scores_list, + win_extlr=ap_extlr) + start_pos = site_starts_dic[old_id] + seq_id = old_id + # Get original FASTA sequence ID. + if seq_ids_list: + seq_id = seq_ids_list[old_id] + for i, sc in enumerate(aps_list): + pos = i + start_pos + 1 # make 1-based. + OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) + OUTPROF.close() + + +################################################################################ + +def graphprot_profile_extract_peak_regions(in_file, out_file, + max_merge_dist=0, + sc_thr=0): + """ + Extract peak regions from GraphProt .profile file. + Store the peak regions (defined as regions with scores >= sc_thr) + as to out_file in 6-column .bed. + + TODO: + Add option for genomic coordinates input (+ - polarity support). + Output genomic regions instead of sequence regions. + + >>> in_file = "test-data/test4.avg_profile" + >>> out_file = "test-data/test4_out.peaks.bed" + >>> exp_file = "test-data/test4_out_exp.peaks.bed" + >>> exp2_file = "test-data/test4_out_exp2.peaks.bed" + >>> empty_file = "test-data/empty_file" + >>> graphprot_profile_extract_peak_regions(in_file, out_file) + >>> diff_two_files_identical(out_file, exp_file) + True + >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10) + >>> diff_two_files_identical(out_file, empty_file) + True + >>> graphprot_profile_extract_peak_regions(in_file, out_file, max_merge_dist=2) + >>> diff_two_files_identical(out_file, exp2_file) + True + + """ + + OUTPEAKS = open(out_file, "w") + # Old site ID. + old_id = "" + # Current site ID. + cur_id = "" + # Scores list. + scores_list = [] + site_starts_dic = {} + with open(in_file) as f: + for line in f: + cols = line.strip().split("\t") + cur_id = cols[0] + pos = int(cols[1]) # 0-based. + score = float(cols[2]) + # Store first position of site. + if cur_id not in site_starts_dic: + # If first position != zero, we assume positions are 1-based. + if pos != 0: + # Make index 0-based. + site_starts_dic[cur_id] = pos - 1 + else: + site_starts_dic[cur_id] = pos + # Case: new site (new column 1 ID). + if cur_id != old_id: + # Process old id scores. + if scores_list: + # Extract peaks from region. + peak_list = list_extract_peaks(scores_list, + max_merge_dist=max_merge_dist, + coords="bed", + sc_thr=sc_thr) + start_pos = site_starts_dic[old_id] + # Print out peaks in .bed format. + for l in peak_list: + peak_s = start_pos + l[0] + peak_e = start_pos + l[1] + site_id = "%s,%i" %(old_id, l[2]) + OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) + # Reset list. + scores_list = [] + old_id = cur_id + scores_list.append(score) + else: + # Add to scores_list. + scores_list.append(score) + f.close() + # Process last block. + if scores_list: + # Extract peaks from region. + peak_list = list_extract_peaks(scores_list, + max_merge_dist=max_merge_dist, + coords="bed", + sc_thr=sc_thr) + start_pos = site_starts_dic[old_id] + # Print out peaks in .bed format. + for l in peak_list: + peak_s = start_pos + l[0] + peak_e = start_pos + l[1] + site_id = "%s,%i" %(old_id, l[2]) # best score also 1-based. + OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) + OUTPEAKS.close() + + +################################################################################ + +def list_extract_peaks(in_list, + max_merge_dist=0, + coords="list", + sc_thr=0): + """ + Extract peak regions from list. + Peak region is defined as region >= score threshold. + + coords=bed : peak start 0-based, peak end 1-based. + coords=list : peak start 0-based, peak end 0-based. + + >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5] + >>> list_extract_peaks(test_list) + [[1, 4, 3, 4.5], [6, 7, 7, 6.5]] + >>> list_extract_peaks(test_list, sc_thr=2) + [[2, 3, 3, 4.5], [6, 7, 7, 6.5]] + >>> list_extract_peaks(test_list, sc_thr=2, coords="bed") + [[2, 4, 4, 4.5], [6, 8, 8, 6.5]] + >>> list_extract_peaks(test_list, sc_thr=10) + [] + >>> test_list = [2, -1, 3, -1, 4, -1, -1, 6, 9] + >>> list_extract_peaks(test_list, max_merge_dist=2) + [[0, 4, 4, 4], [7, 8, 8, 9]] + >>> list_extract_peaks(test_list, max_merge_dist=3) + [[0, 8, 8, 9]] + + """ + # Check. + assert len(in_list), "Given list is empty" + # Peak regions list. + peak_list = [] + # Help me. + inside = False + pr_s = 0 + pr_e = 0 + pr_top_pos = 0 + pr_top_sc = -100000 + for i, sc in enumerate(in_list): + # Part of peak region? + if sc >= sc_thr: + # At peak start. + if not inside: + pr_s = i + pr_e = i + inside = True + else: + # Inside peak region. + pr_e = i + # Store top position. + if sc > pr_top_sc: + pr_top_sc = sc + pr_top_pos = i + else: + # Before was peak region? + if inside: + # Store peak region. + #peak_infos = "%i,%i,%i,%f" %(pr_s, pr_e, pr_top_pos, pr_top_sc) + peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] + peak_list.append(peak_infos) + inside = False + pr_top_pos = 0 + pr_top_sc = -100000 + # If peak at the end, also report. + if inside: + # Store peak region. + peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] + peak_list.append(peak_infos) + # Merge peaks. + if max_merge_dist and len(peak_list) > 1: + iterate = True + while iterate: + merged_peak_list = [] + added_peaks_dic = {} + peaks_merged = False + for i, l in enumerate(peak_list): + if i in added_peaks_dic: + continue + j = i + 1 + # Last element. + if j == len(peak_list): + if i not in added_peaks_dic: + merged_peak_list.append(peak_list[i]) + break + # Compare two elements. + new_peak = [] + if (peak_list[j][0] - peak_list[i][1]) <= max_merge_dist: + peaks_merged = True + new_top_pos = peak_list[i][2] + new_top_sc = peak_list[i][3] + if peak_list[i][3] < peak_list[j][3]: + new_top_pos = peak_list[j][2] + new_top_sc = peak_list[j][3] + new_peak = [peak_list[i][0], peak_list[j][1], new_top_pos, new_top_sc] + # If two peaks were merged. + if new_peak: + merged_peak_list.append(new_peak) + added_peaks_dic[i] = 1 + added_peaks_dic[j] = 1 + else: + merged_peak_list.append(peak_list[i]) + added_peaks_dic[i] = 1 + if not peaks_merged: + iterate = False + peak_list = merged_peak_list + peaks_merged = False + # If peak coordinates should be in .bed format, make peak ends 1-based. + if coords == "bed": + for i in range(len(peak_list)): + peak_list[i][1] += 1 + peak_list[i][2] += 1 # 1-base best score position too. + return peak_list + + +################################################################################ + +def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False): + """ + Given a .bed file of sequence peak regions (possible coordinates from + 0 to length of s), convert peak coordinates to genomic coordinates. + Do this by taking genomic regions of sequences as input. + + >>> test_in = "test-data/test.peaks.bed" + >>> test_exp = "test-data/test_exp.peaks.bed" + >>> test_out = "test-data/test_out.peaks.bed" + >>> gen_in = "test-data/test.peaks_genomic.bed" + >>> bed_peaks_to_genomic_peaks(test_in, test_out, gen_in) + >>> diff_two_files_identical(test_out, test_exp) + True + + """ + # Read in genomic region info. + id2row_dic = {} + + with open(genomic_sites_bed) as f: + for line in f: + row = line.strip() + cols = line.strip().split("\t") + site_id = cols[3] + assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(args.genomic_sites_bed) + id2row_dic[site_id] = row + f.close() + + # Read in peaks file and convert coordinates. + OUTPEAKS = open(genomic_peak_file, "w") + with open(peak_file) as f: + for line in f: + cols = line.strip().split("\t") + site_id = cols[0] + site_s = int(cols[1]) + site_e = int(cols[2]) + site_id2 = cols[3] + site_sc = float(cols[4]) + assert re.search(".+,.+", site_id2), "regular expression failed for ID \"%s\"" %(site_id2) + m = re.search(".+,(\d+)", site_id2) + sc_pos = int(m.group(1)) # 1-based. + assert site_id in id2row_dic, "site ID \"%s\" not found in genomic sites dictionary" %(site_id) + row = id2row_dic[site_id] + rowl = row.split("\t") + gen_chr = rowl[0] + gen_s = int(rowl[1]) + gen_e = int(rowl[2]) + gen_pol = rowl[5] + new_s = site_s + gen_s + new_e = site_e + gen_s + new_sc_pos = sc_pos + gen_s + if gen_pol == "-": + new_s = gen_e - site_e + new_e = gen_e - site_s + new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. + new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol) + OUTPEAKS.write("%s\n" %(new_row)) + if print_rows: + print(new_row) + OUTPEAKS.close() + + +################################################################################ + +def diff_two_files_identical(file1, file2): + """ + Check whether two files are identical. Return true if diff reports no + differences. + + >>> file1 = "test-data/file1" + >>> file2 = "test-data/file2" + >>> diff_two_files_identical(file1, file2) + True + >>> file1 = "test-data/test1.bed" + >>> diff_two_files_identical(file1, file2) + False + + """ + same = True + check_cmd = "diff " + file1 + " " + file2 + output = subprocess.getoutput(check_cmd) + if output: + same = False + return same + + +################################################################################ + +