graphprot_predict_profile: gplib.py comparison

comparison gplib.py @ 5:ddcf35a868b8 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit ad60258f5759eaa205fec4af6143c728ea131419

author	bgruening
date	Wed, 05 Jun 2024 16:40:51 +0000
parents	ace92c9a4653
children

comparison

equal deleted inserted replaced

-:4ad83aed5c3c
+:ddcf35a868b8
 import gzip
 import random
 import re
 import statistics
 import subprocess
 from distutils.spawn import find_executable
 """
 Run doctests:
 python3 -m doctest gplib.py
 """
-###############################################################################
+#######################################################################
 def graphprot_predictions_get_median(predictions_file):
 """
 Given a GraphProt .predictions file, read in site scores and return
 the median value.
 f.close()
 # Return the median.
 return statistics.median(sc_list)
-###############################################################################
+#######################################################################
-def graphprot_profile_get_tsm(profile_file,
-profile_type="profile",
+def graphprot_profile_get_tsm(
-avg_profile_extlr=5):
+profile_file, profile_type="profile", avg_profile_extlr=5
+):
 """
 Given a GraphProt .profile file, extract for each site (identified by
 column 1 ID) the top (= highest) score. Then return the median of these
 top scores.
 if profile_type == "profile":
 max_sc = max(lists_dic[seq_id])
 max_list.append(max_sc)
 elif profile_type == "avg_profile":
 # Convert profile score list to average profile scores list.
-aps_list = \
+aps_list = list_moving_window_average_values(
-list_moving_window_average_values(lists_dic[seq_id],
+lists_dic[seq_id], win_extlr=avg_profile_extlr
-win_extlr=avg_profile_extlr)
+)
 max_sc = max(aps_list)
 max_list.append(max_sc)
 else:
-assert 0, "invalid profile_type argument given: \"%s\"" \
+assert 0, 'invalid profile_type argument given: "%s"' % (profile_type)
-% (profile_type)
 # Return the median.
 return statistics.median(max_list)
-###############################################################################
+#######################################################################
-def list_moving_window_average_values(in_list,
-win_extlr=5,
+def list_moving_window_average_values(in_list, win_extlr=5, method=1):
-method=1):
 """
 Take a list of numeric values, and calculate for each position a new value,
 by taking the mean value of the window of positions -win_extlr and
 +win_extlr. If full extension is not possible (at list ends), it just
 takes what it gets.
 else:
 assert 0, "invalid method ID given (%i)" % (method)
 return new_list
-###############################################################################
+#######################################################################
 def echo_add_to_file(echo_string, out_file):
 """
 Add a string to file, using echo command.
 if output:
 error = True
 assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output)
-###############################################################################
+#######################################################################
 def is_tool(name):
 """Check whether tool "name" is in PATH."""
 return find_executable(name) is not None
-###############################################################################
+#######################################################################
 def count_fasta_headers(fasta_file):
 """
 Count number of FASTA headers in fasta_file using grep.
 output = subprocess.getoutput(check_cmd)
 row_count = int(output.strip())
 return row_count
-###############################################################################
+#######################################################################
 def make_file_copy(in_file, out_file):
 """
 Make a file copy by copying in_file to out_file.
 """
 check_cmd = "cat " + in_file + " > " + out_file
-assert in_file != out_file, \
+assert in_file != out_file, "cat does not like to cat file into same file (%s)" % (
-"cat does not like to cat file into same file (%s)" % (check_cmd)
+check_cmd
+)
 output = subprocess.getoutput(check_cmd)
 error = False
 if output:
 error = True
-assert not error, \
+assert not error, "cat did not like your input (in_file: %s, out_file: %s):\n%s" % (
-"cat did not like your input (in_file: %s, out_file: %s):\n%s" \
+in_file,
-% (in_file, out_file, output)
+out_file,
+output,
+)
-###############################################################################
-def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa,
+#######################################################################
-test_size=500):
+def split_fasta_into_test_train_files(
+in_fasta, test_out_fa, train_out_fa, test_size=500
+):
 """
 Split in_fasta .fa file into two files (e.g. test, train).
 """
 # Read in in_fasta.
 c_out = 0
 TESTOUT = open(test_out_fa, "w")
 TRAINOUT = open(train_out_fa, "w")
 for seq_id in rand_ids_list:
 seq = seqs_dic[seq_id]
-if (c_out >= test_size):
+if c_out >= test_size:
 TRAINOUT.write(">%s\n%s\n" % (seq_id, seq))
 else:
 TESTOUT.write(">%s\n%s\n" % (seq_id, seq))
 c_out += 1
 TESTOUT.close()
 TRAINOUT.close()
-###############################################################################
+#######################################################################
 def check_seqs_dic_format(seqs_dic):
 """
 Check sequence dictionary for lowercase-only sequences or sequences
 wich have lowercase nts in between uppercase nts.
 if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq):
 bad_seq_ids.append(seq_id)
 return bad_seq_ids
-###############################################################################
+#######################################################################
-def read_fasta_into_dic(fasta_file,
-seqs_dic=False,
+def read_fasta_into_dic(
-ids_dic=False,
+fasta_file,
-read_dna=False,
+seqs_dic=False,
-short_ensembl=False,
+ids_dic=False,
-reject_lc=False,
+read_dna=False,
-convert_to_uc=False,
+short_ensembl=False,
-skip_n_seqs=True):
+reject_lc=False,
+convert_to_uc=False,
+skip_n_seqs=True,
+):
 """
 Read in FASTA sequences, convert to RNA, store in dictionary
 and return dictionary.
 >>> test_fasta = "test-data/test.fa"
 seq_id = ""
 seq = ""
 # Go through FASTA file, extract sequences.
 if re.search(r".+\.gz$", fasta_file):
-f = gzip.open(fasta_file, 'rt')
+f = gzip.open(fasta_file, "rt")
 else:
 f = open(fasta_file, "r")
 for line in f:
 if re.search(">.+", line):
 m = re.search(">(.+)", line)
 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..."
 if short_ensembl:
 if re.search(r".+\..+", seq_id):
 m = re.search(r"(.+?)\..+", seq_id)
 seq_id = m.group(1)
-assert seq_id not in seqs_dic, \
+assert seq_id not in seqs_dic, 'non-unique FASTA header "%s" in "%s"' % (
-"non-unique FASTA header \"%s\" in \"%s\"" \
+seq_id,
-% (seq_id, fasta_file)
+fasta_file,
+)
 if ids_dic:
 if seq_id in ids_dic:
 seqs_dic[seq_id] = ""
 else:
 seqs_dic[seq_id] = ""
 elif re.search("[ACGTUN]+", line, re.I):
 if seq_id in seqs_dic:
 m = re.search("([ACGTUN]+)", line, re.I)
 seq = m.group(1)
 if reject_lc:
-assert \
+assert not re.search(
-not re.search("[a-z]", seq), \
+"[a-z]", seq
-"lc char detected in seq \"%i\" (reject_lc=True)" \
+), 'lc char detected in seq "%i" (reject_lc=True)' % (seq_id)
-% (seq_id)
 if convert_to_uc:
 seq = seq.upper()
 # If sequences with N nucleotides should be skipped.
 if skip_n_seqs:
 if "n" in m.group(1) or "N" in m.group(1):
-print("WARNING: \"%s\" contains N. Discarding "
+print(
-"sequence ... " % (seq_id))
+'WARNING: "%s" contains N. Discarding '
+"sequence ... " % (seq_id)
+)
 del seqs_dic[seq_id]
 continue
 # Convert to RNA, concatenate sequence.
 if read_dna:
-seqs_dic[seq_id] += \
+seqs_dic[seq_id] += m.group(1).replace("U", "T").replace("u", "t")
-m.group(1).replace("U", "T").replace("u", "t")
 else:
-seqs_dic[seq_id] += \
+seqs_dic[seq_id] += m.group(1).replace("T", "U").replace("t", "u")
-m.group(1).replace("T", "U").replace("t", "u")
 f.close()
 return seqs_dic
-###############################################################################
+#######################################################################
 def random_order_dic_keys_into_list(in_dic):
 """
 Read in dictionary keys, and return random order list of IDs.
 id_list.append(key)
 random.shuffle(id_list)
 return id_list
-###############################################################################
+#######################################################################
 def graphprot_get_param_string(params_file):
 """
 Get parameter string from GraphProt .params file.
 if setting == "sequence":
 param_string += "-onlyseq "
 else:
 param_string += "-%s %s " % (par, setting)
 else:
-assert 0, "pattern matching failed for string \"%s\"" % (param)
+assert 0, 'pattern matching failed for string "%s"' % (param)
 return param_string
-###############################################################################
+#######################################################################
 def seqs_dic_count_uc_nts(seqs_dic):
 """
 Count number of uppercase nucleotides in sequences stored in sequence
 dictionary.
 """
 assert seqs_dic, "Given sequence dictionary empty"
 c_uc = 0
 for seq_id in seqs_dic:
-c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id]))
+c_uc += len(re.findall(r"[A-Z]", seqs_dic[seq_id]))
 return c_uc
-###############################################################################
+#######################################################################
 def seqs_dic_count_lc_nts(seqs_dic):
 """
 Count number of lowercase nucleotides in sequences stored in sequence
 dictionary.
 """
 assert seqs_dic, "Given sequence dictionary empty"
 c_uc = 0
 for seq_id in seqs_dic:
-c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id]))
+c_uc += len(re.findall(r"[a-z]", seqs_dic[seq_id]))
 return c_uc
-###############################################################################
+#######################################################################
 def count_file_rows(in_file):
 """
 Count number of file rows for given input file.
 output = subprocess.getoutput(check_cmd)
 row_count = int(output.strip())
 return row_count
-###############################################################################
+#######################################################################
 def bed_check_six_col_format(bed_file):
 """
 Check whether given .bed file has 6 columns.
 break
 f.closed
 return six_col_format
-###############################################################################
+#######################################################################
 def bed_check_unique_ids(bed_file):
 """
 Check whether .bed file (6 column format with IDs in column 4)
 has unique column 4 IDs.
 return False
 else:
 return True
-###############################################################################
+#######################################################################
 def get_seq_lengths_from_seqs_dic(seqs_dic):
 """
 Given a dictionary of sequences, return dictionary of sequence lengths.
 Mapping is sequence ID -> sequence length.
 seq_l = len(seqs_dic[seq_id])
 seq_len_dic[seq_id] = seq_l
 return seq_len_dic
-###############################################################################
+#######################################################################
 def bed_get_region_lengths(bed_file):
 """
 Read in .bed file, store and return region lengths in dictionary.
 key   :  region ID (.bed col4)
 cols = line.strip().split("\t")
 site_s = int(cols[1])
 site_e = int(cols[2])
 site_id = cols[3]
 site_l = site_e - site_s
-assert site_id \
+assert (
-not in id2len_dic, \
+site_id not in id2len_dic
-"column 4 IDs not unique in given .bed file \"%s\"" \
+), 'column 4 IDs not unique in given .bed file "%s"' % (bed_file)
-% (bed_file)
 id2len_dic[site_id] = site_l
 f.closed
-assert id2len_dic, \
+assert (
-"No IDs read into dic (input file \"%s\" empty or malformatted?)" \
+id2len_dic
-% (bed_file)
+), 'No IDs read into dic (input file "%s" empty or malformatted?)' % (bed_file)
 return id2len_dic
-###############################################################################
+#######################################################################
 def graphprot_get_param_dic(params_file):
 """
 Read in GraphProt .params file and store in dictionary.
 key = parameter
 param_dic[par] = setting
 f.close()
 return param_dic
-###############################################################################
+#######################################################################
-def graphprot_filter_predictions_file(in_file, out_file,
-sc_thr=0):
+def graphprot_filter_predictions_file(in_file, out_file, sc_thr=0):
 """
 Filter GraphProt .predictions file by given score thr_sc.
 """
 OUTPRED = open(out_file, "w")
 with open(in_file) as f:
 OUTPRED.write("%s\n" % (row))
 f.close()
 OUTPRED.close()
-###############################################################################
+#######################################################################
 def fasta_read_in_ids(fasta_file):
 """
 Given a .fa file, read in header IDs in order appearing in file,
 and store in list.
 ids_list.append(seq_id)
 f.close()
 return ids_list
-###############################################################################
+#######################################################################
-def graphprot_profile_calc_avg_profile(in_file, out_file,
-ap_extlr=5,
+def graphprot_profile_calc_avg_profile(
-seq_ids_list=False,
+in_file, out_file, ap_extlr=5, seq_ids_list=False, method=1
-method=1):
+):
 """
 Given a GraphProt .profile file, calculate average profiles and output
 average profile file.
 Average profile means that the position-wise scores will get smoothed
 out by calculating for each position a new score, taking a sequence
 f.close()
 # Check number of IDs (# FASTA IDs has to be same as # site IDs).
 if seq_ids_list:
 c_seq_ids = len(seq_ids_list)
 c_site_ids = len(site_starts_dic)
-assert c_seq_ids == c_site_ids, \
+assert (
-"# sequence IDs != # site IDs (%i != %i)" \
+c_seq_ids == c_site_ids
-% (c_seq_ids, c_site_ids)
+), "# sequence IDs != # site IDs (%i != %i)" % (c_seq_ids, c_site_ids)
 OUTPROF = open(out_file, "w")
 # For each site, calculate average profile scores list.
 for site_id in lists_dic:
 # Convert profile score list to average profile scores list.
-aps_list = list_moving_window_average_values(lists_dic[site_id],
+aps_list = list_moving_window_average_values(
-win_extlr=ap_extlr)
+lists_dic[site_id], win_extlr=ap_extlr
+)
 start_pos = site_starts_dic[site_id]
 # Get original FASTA sequence ID.
 if seq_ids_list:
 site_id = seq_ids_list[site_id]
 for i, sc in enumerate(aps_list):
 site_starts_dic[cur_id] = pos
 # Case: new site (new column 1 ID).
 if cur_id != old_id:
 # Process old id scores.
 if scores_list:
-aps_list = \
+aps_list = list_moving_window_average_values(
-list_moving_window_average_values(
+scores_list, win_extlr=ap_extlr
-scores_list,
+)
-win_extlr=ap_extlr)
 start_pos = site_starts_dic[old_id]
 seq_id = old_id
 # Get original FASTA sequence ID.
 if seq_ids_list:
 seq_id = seq_ids_list[old_id]
 # Add to scores_list.
 scores_list.append(score)
 f.close()
 # Process last block.
 if scores_list:
-aps_list = list_moving_window_average_values(scores_list,
+aps_list = list_moving_window_average_values(
-win_extlr=ap_extlr)
+scores_list, win_extlr=ap_extlr
+)
 start_pos = site_starts_dic[old_id]
 seq_id = old_id
 # Get original FASTA sequence ID.
 if seq_ids_list:
 seq_id = seq_ids_list[old_id]
 pos = i + start_pos + 1  # make 1-based.
 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc))
 OUTPROF.close()
-###############################################################################
+#######################################################################
-def graphprot_profile_extract_peak_regions(in_file, out_file,
-max_merge_dist=0,
+def graphprot_profile_extract_peak_regions(
-sc_thr=0):
+in_file, out_file, max_merge_dist=0, sc_thr=0
+):
 """
 Extract peak regions from GraphProt .profile file.
 Store the peak regions (defined as regions with scores >= sc_thr)
 as to out_file in 6-column .bed.
 # Case: new site (new column 1 ID).
 if cur_id != old_id:
 # Process old id scores.
 if scores_list:
 # Extract peaks from region.
-peak_list = \
+peak_list = list_extract_peaks(
-list_extract_peaks(scores_list,
+scores_list,
 max_merge_dist=max_merge_dist,
 coords="bed",
-sc_thr=sc_thr)
+sc_thr=sc_thr,
+)
 start_pos = site_starts_dic[old_id]
 # Print out peaks in .bed format.
 for ln in peak_list:
 peak_s = start_pos + ln[0]
 peak_e = start_pos + ln[1]
 site_id = "%s,%i" % (old_id, ln[2])
-OUTPEAKS.write("%s\t%i\t%i"
+OUTPEAKS.write(
-"\t%s\t%f\t+\n"
+"%s\t%i\t%i"
-% (old_id, peak_s,
+"\t%s\t%f\t+\n" % (old_id, peak_s, peak_e, site_id, ln[3])
-peak_e, site_id, ln[3]))
+)
 # Reset list.
 scores_list = []
 old_id = cur_id
 scores_list.append(score)
 else:
 scores_list.append(score)
 f.close()
 # Process last block.
 if scores_list:
 # Extract peaks from region.
-peak_list = list_extract_peaks(scores_list,
+peak_list = list_extract_peaks(
-max_merge_dist=max_merge_dist,
+scores_list, max_merge_dist=max_merge_dist, coords="bed", sc_thr=sc_thr
-coords="bed",
+)
-sc_thr=sc_thr)
 start_pos = site_starts_dic[old_id]
 # Print out peaks in .bed format.
 for ln in peak_list:
 peak_s = start_pos + ln[0]
 peak_e = start_pos + ln[1]
 site_id = "%s,%i" % (old_id, ln[2])  # best score also 1-based.
-OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n"
+OUTPEAKS.write(
-% (old_id, peak_s, peak_e, site_id, ln[3]))
+"%s\t%i\t%i\t%s\t%f\t+\n" % (old_id, peak_s, peak_e, site_id, ln[3])
+)
 OUTPEAKS.close()
-###############################################################################
+#######################################################################
-def list_extract_peaks(in_list,
-max_merge_dist=0,
+def list_extract_peaks(in_list, max_merge_dist=0, coords="list", sc_thr=0):
-coords="list",
-sc_thr=0):
 """
 Extract peak regions from list.
 Peak region is defined as region >= score threshold.
 coords=bed  :  peak start 0-based, peak end 1-based.
 new_top_pos = peak_list[i][2]
 new_top_sc = peak_list[i][3]
 if peak_list[i][3] < peak_list[j][3]:
 new_top_pos = peak_list[j][2]
 new_top_sc = peak_list[j][3]
-new_peak = [peak_list[i][0], peak_list[j][1],
+new_peak = [
-new_top_pos, new_top_sc]
+peak_list[i][0],
+peak_list[j][1],
+new_top_pos,
+new_top_sc,
+]
 # If two peaks were merged.
 if new_peak:
 merged_peak_list.append(new_peak)
 added_peaks_dic[i] = 1
 added_peaks_dic[j] = 1
 peak_list[i][1] += 1
 peak_list[i][2] += 1  # 1-base best score position too.
 return peak_list
-###############################################################################
+#######################################################################
-def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed,
-print_rows=False):
+def bed_peaks_to_genomic_peaks(
+peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False
+):
 """
 Given a .bed file of sequence peak regions (possible coordinates from
 0 to length of s), convert peak coordinates to genomic coordinates.
 Do this by taking genomic regions of sequences as input.
 with open(genomic_sites_bed) as f:
 for line in f:
 row = line.strip()
 cols = line.strip().split("\t")
 site_id = cols[3]
-assert site_id \
+assert (
-not in id2row_dic, \
+site_id not in id2row_dic
-"column 4 IDs not unique in given .bed file \"%s\"" \
+), 'column 4 IDs not unique in given .bed file "%s"' % (genomic_sites_bed)
-% (genomic_sites_bed)
 id2row_dic[site_id] = row
 f.close()
 # Read in peaks file and convert coordinates.
 OUTPEAKS = open(genomic_peak_file, "w")
 site_id = cols[0]
 site_s = int(cols[1])
 site_e = int(cols[2])
 site_id2 = cols[3]
 site_sc = float(cols[4])
-assert re.search(".+,.+", site_id2), \
+assert re.search(
-"regular expression failed for ID \"%s\"" % (site_id2)
+".+,.+", site_id2
+), 'regular expression failed for ID "%s"' % (site_id2)
 m = re.search(r".+,(\d+)", site_id2)
 sc_pos = int(m.group(1))  # 1-based.
-assert site_id in id2row_dic, \
+assert (
-"site ID \"%s\" not found in genomic sites dictionary" \
+site_id in id2row_dic
-% (site_id)
+), 'site ID "%s" not found in genomic sites dictionary' % (site_id)
 row = id2row_dic[site_id]
 rowl = row.split("\t")
 gen_chr = rowl[0]
 gen_s = int(rowl[1])
 gen_e = int(rowl[2])
 new_sc_pos = sc_pos + gen_s
 if gen_pol == "-":
 new_s = gen_e - site_e
 new_e = gen_e - site_s
 new_sc_pos = gen_e - sc_pos + 1  # keep 1-based.
-new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \
+new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" % (
-% (gen_chr, new_s, new_e,
+gen_chr,
-site_id, new_sc_pos, site_sc, gen_pol)
+new_s,
+new_e,
+site_id,
+new_sc_pos,
+site_sc,
+gen_pol,
+)
 OUTPEAKS.write("%s\n" % (new_row))
 if print_rows:
 print(new_row)
 OUTPEAKS.close()
-###############################################################################
+#######################################################################
 def diff_two_files_identical(file1, file2):
 """
 Check whether two files are identical. Return true if diff reports no
 differences.
 if output:
 same = False
 return same
-###############################################################################
+#######################################################################

Mercurial > repos > rnateam > graphprot_predict_profile

comparison gplib.py @ 5:ddcf35a868b8 draft default tip