# HG changeset patch # User galaxyp # Date 1694695759 0 # Node ID 867f17ede7f376659d8bf31b8fcf3d5d15a543b3 # Parent 6d93529d19d40556ccca504e2137d084ccb9c52f planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 42e5dfeaa309e6ac17b4616314498a3b628272d2 diff -r 6d93529d19d4 -r 867f17ede7f3 benchmarking.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/benchmarking.py Thu Sep 14 12:49:19 2023 +0000 @@ -0,0 +1,270 @@ +import argparse +import os + +import numpy as np +import pandas as pd + +# Define the ArgumentParser +parser = argparse.ArgumentParser("List of natural abundances of the isotopes") + +parser.add_argument( + "--input", type=str, metavar="data", help="Input file/folder", required=True +) + +parser.add_argument( + "--isotope_abundance_matrix", + type=str, + metavar="data", + help="Isotope abundance matrix", + required=True, +) +parser.add_argument( + "--isotope", + type=str, + metavar="ISOTOPE", + help="Isotope", + required=True, +) +parser.add_argument( + "--out_summary", + type=str, + metavar="output", + help="Peptide summary output", + required=False, +) +parser.add_argument( + "--out_filtered", type=str, metavar="output", help="Filtered output", required=False +) +parser.add_argument( + "--nominal_values", + type=str, + metavar="nominal_values", + help="Table giving nominal values", + default=None, + required=False, +) + +# Indicate end of argument definitions and parse args +args = parser.parse_args() + + +def parse_nominal_values(filename): + nominal_values = {} + if not filename: + return nominal_values + with open(filename) as fh: + for line in fh: + line = line.strip() + if len(line) == 0 or line[0] == "#": + continue + line = line.split() + nominal_values[line[0]] = line[1] + return nominal_values + + +# Benchmarking section +# the functions for optimising calis-p data + + +def load_calisp_data(filename, factor): + # (1) load data + file_count = 1 + if os.path.isdir(filename): + file_data = [] + file_count = len(os.listdir(filename)) + for f in os.listdir(filename): + f = os.path.join(filename, f) + file_data.append(pd.read_feather(f)) + base, _ = os.path.splitext(f) + file_data[-1].to_csv(f"{base}.tsv", sep="\t", index=False) + data = pd.concat(file_data) + else: + data = pd.read_feather(filename) + base, _ = os.path.splitext(filename) + data.to_csv(f"{base}.tsv", sep="\t", index=False) + + file_success_count = len(data["ms_run"].unique()) + # (2) calculate deltas + # ((1-f)/f) - 1 == 1/f -2 + data["delta_na"] = data["ratio_na"] / ((1 / factor) - 2) * 1000 + data["delta_fft"] = data["ratio_fft"] / ((1 / factor) - 2) * 1000 + print( + f"Loaded {len(data.index)} isotopic patterns from {file_success_count}/{file_count} file(s)" + ) + return data + + +def filter_calisp_data(data, target): + if target.lower() == "na": + subdata = data.loc[ + lambda df: (df["flag_peak_at_minus_one_pos"] == False) # noqa: E712 + & (df["flag_pattern_is_wobbly"] == False) # noqa: E712 + & (df["flag_psm_has_low_confidence"] == False) # noqa: E712 + & (df["flag_psm_is_ambiguous"] == False) # noqa: E712 + & (df["flag_pattern_is_contaminated"] == False) # noqa: E712 + & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 + :, + ] + elif target.lower() == "fft": + subdata = data.loc[ + lambda df: (df["error_fft"] < 0.001) + & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 + :, + ] + elif target.lower() == "clumpy": + subdata = data.loc[ + lambda df: (df["error_clumpy"] < 0.001) + & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 + :, + ] + + print( + f"{len(subdata.index)} ({len(subdata.index)/len(data.index)*100:.1f}%) remaining after filters." + ) + return subdata + + +def estimate_clumpiness(data): + subdata = data.loc[lambda df: df["error_clumpy"] < 0.001, :] + clumpiness = [] + for c in ["c1", "c2", "c3", "c4", "c5", "c6"]: + try: + count, division = np.histogram(subdata[c], bins=50) + count = count[1:-1] + opt = 0.02 * np.where(count == count.max())[0][0] / 0.96 + clumpiness.append(opt) + except ValueError: + pass + return clumpiness / sum(clumpiness) + + +# the function for benchmarking +def benchmark_sip_mock_community_data(data, factor, nominal_values): + background_isotope = 1 - factor + background_unlabelled = factor + + # For false positive discovery rates we set the threshold at the isotope/unlabelled associated with 1/4 of a generation + # of labeling. The E. coli values (1.7, 4.2 and 7.1) are for 1 generation at 1, 5 and 10% label, + # and we take the background (1.07) into account as well. + thresholds = { + 1: 1.07 + (1.7 - 1.07) / 4, + 5: 1.07 + (4.2 - 1.07) / 4, + 10: 1.07 + (7.1 - 1.07) / 4, + } + + filenames = data["ms_run"].unique() + for fname in filenames: + print(f"Using nominal value {nominal_values.get(fname, 0)} for {fname}") + + bin_names = data["bins"].unique() + peptide_sequences = data["peptide"].unique() + benchmarking = pd.DataFrame( + columns=[ + "file", + "bins", + "% label", + "ratio", + "peptide", + "psm_mz", + "n(patterns)", + "mean intensity", + "ratio_NA median", + "N mean", + "ratio_NA SEM", + "ratio_FFT median", + "ratio_FFT SEM", + "False Positive", + ] + ) + false_positives = 0 + for p in peptide_sequences: + pep_data = data.loc[lambda df: df["peptide"] == p, :] + for b in bin_names: + # bindata = data.loc[lambda df: df["bins"] == b, :] + for f in filenames: + nominal_value = nominal_values.get(fname, 0) + unlabeled_fraction = 1 - nominal_value / 100 + U = unlabeled_fraction * background_unlabelled + I = nominal_value / 100 + unlabeled_fraction * background_isotope + ratio = I / U * 100 + pepfiledata = pep_data.loc[lambda df: df["ms_run"] == f, :] + is_false_positive = 0 + try: + if ( + b != "K12" + and pepfiledata["ratio_na"].median() > thresholds[nominal_value] + ): + is_false_positive = 1 + false_positives += 1 + except KeyError: + pass + benchmarking.loc[len(benchmarking)] = [ + f, + b, + nominal_value, + ratio, + p, + pepfiledata["psm_mz"].median(), + len(pepfiledata.index), + pepfiledata["pattern_total_intensity"].mean(), + pepfiledata["ratio_na"].median(), + pepfiledata["psm_neutrons"].mean(), + pepfiledata["ratio_na"].sem(), + pepfiledata["ratio_fft"].median(), + pepfiledata["ratio_fft"].sem(), + is_false_positive, + ] + + benchmarking = benchmarking.sort_values(["bins", "peptide"]) + benchmarking = benchmarking.reset_index(drop=True) + return benchmarking + + +rowcol = { + "13C": (0, 1), + "14C": (0, 2), + "15N": (1, 1), + "17O": (2, 1), + "18O": (2, 2), + "2H": (3, 1), + "3H": (3, 2), + "33S": (4, 1), + "34S": (4, 2), + "36S": (4, 3), +} +with open(args.isotope_abundance_matrix) as iamf: + matrix = [] + for line in iamf: + line = line.strip() + line = line.split("#")[0] + if line == "": + continue + matrix.append([float(x) for x in line.split()]) +factor = matrix[rowcol[args.isotope][0]][rowcol[args.isotope][1]] +print(f"Using factor {factor}") + + +# cleaning and filtering data +data = load_calisp_data(args.input, factor) + +if args.out_filtered: + data = filter_calisp_data(data, "na") + data["peptide_clean"] = data["peptide"] + data["peptide_clean"] = data["peptide_clean"].replace("'Oxidation'", "", regex=True) + data["peptide_clean"] = data["peptide_clean"].replace( + "'Carbamidomethyl'", "", regex=True + ) + data["peptide_clean"] = data["peptide_clean"].replace(r"\s*\[.*\]", "", regex=True) + + data["ratio_na"] = data["ratio_na"] * 100 + data["ratio_fft"] = data["ratio_fft"] * 100 + data.to_csv(args.out_filtered, sep="\t", index=False) + + # The column "% label" indicates the amount of label applied (percentage of label in the glucose). The amount of + # labeled E. coli cells added corresponded to 1 generation of labeling (50% of E. coli cells were labeled in + # all experiments except controls) + +if args.out_summary: + nominal_values = parse_nominal_values(args.nominal_values) + benchmarks = benchmark_sip_mock_community_data(data, factor, nominal_values) + benchmarks.to_csv(args.out_summary, sep="\t", index=False) diff -r 6d93529d19d4 -r 867f17ede7f3 calisp.xml --- a/calisp.xml Thu Jun 01 08:34:14 2023 +0000 +++ b/calisp.xml Thu Sep 14 12:49:19 2023 +0000 @@ -1,7 +1,7 @@ Estimate isotopic composition of peptides from proteomics mass spectrometry data - 3.0.10 + 3.0.13 0 https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242 @@ -30,8 +30,28 @@ --bin_delimiter '$bin_delimiter' --threads "\${GALAXY_SLOTS:-1}" --isotope $isotope - $compute_clumps && -'$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/ + $compute_clumps +#if $isotope_abundance_matrix + --isotope_abundance_matrix '$isotope_abundance_matrix' +#end if + +#if $isotope_abundance_matrix + && ISOTOPE_ABUNDANCE_MATRIX="$isotope_abundance_matrix" +#else + && ISOTOPE_ABUNDANCE_MATRIX="\$(python -c 'import site; print(f"{site.getsitepackages()[0]}/calisp/isotope_matrix.txt")')" +#end if + + && python '$__tool_directory__/benchmarking.py' + --input calisp-output/ + --isotope_abundance_matrix "\$ISOTOPE_ABUNDANCE_MATRIX" + --isotope $isotope +#if $benchmark_cond.benchmark == 'yes' + --out_filtered '$filtered' + --out_summary '$summary' + #if $benchmark_cond.nominal_values + --nominal_values '$benchmark_cond.nominal_values' + #end if +#end if ]]> @@ -59,18 +79,36 @@ + + + + + + + + + + + + + benchmark_cond['benchmark'] == 'yes' + + + benchmark_cond['benchmark'] == 'yes' + + + + + + + + + + --> + + + + + + 10.1186/s40168-022-01454-1 10.1073/pnas.1722325115 - 10.1101/2021.03.29.437612 10.1093/bioinformatics/bty046 - \ No newline at end of file + diff -r 6d93529d19d4 -r 867f17ede7f3 feather2tsv.py --- a/feather2tsv.py Thu Jun 01 08:34:14 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -#!/usr/bin/env python -""" -based on https://github.com/kinestetika/Calisp/blob/master/benchmarking/sip%20benchmarking.ipynb -""" - -import argparse -import os - -import pandas as pd - - -def load_calisp_data(filename): - - # (1) load data - if os.path.isdir(filename): - file_data = [] - for f in os.listdir(filename): - if not f.endswith(".feather"): - continue - f = os.path.join(filename, f) - file_data.append(pd.read_feather(f)) - base, _ = os.path.splitext(f) - file_data[-1].to_csv(f"{base}.tsv", sep="\t") - data = pd.concat(file_data) - else: - data = pd.read_feather(filename) - base, _ = os.path.splitext(filename) - data.to_csv(f"{base}.tsv", sep="\t") - - -parser = argparse.ArgumentParser(description='feather2tsv') -parser.add_argument('--calisp_output', required=True, help='feather file') -args = parser.parse_args() - -data = load_calisp_data(args.calisp_output) diff -r 6d93529d19d4 -r 867f17ede7f3 test-data/summary.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/summary.tsv Thu Sep 14 12:49:19 2023 +0000 @@ -0,0 +1,4 @@ +file bins % label ratio peptide psm_mz n(patterns) mean intensity ratio_NA median N mean ratio_NA SEM ratio_FFT median ratio_FFT SEM False Positive +MKH_260min_1800ng HOMO 0 8944.383821398755 HEEAPGHRPTTNPNASK [] [] 461.476684570313 11 860150.8512961648 0.9935866375027093 0.0 0.05207457939795477 1.0161266422773116 0.04270105864239165 0 +MKH_260min_1800ng HOMO 0 8944.383821398755 HKQSLEEAAK [] [] 380.871520996094 2 537188.1938476562 1.2002766734343886 0.0 0.0703566320302016 1.2079926603082054 0.13986270898184525 0 +MKH_260min_1800ng HOMO 0 8944.383821398755 NHEEEMKDLR ['Oxidation'] [5] 439.534698486328 10 1699843.8217773438 0.988856432577341 0.0 0.07057874972895482 0.9880480975986452 0.06010014927442129 0