# HG changeset patch
# User galaxyp
# Date 1694695759 0
# Node ID 867f17ede7f376659d8bf31b8fcf3d5d15a543b3
# Parent  6d93529d19d40556ccca504e2137d084ccb9c52f
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/calisp commit 42e5dfeaa309e6ac17b4616314498a3b628272d2

diff -r 6d93529d19d4 -r 867f17ede7f3 benchmarking.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/benchmarking.py	Thu Sep 14 12:49:19 2023 +0000
@@ -0,0 +1,270 @@
+import argparse
+import os
+
+import numpy as np
+import pandas as pd
+
+# Define the ArgumentParser
+parser = argparse.ArgumentParser("List of natural abundances of the isotopes")
+
+parser.add_argument(
+    "--input", type=str, metavar="data", help="Input file/folder", required=True
+)
+
+parser.add_argument(
+    "--isotope_abundance_matrix",
+    type=str,
+    metavar="data",
+    help="Isotope abundance matrix",
+    required=True,
+)
+parser.add_argument(
+    "--isotope",
+    type=str,
+    metavar="ISOTOPE",
+    help="Isotope",
+    required=True,
+)
+parser.add_argument(
+    "--out_summary",
+    type=str,
+    metavar="output",
+    help="Peptide summary output",
+    required=False,
+)
+parser.add_argument(
+    "--out_filtered", type=str, metavar="output", help="Filtered output", required=False
+)
+parser.add_argument(
+    "--nominal_values",
+    type=str,
+    metavar="nominal_values",
+    help="Table giving nominal values",
+    default=None,
+    required=False,
+)
+
+# Indicate end of argument definitions and parse args
+args = parser.parse_args()
+
+
+def parse_nominal_values(filename):
+    nominal_values = {}
+    if not filename:
+        return nominal_values
+    with open(filename) as fh:
+        for line in fh:
+            line = line.strip()
+            if len(line) == 0 or line[0] == "#":
+                continue
+            line = line.split()
+            nominal_values[line[0]] = line[1]
+    return nominal_values
+
+
+# Benchmarking section
+# the functions for optimising calis-p data
+
+
+def load_calisp_data(filename, factor):
+    # (1) load data
+    file_count = 1
+    if os.path.isdir(filename):
+        file_data = []
+        file_count = len(os.listdir(filename))
+        for f in os.listdir(filename):
+            f = os.path.join(filename, f)
+            file_data.append(pd.read_feather(f))
+            base, _ = os.path.splitext(f)
+            file_data[-1].to_csv(f"{base}.tsv", sep="\t", index=False)
+        data = pd.concat(file_data)
+    else:
+        data = pd.read_feather(filename)
+        base, _ = os.path.splitext(filename)
+        data.to_csv(f"{base}.tsv", sep="\t", index=False)
+
+    file_success_count = len(data["ms_run"].unique())
+    # (2) calculate deltas
+    # ((1-f)/f) - 1 == 1/f -2
+    data["delta_na"] = data["ratio_na"] / ((1 / factor) - 2) * 1000
+    data["delta_fft"] = data["ratio_fft"] / ((1 / factor) - 2) * 1000
+    print(
+        f"Loaded {len(data.index)} isotopic patterns from {file_success_count}/{file_count} file(s)"
+    )
+    return data
+
+
+def filter_calisp_data(data, target):
+    if target.lower() == "na":
+        subdata = data.loc[
+            lambda df: (df["flag_peak_at_minus_one_pos"] == False)  # noqa: E712
+            & (df["flag_pattern_is_wobbly"] == False)  # noqa: E712
+            & (df["flag_psm_has_low_confidence"] == False)  # noqa: E712
+            & (df["flag_psm_is_ambiguous"] == False)  # noqa: E712
+            & (df["flag_pattern_is_contaminated"] == False)  # noqa: E712
+            & (df["flag_peptide_assigned_to_multiple_bins"] == False),  # noqa: E712
+            :,
+        ]
+    elif target.lower() == "fft":
+        subdata = data.loc[
+            lambda df: (df["error_fft"] < 0.001)
+            & (df["flag_peptide_assigned_to_multiple_bins"] == False),  # noqa: E712
+            :,
+        ]
+    elif target.lower() == "clumpy":
+        subdata = data.loc[
+            lambda df: (df["error_clumpy"] < 0.001)
+            & (df["flag_peptide_assigned_to_multiple_bins"] == False),  # noqa: E712
+            :,
+        ]
+
+    print(
+        f"{len(subdata.index)} ({len(subdata.index)/len(data.index)*100:.1f}%) remaining after filters."
+    )
+    return subdata
+
+
+def estimate_clumpiness(data):
+    subdata = data.loc[lambda df: df["error_clumpy"] < 0.001, :]
+    clumpiness = []
+    for c in ["c1", "c2", "c3", "c4", "c5", "c6"]:
+        try:
+            count, division = np.histogram(subdata[c], bins=50)
+            count = count[1:-1]
+            opt = 0.02 * np.where(count == count.max())[0][0] / 0.96
+            clumpiness.append(opt)
+        except ValueError:
+            pass
+    return clumpiness / sum(clumpiness)
+
+
+# the function for benchmarking
+def benchmark_sip_mock_community_data(data, factor, nominal_values):
+    background_isotope = 1 - factor
+    background_unlabelled = factor
+
+    # For false positive discovery rates we set the threshold at the isotope/unlabelled associated with 1/4 of a generation
+    # of labeling. The E. coli values (1.7, 4.2 and 7.1) are for 1 generation at 1, 5 and 10% label,
+    # and we take the background (1.07) into account as well.
+    thresholds = {
+        1: 1.07 + (1.7 - 1.07) / 4,
+        5: 1.07 + (4.2 - 1.07) / 4,
+        10: 1.07 + (7.1 - 1.07) / 4,
+    }
+
+    filenames = data["ms_run"].unique()
+    for fname in filenames:
+        print(f"Using nominal value {nominal_values.get(fname, 0)} for {fname}")
+
+    bin_names = data["bins"].unique()
+    peptide_sequences = data["peptide"].unique()
+    benchmarking = pd.DataFrame(
+        columns=[
+            "file",
+            "bins",
+            "% label",
+            "ratio",
+            "peptide",
+            "psm_mz",
+            "n(patterns)",
+            "mean intensity",
+            "ratio_NA median",
+            "N mean",
+            "ratio_NA SEM",
+            "ratio_FFT median",
+            "ratio_FFT SEM",
+            "False Positive",
+        ]
+    )
+    false_positives = 0
+    for p in peptide_sequences:
+        pep_data = data.loc[lambda df: df["peptide"] == p, :]
+        for b in bin_names:
+            # bindata = data.loc[lambda df: df["bins"] == b, :]
+            for f in filenames:
+                nominal_value = nominal_values.get(fname, 0)
+                unlabeled_fraction = 1 - nominal_value / 100
+                U = unlabeled_fraction * background_unlabelled
+                I = nominal_value / 100 + unlabeled_fraction * background_isotope
+                ratio = I / U * 100
+                pepfiledata = pep_data.loc[lambda df: df["ms_run"] == f, :]
+                is_false_positive = 0
+                try:
+                    if (
+                        b != "K12"
+                        and pepfiledata["ratio_na"].median() > thresholds[nominal_value]
+                    ):
+                        is_false_positive = 1
+                        false_positives += 1
+                except KeyError:
+                    pass
+                benchmarking.loc[len(benchmarking)] = [
+                    f,
+                    b,
+                    nominal_value,
+                    ratio,
+                    p,
+                    pepfiledata["psm_mz"].median(),
+                    len(pepfiledata.index),
+                    pepfiledata["pattern_total_intensity"].mean(),
+                    pepfiledata["ratio_na"].median(),
+                    pepfiledata["psm_neutrons"].mean(),
+                    pepfiledata["ratio_na"].sem(),
+                    pepfiledata["ratio_fft"].median(),
+                    pepfiledata["ratio_fft"].sem(),
+                    is_false_positive,
+                ]
+
+    benchmarking = benchmarking.sort_values(["bins", "peptide"])
+    benchmarking = benchmarking.reset_index(drop=True)
+    return benchmarking
+
+
+rowcol = {
+    "13C": (0, 1),
+    "14C": (0, 2),
+    "15N": (1, 1),
+    "17O": (2, 1),
+    "18O": (2, 2),
+    "2H": (3, 1),
+    "3H": (3, 2),
+    "33S": (4, 1),
+    "34S": (4, 2),
+    "36S": (4, 3),
+}
+with open(args.isotope_abundance_matrix) as iamf:
+    matrix = []
+    for line in iamf:
+        line = line.strip()
+        line = line.split("#")[0]
+        if line == "":
+            continue
+        matrix.append([float(x) for x in line.split()])
+factor = matrix[rowcol[args.isotope][0]][rowcol[args.isotope][1]]
+print(f"Using factor {factor}")
+
+
+# cleaning and filtering data
+data = load_calisp_data(args.input, factor)
+
+if args.out_filtered:
+    data = filter_calisp_data(data, "na")
+    data["peptide_clean"] = data["peptide"]
+    data["peptide_clean"] = data["peptide_clean"].replace("'Oxidation'", "", regex=True)
+    data["peptide_clean"] = data["peptide_clean"].replace(
+        "'Carbamidomethyl'", "", regex=True
+    )
+    data["peptide_clean"] = data["peptide_clean"].replace(r"\s*\[.*\]", "", regex=True)
+
+    data["ratio_na"] = data["ratio_na"] * 100
+    data["ratio_fft"] = data["ratio_fft"] * 100
+    data.to_csv(args.out_filtered, sep="\t", index=False)
+
+    # The column "% label" indicates the amount of label applied (percentage of label in the glucose). The amount of
+    # labeled E. coli cells added corresponded to 1 generation of labeling (50% of E. coli cells were labeled in
+    # all experiments except controls)
+
+if args.out_summary:
+    nominal_values = parse_nominal_values(args.nominal_values)
+    benchmarks = benchmark_sip_mock_community_data(data, factor, nominal_values)
+    benchmarks.to_csv(args.out_summary, sep="\t", index=False)
diff -r 6d93529d19d4 -r 867f17ede7f3 calisp.xml
--- a/calisp.xml	Thu Jun 01 08:34:14 2023 +0000
+++ b/calisp.xml	Thu Sep 14 12:49:19 2023 +0000
@@ -1,7 +1,7 @@
 <tool id="calisp" name="calisp" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
     <description>Estimate isotopic composition of peptides from proteomics mass spectrometry data</description>
     <macros>
-        <token name="@TOOL_VERSION@">3.0.10</token>
+        <token name="@TOOL_VERSION@">3.0.13</token>
         <token name="@VERSION_SUFFIX@">0</token>
         <token name="@CALISP_REPO@">https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242</token>
         <xml name="input_macro" tokens="multiple">
@@ -30,8 +30,28 @@
     --bin_delimiter '$bin_delimiter'
     --threads "\${GALAXY_SLOTS:-1}"
     --isotope $isotope
-    $compute_clumps &&
-'$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/
+    $compute_clumps
+#if $isotope_abundance_matrix
+    --isotope_abundance_matrix '$isotope_abundance_matrix'
+#end if
+
+#if $isotope_abundance_matrix
+    && ISOTOPE_ABUNDANCE_MATRIX="$isotope_abundance_matrix"
+#else
+    && ISOTOPE_ABUNDANCE_MATRIX="\$(python -c 'import site; print(f"{site.getsitepackages()[0]}/calisp/isotope_matrix.txt")')"
+#end if
+
+    && python '$__tool_directory__/benchmarking.py'
+        --input calisp-output/
+        --isotope_abundance_matrix "\$ISOTOPE_ABUNDANCE_MATRIX"
+        --isotope $isotope
+#if $benchmark_cond.benchmark == 'yes'
+        --out_filtered '$filtered'
+        --out_summary '$summary'
+        #if $benchmark_cond.nominal_values
+            --nominal_values '$benchmark_cond.nominal_values'
+        #end if
+#end if
     ]]></command>
     <inputs>
         <param argument="--spectrum_file" type="data" multiple="false" format="mzml" label="Spectrum file"/>
@@ -59,18 +79,36 @@
             <option value="36S">36S</option>
         </param>
         <param argument="--compute_clumps" type="boolean" truevalue="--compute_clumps" falsevalue="" checked="false" label="Compute clumps" help="To compute clumpiness of carbon assimilation. Only use when samples are labeled tosaturation. Estimation of clumpiness takes much additional time." />
+        <param argument="--isotope_abundance_matrix" type="data" format="tabular" optional="true" label="Custom isotope abundance matrix" help="If not given the built in matrix will be used" />
+        <conditional name="benchmark_cond">
+            <param name="benchmark" type="select" label="Run benchmarking">
+                <option value="yes">Yes</option>
+                <option value="no">No</option>
+            </param>
+            <when value="yes">
+                <param name="nominal_values" type="data" format="tabular" optional="true" label="Nominal values" help="A table containing ms_run and their nominal value (1, 5, or 10)"/>
+            </when>
+            <when value="no"/>
+        </conditional>
     </inputs>
     <outputs>
         <collection name="output" type="list">
             <discover_datasets pattern="(?P&lt;designation&gt;.*)\.tsv" format="tabular" directory="calisp-output"/>
         </collection>
+        <data name="filtered" format="tabular" label="${tool.name} on ${on_string}: filtered">
+            <filter>benchmark_cond['benchmark'] == 'yes'</filter>
+        </data>
+        <data name="summary" format="tabular" label="${tool.name} on ${on_string}: peptide summary">
+            <filter>benchmark_cond['benchmark'] == 'yes'</filter>
+        </data>
     </outputs>
     <tests>
         <!-- TODO test data to large, avilable from here: https://github.com/kinestetika/Calisp/tree/master/test
-            if possible inlcude via location in the future
-        <test expect_num_outputs="1">
-            <param name="spectrum_file" value="calisp_test_data.mzML" ftype="mzml"/>
-            <param name="peptide_file" value="calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+            if possible inlcude via location in the future -->
+        <!-- <test expect_num_outputs="3">
+            <param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/>
+            <param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+            <param name="benchmark" value="true"/>
             <output_collection name="output" count="1">
                 <element name="calisp_test_data">
                     <assert_contents>
@@ -80,13 +118,74 @@
                         <has_text text="P13645"/>
                         <has_text text="NHEEEMKDLR"/>
                         <has_text text="Oxidation"/>
-                        <has_n_columns n="85"/>
+                        <has_n_columns n="84"/>
                         <has_n_lines n="24"/>
                     </assert_contents>
                 </element>
             </output_collection>
-        </test>
-    -->
+            <output name="filtered" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="24"/>
+                    <has_n_columns n="87"/>
+                </assert_contents>
+            </output>
+            <output name="summary" value="summary.tsv" ftype="tabular"/>
+        </test> -->
+
+        <!-- same test, but with isotope abundance matrix supplied by the user
+             (using the same as the built in => same results)
+            
+            TODO: test will only work with 23.1 tool-utils package available -->
+        <!-- <test expect_num_outputs="3">
+            <param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/>
+            <param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+            <param name="isotope_abundance_matrix" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/src/calisp/isotope_matrix.txt" ftype="tabular"/>
+            <param name="benchmark" value="true"/>
+            <output_collection name="output" count="1">
+                <element name="calisp_test_data">
+                    <assert_contents>
+                        <has_text text="experiment"/>
+                        <has_text text="MKH_260min_1800ng"/>
+                        <has_text text="HOMO"/>
+                        <has_text text="P13645"/>
+                        <has_text text="NHEEEMKDLR"/>
+                        <has_text text="Oxidation"/>
+                        <has_n_columns n="84"/>
+                        <has_n_lines n="24"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output name="filtered" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="24"/>
+                    <has_n_columns n="87"/>
+                </assert_contents>
+            </output>
+            <output name="summary" value="summary.tsv" ftype="tabular"/>
+        </test> -->
+
+        <!--  trst output filters for no benchmarking -->
+        <!-- <test expect_num_outputs="1">
+            <param name="spectrum_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data.mzML" ftype="mzml"/>
+            <param name="peptide_file" location="https://raw.githubusercontent.com/kinestetika/Calisp/v@TOOL_VERSION@/test/calisp_test_data_TargetPeptideSpectrumMatch.txt" ftype="tabular"/>
+            <conditional name="benchmark_cond">
+                <param name="benchmark" value="no"/>
+            </conditional>
+            <output_collection name="output" count="1">
+                <element name="calisp_test_data">
+                    <assert_contents>
+                        <has_text text="experiment"/>
+                        <has_text text="MKH_260min_1800ng"/>
+                        <has_text text="HOMO"/>
+                        <has_text text="P13645"/>
+                        <has_text text="NHEEEMKDLR"/>
+                        <has_text text="Oxidation"/>
+                        <has_n_columns n="84"/>
+                        <has_n_lines n="24"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test> -->
     </tests>
     <help><![CDATA[
 Calisp (Calgary approach to isotopes in proteomics) is a program that estimates
@@ -165,8 +264,8 @@
 pattern_peak_count                         # of peaks in the pattern
 pattern_median_peak_spacing                medium mass difference between a pattern's peaks
 spectrum_mass_irregularity                 a measure for the standard deviation in the mass difference between a pattern's peaks
-ratio_na                                   the estimated isotope ratio inferred from neutron abundance (sip experiments) 
-ratio_fft                                  the estimated isotope ratio inferred by the fft method (natural isotope abundances)
+ratio_na                                   the estimated isotope ratio (in percent) inferred from neutron abundance (sip experiments) 
+ratio_fft                                  the estimated isotope ratio (in percent) inferred by the fft method (natural isotope abundances)
 error_fft                                  the remaining error after fitting the pattern with fft
 error_clumpy                               the remaining error after fitting the pattern with the clumpy carbon method
 flag_peptide_contains_sulfur               true if peptide contains sulfur
@@ -183,11 +282,123 @@
 m0 - m19                                   the masses of the first 20 peaks of the pattern
 c1 - c6                                    contributions of clumps of 1-6 carbon to ratio_na. These are the outcomes of the clumpy carbon model. These results are only meaningful if the biomass was labeled to saturation.
 ========================================== ===================
+
+Benchmarking
+============
+
+If the user chooses to run the additional benchmarking script two additional
+outputs are created as follows.
+
+Load data:
+----------
+
+- Concatenate calisp result tables
+- add column ``delta_na`` = 1000 * ``ratio_na`` / (1/factor-2)
+- add column ``delta_fft`` = 1000 * ``ratio_fft`` / (1/factor-2)
+
+Filter data:
+------------
+
+Rows are removed for which any of the following criteria applies
+
+- flag_peak_at_minus_one_pos
+- flag_pattern_is_wobbly
+- flag_psm_has_low_confidence
+- flag_psm_is_ambiguous
+- flag_pattern_is_contaminated
+- flag_peptide_assigned_to_multiple_bins
+
+Furthermore in the ``peptide`` column the strings ``"Oxidation"``, ``"Carbamidomethyl"``,
+and text in brackets (i.e. ``[]``) preceded by any number of spaces
+is removed.
+
+Benchmarking:
+-------------
+
+Iterate through all combinations of unique peptides, proteins, and samples
+and output the following tabular information
+
+=================== ===========================
+Column              Content
+=================== ===========================
+file                The name of the mzML spectrum file comprising the peptide
+bin                 bin/mag ids, separated by commas. Calisp expects the protein ids in the psm file to consist of two parts, separated by a delimiter (_ by default). The first part is the bin/mag id, and the second part is the protein id
+%label              The label percentage (≠ 0 if labelled components used during experiments)
+ratio               The natural abundance ratio of the target element (C, H, N, O, S)
+peptide             The labeled peptides
+psm_mz              psm m over z
+n(patterns)         The number of iterations of the same pattern for the peptides has been repeated
+mean intensity      The mean of the total intensity of the pattern
+ratio_NA median     The mean of the estimated isotope ratio inferred from neutron abundance (sip experiments)
+N mean              The mean of the number of neutrons inferred from custom 'neutron' modifications
+ratio_NA SEM        The standard error of the mean of the estimated isotope ratio inferred from neutron abundance (sip experiments)
+ratio_FFT median    The mean of the estimated isotope ratio inferred by the fft method (natural isotope abundances)
+ratio_FFT SEM       The standard error of the mean of the estimated isotope ratio inferred by the fft method (natural isotope abundances)
+False Positive      Any false positive indications
+=================== ===========================
+
+Mean, median, and standard error values are computed for
+all entries of this sample and that have the same peptide.
+
+**Isotope abundance matrix**:
+
+The isotope abundance matrix gives the background unlabeled fraction.
+The default matrix implemented in calisp is given here: 
+https://github.com/kinestetika/Calisp/blob/v@TOOL_VERSION@/src/calisp/isotope_matrix.txt. 
+Columns specify the atom of interest and the rows the isotope, i.e. 
+rows 1-5 correspond to C, N, O, H, S. For instance 
+13C is in the 2nd column of the 1st row and 14C in the 3rd column
+of the same row. 
+
+**Benchmarking without nominal values**:
+
+If no nominal values, i.e. percentage of labeled atoms are given,
+nominal values of 0 are assumed.
+
+The values in the `ratio` column is comuted as `background_isotope / background_unlabelled * 100`
+where `background_unlabelled` is taken from the isotope abundance matrix
+according to the chosen target isotope.
+Then `background_isotope` is given by `1 - background_unlabelled`
+
+All entries of the table are considered not false positive.
+
+**Benchmarking with nominal values**:
+
+The `%label` (the nominal value) of a sample is either 0 (the default),
+1, 5, or 10 and can be provided or each sample by a tabular dataset
+(column 1 should give the sample names and column 2 the nominal value).
+
+The `ratio = I / U * 100` is given by 
+`U = unlabeled_fraction * background_unlabelled` and
+`I = nominal_value / 100 + unlabeled_fraction * background_isotope`
+where 
+`unlabeled_fraction = 1 - nominal_value / 100`
+`background_isotope = 1 - background_unlabelled`
+and `background_unlabelled` is given by the isotope abundance matrix.
+
+A peptide is considered false positive if it's not a contaminant(at the moment only K12)
+and the median of `ratio_na` values for the same peptide and sample
+is greater than a threshold depending on the nominal value:
+
+"For false positive discovery rates we set the threshold at the
+isotope/unlabelled associated with 1/4 of a generation of labeling. The E.
+coli values (1.7, 4.2 and 7.1) are for 1 generation at 1, 5 and 10% label, and
+we take the background (1.07) into account as well.""
+
+============= =========
+nominal value threshold
+============= =========
+1             `1.07 + (1.7 - 1.07) / 4`
+5             `1.07 + (4.2 - 1.07) / 4`
+10            `1.07 + (7.1 - 1.07) / 4`
+============= =========
+
+File an issue at https://github.com/galaxyproteomics/tools-galaxyp/issues if
+different contaminants of thresholds should be considered.
     ]]></help>
     <citations>
         <citation type="doi">10.1186/s40168-022-01454-1</citation>
         <citation type="doi">10.1073/pnas.1722325115</citation>
-        <citation type="doi">10.1101/2021.03.29.437612</citation>
         <citation type="doi">10.1093/bioinformatics/bty046</citation>
     </citations>
-</tool>
\ No newline at end of file
+</tool>
diff -r 6d93529d19d4 -r 867f17ede7f3 feather2tsv.py
--- a/feather2tsv.py	Thu Jun 01 08:34:14 2023 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,35 +0,0 @@
-#!/usr/bin/env python
-"""
-based on https://github.com/kinestetika/Calisp/blob/master/benchmarking/sip%20benchmarking.ipynb
-"""
-
-import argparse
-import os
-
-import pandas as pd
-
-
-def load_calisp_data(filename):
-
-    # (1) load data
-    if os.path.isdir(filename):
-        file_data = []
-        for f in os.listdir(filename):
-            if not f.endswith(".feather"):
-                continue
-            f = os.path.join(filename, f)
-            file_data.append(pd.read_feather(f))
-            base, _ = os.path.splitext(f)
-            file_data[-1].to_csv(f"{base}.tsv", sep="\t")
-        data = pd.concat(file_data)
-    else:
-        data = pd.read_feather(filename)
-        base, _ = os.path.splitext(filename)
-        data.to_csv(f"{base}.tsv", sep="\t")
-
-
-parser = argparse.ArgumentParser(description='feather2tsv')
-parser.add_argument('--calisp_output', required=True, help='feather file')
-args = parser.parse_args()
-
-data = load_calisp_data(args.calisp_output)
diff -r 6d93529d19d4 -r 867f17ede7f3 test-data/summary.tsv
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/summary.tsv	Thu Sep 14 12:49:19 2023 +0000
@@ -0,0 +1,4 @@
+file	bins	% label	ratio	peptide	psm_mz	n(patterns)	mean intensity	ratio_NA median	N mean	ratio_NA SEM	ratio_FFT median	ratio_FFT SEM	False Positive
+MKH_260min_1800ng	HOMO	0	8944.383821398755	HEEAPGHRPTTNPNASK [] []	461.476684570313	11	860150.8512961648	0.9935866375027093	0.0	0.05207457939795477	1.0161266422773116	0.04270105864239165	0
+MKH_260min_1800ng	HOMO	0	8944.383821398755	HKQSLEEAAK [] []	380.871520996094	2	537188.1938476562	1.2002766734343886	0.0	0.0703566320302016	1.2079926603082054	0.13986270898184525	0
+MKH_260min_1800ng	HOMO	0	8944.383821398755	NHEEEMKDLR ['Oxidation'] [5]	439.534698486328	10	1699843.8217773438	0.988856432577341	0.0	0.07057874972895482	0.9880480975986452	0.06010014927442129	0