Mercurial > repos > recetox > freqsap

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/freqSAP.py	Fri Jul 18 13:21:36 2025 +0000
@@ -0,0 +1,312 @@
+import argparse
+import json
+import os
+import sys
+
+import pandas as pd
+import requests
+
+VARIANT_INDEX = "NCBI Reference SNP (rs) Report ALPHA"
+
+
+def get_protein_variation(accession: str) -> tuple[dict, pd.DataFrame]:
+    requestURL = f"https://www.ebi.ac.uk/proteins/api/variation?offset=0&size=-1&accession={accession}"
+    r = requests.get(requestURL, headers={"Accept": "application/json"})
+
+    if not r.ok:
+        r.raise_for_status()
+        sys.exit()
+
+    responseBody = r.text
+
+    data = json.loads(responseBody)[0]
+
+    features = pd.DataFrame(data.pop("features"))
+    return data, features
+
+
+def get_position(feature: dict) -> str:
+    """
+    Get the position of a feature in the protein sequence.
+
+    Args:
+        feature (dict): A feature dictionary containing 'begin' and 'end'.
+
+    Returns:
+        str: The position in the format "start-end".
+    """
+    begin = feature.get("begin")
+    end = feature.get("end")
+    if begin == end:
+        return str(begin)
+    return f"{feature.get('begin')}-{feature.get('end')}"
+
+
+def get_frequency(variant: str) -> str:
+    if not variant:
+        return ""
+
+    try:
+        freq_url = f"https://www.ncbi.nlm.nih.gov/snp/{variant}/download/frequency"
+        r = requests.get(freq_url, headers={"Accept": "application/json"})
+        if not r.ok:
+            r.raise_for_status()
+        return r.text
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching frequency data for variant {variant}: {e}")
+        return ""
+
+
+def parse_frequency_reponse(responseBody: str) -> tuple[dict, pd.DataFrame]:
+    """
+    Parse the frequency response body.
+
+    Args:
+        responseBody (str): The response body as a string.
+
+    Returns:
+        dict: Parsed JSON data.
+    """
+    if responseBody == "":
+        return {}, pd.DataFrame()
+
+    lines = responseBody.splitlines()
+    n_lines = len(lines)
+    i = 0
+
+    metadata = {}
+    header = []
+    rows = []
+
+    while i < n_lines:
+        line = lines[i]
+        tokens = line.split("\t")
+
+        if len(tokens) == 2:
+            key = tokens[0].strip("# ")
+            value = tokens[1].strip()
+            metadata[key] = value
+        elif len(tokens) > 2:
+            if tokens[0].startswith("#"):
+                header = [t.strip("# ") for t in tokens]
+            else:
+                row = list(map(lambda x: "NA" if x == "" else x, tokens))
+                rows.append(row)
+        elif len(tokens) == 1:
+            pass
+        else:
+            print(f"Unexpected line format at line {i}: {line}")
+            sys.exit(1)
+        i += 1
+        continue
+
+    df = pd.DataFrame(rows, columns=header)
+    df[VARIANT_INDEX] = metadata.get(VARIANT_INDEX)
+    return metadata, df
+
+
+def get_variant_id(feature: dict) -> str:
+    """
+    Get the variant ID from a feature.
+
+    Args:
+        feature (dict): A feature dictionary.
+
+    Returns:
+        str: The variant ID or None if not applicable.
+    """
+    xrefs = feature.get("xrefs", [])
+    for xref in xrefs:
+        if xref.get("id").startswith("rs"):
+            return xref.get("id")
+    return ""
+
+
+def combine_alternative_sequences(df: pd.DataFrame) -> pd.DataFrame:
+    if "mutatedType" not in df.columns:
+        return df
+    return df.groupby(["begin", "end", "variant_id"], dropna=False, as_index=False).agg(
+        {
+            col: (
+                (lambda x: ",".join(x.astype(str).unique()))
+                if col == "mutatedType"
+                else "first"
+            )
+            for col in df.columns
+        }
+    )
+
+
+def get_populations(regions: list[str]) -> set[str]:
+    """Generate subgroups for larger groups.
+
+    Args:
+        groups (list[str]): List of main groups to include.
+
+    Returns:
+        list[str]: List of all subgroups in the main group.
+    """
+    mapping: dict[str, set[str]] = {
+        "Africa": set(["African"]),
+        "North America": set(
+            [
+                "American",
+                "African American",
+                "Mexican",
+                "Cuban",
+                "European American",
+                "NativeAmerican",
+                "NativeHawaiian",
+            ]
+        ),
+        "Asia": set(
+            [
+                "Asian",
+                "East Asian",
+                "Central Asia",
+                "JAPANESE",
+                "KOREAN",
+                "South Asian",
+                "SouthAsian",
+            ]
+        ),
+        "Europe": set(
+            [
+                "Europe",
+                "European",
+                "Finnish from FINRISK project",
+                "Spanish controls",
+                "TWIN COHORT",
+            ]
+        ),
+        "Global": set(["Global", "Total"]),
+        "South America": set(
+            [
+                "Latin American 1",
+                "Latin American 2",
+                "Dominican",
+                "PuertoRican",
+                "SouthAmerican",
+            ]
+        ),
+        "Middle East": set(["Middle Eastern", "Near_East"]),
+        "Other": set(["Other"]),
+    }
+
+    return set.union(*[mapping.get(group, set()) for group in regions])
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Protein Variance CLI Application")
+    parser.add_argument(
+        "-a", "--accession", type=str, required=True, help="Protein accession number."
+    )
+    parser.add_argument(
+        "-p",
+        "--populations",
+        type=str,
+        required=True,
+        help="Comma-separated list of populations.",
+    )
+    parser.add_argument(
+        "-f",
+        "--output-format",
+        type=str,
+        choices=["xlsx", "tabular", "csv"],
+        default="tabular",
+        help="Output format: xlsx, tabular (tsv), or csv. Default is tabular.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-file",
+        type=str,
+        default="protein_variation.tsv",
+        help="Output file name.",
+    )
+
+    args = parser.parse_args()
+
+    populations = get_populations(args.populations.split(","))
+
+    _, features_df = get_protein_variation(args.accession)
+
+    features_df["variant_id"] = features_df.apply(get_variant_id, axis=1)
+    features_df["variation_position"] = features_df.apply(get_position, axis=1)
+    features_df = combine_alternative_sequences(features_df)
+
+    results = list(
+        zip(
+            *[
+                parse_frequency_reponse(get_frequency(variant))
+                for variant in features_df["variant_id"]
+            ]
+        )
+    )
+
+    metadata_df = pd.DataFrame(results[0])
+    frequencies_df = pd.concat(results[1])
+
+    merged = pd.concat([features_df, metadata_df], axis=1)
+    final_merged = pd.merge(merged, frequencies_df, on=VARIANT_INDEX, how="outer")
+
+    final_merged[["Ref Allele NA", "Ref Allele Prob"]] = final_merged[
+        "Ref Allele"
+    ].str.split("=", n=1, expand=True)
+
+    alt_alleles = final_merged["Alt Allele"].str.split(",", expand=True)
+    if alt_alleles.columns.size == 2:
+        final_merged[["Alt Allele 1", "Alt Allele 2"]] = final_merged[
+            "Alt Allele"
+        ].str.split(",", expand=True)
+        final_merged[["Alt Allele NA 1", "Alt Allele Prob 1"]] = final_merged[
+            "Alt Allele 1"
+        ].str.split("=", n=1, expand=True)
+        final_merged[["Alt Allele NA 2", "Alt Allele Prob 2"]] = final_merged[
+            "Alt Allele 2"
+        ].str.split("=", n=1, expand=True)
+    else:
+        final_merged[["Alt Allele NA 1", "Alt Allele Prob 1"]] = final_merged[
+            "Alt Allele"
+        ].str.split("=", n=1, expand=True)
+        final_merged[["Alt Allele NA 2", "Alt Allele Prob 2"]] = None
+
+    subset_cols: list[str] = [
+        "variation_position",
+        "consequenceType",
+        "wildType",
+        "mutatedType",
+        "variant_id",
+        "Alleles",
+        "Study",
+        "Population",
+        "Group",
+        "Samplesize",
+        "Ref Allele",
+        "Alt Allele",
+        "BioProject ID",
+        "BioSample ID",
+        "somaticStatus",
+        "Ref Allele NA",
+        "Ref Allele Prob",
+        "Alt Allele NA 1",
+        "Alt Allele Prob 1",
+        "Alt Allele NA 2",
+        "Alt Allele Prob 2",
+    ]
+
+    subset = final_merged[subset_cols]
+    subset = subset[subset["Population"].isin(populations)]
+
+    if args.output_format == "xlsx":
+        outdir = os.path.dirname(args.output_file)
+        outpath = os.path.join(outdir, "results.xlsx")
+        subset.to_excel(outpath, index=False, na_rep="NA", engine="openpyxl")
+        os.rename(outpath, args.output_file)
+    elif args.output_format == "csv":
+        subset.to_csv(args.output_file, index=False, na_rep="NA")
+    else:  # tabular/tsv
+        subset.to_csv(args.output_file, index=False, sep="\t", na_rep="NA")
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/freqSAP.xml	Fri Jul 18 13:21:36 2025 +0000
@@ -0,0 +1,138 @@
+<tool id="freqsap" name="freqSAP" version="1.0.0+galaxy0" profile="23.0" license="MIT">
+    <description>Get frequencies of single amino-acid polymorphisms based on nucleid-acid polymorphism for different populations from UniProt and DbSNP</description>
+
+    <creator>
+        <person
+            givenName="Helge"
+            familyName="Hecht"
+            url="https://github.com/hechth"
+            identifier="0000-0001-6744-996X" />
+        <organization
+            url="https://www.recetox.muni.cz/"
+            email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"
+            name="RECETOX MUNI" />
+    </creator>
+
+    <edam_topics>
+        <edam_topic>topic_0121</edam_topic>
+        <edam_topic>topic_3366</edam_topic>
+    </edam_topics>
+
+    <edam_operations>
+        <edam_operation>operation_3197</edam_operation>
+        <edam_operation>operation_2479</edam_operation>
+        <edam_operation>operation_2422</edam_operation>
+    </edam_operations>
+
+    <requirements>
+        <requirement type="package" version="2.3.0">pandas</requirement>
+        <requirement type="package" version="2.32.4">requests</requirement>
+        <requirement type="package" version="3.1.5">openpyxl</requirement>
+    </requirements>
+
+    <required_files>
+        <include path="freqSAP.py"/>
+    </required_files>
+
+    <command detect_errors="exit_code"><![CDATA[
+    python3 '${__tool_directory__}/freqSAP.py' -a '$accession' -p '$populations' -f '$output_format' -o '$output_file'
+    ]]></command>
+    <inputs>
+        <param argument="--accession" type="text" label="Protein Accession in UniProt" help="UniProt accession of the protein to fetch variation data for.">
+            <sanitizer invalid_char="">
+                <valid initial="string.letters,string.digits">
+                    <add value="_" />
+                </valid>
+            </sanitizer>
+            <validator type="regex">[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}</validator>
+        </param>
+        <param argument="--populations" type="select" multiple="true" optional="false" label="Regions from which to choose the populations" help="The regions are broken down into sub-populations based on available entries in DbSNP.">
+            <option value="Africa">Africa</option>
+            <option value="North America">North America</option>
+            <option value="Asia">Asia</option>
+            <option value="Europe">Europe</option>
+            <option value="South America">South America</option>
+            <option value="Middle East">Middle East</option>
+            <option value="Other">Other</option>
+            <option value="Global" selected="true">Global</option>
+        </param>
+        <param argument="--output_format" type="select" label="Output Format" help="Format in which to store the output file.">
+            <option value="tabular" selected="true">Tab Separated Value (tabular)</option>
+            <option value="csv">Comma Separated Value (csv)</option>
+            <option value="xlsx">Excel (xlsx)</option>
+       </param>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="tabular" label="${tool.name} of ${accession}">
+            <change_format>
+                <when input="output_format" value="xlsx" format="xlsx"/>
+                <when input="output_format" value="csv" format="csv"/>
+            </change_format>
+        </data>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="accession" value="P0DJI9"/>
+            <param name="populations" value="Europe,Asia"/>
+            <output name="output_file" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="21"/>
+                    <has_n_lines n="807" delta="20"/>
+                    <has_text text="rs1044068853"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="accession" value="P05067"/>
+            <param name="populations" value="Global"/>
+            <param name="output_format" value="xlsx"/>
+            <output name="output_file" ftype="xlsx">
+                <assert_contents>
+                    <has_size size="161" delta="10"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+freqSAP: Frequency of Single Amino-Acid Polymorphisms
+=====================================================
+
+This tool retrieves the frequencies of single amino-acid polymorphisms (SAPs) for a given protein accession from UniProt and DbSNP, broken down by population regions.
+
+Inputs
+------
+
+- **Protein Accession in UniProt:**
+  Enter the UniProt accession (e.g., ``P12345``) for the protein of interest.
+
+- **Regions from which to choose the populations:**
+  Select one or more population regions. The tool will fetch SAP frequencies for the selected regions.
+
+- **Output Format:**
+  Choose the format for the output file:
+
+  - Tab Separated Value (``tabular``)
+  - Comma Separated Value (``csv``)
+  - Excel (``xlsx``)
+
+Outputs
+-------
+
+- A table listing SAP frequencies for the selected protein and populations, in the chosen format.
+
+Example Usage
+-------------
+
+1. Enter a UniProt accession (e.g., ``P69905``).
+2. Select ``Europe`` and ``Asia`` as populations.
+3. Choose ``csv`` as the output format.
+4. Run the tool to download the SAP frequency table.
+    ]]></help>
+
+    <citations>
+        <citation type="doi">10.1093/nar/29.1.308</citation>
+        <citation type="doi">10.1093/nar/gkaf394</citation>
+    </citations>
+</tool>
\ No newline at end of file