Mercurial > repos > recetox > freqsap
changeset 0:ddabfd6ee2a2 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/main/tools/freqsap commit 202a898874d0de51b9923430ea0ef3040084c8d0
author | recetox |
---|---|
date | Fri, 18 Jul 2025 13:21:36 +0000 |
parents | |
children | |
files | freqSAP.py freqSAP.xml |
diffstat | 2 files changed, 450 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/freqSAP.py Fri Jul 18 13:21:36 2025 +0000 @@ -0,0 +1,312 @@ +import argparse +import json +import os +import sys + +import pandas as pd +import requests + +VARIANT_INDEX = "NCBI Reference SNP (rs) Report ALPHA" + + +def get_protein_variation(accession: str) -> tuple[dict, pd.DataFrame]: + requestURL = f"https://www.ebi.ac.uk/proteins/api/variation?offset=0&size=-1&accession={accession}" + r = requests.get(requestURL, headers={"Accept": "application/json"}) + + if not r.ok: + r.raise_for_status() + sys.exit() + + responseBody = r.text + + data = json.loads(responseBody)[0] + + features = pd.DataFrame(data.pop("features")) + return data, features + + +def get_position(feature: dict) -> str: + """ + Get the position of a feature in the protein sequence. + + Args: + feature (dict): A feature dictionary containing 'begin' and 'end'. + + Returns: + str: The position in the format "start-end". + """ + begin = feature.get("begin") + end = feature.get("end") + if begin == end: + return str(begin) + return f"{feature.get('begin')}-{feature.get('end')}" + + +def get_frequency(variant: str) -> str: + if not variant: + return "" + + try: + freq_url = f"https://www.ncbi.nlm.nih.gov/snp/{variant}/download/frequency" + r = requests.get(freq_url, headers={"Accept": "application/json"}) + if not r.ok: + r.raise_for_status() + return r.text + except requests.exceptions.RequestException as e: + print(f"Error fetching frequency data for variant {variant}: {e}") + return "" + + +def parse_frequency_reponse(responseBody: str) -> tuple[dict, pd.DataFrame]: + """ + Parse the frequency response body. + + Args: + responseBody (str): The response body as a string. + + Returns: + dict: Parsed JSON data. + """ + if responseBody == "": + return {}, pd.DataFrame() + + lines = responseBody.splitlines() + n_lines = len(lines) + i = 0 + + metadata = {} + header = [] + rows = [] + + while i < n_lines: + line = lines[i] + tokens = line.split("\t") + + if len(tokens) == 2: + key = tokens[0].strip("# ") + value = tokens[1].strip() + metadata[key] = value + elif len(tokens) > 2: + if tokens[0].startswith("#"): + header = [t.strip("# ") for t in tokens] + else: + row = list(map(lambda x: "NA" if x == "" else x, tokens)) + rows.append(row) + elif len(tokens) == 1: + pass + else: + print(f"Unexpected line format at line {i}: {line}") + sys.exit(1) + i += 1 + continue + + df = pd.DataFrame(rows, columns=header) + df[VARIANT_INDEX] = metadata.get(VARIANT_INDEX) + return metadata, df + + +def get_variant_id(feature: dict) -> str: + """ + Get the variant ID from a feature. + + Args: + feature (dict): A feature dictionary. + + Returns: + str: The variant ID or None if not applicable. + """ + xrefs = feature.get("xrefs", []) + for xref in xrefs: + if xref.get("id").startswith("rs"): + return xref.get("id") + return "" + + +def combine_alternative_sequences(df: pd.DataFrame) -> pd.DataFrame: + if "mutatedType" not in df.columns: + return df + return df.groupby(["begin", "end", "variant_id"], dropna=False, as_index=False).agg( + { + col: ( + (lambda x: ",".join(x.astype(str).unique())) + if col == "mutatedType" + else "first" + ) + for col in df.columns + } + ) + + +def get_populations(regions: list[str]) -> set[str]: + """Generate subgroups for larger groups. + + Args: + groups (list[str]): List of main groups to include. + + Returns: + list[str]: List of all subgroups in the main group. + """ + mapping: dict[str, set[str]] = { + "Africa": set(["African"]), + "North America": set( + [ + "American", + "African American", + "Mexican", + "Cuban", + "European American", + "NativeAmerican", + "NativeHawaiian", + ] + ), + "Asia": set( + [ + "Asian", + "East Asian", + "Central Asia", + "JAPANESE", + "KOREAN", + "South Asian", + "SouthAsian", + ] + ), + "Europe": set( + [ + "Europe", + "European", + "Finnish from FINRISK project", + "Spanish controls", + "TWIN COHORT", + ] + ), + "Global": set(["Global", "Total"]), + "South America": set( + [ + "Latin American 1", + "Latin American 2", + "Dominican", + "PuertoRican", + "SouthAmerican", + ] + ), + "Middle East": set(["Middle Eastern", "Near_East"]), + "Other": set(["Other"]), + } + + return set.union(*[mapping.get(group, set()) for group in regions]) + + +def main(): + parser = argparse.ArgumentParser(description="Protein Variance CLI Application") + parser.add_argument( + "-a", "--accession", type=str, required=True, help="Protein accession number." + ) + parser.add_argument( + "-p", + "--populations", + type=str, + required=True, + help="Comma-separated list of populations.", + ) + parser.add_argument( + "-f", + "--output-format", + type=str, + choices=["xlsx", "tabular", "csv"], + default="tabular", + help="Output format: xlsx, tabular (tsv), or csv. Default is tabular.", + ) + parser.add_argument( + "-o", + "--output-file", + type=str, + default="protein_variation.tsv", + help="Output file name.", + ) + + args = parser.parse_args() + + populations = get_populations(args.populations.split(",")) + + _, features_df = get_protein_variation(args.accession) + + features_df["variant_id"] = features_df.apply(get_variant_id, axis=1) + features_df["variation_position"] = features_df.apply(get_position, axis=1) + features_df = combine_alternative_sequences(features_df) + + results = list( + zip( + *[ + parse_frequency_reponse(get_frequency(variant)) + for variant in features_df["variant_id"] + ] + ) + ) + + metadata_df = pd.DataFrame(results[0]) + frequencies_df = pd.concat(results[1]) + + merged = pd.concat([features_df, metadata_df], axis=1) + final_merged = pd.merge(merged, frequencies_df, on=VARIANT_INDEX, how="outer") + + final_merged[["Ref Allele NA", "Ref Allele Prob"]] = final_merged[ + "Ref Allele" + ].str.split("=", n=1, expand=True) + + alt_alleles = final_merged["Alt Allele"].str.split(",", expand=True) + if alt_alleles.columns.size == 2: + final_merged[["Alt Allele 1", "Alt Allele 2"]] = final_merged[ + "Alt Allele" + ].str.split(",", expand=True) + final_merged[["Alt Allele NA 1", "Alt Allele Prob 1"]] = final_merged[ + "Alt Allele 1" + ].str.split("=", n=1, expand=True) + final_merged[["Alt Allele NA 2", "Alt Allele Prob 2"]] = final_merged[ + "Alt Allele 2" + ].str.split("=", n=1, expand=True) + else: + final_merged[["Alt Allele NA 1", "Alt Allele Prob 1"]] = final_merged[ + "Alt Allele" + ].str.split("=", n=1, expand=True) + final_merged[["Alt Allele NA 2", "Alt Allele Prob 2"]] = None + + subset_cols: list[str] = [ + "variation_position", + "consequenceType", + "wildType", + "mutatedType", + "variant_id", + "Alleles", + "Study", + "Population", + "Group", + "Samplesize", + "Ref Allele", + "Alt Allele", + "BioProject ID", + "BioSample ID", + "somaticStatus", + "Ref Allele NA", + "Ref Allele Prob", + "Alt Allele NA 1", + "Alt Allele Prob 1", + "Alt Allele NA 2", + "Alt Allele Prob 2", + ] + + subset = final_merged[subset_cols] + subset = subset[subset["Population"].isin(populations)] + + if args.output_format == "xlsx": + outdir = os.path.dirname(args.output_file) + outpath = os.path.join(outdir, "results.xlsx") + subset.to_excel(outpath, index=False, na_rep="NA", engine="openpyxl") + os.rename(outpath, args.output_file) + elif args.output_format == "csv": + subset.to_csv(args.output_file, index=False, na_rep="NA") + else: # tabular/tsv + subset.to_csv(args.output_file, index=False, sep="\t", na_rep="NA") + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/freqSAP.xml Fri Jul 18 13:21:36 2025 +0000 @@ -0,0 +1,138 @@ +<tool id="freqsap" name="freqSAP" version="1.0.0+galaxy0" profile="23.0" license="MIT"> + <description>Get frequencies of single amino-acid polymorphisms based on nucleid-acid polymorphism for different populations from UniProt and DbSNP</description> + + <creator> + <person + givenName="Helge" + familyName="Hecht" + url="https://github.com/hechth" + identifier="0000-0001-6744-996X" /> + <organization + url="https://www.recetox.muni.cz/" + email="GalaxyToolsDevelopmentandDeployment@space.muni.cz" + name="RECETOX MUNI" /> + </creator> + + <edam_topics> + <edam_topic>topic_0121</edam_topic> + <edam_topic>topic_3366</edam_topic> + </edam_topics> + + <edam_operations> + <edam_operation>operation_3197</edam_operation> + <edam_operation>operation_2479</edam_operation> + <edam_operation>operation_2422</edam_operation> + </edam_operations> + + <requirements> + <requirement type="package" version="2.3.0">pandas</requirement> + <requirement type="package" version="2.32.4">requests</requirement> + <requirement type="package" version="3.1.5">openpyxl</requirement> + </requirements> + + <required_files> + <include path="freqSAP.py"/> + </required_files> + + <command detect_errors="exit_code"><![CDATA[ + python3 '${__tool_directory__}/freqSAP.py' -a '$accession' -p '$populations' -f '$output_format' -o '$output_file' + ]]></command> + <inputs> + <param argument="--accession" type="text" label="Protein Accession in UniProt" help="UniProt accession of the protein to fetch variation data for."> + <sanitizer invalid_char=""> + <valid initial="string.letters,string.digits"> + <add value="_" /> + </valid> + </sanitizer> + <validator type="regex">[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}</validator> + </param> + <param argument="--populations" type="select" multiple="true" optional="false" label="Regions from which to choose the populations" help="The regions are broken down into sub-populations based on available entries in DbSNP."> + <option value="Africa">Africa</option> + <option value="North America">North America</option> + <option value="Asia">Asia</option> + <option value="Europe">Europe</option> + <option value="South America">South America</option> + <option value="Middle East">Middle East</option> + <option value="Other">Other</option> + <option value="Global" selected="true">Global</option> + </param> + <param argument="--output_format" type="select" label="Output Format" help="Format in which to store the output file."> + <option value="tabular" selected="true">Tab Separated Value (tabular)</option> + <option value="csv">Comma Separated Value (csv)</option> + <option value="xlsx">Excel (xlsx)</option> + </param> + </inputs> + <outputs> + <data name="output_file" format="tabular" label="${tool.name} of ${accession}"> + <change_format> + <when input="output_format" value="xlsx" format="xlsx"/> + <when input="output_format" value="csv" format="csv"/> + </change_format> + </data> + </outputs> + + <tests> + <test> + <param name="accession" value="P0DJI9"/> + <param name="populations" value="Europe,Asia"/> + <output name="output_file" ftype="tabular"> + <assert_contents> + <has_n_columns n="21"/> + <has_n_lines n="807" delta="20"/> + <has_text text="rs1044068853"/> + </assert_contents> + </output> + </test> + <test> + <param name="accession" value="P05067"/> + <param name="populations" value="Global"/> + <param name="output_format" value="xlsx"/> + <output name="output_file" ftype="xlsx"> + <assert_contents> + <has_size size="161" delta="10"/> + </assert_contents> + </output> + </test> + </tests> + + <help><![CDATA[ +freqSAP: Frequency of Single Amino-Acid Polymorphisms +===================================================== + +This tool retrieves the frequencies of single amino-acid polymorphisms (SAPs) for a given protein accession from UniProt and DbSNP, broken down by population regions. + +Inputs +------ + +- **Protein Accession in UniProt:** + Enter the UniProt accession (e.g., ``P12345``) for the protein of interest. + +- **Regions from which to choose the populations:** + Select one or more population regions. The tool will fetch SAP frequencies for the selected regions. + +- **Output Format:** + Choose the format for the output file: + + - Tab Separated Value (``tabular``) + - Comma Separated Value (``csv``) + - Excel (``xlsx``) + +Outputs +------- + +- A table listing SAP frequencies for the selected protein and populations, in the chosen format. + +Example Usage +------------- + +1. Enter a UniProt accession (e.g., ``P69905``). +2. Select ``Europe`` and ``Asia`` as populations. +3. Choose ``csv`` as the output format. +4. Run the tool to download the SAP frequency table. + ]]></help> + + <citations> + <citation type="doi">10.1093/nar/29.1.308</citation> + <citation type="doi">10.1093/nar/gkaf394</citation> + </citations> +</tool> \ No newline at end of file