Mercurial > repos > ufz > lm_get_projects

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_projects.py	Mon Mar 17 20:35:10 2025 +0000
@@ -0,0 +1,251 @@
+import argparse
+import os
+from json import load
+
+
+import pandas as pd
+import sqlalchemy as db
+
+
+def get_arguments() -> argparse.Namespace:
+    """
+    Parse and return the command-line arguments required for the script.
+
+    return: argparse.Namespace: Parsed arguments containing credentials_file (str) and login (str).
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Read the projects of the current user from the Lambda-Miner Database"
+    )
+
+    # Add argument for the credentials file
+    parser.add_argument(
+        "-c",
+        "--credentials-file",
+        dest="credentials_file",
+        type=str,
+        required=False,  # Optional
+        help=(
+            "Credential file in JSON format including dialect, user, password, host, port, and "
+            "database. If not provided, the environment variable LAMBDAMINER_CREDENTIALS will be "
+            "used."
+        )
+    )
+
+    # Add argument for the login name
+    parser.add_argument(
+        "-l",
+        "--login-name",
+        dest="login",
+        type=str,
+        required=True,
+        help="UFZ login name of the user"
+    )
+
+    # Add argument for the output file
+    parser.add_argument(
+        "-o",
+        "--output-file",
+        dest="output",
+        type=str,
+        default="projects.csv",
+        help="Specifiy the output file path including the file name (default: 'projects.csv')"
+    )
+
+    # Add argument for the type
+    parser.add_argument(
+        "-t",
+        "--type",
+        choices=["generic", "import", "calibration", "assignment", "validation", "export"],
+        default="generic",
+        help="Specify the workflow type (default: 'generic')"
+    )
+
+    return parser.parse_args()
+
+
+def parse_check_args(args):
+    """
+    Parse and validate command line arguments.
+
+    Following actions are performed:
+    - Check of the existence of the specified directory in the output path.
+    - Assignment the correct credentials file to the arguments.
+
+    :param args: command line arguments.
+    :type args: argparse.Namespace
+    :raises FileNotFoundError: If the specified directory in the output path does not exist.
+    """
+
+    # Extract the directory part of the specified output path
+    dir_path = os.path.dirname(args.output) or "."
+
+    # Check if the directory exists and raise error if not
+    if not os.path.isdir(dir_path):
+        raise FileNotFoundError(f"Error: The directory does not exist: {dir_path}")
+
+    # Get environment variable LAMBDAMINER_CREDENTIALS
+    envar_credentials = os.getenv("LAMBDAMINER_CREDENTIALS")
+
+    # Use the provided argument or fallback to the environment variable
+    args.credentials_file = args.credentials_file or envar_credentials
+
+    assert args.credentials_file is not None, "Error: No credentials specified"
+
+
+def get_engine(credentials_path: str, echo: bool = False) -> db.engine.Engine:
+    """
+    Create and return a SQLAlchemy engine based on the supplied credentials.
+
+    The engine is created using the data from the supplied credentials file,
+    which should be in JSON format and include the following keys:
+        dialect, user, password, host, port, database
+
+    :param credentials_path: The path to the credentials file.
+    :type credentials_path: str
+    :return: The SQLAlchemy engine object.
+    :rtype: sqlalchemy.engine.Engine
+    """
+
+    with open(credentials_path) as file:
+        credentials = load(file)
+
+    dialect = credentials["dialect"]
+    username = credentials["user"]
+    password = credentials["password"]
+    host = credentials["host"]
+    port = credentials["port"]
+    database_name = credentials["database"]
+
+    database_url = f"{dialect}://{username}:{password}@{host}:{port}/{database_name}"
+
+    return db.create_engine(database_url, echo=echo)
+
+
+def get_user_id(connection, metadata, login: str) -> int:
+    """
+    Retrieve the user_id for a given login.
+
+    :param connection: The database connection.
+    :param metadata: The database metadata containing table definitions.
+    :param login: The login username to search for.
+    :return: The user_id if found, otherwise None.
+    """
+
+    # Access the 'ufz_user' table from metadata.
+    User = metadata.tables["ufz_user"]
+
+    # Construct a query to select the user_id where the login matches.
+    query = db.select(User.c.user_id).where(User.c.login == login)
+
+    # Execute the query and fetch the scalar result.
+    result = connection.execute(query).scalar()
+
+    # Return the user_id as an integer if found, otherwise return None.
+    return int(result) if result else None
+
+
+def get_projects_with_sample_count(connection, metadata, user_id):
+    """
+    Retrieve projects and their sample counts for a given user_id.
+
+    The query will return a pandas DataFrame with the columns:
+        project_id : The id of the project.
+        name : The name of the project.
+        sample_count : The number of samples associated with the project.
+
+    :param connection: The database connection.
+    :param metadata: The database metadata containing table definitions.
+    :param user_id: The user_id to search for.
+    :return: A pandas DataFrame with the projects and their sample counts.
+    """
+
+    User = metadata.tables["ufz_user"]
+    User_Project = metadata.tables["ufz_user_project"]
+    Project = metadata.tables["project"]
+    Sample = metadata.tables["sample"]
+
+    # Construct the query:
+    # 1. Select project_id, name and the count of sample_id as sample_count.
+    # 2. Join the tables ufz_user_project, ufz_user, project and sample.
+    #    - Join ufz_user_project with project on project_id.
+    #    - Join ufz_user with ufz_user_project on user_id.
+    #    - Join sample with project on project_id.
+    #      - Use an outer join to include projects without samples.
+    # 3. Filter the results to only include the given user_id.
+    # 4. Group the results by project_id and name.
+
+    query = (
+        db.select(
+            Project.c.project_id.label("Project ID"),
+            Project.c.name.label("Project Name"),
+            db.func.count(Sample.c.sample_id).label("Sample Count")
+        )
+        .join(User_Project, User_Project.c.project_id == Project.c.project_id)
+        .join(User, User.c.user_id == User_Project.c.user_id)
+        .join(Sample, Sample.c.project == Project.c.project_id, isouter=True)
+        .where(User.c.user_id == user_id)
+        .group_by(Project.c.project_id, Project.c.name)
+    )
+
+    # Execute the query, fetch the results, and return the DataFrame.
+    return pd.DataFrame(connection.execute(query).fetchall())
+
+
+def main():
+
+    # Parse command-line arguments
+    args = get_arguments()
+
+    # Parse and check the specified command line arguments
+    parse_check_args(args)
+
+    try:
+        # Load credentials and create the database engine
+        engine = get_engine(args.credentials_file)
+
+        # Reflect metadata and connect to the database
+        metadata = db.MetaData()
+        metadata.reflect(bind=engine, only=["ufz_user", "ufz_user_project", "project", "sample"])
+
+        with engine.connect() as conn:
+            # Get user ID
+            user_id = get_user_id(conn, metadata, args.login)
+
+            if not user_id:
+                raise ValueError(
+                    "No Lambda-Miner user found with the login name \"{}\". "
+                    "Please find the description on how to register for the Lambda-Miner at "
+                    "https://lambda-miner-project.pages.ufz.de/lambda-miner-workflows/getting-started/."
+                    .format(args.login)
+                )
+
+            # Get projects with sample counts
+            projects = get_projects_with_sample_count(conn, metadata, user_id)
+
+            # Write projects as a CSV file to the specified output
+            with open(args.output, "w") as f:
+                f.write(projects.to_csv(index=False))
+
+            # Display the result
+            if projects.empty:
+                raise ValueError(
+                    "No projects found for the user \"{}\". "
+                    "Please create a project before going on."
+                    .format(args.login)
+                )
+            else:
+                print(projects)
+
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Credentials file not found at \"{args.credentials_file}\".")
+
+    except db.exc.SQLAlchemyError as e:
+        raise RuntimeError(f"Database error occurred: {e}")
+
+    except Exception as e:
+        raise RuntimeError(f"An unexpected error occurred: {e}")
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_projects.xml	Mon Mar 17 20:35:10 2025 +0000
@@ -0,0 +1,49 @@
+<tool id="lm_get_projects" name="Get LMDB Projects" version="0.1.0+galaxy0" profile="21.05">
+    <requirements>
+        <requirement type="package" version="2.2.2">pandas</requirement>
+        <requirement type="package" version="2.9.9">psycopg2</requirement>
+        <requirement type="package" version="2.0.37">sqlalchemy</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #if $test != ""
+            export LAMBDAMINER_CREDENTIALS="$__tool_directory__/test-data/$test";
+        #end if
+        python '$__tool_directory__/get_projects.py'
+        --login-name '$__user_name__'
+        --output-file '$projects'
+    ]]></command>
+    <inputs>
+        <param name="test" type="hidden"/>
+    </inputs>
+    <outputs>
+        <data name="projects" format="csv"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="test" value="test_credentials.json"/>
+            <output name="projects" value="test_projects.csv" compare="contains"/>
+        </test>
+        <test expect_failure="true">
+            <param name="test" value="test_credentials_false.json"/>
+        </test>
+        <test expect_failure="true">
+            <param name="test" value="test_credentials_nonexistent.json"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+This tool queries the Lambda-Miner database for the users projects.
+
+**Input**
+
+There is no input necessary from the user. The tool connects to the Lambda-Miner database and uses the users login name to query it.
+
+**Output**
+
+CSV file containing the projects (id, name, sample count) corresponding to the user.]]>
+    </help>
+    <citations>
+        <citation type="doi">10.5194/egusphere-egu24-15782</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_credentials.json	Mon Mar 17 20:35:10 2025 +0000
@@ -0,0 +1,12 @@
+{
+    "dialect" : "postgresql",
+    "host":"172.17.0.1",
+    "port":5432,
+    "database":"lmdb",
+    "schema":"public",
+    "user":"lmdb_rw",
+    "password":"SSeTjhTE42ba9kgjDrUY",
+    "parameters":{
+        "sslmode":"disable"
+    }
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_credentials_false.json	Mon Mar 17 20:35:10 2025 +0000
@@ -0,0 +1,12 @@
+{
+    "dialect" : "postgresql",
+    "host":"172.17.0.1",
+    "port":5432,
+    "database":"lmdb",
+    "schema":"public",
+    "user":"lmdb_rw",
+    "password":"THIS-IS-THE-WRONG-PASSWORD",
+    "parameters":{
+        "sslmode":"disable"
+    }
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_projects.csv	Mon Mar 17 20:35:10 2025 +0000
@@ -0,0 +1,3 @@
+Project ID,Project Name,Sample Count
+1,2024_BGO_TestProject,2
+2,2025_BGO_TestProject,2