cobraxy: COBRAxy/utils/general

comparison COBRAxy/utils/general_utils.py @ 456:a6e45049c1b9 draft default tip

Uploaded

author	francesco_lapi
date	Fri, 12 Sep 2025 17:28:45 +0000
parents	4a385fdb9e58
children

comparison

equal deleted inserted replaced

-:4e2bc80764b6
+:a6e45049c1b9
+"""
+General utilities for COBRAxy.
+This module provides:
+- File and path helpers (FileFormat, FilePath)
+- Error and result handling utilities (CustomErr, Result)
+- Basic I/O helpers (CSV/TSV, pickle, SVG)
+- Lightweight CLI argument parsers (Bool, Float)
+- Model loader utilities for COBRA models, including compressed formats
+"""
 import math
 import re
 import sys
 import csv
 import pickle
 import lxml.etree as ET
 from enum import Enum
 from itertools import count
-from typing import Any, Callable, Dict, Generic, List, Literal, Optional, TypeVar, Union, Set, Tuple
+from typing import Any, Callable, Dict, Generic, List, Literal, Optional, TypeVar, Union, Tuple
 import pandas as pd
 import cobra
-from cobra import Model as cobraModel, Reaction, Metabolite
 import zipfile
 import gzip
 import bz2
 from io import StringIO
+from typing import Any, Callable, Dict, Generic, List, Literal, Optional, TypeVar, Union, Tuple
 class ValueErr(Exception):
 def __init__(self, param_name, expected, actual):
 super().__init__(f"Invalid value for {param_name}: expected {expected}, got {actual}")
 class PathErr(Exception):
 class FileFormat(Enum):
 """
 Encodes possible file extensions to conditionally save data in a different format.
 """
-DAT    = ("dat",) # this is how galaxy treats all your files!
+DAT    = ("dat",)
-CSV    = ("csv",) # this is how most editable input data is written
+CSV    = ("csv",)
-TSV    = ("tsv",) # this is how most editable input data is ACTUALLY written TODO:more support pls!!
+TSV    = ("tsv",)
-SVG    = ("svg",) # this is how most metabolic maps are written
+SVG    = ("svg",)
-PNG    = ("png",) # this is a common output format for images (such as metabolic maps)
+PNG    = ("png",)
-PDF    = ("pdf",) # this is also a common output format for images, as it's required in publications.
+PDF    = ("pdf",)
-# Updated to include compressed variants
+# Compressed variants for common model formats
-XML    = ("xml", "xml.gz", "xml.zip", "xml.bz2") # SBML files are XML files, sometimes compressed
+XML    = ("xml", "xml.gz", "xml.zip", "xml.bz2")
-JSON   = ("json", "json.gz", "json.zip", "json.bz2") # COBRA models can be stored as JSON files, sometimes compressed
+JSON   = ("json", "json.gz", "json.zip", "json.bz2")
-MAT    = ("mat", "mat.gz", "mat.zip", "mat.bz2") # COBRA models can be stored as MAT files, sometimes compressed
+MAT    = ("mat", "mat.gz", "mat.zip", "mat.bz2")
-YML    = ("yml", "yml.gz", "yml.zip", "yml.bz2") # COBRA models can be stored as YML files, sometimes compressed
+YML    = ("yml", "yml.gz", "yml.zip", "yml.bz2")
-TXT    = ("txt",) # this is how most output data is written
+TXT    = ("txt",)
-PICKLE = ("pickle", "pk", "p") # this is how all runtime data structures are saved
+PICKLE = ("pickle", "pk", "p")
 def __init__(self, *extensions):
 self.extensions = extensions
 # Store original extension when set via fromExt
 self._original_extension = None
 """
 (Private) converts to str representation. Good practice for usage with argparse.
 Returns:
 str : the string representation of the file extension.
 """
-# If we have an original extension stored (for compressed files only), use it
 if hasattr(self, '_original_extension') and self._original_extension:
 return self._original_extension
-# For XML, JSON, MAT and YML without original extension, use the base extension
 if self == FileFormat.XML:
 return "xml"
 elif self == FileFormat.JSON:
 return "json"
 elif self == FileFormat.MAT:
 return self.value[-1]
 class FilePath():
 """
-Represents a file path. View this as an attempt to standardize file-related operations by expecting
+Represents a file path with format-aware helpers.
-values of this type in any process requesting a file path.
 """
 def __init__(self, filePath: str, ext: FileFormat, *, prefix="") -> None:
 """
-(Private) Initializes an instance of FilePath.
+Initialize FilePath.
 Args:
-path : the end of the path, containing the file name.
+path: File name stem.
-ext : the file's extension.
+ext: File extension (FileFormat).
-prefix : anything before path, if the last '/' isn't there it's added by the code.
+prefix: Optional directory path (trailing '/' auto-added).
-Returns:
-None : practically, a FilePath instance.
 """
 self.ext = ext
 self.filePath = filePath
 if prefix and prefix[-1] != '/':
 self.prefix = prefix
 @classmethod
 def fromStrPath(cls, path: str) -> "FilePath":
 """
-Factory method to parse a string from which to obtain, if possible, a valid FilePath instance.
+Parse a string path into a FilePath, supporting double extensions for models (e.g., .json.gz).
-It detects double extensions such as .json.gz and .xml.bz2, which are common in COBRA models.
-These double extensions are not supported for other file types such as .csv.
 Args:
 path : the string containing the path
 Raises:
 PathErr : if the provided string doesn't represent a valid path.
 Returns:
 raise PathErr(path, "cannot recognize folder structure or extension in path")
 prefix = result["prefix"] if result["prefix"] else ""
 name, ext = result["name"], result["ext"]
-# Check for double extensions (json.gz, xml.zip, etc.)
 parts = path.split(".")
 if len(parts) >= 3:
 penultimate = parts[-2]
 last = parts[-1]
 double_ext = f"{penultimate}.{last}"
-# Try the double extension first
 try:
 ext_format = FileFormat.fromExt(double_ext)
 name = ".".join(parts[:-2])
-# Extract prefix if it exists
 if '/' in name:
 prefix = name[:name.rfind('/') + 1]
 name = name[name.rfind('/') + 1:]
 return cls(name, ext_format, prefix=prefix)
 except ValueErr:
-# If double extension doesn't work, fall back to single extension
 pass
-# Single extension fallback (original logic)
 try:
 ext_format = FileFormat.fromExt(ext)
 return cls(name, ext_format, prefix=prefix)
 except ValueErr:
 raise PathErr(path, f"unsupported file extension: {ext}")
 """
 Log a warning message to an output log file and print it to the console. The final period and a
 newline is added by the function.
 Args:
-s (str): The warning message to be logged and printed.
+msg (str): The warning message to be logged and printed.
 loggerPath : The file path of the output log file. Given as a string, parsed to a FilePath and
 immediately read back (beware relative expensive operation, log with caution).
 Returns:
 None
 """
-# building the path and then reading it immediately seems useless, but it's actually a way of
+# Note: validates path via FilePath; keep logging minimal to avoid overhead.
-# validating that reduces repetition on the caller's side. Besides, logging a message by writing
-# to a file is supposed to be computationally expensive anyway, so this is also a good deterrent from
-# mindlessly logging whenever something comes up, log at the very end and tell the user everything
-# that went wrong. If you don't like it: implement a persistent runtime buffer that gets dumped to
-# the file only at the end of the program's execution.
 with open(FilePath.fromStrPath(loggerPath).show(), 'a') as log: log.write(f"{msg}.\n")
 class CustomErr(Exception):
 """
 Custom error class to handle exceptions in a structured way, with a unique identifier and a message.
 self.id = max(explicitErrCode, next(CustomErr.__idGenerator))
 def throw(self, loggerPath = "") -> None:
 """
-Raises the current CustomErr instance, logging a warning message before doing so.
+Raises the current CustomErr instance, optionally logging it first.
+Args:
+loggerPath (str): Optional path to a log file to append this error before raising.
 Raises:
 self: The current CustomErr instance.
 Returns:
 None
 """
-if loggerPath: logWarning(str(self), loggerPath)
+if loggerPath:
+logWarning(str(self), loggerPath)
 raise self
 def abort(self) -> None:
 """
 Aborts the execution of the script.
 Class to handle the result of an operation, with a value and a boolean flag to indicate
 whether the operation was successful or not.
 """
 def __init__(self, value :Union[T, E], isOk :bool) -> None:
 """
-(Private) Initializes an instance of Result.
+Initialize an instance of Result.
 Args:
 value (Union[T, E]): The value to be stored in the Result instance.
 isOk (bool): A boolean flag to indicate whether the operation was successful or not.
 self.value = value
 @classmethod
 def Ok(cls,  value :T) -> "Result":
 """
-Constructs a new Result instance with a successful operation.
+Construct a successful Result.
 Args:
 value (T): The value to be stored in the Result instance, set as successful.
 Returns:
 return Result(value, isOk = True)
 @classmethod
 def Err(cls, value :E) -> "Result":
 """
-Constructs a new Result instance with a failed operation.
+Construct a failed Result.
 Args:
 value (E): The value to be stored in the Result instance, set as failed.
 Returns:
 def __str__(self):
 return f"Result::{'Ok' if self.isOk else 'Err'}({self.value})"
 # FILES
-def read_dataset(path :FilePath, datasetName = "Dataset (not actual file name!)") -> pd.DataFrame:
-"""
-Reads a .csv or .tsv file and returns it as a Pandas DataFrame.
-Args:
-path : the path to the dataset file.
-datasetName : the name of the dataset.
-Raises:
-DataErr: If anything goes wrong when trying to open the file, if pandas thinks the dataset is empty or if
-it has less than 2 columns.
-Returns:
-pandas.DataFrame: The dataset loaded as a Pandas DataFrame.
-"""
-# I advise against the use of this function. This is an attempt at standardizing bad legacy code rather than
-# removing / replacing it to avoid introducing as many bugs as possible in the tools still relying on this code.
-# First off, this is not the best way to distinguish between .csv and .tsv files and Galaxy itself makes it really
-# hard to implement anything better. Also, this function's name advertizes it as a dataset-specific operation and
-# contains dubious responsibility (how many columns..) while being a file-opening function instead. My suggestion is
-# TODO: stop using dataframes ever at all in anything and find a way to have tight control over file extensions.
-try: dataset = pd.read_csv(path.show(), sep = '\t', header = None, engine = "python")
-except:
-try: dataset = pd.read_csv(path.show(), sep = ',', header = 0, engine = "python")
-except Exception as err: raise DataErr(datasetName, f"encountered empty or wrongly formatted data: {err}")
-if len(dataset.columns) < 2: raise DataErr(datasetName, "a dataset is always meant to have at least 2 columns")
-return dataset
 def readPickle(path :FilePath) -> Any:
 """
 Reads the contents of a .pickle file, which needs to exist at the given path.
 Args:
 """
 with open(path.show(), "wb") as fd: fd.write(ET.tostring(data))
 # UI ARGUMENTS
 class Bool:
+"""Simple boolean CLI argument parser accepting 'true' or 'false' (case-insensitive)."""
 def __init__(self, argName :str) -> None:
 self.argName = argName
 def __call__(self, s :str) -> bool: return self.check(s)
 if s == "true" : return True
 if s == "false": return False
 raise ArgsErr(self.argName, "boolean string (true or false, not case sensitive)", f"\"{s}\"")
 class Float:
+"""Float CLI argument parser supporting NaN and None keywords (case-insensitive)."""
 def __init__(self, argName = "Dataset values, not an argument") -> None:
 self.argName = argName
 def __call__(self, s :str) -> float: return self.check(s)
 Recon   = "Recon"
 ENGRO2  = "ENGRO2"
 ENGRO2_no_legend = "ENGRO2_no_legend"
 HMRcore = "HMRcore"
 HMRcore_no_legend = "HMRcore_no_legend"
-Custom  = "Custom" # Exists as a valid variant in the UI, but doesn't point to valid file paths.
+Custom  = "Custom"
 def __raiseMissingPathErr(self, path :Optional[FilePath]) -> None:
 if not path: raise PathErr("<<MISSING>>", "it's necessary to provide a custom path when retrieving files from a custom model")
 def getRules(self, toolDir :str, customPath :Optional[FilePath] = None) -> Dict[str, Dict[str, OldRule]]:
 path = customPath if self is Model.Custom else FilePath(f"{self.name}_genes", FileFormat.PICKLE, prefix = f"{toolDir}/local/pickle files/")
 self.__raiseMissingPathErr(path)
 return readPickle(path)
 def getMap(self, toolDir = ".", customPath :Optional[FilePath] = None) -> ET.ElementTree:
+"""Open the SVG metabolic map for this model."""
 path = customPath if self is Model.Custom else FilePath(f"{self.name}_map", FileFormat.SVG, prefix = f"{toolDir}/local/svg metabolic maps/")
 self.__raiseMissingPathErr(path)
 return readSvg(path, customErr = DataErr(path, f"custom map in wrong format"))
 def getCOBRAmodel(self, toolDir = ".", customPath :Optional[FilePath] = None, customExtension :Optional[FilePath]=None)->cobra.Model:
+"""Load the COBRA model for this enum variant (supports Custom with explicit path/extension)."""
 if(self is Model.Custom):
 return self.load_custom_model(customPath, customExtension)
 else:
 return cobra.io.read_sbml_model(FilePath(f"{self.name}", FileFormat.XML, prefix = f"{toolDir}/local/models/").show())
 def load_custom_model(self, file_path :FilePath, ext :Optional[FileFormat] = None) -> cobra.Model:
+"""Load a COBRA model from a custom path, supporting XML, JSON, MAT, and YML (compressed or not)."""
 ext = ext if ext else file_path.ext
 try:
 if str(ext) in FileFormat.XML.value:
 return cobra.io.read_sbml_model(file_path.show())

Mercurial > repos > bimib > cobraxy

comparison COBRAxy/utils/general_utils.py @ 456:a6e45049c1b9 draft default tip