comparison COBRAxy/utils/general_utils.py @ 335:2b7911a8366f draft

Uploaded
author luca_milaz
date Thu, 04 Sep 2025 12:05:10 +0000
parents 63f5078627a9
children b89091ae2484
comparison
equal deleted inserted replaced
334:c561c060a55f 335:2b7911a8366f
5 import pickle 5 import pickle
6 import lxml.etree as ET 6 import lxml.etree as ET
7 7
8 from enum import Enum 8 from enum import Enum
9 from itertools import count 9 from itertools import count
10 from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union 10 from typing import Any, Callable, Dict, Generic, List, Literal, Optional, TypeVar, Union
11 11
12 import pandas as pd 12 import pandas as pd
13 import cobra 13 import cobra
14 14
15 import zipfile
16 import gzip
17 import bz2
18 from io import StringIO
19
15 # FILES 20 # FILES
16 class FileFormat(Enum): 21 class FileFormat(Enum):
17 """ 22 """
18 Encodes possible file extensions to conditionally save data in a different format. 23 Encodes possible file extensions to conditionally save data in a different format.
19 """ 24 """
20 DAT = ("dat",) # this is how galaxy treats all your files! 25 DAT = ("dat",) # this is how galaxy treats all your files!
21 CSV = ("csv",) # this is how most editable input data is written 26 CSV = ("csv",) # this is how most editable input data is written
22 TSV = ("tsv",) # this is how most editable input data is ACTUALLY written TODO:more support pls!! 27 TSV = ("tsv",) # this is how most editable input data is ACTUALLY written
23 28
24 SVG = ("svg",) # this is how most metabolic maps are written 29 SVG = ("svg",) # this is how most metabolic maps are written
25 PNG = ("png",) # this is a common output format for images (such as metabolic maps) 30 PNG = ("png",) # this is a common output format for images (such as metabolic maps)
26 PDF = ("pdf",) # this is also a common output format for images, as it's required in publications. 31 PDF = ("pdf",) # this is also a common output format for images, as it's required in publications.
27 32
28 XML = ("xml",) # this is one main way cobra models appear in 33 XML = ("xml","xml.gz", "xml.zip", "xml.bz2") # SBML files are XML files, sometimes compressed
29 JSON = ("json",) # this is the other 34 JSON = ("json","json.gz", "json.zip", "json.bz2") # COBRA models can be stored as JSON files, sometimes compressed
30 35
31 TXT = ("txt",) # this is how most output data is written 36 TXT = ("txt",) # this is how most output data is written
32 37
33 PICKLE = ("pickle", "pk", "p") # this is how all runtime data structures are saved 38 PICKLE = ("pickle", "pk", "p") # this is how all runtime data structures are saved
34 #TODO: we're in a pickle (ba dum tss), there's no point in supporting many extensions internally. The 39
35 # issue will never be solved for user-uploaded files and those are saved as .dat by galaxy anyway so it 40 def __init__(self):
36 # doesn't matter as long as we CAN recognize these 3 names as valid pickle extensions. We must however 41 self.original_extension = ""
37 # agree on an internal standard and use only that one, otherwise constructing usable paths becomes a nightmare. 42
43
38 @classmethod 44 @classmethod
39 def fromExt(cls, ext :str) -> "FileFormat": 45 def fromExt(cls, ext :str) -> "FileFormat":
40 """ 46 """
41 Converts a file extension string to a FileFormat instance. 47 Converts a file extension string to a FileFormat instance.
42 48
45 51
46 Returns: 52 Returns:
47 FileFormat: The FileFormat instance corresponding to the file extension. 53 FileFormat: The FileFormat instance corresponding to the file extension.
48 """ 54 """
49 variantName = ext.upper() 55 variantName = ext.upper()
50 if variantName in FileFormat.__members__: return FileFormat[variantName] 56 if variantName in FileFormat.__members__:
57 instance = FileFormat[variantName]
58 instance.original_extension = ext
59 return instance
51 60
52 variantName = variantName.lower() 61 variantName = variantName.lower()
53 for member in cls: 62 for member in cls:
54 if variantName in member.value: return member 63 if variantName in member.value:
64 member.original_extension = ext
65 return member
55 66
56 raise ValueErr("ext", "a valid FileFormat file extension", ext) 67 raise ValueErr("ext", "a valid FileFormat file extension", ext)
57 68
58 def __str__(self) -> str: 69 def __str__(self) -> str:
59 """ 70 """
60 (Private) converts to str representation. Good practice for usage with argparse. 71 (Private) converts to str representation. Good practice for usage with argparse.
61 72
62 Returns: 73 Returns:
63 str : the string representation of the file extension. 74 str : the string representation of the file extension.
64 """ 75 """
65 return self.value[-1] #TODO: fix, it's the dumb pickle thing 76
77 if(self.values[-1] in ["json", "xml"]): #return the original string extension for compressed files
78 return self.original_extension
79 else:
80 return self.value[-1] # for all other formats and pickle
66 81
67 class FilePath(): 82 class FilePath():
68 """ 83 """
69 Represents a file path. View this as an attempt to standardize file-related operations by expecting 84 Represents a file path. View this as an attempt to standardize file-related operations by expecting
70 values of this type in any process requesting a file path. 85 values of this type in any process requesting a file path.
89 104
90 @classmethod 105 @classmethod
91 def fromStrPath(cls, path :str) -> "FilePath": 106 def fromStrPath(cls, path :str) -> "FilePath":
92 """ 107 """
93 Factory method to parse a string from which to obtain, if possible, a valid FilePath instance. 108 Factory method to parse a string from which to obtain, if possible, a valid FilePath instance.
109 It detects double extensions such as .json.gz and .xml.bz2, which are common in COBRA models.
110 These double extensions are not supported for other file types such as .csv.
94 111
95 Args: 112 Args:
96 path : the string containing the path 113 path : the string containing the path
97 114
98 Raises: 115 Raises:
111 result = re.search(r"^(?P<prefix>.*\/)?(?P<name>.*)\.(?P<ext>[^.]*)$", path) 128 result = re.search(r"^(?P<prefix>.*\/)?(?P<name>.*)\.(?P<ext>[^.]*)$", path)
112 if not result or not result["name"] or not result["ext"]: 129 if not result or not result["name"] or not result["ext"]:
113 raise PathErr(path, "cannot recognize folder structure or extension in path") 130 raise PathErr(path, "cannot recognize folder structure or extension in path")
114 131
115 prefix = result["prefix"] if result["prefix"] else "" 132 prefix = result["prefix"] if result["prefix"] else ""
116 return cls(result["name"], FileFormat.fromExt(result["ext"]), prefix = prefix) 133 name, ext = result["name"], result["ext"]
134
135 # Split path into parts
136 parts = path.split(".")
137 if len(parts) >= 3:
138 penultimate = parts[-2]
139 last = parts[-1]
140 if penultimate in {"json", "xml"}:
141 name = ".".join(parts[:-2])
142 ext = f"{penultimate}.{last}"
143
144 return cls(name, FileFormat.fromExt(ext), prefix=prefix)
117 145
118 def show(self) -> str: 146 def show(self) -> str:
119 """ 147 """
120 Shows the path as a string. 148 Shows the path as a string.
121 149
560 return cobra.io.read_sbml_model(FilePath(f"{self.name}", FileFormat.XML, prefix = f"{toolDir}/local/models/").show()) 588 return cobra.io.read_sbml_model(FilePath(f"{self.name}", FileFormat.XML, prefix = f"{toolDir}/local/models/").show())
561 589
562 def load_custom_model(self, file_path :FilePath, ext :Optional[FileFormat] = None) -> cobra.Model: 590 def load_custom_model(self, file_path :FilePath, ext :Optional[FileFormat] = None) -> cobra.Model:
563 ext = ext if ext else file_path.ext 591 ext = ext if ext else file_path.ext
564 try: 592 try:
565 if ext is FileFormat.XML: 593 if ext in FileFormat.XML:
566 return cobra.io.read_sbml_model(file_path.show()) 594 return cobra.io.read_sbml_model(file_path.show())
567 595
568 if ext is FileFormat.JSON: 596 if ext in FileFormat.JSON:
569 return cobra.io.load_json_model(file_path.show()) 597 # Compressed files are not automatically handled by cobra
598 if(ext == "json"):
599 return cobra.io.load_json_model(file_path.show())
600 else:
601 return self.extract_json_model(file_path, ext)
570 602
571 except Exception as e: raise DataErr(file_path, e.__str__()) 603 except Exception as e: raise DataErr(file_path, e.__str__())
572 raise DataErr(file_path, 604 raise DataErr(file_path,
573 f"Fomat \"{file_path.ext}\" is not recognized, only JSON and XML files are supported.") 605 f"Fomat \"{file_path.ext}\" is not recognized, only JSON and XML files are supported.")
606
607
608 def extract_json_model(file_path:FilePath, ext :FileFormat) -> cobra.Model:
609 """
610 Extract json COBRA model from a compressed file (zip, gz, bz2).
611
612 Args:
613 file_path: File path of the model
614 ext: File extensions of class FileFormat (should be .zip, .gz or .bz2)
615
616 Returns:
617 cobra.Model: COBRApy model
618
619 Raises:
620 Exception: Extraction errors
621 """
622 ext_str = str(ext)
623
624 try:
625 if '.zip' in ext_str:
626 with zipfile.ZipFile(file_path.show(), 'r') as zip_ref:
627 with zip_ref.open(zip_ref.namelist()[0]) as json_file:
628 content = json_file.read().decode('utf-8')
629 return cobra.io.load_json_model(StringIO(content))
630 elif '.gz' in ext_str:
631 with gzip.open(file_path.show(), 'rt', encoding='utf-8') as gz_ref:
632 return cobra.io.load_json_model(gz_ref)
633 elif '.bz2' in ext_str:
634 with bz2.open(file_path.show(), 'rt', encoding='utf-8') as bz2_ref:
635 return cobra.io.load_json_model(bz2_ref)
636 else:
637 raise ValueError(f"Compression format not supported: {ext_str}. Supported: .zip, .gz and .bz2")
638
639 except Exception as e:
640 raise Exception(f"Error during model extraction: {str(e)}")
641
642
574 643
575 def __str__(self) -> str: return self.value 644 def __str__(self) -> str: return self.value