changeset 300:24783af8a2f9 draft

Uploaded
author francesco_lapi
date Tue, 20 May 2025 14:47:38 +0000
parents 8ae278b4a5d5
children 5a595a737220
files COBRAxy/marea.py
diffstat 1 files changed, 67 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/COBRAxy/marea.py	Fri May 16 15:47:05 2025 +0000
+++ b/COBRAxy/marea.py	Tue May 20 14:47:38 2025 +0000
@@ -17,6 +17,10 @@
 from typing import Tuple, Union, Optional, List, Dict
 import copy
 
+from pydeseq2.dds import DeseqDataSet
+from pydeseq2.default_inference import DefaultInference
+from pydeseq2.ds import DeseqStats
+
 ERRORS = []
 ########################## argparse ##########################################
 ARGS :argparse.Namespace
@@ -55,7 +59,7 @@
         '-te' ,'--test',
         type = str, 
         default = 'ks', 
-        choices = ['ks', 'ttest_p', 'ttest_ind', 'wilcoxon', 'mw'],
+        choices = ['ks', 'ttest_p', 'ttest_ind', 'wilcoxon', 'mw', 'DESeq'],
         help = 'Statistical test to use (default: %(default)s)')
     
     parser.add_argument(
@@ -729,6 +733,8 @@
         case "mw":
             # Perform Mann-Whitney U test
             _, p_value = st.mannwhitneyu(dataset1Data, dataset2Data)
+        case _:
+            p_value = np.nan # Default value if no valid test is selected
     
     # Calculate means and standard deviations
     mean1 = np.mean(dataset1Data)
@@ -744,8 +750,48 @@
     
     return p_value, z_score
 
+
+def DESeqPValue(comparisonResult :Dict[str, List[Union[float, FoldChange]]], dataset1Data :List[List[float]], dataset2Data :List[List[float]], ids :List[str]) -> None:
+    """
+    Computes the p-value for each reaction in the comparisonResult dictionary using DESeq2.
+
+    Args:
+        comparisonResult : dictionary mapping a p-value and a fold-change value (values) to each reaction ID as encoded in the SVG map (keys)
+        dataset1Data : data from the 1st dataset.
+        dataset2Data : data from the 2nd dataset.
+        ids : list of reaction IDs.
+
+    Returns:
+        None : mutates the comparisonResult dictionary in place with the p-values.
+    """
+
+    # pyDESeq2 is based on pandas, so we need to convert the data into a DataFrame and clean it from NaN values
+    dataframe1 = pd.DataFrame(dataset1Data, index=ids)
+    dataframe2 = pd.DataFrame(dataset2Data, index=ids)
+
+    dataframe1_clean = dataframe1.dropna(axis=0, how="any").T.astype(int)
+    dataframe2_clean = dataframe2.dropna(axis=0, how="any").T.astype(int)
+
+    # pyDESeq2 works on a DataFrame with values and another with infos about samples and conditions
+    dataframe = pd.concat([dataframe1_clean, dataframe2_clean], axis=0)
+    metadata = pd.DataFrame(np.concatenate([np.full(dataframe1_clean.shape[0], "dataset1"), np.full(dataframe2_clean.shape[0], "dataset2")]), columns=["dataset"])
+    metadata.index = dataframe.index
+
+    # Prepare and run pyDESeq2
+    inference = DefaultInference()
+    dds = DeseqDataSet(counts=dataframe, metadata=metadata, design="~dataset", inference=inference, quiet=True, low_memory=True)
+    dds.deseq2()
+    ds = DeseqStats(dds, contrast=["dataset", "dataset1", "dataset2"], inference=inference, quiet=True)
+    ds.summary()
+
+    # Retrieve the p-values from the DESeq2 results
+    for reactId in ds.results_df.index:
+        comparisonResult[reactId][0] = ds.results_df["pvalue"][reactId]
+
+
 # TODO: the net RPS computation should be done in the RPS module
 def compareDatasetPair(dataset1Data :List[List[float]], dataset2Data :List[List[float]], ids :List[str]) -> Tuple[Dict[str, List[Union[float, FoldChange]]], float, Dict[str, Tuple[np.ndarray, np.ndarray]]]:
+
     #TODO: the following code still suffers from "dumbvarnames-osis"
     netRPS :Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
     comparisonResult :Dict[str, List[Union[float, FoldChange]]] = {}
@@ -768,6 +814,7 @@
                 nets2 = np.subtract(l2, dataset2Data[position])
                 netRPS[reactId] = (nets1, nets2)
 
+                # Compute p-value and z-score for the RPS scores, if the pyDESeq option is set, p-values will be computed after and this function will return p_value = 0
                 p_value, z_score = computePValue(nets1, nets2)
                 avg1 = sum(nets1)   / len(nets1)
                 avg2 = sum(nets2)   / len(nets2)
@@ -782,6 +829,7 @@
                     continue
 
             # fallthrough is intended, regular scores need to be computed when tips aren't net but RAS datasets aren't used
+            # Compute p-value and z-score for the RAS scores, if the pyDESeq option is set, p-values will be computed after and this function will return p_value = 0
             p_value, z_score = computePValue(l1, l2)
             avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
             # vvv TODO: Check numpy version compatibility
@@ -790,17 +838,22 @@
         
         except (TypeError, ZeroDivisionError): continue
     
+    if ARGS.test == "DESeq":
+        # Compute p-values using DESeq2
+        DESeqPValue(comparisonResult, dataset1Data, dataset2Data, ids)
+
     # Apply multiple testing correction if set by the user
     if ARGS.adjusted:
-        
-        # Retrieve the p-values from the comparisonResult dictionary
-        reactIds = list(comparisonResult.keys())
-        pValues = [comparisonResult[reactId][0] for reactId in reactIds]
-        
-        # Apply the Benjamini-Hochberg correction and update
-        adjustedPValues = st.false_discovery_control(pValues)[1]
-        for i, reactId in enumerate(reactIds):
-            comparisonResult[reactId][0] = adjustedPValues[i]
+
+        # Retrieve the p-values from the comparisonResult dictionary, they have to be different from NaN
+        validPValues = [(reactId, result[0]) for reactId, result in comparisonResult.items() if not np.isnan(result[0])]
+        # Unpack the valid p-values
+        reactIds, pValues = zip(*validPValues)
+        # Adjust the p-values using the Benjamini-Hochberg method
+        adjustedPValues = st.false_discovery_control(pValues)
+        # Update the comparisonResult dictionary with the adjusted p-values
+        for reactId , adjustedPValue in zip(reactIds, adjustedPValues):
+            comparisonResult[reactId][0] = adjustedPValue
 
     return comparisonResult, max_z_score, netRPS
 
@@ -1002,9 +1055,11 @@
             writeToCsv(
                 [[reactId, *netValues] for reactId, netValues in rows.items()],
                 # vvv In weird comparison modes the dataset names are not recorded properly..
-                columnNames.get(datasetName, ["Reactions", "other cool column names.."]),
+                columnNames.get(datasetName, ["Reactions"]),
                 utils.FilePath(
-                    datasetName, ext = utils.FileFormat.CSV, prefix = ARGS.output_path))
+                    "Net_RPS_" + datasetName,
+                    ext = utils.FileFormat.CSV,
+                    prefix = ARGS.output_path))
 
     print('Execution succeeded')
 ###############################################################################