comparison COBRAxy/marea.py @ 313:e796d29076be draft

Uploaded
author francesco_lapi
date Mon, 26 May 2025 16:00:58 +0000
parents 38c9a958ea78
children bfe98b0f04fc
comparison
equal deleted inserted replaced
312:a99667e35947 313:e796d29076be
769 769
770 Returns: 770 Returns:
771 None : mutates the comparisonResult dictionary in place with the p-values. 771 None : mutates the comparisonResult dictionary in place with the p-values.
772 """ 772 """
773 773
774 # pyDESeq2 needs at least 2 replicates per sample so I check this
775 if len(dataset1Data[0]) < 2 or len(dataset2Data[0]) < 2:
776 raise ValueError("Datasets must have at least 2 replicates each")
777
774 # pyDESeq2 is based on pandas, so we need to convert the data into a DataFrame and clean it from NaN values 778 # pyDESeq2 is based on pandas, so we need to convert the data into a DataFrame and clean it from NaN values
775 dataframe1 = pd.DataFrame(dataset1Data, index=ids) 779 dataframe1 = pd.DataFrame(dataset1Data, index=ids)
776 dataframe2 = pd.DataFrame(dataset2Data, index=ids) 780 dataframe2 = pd.DataFrame(dataset2Data, index=ids)
777 781
782 # pyDESeq2 requires datasets to be samples x reactions and integer values
778 dataframe1_clean = dataframe1.dropna(axis=0, how="any").T.astype(int) 783 dataframe1_clean = dataframe1.dropna(axis=0, how="any").T.astype(int)
779 dataframe2_clean = dataframe2.dropna(axis=0, how="any").T.astype(int) 784 dataframe2_clean = dataframe2.dropna(axis=0, how="any").T.astype(int)
780 785 dataframe1_clean.index = [f"ds1_rep{i+1}" for i in range(dataframe1_clean.shape[0])]
781 # pyDESeq2 works on a DataFrame with values and another with infos about samples and conditions 786 dataframe2_clean.index = [f"ds2_rep{j+1}" for j in range(dataframe2_clean.shape[0])]
787
788 # pyDESeq2 works on a DataFrame with values and another with infos about how samples are split (like dataset class)
782 dataframe = pd.concat([dataframe1_clean, dataframe2_clean], axis=0) 789 dataframe = pd.concat([dataframe1_clean, dataframe2_clean], axis=0)
783 metadata = pd.DataFrame(np.concatenate([np.full(dataframe1_clean.shape[0], "dataset1"), np.full(dataframe2_clean.shape[0], "dataset2")]), columns=["dataset"]) 790 metadata = pd.DataFrame({"dataset": (["dataset1"]*dataframe1_clean.shape[0] + ["dataset2"]*dataframe2_clean.shape[0])}, index=dataframe.index)
784 metadata.index = dataframe.index 791
792 # Ensure the index of the metadata matches the index of the dataframe
793 if not dataframe.index.equals(metadata.index):
794 raise ValueError("The index of the metadata DataFrame must match the index of the counts DataFrame.")
785 795
786 # Prepare and run pyDESeq2 796 # Prepare and run pyDESeq2
787 inference = DefaultInference() 797 inference = DefaultInference()
788 dds = DeseqDataSet(counts=dataframe, metadata=metadata, design="~dataset", inference=inference, quiet=True, low_memory=True) 798 dds = DeseqDataSet(counts=dataframe, metadata=metadata, design="~dataset", inference=inference, quiet=True, low_memory=True)
789 dds.deseq2() 799 dds.deseq2()