Mercurial > repos > ebi-gxa > score_genes_aucell

--- a/decoupler_pseudobulk.py	Sun Sep 15 09:56:28 2024 +0000
+++ b/decoupler_pseudobulk.py	Wed Oct 02 08:27:01 2024 +0000
@@ -298,7 +298,7 @@
         for group in args.adata_obs_fields_to_merge.split(":"):
             fields = group.split(",")
             check_fields(fields, adata)
-            adata = merge_adata_obs_fields(fields, adata)
+            merge_adata_obs_fields(fields, adata)

     check_fields([args.groupby, args.sample_key], adata)

@@ -322,10 +322,10 @@

     print("Created pseudo-bulk AnnData, checking if fields still make sense.")
     print(
-        "If this fails this check, it might mean that you asked for factors "
-        + "that are not compatible with you sample identifiers (ie. asked for "
-        + "phase in the factors, but each sample contains more than one phase, "
-        + "try joining fields)"
+        "If this fails this check, it might mean that you asked for factors \
+        that are not compatible with you sample identifiers (ie. asked for \
+        phase in the factors, but each sample contains more than one phase,\
+        try joining fields)."
     )
     if factor_fields:
         check_fields(
@@ -374,6 +374,21 @@
         min_counts_per_sample_marking=args.min_counts_per_sample_marking,
     )

+    # if contrasts file is provided, produce a file with genes that should be
+    # filtered for each contrasts
+    if args.contrasts_file:
+        contrast_genes_df = identify_genes_to_filter_per_contrast(
+            contrast_file=args.contrasts_file,
+            min_perc_cells_expression=args.min_gene_exp_perc_per_cell,
+            adata=adata,
+            obs_field=args.groupby
+        )
+        contrast_genes_df.to_csv(
+            f"{args.save_path}/genes_to_filter_by_contrast.tsv",
+            sep="\t",
+            index=False,
+        )
+

 def merge_adata_obs_fields(obs_fields_to_merge, adata):
     """
@@ -394,7 +409,7 @@
     docstring tests:
     >>> import scanpy as sc
     >>> ad = sc.datasets.pbmc68k_reduced()
-    >>> ad = merge_adata_obs_fields(["bulk_labels","louvain"], ad)
+    >>> merge_adata_obs_fields(["bulk_labels","louvain"], ad)
     >>> ad.obs.columns
     Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score',
            'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'],
@@ -412,7 +427,115 @@
             adata.obs[field_name] = (
                 adata.obs[field_name] + "_" + adata.obs[field].astype(str)
             )
-    return adata
+
+
+def identify_genes_to_filter_per_contrast(
+    contrast_file, min_perc_cells_expression, adata, obs_field
+):
+    """
+    Identify genes to filter per contrast based on expression percentage.
+    We need those genes to be under the threshold for all conditions
+    in a contrast to be identified for further filtering. If
+    one condition has the gene expressed above the threshold, the gene
+    becomes of interest (it can be highly up or down regulated).
+
+    Parameters
+    ----------
+    contrast_file : str
+        Path to the contrasts file.
+    min_perc_cells_expression : float
+        Minimum percentage of cells that should express a gene.
+    adata: adata
+        Original AnnData file
+    obs_field: str
+        Field in the AnnData observations where the contrasts are defined.
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> import anndata
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> import os
+    >>> from io import StringIO
+    >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\
+condition2{os.linesep}")
+    >>> min_perc_cells_expression = 30.0
+    >>> data = {
+    ...     'obs': pd.DataFrame({'condition': ['condition1', 'condition1',
+    ...                          'condition2', 'condition2']}),
+    ...     'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0],
+    ...     [0, 0, 1, 1, 0], [0, 0, 0, 2, 0]]),
+    ... }
+    >>> adata = anndata.AnnData(X=data['X'], obs=data['obs'])
+    >>> df = identify_genes_to_filter_per_contrast(
+    ...     contrast_file, min_perc_cells_expression, adata, 'condition'
+    ... ) # doctest:+ELLIPSIS
+    Identifying genes to filter using ...
+    >>> df.head() # doctest:+ELLIPSIS
+                    contrast gene
+    0  condition1-condition2    ...
+    1  condition1-condition2    ...
+    """
+    import re
+
+    # Implement the logic to identify genes to filter per contrast
+    # This is a placeholder implementation
+    print(
+        f"Identifying genes to filter using {contrast_file} "
+        f"with min expression {min_perc_cells_expression}%"
+    )
+    sides_regex = re.compile(r"[\+\-\*\/\(\)\^]+")
+
+    contrasts = pd.read_csv(contrast_file, sep="\t")
+    # Iterate over each line in the contrast file
+    genes_filter_for_contrast = dict()
+    for contrast in contrasts.iloc[:, 0]:
+        conditions = set(sides_regex.split(contrast))
+        # we want to find the genes that are below the threshold
+        # of % of cells expressed for ALL the conditions in the
+        # contrast. It is enough for one of the conditions
+        # of the contrast to have the genes expressed above
+        # the threshold of % of cells to be of interest.
+        for condition in conditions:
+            # remove any starting or trailing whitespaces from condition
+            condition = condition.strip()
+            # check the percentage of cells that express each gene
+            # Filter the AnnData object based on the obs_field value
+            adata_filtered = adata[adata.obs[obs_field] == condition]
+            # Calculate the percentage of cells expressing each gene
+            gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100
+            genes_to_filter = set(adata_filtered.var[
+                gene_expression.transpose() < min_perc_cells_expression
+            ].index.tolist())
+            # Update the genes_filter_for_contrast dictionary
+            if contrast in genes_filter_for_contrast.keys():
+                genes_filter_for_contrast[contrast].intersection_update(
+                    genes_to_filter
+                )
+            else:
+                genes_filter_for_contrast[contrast] = genes_to_filter
+
+    # write the genes_filter_for_contrast to pandas dataframe of two columns:
+    # contrast and gene
+
+    # Initialize an empty list to store the expanded pairs
+    expanded_pairs = []
+
+    # Iterate over the dictionary
+    for contrast, genes in genes_filter_for_contrast.items():
+        for gene in genes:
+            expanded_pairs.append((contrast, gene))
+
+    # Create the DataFrame
+    contrast_genes_df = pd.DataFrame(
+        expanded_pairs, columns=["contrast", "gene"]
+    )
+
+    return contrast_genes_df


 if __name__ == "__main__":
@@ -474,6 +597,25 @@
         default=10,
         help="Minimum number of cells for pseudobulk analysis",
     )
+    # add argument for min percentage of cells that should express a gene
+    parser.add_argument(
+        "--min_gene_exp_perc_per_cell",
+        type=float,
+        default=50,
+        help="If all the conditions of one side of a contrast express a \
+            gene in less than this percentage of cells, then the genes \
+            will be added to a list of genes to ignore for that contrast.\
+            Requires the contrast file to be provided.",
+    )
+    parser.add_argument(
+        "--contrasts_file",
+        type=str,
+        required=False,
+        help="Contrasts file, a one column tsv with a header, each line \
+            represents a contrast as a combination of conditions at each \
+            side of a substraction.",
+    )
+
     parser.add_argument(
         "--save_path", type=str, help="Path to save the plot (optional)"
     )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_contrasts.txt	Wed Oct 02 08:27:01 2024 +0000
@@ -0,0 +1,2 @@
+Contrasts
+N705_male+N707_male-N703_female
\ No newline at end of file