decoupler_pseudobulk: decoupler_pseudobulk.py comparison

comparison decoupler_pseudobulk.py @ 10:f6040492b499 draft default tip

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit dea8a066ccf04e241457719bf5162f9d39fe6c48

author	ebi-gxa
date	Wed, 02 Oct 2024 08:26:56 +0000
parents	bd4b54b75888
children

comparison

equal deleted inserted replaced

-:bd4b54b75888
+:f6040492b499
 if args.adata_obs_fields_to_merge:
 # first split potential groups by ":" and iterate over them
 for group in args.adata_obs_fields_to_merge.split(":"):
 fields = group.split(",")
 check_fields(fields, adata)
-adata = merge_adata_obs_fields(fields, adata)
+merge_adata_obs_fields(fields, adata)
 check_fields([args.groupby, args.sample_key], adata)
 factor_fields = None
 if args.factor_fields:
 min_counts=args.min_counts,
 )
 print("Created pseudo-bulk AnnData, checking if fields still make sense.")
 print(
-"If this fails this check, it might mean that you asked for factors "
+"If this fails this check, it might mean that you asked for factors \
-+ "that are not compatible with you sample identifiers (ie. asked for "
+that are not compatible with you sample identifiers (ie. asked for \
-+ "phase in the factors, but each sample contains more than one phase, "
+phase in the factors, but each sample contains more than one phase,\
-+ "try joining fields)"
+try joining fields)."
 )
 if factor_fields:
 check_fields(
 factor_fields,
 pseudobulk_data,
 output_dir=args.deseq2_output_path,
 factor_fields=factor_fields,
 min_counts_per_sample_marking=args.min_counts_per_sample_marking,
 )
+# if contrasts file is provided, produce a file with genes that should be
+# filtered for each contrasts
+if args.contrasts_file:
+contrast_genes_df = identify_genes_to_filter_per_contrast(
+contrast_file=args.contrasts_file,
+min_perc_cells_expression=args.min_gene_exp_perc_per_cell,
+adata=adata,
+obs_field=args.groupby
+)
+contrast_genes_df.to_csv(
+f"{args.save_path}/genes_to_filter_by_contrast.tsv",
+sep="\t",
+index=False,
+)
 def merge_adata_obs_fields(obs_fields_to_merge, adata):
 """
 Merge adata.obs fields specified in args.adata_obs_fields_to_merge
 The merged AnnData object
 docstring tests:
 >>> import scanpy as sc
 >>> ad = sc.datasets.pbmc68k_reduced()
->>> ad = merge_adata_obs_fields(["bulk_labels","louvain"], ad)
+>>> merge_adata_obs_fields(["bulk_labels","louvain"], ad)
 >>> ad.obs.columns
 Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score',
 'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'],
 dtype='object')
 """
 adata.obs[field_name] = adata.obs[field].astype(str)
 else:
 adata.obs[field_name] = (
 adata.obs[field_name] + "_" + adata.obs[field].astype(str)
 )
-return adata
+def identify_genes_to_filter_per_contrast(
+contrast_file, min_perc_cells_expression, adata, obs_field
+):
+"""
+Identify genes to filter per contrast based on expression percentage.
+We need those genes to be under the threshold for all conditions
+in a contrast to be identified for further filtering. If
+one condition has the gene expressed above the threshold, the gene
+becomes of interest (it can be highly up or down regulated).
+Parameters
+----------
+contrast_file : str
+Path to the contrasts file.
+min_perc_cells_expression : float
+Minimum percentage of cells that should express a gene.
+adata: adata
+Original AnnData file
+obs_field: str
+Field in the AnnData observations where the contrasts are defined.
+Returns
+-------
+None
+Examples
+--------
+>>> import anndata
+>>> import pandas as pd
+>>> import numpy as np
+>>> import os
+>>> from io import StringIO
+>>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\
+condition2{os.linesep}")
+>>> min_perc_cells_expression = 30.0
+>>> data = {
+...     'obs': pd.DataFrame({'condition': ['condition1', 'condition1',
+...                          'condition2', 'condition2']}),
+...     'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0],
+...     [0, 0, 1, 1, 0], [0, 0, 0, 2, 0]]),
+... }
+>>> adata = anndata.AnnData(X=data['X'], obs=data['obs'])
+>>> df = identify_genes_to_filter_per_contrast(
+...     contrast_file, min_perc_cells_expression, adata, 'condition'
+... ) # doctest:+ELLIPSIS
+Identifying genes to filter using ...
+>>> df.head() # doctest:+ELLIPSIS
+contrast gene
+0  condition1-condition2    ...
+1  condition1-condition2    ...
+"""
+import re
+# Implement the logic to identify genes to filter per contrast
+# This is a placeholder implementation
+print(
+f"Identifying genes to filter using {contrast_file} "
+f"with min expression {min_perc_cells_expression}%"
+)
+sides_regex = re.compile(r"[\+\-\*\/\(\)\^]+")
+contrasts = pd.read_csv(contrast_file, sep="\t")
+# Iterate over each line in the contrast file
+genes_filter_for_contrast = dict()
+for contrast in contrasts.iloc[:, 0]:
+conditions = set(sides_regex.split(contrast))
+# we want to find the genes that are below the threshold
+# of % of cells expressed for ALL the conditions in the
+# contrast. It is enough for one of the conditions
+# of the contrast to have the genes expressed above
+# the threshold of % of cells to be of interest.
+for condition in conditions:
+# remove any starting or trailing whitespaces from condition
+condition = condition.strip()
+# check the percentage of cells that express each gene
+# Filter the AnnData object based on the obs_field value
+adata_filtered = adata[adata.obs[obs_field] == condition]
+# Calculate the percentage of cells expressing each gene
+gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100
+genes_to_filter = set(adata_filtered.var[
+gene_expression.transpose() < min_perc_cells_expression
+].index.tolist())
+# Update the genes_filter_for_contrast dictionary
+if contrast in genes_filter_for_contrast.keys():
+genes_filter_for_contrast[contrast].intersection_update(
+genes_to_filter
+)
+else:
+genes_filter_for_contrast[contrast] = genes_to_filter
+# write the genes_filter_for_contrast to pandas dataframe of two columns:
+# contrast and gene
+# Initialize an empty list to store the expanded pairs
+expanded_pairs = []
+# Iterate over the dictionary
+for contrast, genes in genes_filter_for_contrast.items():
+for gene in genes:
+expanded_pairs.append((contrast, gene))
+# Create the DataFrame
+contrast_genes_df = pd.DataFrame(
+expanded_pairs, columns=["contrast", "gene"]
+)
+return contrast_genes_df
 if __name__ == "__main__":
 # Create argument parser
 parser = argparse.ArgumentParser(
 "--min_cells",
 type=int,
 default=10,
 help="Minimum number of cells for pseudobulk analysis",
 )
+# add argument for min percentage of cells that should express a gene
+parser.add_argument(
+"--min_gene_exp_perc_per_cell",
+type=float,
+default=50,
+help="If all the conditions of one side of a contrast express a \
+gene in less than this percentage of cells, then the genes \
+will be added to a list of genes to ignore for that contrast.\
+Requires the contrast file to be provided.",
+)
+parser.add_argument(
+"--contrasts_file",
+type=str,
+required=False,
+help="Contrasts file, a one column tsv with a header, each line \
+represents a contrast as a combination of conditions at each \
+side of a substraction.",
+)
 parser.add_argument(
 "--save_path", type=str, help="Path to save the plot (optional)"
 )
 parser.add_argument(
 "--min_counts",

Mercurial > repos > ebi-gxa > decoupler_pseudobulk

comparison decoupler_pseudobulk.py @ 10:f6040492b499 draft default tip