# HG changeset patch # User ebi-gxa # Date 1727857621 0 # Node ID 617e5076721515cdbfbfbc32af8c3a52e75cc007 # Parent a33eb7d3b05314de1a74278974f9afc642f8844f planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit dea8a066ccf04e241457719bf5162f9d39fe6c48 diff -r a33eb7d3b053 -r 617e50767215 decoupler_pseudobulk.py --- a/decoupler_pseudobulk.py Sun Sep 15 09:56:28 2024 +0000 +++ b/decoupler_pseudobulk.py Wed Oct 02 08:27:01 2024 +0000 @@ -298,7 +298,7 @@ for group in args.adata_obs_fields_to_merge.split(":"): fields = group.split(",") check_fields(fields, adata) - adata = merge_adata_obs_fields(fields, adata) + merge_adata_obs_fields(fields, adata) check_fields([args.groupby, args.sample_key], adata) @@ -322,10 +322,10 @@ print("Created pseudo-bulk AnnData, checking if fields still make sense.") print( - "If this fails this check, it might mean that you asked for factors " - + "that are not compatible with you sample identifiers (ie. asked for " - + "phase in the factors, but each sample contains more than one phase, " - + "try joining fields)" + "If this fails this check, it might mean that you asked for factors \ + that are not compatible with you sample identifiers (ie. asked for \ + phase in the factors, but each sample contains more than one phase,\ + try joining fields)." ) if factor_fields: check_fields( @@ -374,6 +374,21 @@ min_counts_per_sample_marking=args.min_counts_per_sample_marking, ) + # if contrasts file is provided, produce a file with genes that should be + # filtered for each contrasts + if args.contrasts_file: + contrast_genes_df = identify_genes_to_filter_per_contrast( + contrast_file=args.contrasts_file, + min_perc_cells_expression=args.min_gene_exp_perc_per_cell, + adata=adata, + obs_field=args.groupby + ) + contrast_genes_df.to_csv( + f"{args.save_path}/genes_to_filter_by_contrast.tsv", + sep="\t", + index=False, + ) + def merge_adata_obs_fields(obs_fields_to_merge, adata): """ @@ -394,7 +409,7 @@ docstring tests: >>> import scanpy as sc >>> ad = sc.datasets.pbmc68k_reduced() - >>> ad = merge_adata_obs_fields(["bulk_labels","louvain"], ad) + >>> merge_adata_obs_fields(["bulk_labels","louvain"], ad) >>> ad.obs.columns Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'], @@ -412,7 +427,115 @@ adata.obs[field_name] = ( adata.obs[field_name] + "_" + adata.obs[field].astype(str) ) - return adata + + +def identify_genes_to_filter_per_contrast( + contrast_file, min_perc_cells_expression, adata, obs_field +): + """ + Identify genes to filter per contrast based on expression percentage. + We need those genes to be under the threshold for all conditions + in a contrast to be identified for further filtering. If + one condition has the gene expressed above the threshold, the gene + becomes of interest (it can be highly up or down regulated). + + Parameters + ---------- + contrast_file : str + Path to the contrasts file. + min_perc_cells_expression : float + Minimum percentage of cells that should express a gene. + adata: adata + Original AnnData file + obs_field: str + Field in the AnnData observations where the contrasts are defined. + + Returns + ------- + None + + Examples + -------- + >>> import anndata + >>> import pandas as pd + >>> import numpy as np + >>> import os + >>> from io import StringIO + >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\ +condition2{os.linesep}") + >>> min_perc_cells_expression = 30.0 + >>> data = { + ... 'obs': pd.DataFrame({'condition': ['condition1', 'condition1', + ... 'condition2', 'condition2']}), + ... 'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0], + ... [0, 0, 1, 1, 0], [0, 0, 0, 2, 0]]), + ... } + >>> adata = anndata.AnnData(X=data['X'], obs=data['obs']) + >>> df = identify_genes_to_filter_per_contrast( + ... contrast_file, min_perc_cells_expression, adata, 'condition' + ... ) # doctest:+ELLIPSIS + Identifying genes to filter using ... + >>> df.head() # doctest:+ELLIPSIS + contrast gene + 0 condition1-condition2 ... + 1 condition1-condition2 ... + """ + import re + + # Implement the logic to identify genes to filter per contrast + # This is a placeholder implementation + print( + f"Identifying genes to filter using {contrast_file} " + f"with min expression {min_perc_cells_expression}%" + ) + sides_regex = re.compile(r"[\+\-\*\/\(\)\^]+") + + contrasts = pd.read_csv(contrast_file, sep="\t") + # Iterate over each line in the contrast file + genes_filter_for_contrast = dict() + for contrast in contrasts.iloc[:, 0]: + conditions = set(sides_regex.split(contrast)) + # we want to find the genes that are below the threshold + # of % of cells expressed for ALL the conditions in the + # contrast. It is enough for one of the conditions + # of the contrast to have the genes expressed above + # the threshold of % of cells to be of interest. + for condition in conditions: + # remove any starting or trailing whitespaces from condition + condition = condition.strip() + # check the percentage of cells that express each gene + # Filter the AnnData object based on the obs_field value + adata_filtered = adata[adata.obs[obs_field] == condition] + # Calculate the percentage of cells expressing each gene + gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100 + genes_to_filter = set(adata_filtered.var[ + gene_expression.transpose() < min_perc_cells_expression + ].index.tolist()) + # Update the genes_filter_for_contrast dictionary + if contrast in genes_filter_for_contrast.keys(): + genes_filter_for_contrast[contrast].intersection_update( + genes_to_filter + ) + else: + genes_filter_for_contrast[contrast] = genes_to_filter + + # write the genes_filter_for_contrast to pandas dataframe of two columns: + # contrast and gene + + # Initialize an empty list to store the expanded pairs + expanded_pairs = [] + + # Iterate over the dictionary + for contrast, genes in genes_filter_for_contrast.items(): + for gene in genes: + expanded_pairs.append((contrast, gene)) + + # Create the DataFrame + contrast_genes_df = pd.DataFrame( + expanded_pairs, columns=["contrast", "gene"] + ) + + return contrast_genes_df if __name__ == "__main__": @@ -474,6 +597,25 @@ default=10, help="Minimum number of cells for pseudobulk analysis", ) + # add argument for min percentage of cells that should express a gene + parser.add_argument( + "--min_gene_exp_perc_per_cell", + type=float, + default=50, + help="If all the conditions of one side of a contrast express a \ + gene in less than this percentage of cells, then the genes \ + will be added to a list of genes to ignore for that contrast.\ + Requires the contrast file to be provided.", + ) + parser.add_argument( + "--contrasts_file", + type=str, + required=False, + help="Contrasts file, a one column tsv with a header, each line \ + represents a contrast as a combination of conditions at each \ + side of a substraction.", + ) + parser.add_argument( "--save_path", type=str, help="Path to save the plot (optional)" ) diff -r a33eb7d3b053 -r 617e50767215 test-data/test_contrasts.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_contrasts.txt Wed Oct 02 08:27:01 2024 +0000 @@ -0,0 +1,2 @@ +Contrasts +N705_male+N707_male-N703_female \ No newline at end of file