Mercurial > repos > ebi-gxa > decoupler_pseudobulk
changeset 10:f6040492b499 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit dea8a066ccf04e241457719bf5162f9d39fe6c48
| author | ebi-gxa |
|---|---|
| date | Wed, 02 Oct 2024 08:26:56 +0000 |
| parents | bd4b54b75888 |
| children | 70a5c6f2408d |
| files | decoupler_pseudobulk.py decoupler_pseudobulk.xml test-data/test_contrasts.txt |
| diffstat | 3 files changed, 238 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/decoupler_pseudobulk.py Sun Sep 15 09:56:34 2024 +0000 +++ b/decoupler_pseudobulk.py Wed Oct 02 08:26:56 2024 +0000 @@ -298,7 +298,7 @@ for group in args.adata_obs_fields_to_merge.split(":"): fields = group.split(",") check_fields(fields, adata) - adata = merge_adata_obs_fields(fields, adata) + merge_adata_obs_fields(fields, adata) check_fields([args.groupby, args.sample_key], adata) @@ -322,10 +322,10 @@ print("Created pseudo-bulk AnnData, checking if fields still make sense.") print( - "If this fails this check, it might mean that you asked for factors " - + "that are not compatible with you sample identifiers (ie. asked for " - + "phase in the factors, but each sample contains more than one phase, " - + "try joining fields)" + "If this fails this check, it might mean that you asked for factors \ + that are not compatible with you sample identifiers (ie. asked for \ + phase in the factors, but each sample contains more than one phase,\ + try joining fields)." ) if factor_fields: check_fields( @@ -374,6 +374,21 @@ min_counts_per_sample_marking=args.min_counts_per_sample_marking, ) + # if contrasts file is provided, produce a file with genes that should be + # filtered for each contrasts + if args.contrasts_file: + contrast_genes_df = identify_genes_to_filter_per_contrast( + contrast_file=args.contrasts_file, + min_perc_cells_expression=args.min_gene_exp_perc_per_cell, + adata=adata, + obs_field=args.groupby + ) + contrast_genes_df.to_csv( + f"{args.save_path}/genes_to_filter_by_contrast.tsv", + sep="\t", + index=False, + ) + def merge_adata_obs_fields(obs_fields_to_merge, adata): """ @@ -394,7 +409,7 @@ docstring tests: >>> import scanpy as sc >>> ad = sc.datasets.pbmc68k_reduced() - >>> ad = merge_adata_obs_fields(["bulk_labels","louvain"], ad) + >>> merge_adata_obs_fields(["bulk_labels","louvain"], ad) >>> ad.obs.columns Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'], @@ -412,7 +427,115 @@ adata.obs[field_name] = ( adata.obs[field_name] + "_" + adata.obs[field].astype(str) ) - return adata + + +def identify_genes_to_filter_per_contrast( + contrast_file, min_perc_cells_expression, adata, obs_field +): + """ + Identify genes to filter per contrast based on expression percentage. + We need those genes to be under the threshold for all conditions + in a contrast to be identified for further filtering. If + one condition has the gene expressed above the threshold, the gene + becomes of interest (it can be highly up or down regulated). + + Parameters + ---------- + contrast_file : str + Path to the contrasts file. + min_perc_cells_expression : float + Minimum percentage of cells that should express a gene. + adata: adata + Original AnnData file + obs_field: str + Field in the AnnData observations where the contrasts are defined. + + Returns + ------- + None + + Examples + -------- + >>> import anndata + >>> import pandas as pd + >>> import numpy as np + >>> import os + >>> from io import StringIO + >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\ +condition2{os.linesep}") + >>> min_perc_cells_expression = 30.0 + >>> data = { + ... 'obs': pd.DataFrame({'condition': ['condition1', 'condition1', + ... 'condition2', 'condition2']}), + ... 'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0], + ... [0, 0, 1, 1, 0], [0, 0, 0, 2, 0]]), + ... } + >>> adata = anndata.AnnData(X=data['X'], obs=data['obs']) + >>> df = identify_genes_to_filter_per_contrast( + ... contrast_file, min_perc_cells_expression, adata, 'condition' + ... ) # doctest:+ELLIPSIS + Identifying genes to filter using ... + >>> df.head() # doctest:+ELLIPSIS + contrast gene + 0 condition1-condition2 ... + 1 condition1-condition2 ... + """ + import re + + # Implement the logic to identify genes to filter per contrast + # This is a placeholder implementation + print( + f"Identifying genes to filter using {contrast_file} " + f"with min expression {min_perc_cells_expression}%" + ) + sides_regex = re.compile(r"[\+\-\*\/\(\)\^]+") + + contrasts = pd.read_csv(contrast_file, sep="\t") + # Iterate over each line in the contrast file + genes_filter_for_contrast = dict() + for contrast in contrasts.iloc[:, 0]: + conditions = set(sides_regex.split(contrast)) + # we want to find the genes that are below the threshold + # of % of cells expressed for ALL the conditions in the + # contrast. It is enough for one of the conditions + # of the contrast to have the genes expressed above + # the threshold of % of cells to be of interest. + for condition in conditions: + # remove any starting or trailing whitespaces from condition + condition = condition.strip() + # check the percentage of cells that express each gene + # Filter the AnnData object based on the obs_field value + adata_filtered = adata[adata.obs[obs_field] == condition] + # Calculate the percentage of cells expressing each gene + gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100 + genes_to_filter = set(adata_filtered.var[ + gene_expression.transpose() < min_perc_cells_expression + ].index.tolist()) + # Update the genes_filter_for_contrast dictionary + if contrast in genes_filter_for_contrast.keys(): + genes_filter_for_contrast[contrast].intersection_update( + genes_to_filter + ) + else: + genes_filter_for_contrast[contrast] = genes_to_filter + + # write the genes_filter_for_contrast to pandas dataframe of two columns: + # contrast and gene + + # Initialize an empty list to store the expanded pairs + expanded_pairs = [] + + # Iterate over the dictionary + for contrast, genes in genes_filter_for_contrast.items(): + for gene in genes: + expanded_pairs.append((contrast, gene)) + + # Create the DataFrame + contrast_genes_df = pd.DataFrame( + expanded_pairs, columns=["contrast", "gene"] + ) + + return contrast_genes_df if __name__ == "__main__": @@ -474,6 +597,25 @@ default=10, help="Minimum number of cells for pseudobulk analysis", ) + # add argument for min percentage of cells that should express a gene + parser.add_argument( + "--min_gene_exp_perc_per_cell", + type=float, + default=50, + help="If all the conditions of one side of a contrast express a \ + gene in less than this percentage of cells, then the genes \ + will be added to a list of genes to ignore for that contrast.\ + Requires the contrast file to be provided.", + ) + parser.add_argument( + "--contrasts_file", + type=str, + required=False, + help="Contrasts file, a one column tsv with a header, each line \ + represents a contrast as a combination of conditions at each \ + side of a substraction.", + ) + parser.add_argument( "--save_path", type=str, help="Path to save the plot (optional)" )
--- a/decoupler_pseudobulk.xml Sun Sep 15 09:56:34 2024 +0000 +++ b/decoupler_pseudobulk.xml Wed Oct 02 08:26:56 2024 +0000 @@ -1,4 +1,4 @@ -<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy4" profile="20.05"> +<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy5" profile="20.05"> <description>aggregates single cell RNA-seq data for running bulk RNA-seq methods</description> <requirements> <requirement type="package" version="1.4.0">decoupler</requirement> @@ -43,6 +43,10 @@ #if $factor_fields: --factor_fields '$factor_fields' #end if + #if $filter_per_contrast.filter == 'yes': + --contrasts_file '$filter_per_contrast.contrasts_file' + --min_gene_exp_perc_per_cell '$filter_per_contrast.min_cells_perc_per_contrast_cond' + #end if --deseq2_output_path deseq_output_dir --plot_samples_figsize $plot_samples_figsize --plot_filtering_figsize $plot_filtering_figsize @@ -53,6 +57,18 @@ </environment_variables> <inputs> <param type="data" name="input_file" format="data" label="Input AnnData file"/> + <conditional name="filter_per_contrast"> + <param name="filter" type="select" label="Produce a list of genes to filter out per contrast?" help="TODO"> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + <when value="yes"> + <param type="data" name="contrasts_file" format="txt,tabular" label="Contrasts file" help="A file with header and arithmetic operations between existing values in the Groupby column in the AnnData file."/> + <param type="float" name="min_cells_perc_per_contrast_cond" value="20" label="Min. percentage of cells that need to be expressing a gene in any of the conditions of a contrast" help="Genes whose expression across all conditions of a contrast are below this threshold are tagged for removal from the contrast on a separate file"/> + </when> + <when value="no"> + </when> + </conditional> <param type="text" name="adata_obs_fields_to_merge" label="Obs Fields to Merge" optional="true" help="Fields in adata.obs to merge, comma separated (optional). They will be available as field1_field2_field3 in the AnnData Obs dataframe. You can have multiple groups to merge, separated by colon (:)."/> <param type="text" name="groupby" label="Groupby column" help="The column in adata.obs that defines the groups. Merged columns in the above field are available here."/> <param type="text" name="sample_key" label="Sample Key column" help="The column in adata.obs that defines the samples. Merged columns in the above field are available here."/> @@ -90,6 +106,9 @@ <data name="genes_ignore_per_contrast_field" format="tabular" label="{tool.name} on ${on_string}: Genes to ignore by contrast field" from_work_dir="deseq_output_dir/genes_to_ignore_per_contrast_field.tsv"> <filter>factor_fields</filter> </data> + <data name="genes_ignore_per_contrast" format="tabular" label="{tool.name} on ${on_string}: Genes to ignore by contrast" from_work_dir="plots_output_dir/genes_to_filter_by_contrast.tsv"> + <filter>filter_per_contrast['filter'] == 'yes'</filter> + </data> </outputs> <tests> <test expect_num_outputs="7"> @@ -147,12 +166,77 @@ </assert_contents> </output> </test> + <test expect_num_outputs="8"> + <param name="input_file" value="mito_counted_anndata.h5ad"/> + <param name="filter" value="yes"/> + <param name="contrasts_file" value="test_contrasts.txt" ftype="txt"/> + <param name="min_cells_perc_per_contrast_cond" value="25"/> + <param name="adata_obs_fields_to_merge" value="batch,sex:batch,genotype"/> + <param name="groupby" value="batch_sex"/> + <param name="sample_key" value="genotype"/> + <param name="factor_fields" value="genotype,batch_sex"/> + <param name="mode" value="sum"/> + <param name="min_cells" value="10"/> + <param name="produce_plots" value="true"/> + <param name="produce_anndata" value="true"/> + <param name="min_counts" value="10"/> + <param name="min_counts_per_sample" value="50"/> + <param name="min_total_counts" value="1000"/> + <param name="filter_expr" value="true"/> + <param name="plot_samples_figsize" value="10 10"/> + <param name="plot_filtering_figsize" value="10 10"/> + <output name="pbulk_anndata" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="obs/psbulk_n_cells"/> + </assert_contents> + </output> + <output name="count_matrix" ftype="tabular"> + <assert_contents> + <has_n_lines n="3620"/> + <has_n_columns n="8"/> + </assert_contents> + </output> + <output name="samples_metadata" ftype="tabular"> + <assert_contents> + <has_n_lines n="8"/> + <has_n_columns n="3"/> + </assert_contents> + </output> + <output name="genes_metadata" ftype="tabular"> + <assert_contents> + <has_n_lines n="3620"/> + <has_n_columns n="13"/> + </assert_contents> + </output> + <output name="plot_output" ftype="png"> + <assert_contents> + <has_size value="31853" delta="3000"/> + </assert_contents> + </output> + <output name="genes_ignore_per_contrast_field" ftype="tabular"> + <assert_contents> + <has_n_lines n="5"/> + </assert_contents> + </output> + <output name="filter_by_expr_plot" ftype="png"> + <assert_contents> + <has_size value="21656" delta="2000"/> + </assert_contents> + </output> + <output name="genes_ignore_per_contrast" ftype="tabular"> + <assert_contents> + <has_n_lines n="35478"/> + </assert_contents> + </output> + </test> </tests> <help> <