Mercurial > repos > ebi-gxa > score_genes_aucell

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/decoupler_aucell_score.py	Thu Nov 09 11:36:08 2023 +0000
@@ -0,0 +1,181 @@
+import argparse
+import os
+import tempfile
+
+import anndata
+import decoupler as dc
+import pandas as pd
+
+
+def read_gmt(gmt_file):
+    """
+    Reads a GMT file into a Pandas DataFrame.
+
+    Parameters
+    ----------
+    gmt_file : str
+        Path to the GMT file.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame with the gene sets. Each row represents a gene set, and the columns are "gene_set_name", "gene_set_url", and "genes".
+    >>> line = "HALLMARK_NOTCH_SIGNALING\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_NOTCH_SIGNALING\\tJAG1\\tNOTCH3\\tNOTCH2\\tAPH1A\\tHES1\\tCCND1\\tFZD1\\tPSEN2\\tFZD7\\tDTX1\\tDLL1\\tFZD5\\tMAML2\\tNOTCH1\\tPSENEN\\tWNT5A\\tCUL1\\tWNT2\\tDTX4\\tSAP30\\tPPARD\\tKAT2A\\tHEYL\\tSKP1\\tRBX1\\tTCF7L2\\tARRB1\\tLFNG\\tPRKCA\\tDTX2\\tST3GAL6\\tFBXW11\\n"
+    >>> line2 = "HALLMARK_APICAL_SURFACE\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_APICAL_SURFACE\\tB4GALT1\\tRHCG\\tMAL\\tLYPD3\\tPKHD1\\tATP6V0A4\\tCRYBG1\\tSHROOM2\\tSRPX\\tMDGA1\\tTMEM8B\\tTHY1\\tPCSK9\\tEPHB4\\tDCBLD2\\tGHRL\\tLYN\\tGAS1\\tFLOT2\\tPLAUR\\tAKAP7\\tATP8B1\\tEFNA5\\tSLC34A3\\tAPP\\tGSTM3\\tHSPB1\\tSLC2A4\\tIL2RB\\tRTN4RL1\\tNCOA6\\tSULF2\\tADAM10\\tBRCA1\\tGATA3\\tAFAP1L2\\tIL2RG\\tCD160\\tADIPOR2\\tSLC22A12\\tNTNG1\\tSCUBE1\\tCX3CL1\\tCROCC\\n"
+    >>> temp_dir = tempfile.gettempdir()
+    >>> temp_gmt = os.path.join(temp_dir, "temp_file.gmt")
+    >>> with open(temp_gmt, "w") as f:
+    ...   f.write(line)
+    ...   f.write(line2)
+    288
+    380
+    >>> df = read_gmt(temp_gmt)
+    >>> df.shape[0]
+    2
+    >>> df.columns == ["gene_set_name", "genes"]
+    array([ True,  True])
+    >>> df.loc[df["gene_set_name"] == "HALLMARK_APICAL_SURFACE"].genes.tolist()[0].startswith("B4GALT1")
+    True
+    """
+    # Read the GMT file into a list of lines
+    with open(gmt_file, "r") as f:
+        lines = f.readlines()
+
+    # Create a list of dictionaries, where each dictionary represents a gene set
+    gene_sets = []
+    for line in lines:
+        fields = line.strip().split("\t")
+        gene_set = {"gene_set_name": fields[0], "genes": ",".join(fields[2:])}
+        gene_sets.append(gene_set)
+
+    # Convert the list of dictionaries to a DataFrame
+    return pd.DataFrame(gene_sets)
+
+
+def score_genes_aucell(
+    adata: anndata.AnnData, gene_list: list, score_name: str, use_raw=False
+):
+    """Score genes using Aucell.
+
+    Parameters
+    ----------
+    adata : anndata.AnnData
+    gene_list : list
+    score_names : str
+    use_raw : bool, optional
+
+    >>> import scanpy as sc
+    >>> import decoupler as dc
+    >>> adata = sc.datasets.pbmc68k_reduced()
+    >>> gene_list = adata.var[adata.var.index.str.startswith("RP")].index.tolist()
+    >>> score_genes_aucell(adata, gene_list, "ribosomal_aucell", use_raw=False)
+    >>> "ribosomal_aucell" in adata.obs.columns
+    True
+    """
+    # make a data.frame with two columns, geneset and gene_id, geneset filled with score_names and gene_id with gene_list, one row per element
+    geneset_df = pd.DataFrame(
+        {
+            "gene_id": gene_list,
+            "geneset": score_name,
+        }
+    )
+    # run decoupler's run_aucell
+    dc.run_aucell(
+        adata, net=geneset_df, source="geneset", target="gene_id", use_raw=use_raw
+    )
+    # copy .obsm['aucell_estimate'] matrix columns to adata.obs using the column names
+    adata.obs[score_name] = adata.obsm["aucell_estimate"][score_name]
+
+
+def run_for_genelists(
+    adata, gene_lists, score_names, use_raw=False, gene_symbols_field="gene_symbols"
+):
+    if len(gene_lists) == len(score_names):
+        for gene_list, score_names in zip(gene_lists, score_names):
+            genes = gene_list.split(",")
+            ens_gene_ids = adata.var[adata.var[gene_symbols_field].isin(genes)].index
+            score_genes_aucell(
+                adata,
+                ens_gene_ids,
+                f"AUCell_{score_names}",
+                use_raw,
+            )
+    else:
+        raise ValueError(
+            "The number of gene lists (separated by :) and score names (separated by :) must be the same"
+        )
+
+
+if __name__ == "__main__":
+    # Create command-line arguments parser
+    parser = argparse.ArgumentParser(description="Score genes using Aucell")
+    parser.add_argument("--input_file", type=str, help="Path to input AnnData file")
+    parser.add_argument("--output_file", type=str, help="Path to output file")
+    parser.add_argument("--gmt_file", type=str, help="Path to GMT file", required=False)
+    # add argument for gene sets to score
+    parser.add_argument(
+        "--gene_sets_to_score",
+        type=str,
+        required=False,
+        help="Comma separated list of gene sets to score (the need to be in the gmt file)",
+    )
+    # add argument for gene list (comma separated) to score
+    parser.add_argument(
+        "--gene_lists_to_score",
+        type=str,
+        required=False,
+        help="Comma separated list of genes to score. You can have more than one set of genes, separated by colon :",
+    )
+    # argument for the score name when using the gene list
+    parser.add_argument(
+        "--score_names",
+        type=str,
+        required=False,
+        help="Name of the score column when using the gene list. You can have more than one set of score names, separated by colon :. It should be the same length as the number of gene lists.",
+    )
+    parser.add_argument(
+        "--gene_symbols_field",
+        type=str,
+        help="Name of the gene symbols field in the AnnData object",
+    )
+    parser.add_argument("--use_raw", action="store_true", help="Use raw data")
+    parser.add_argument(
+        "--write_anndata", action="store_true", help="Write the modified AnnData object"
+    )
+
+    # Parse command-line arguments
+    args = parser.parse_args()
+
+    # Load input AnnData object
+    adata = anndata.read_h5ad(args.input_file)
+
+    if args.gene_sets_to_score is not None and args.gmt_file is not None:
+        # Load MSigDB file in GMT format
+        msigdb = read_gmt(args.gmt_file)
+
+        gene_sets_to_score = args.gene_sets_to_score.split(",")
+        # Score genes by their ensembl ids using the score_genes_aucell function
+        for _, row in msigdb.iterrows():
+            gene_set_name = row["gene_set_name"]
+            if gene_set_name in gene_sets_to_score:
+                genes = row["genes"].split(",")
+                # Convert gene symbols to ensembl ids by using the columns gene_symbols and index in adata.var specific to the gene set
+                ens_gene_ids = adata.var[
+                    adata.var[args.gene_symbols_field].isin(genes)
+                ].index
+                score_genes_aucell(
+                    adata, ens_gene_ids, f"AUCell_{gene_set_name}", args.use_raw
+                )
+    elif args.gene_lists_to_score is not None and args.score_names is not None:
+        gene_lists = args.gene_lists_to_score.split(":")
+        score_names = args.score_names.split(",")
+        run_for_genelists(
+            adata, gene_lists, score_names, args.use_raw, args.gene_symbols_field
+        )
+
+    # Save the modified AnnData object or generate a file with cells as rows and the new score_names columns
+    if args.write_anndata:
+        adata.write_h5ad(args.output_file)
+    else:
+        new_columns = [col for col in adata.obs.columns if col.startswith("AUCell_")]
+        adata.obs[new_columns].to_csv(args.output_file, sep="\t", index=True)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/decoupler_aucell_score.xml	Thu Nov 09 11:36:08 2023 +0000
@@ -0,0 +1,137 @@
+<?xml version="1.0"?>
+<tool id="score_genes_aucell" name="Decoupler AUCell" version="1.4.0+galaxy0" profile="20.05">
+    <description>
+        scores cells using the AUCell method for gene sets.
+    </description>
+    <requirements>
+        <requirement type="package" version="1.4.0">decoupler</requirement>
+    </requirements>
+    <command>
+        python '$__tool_directory__/decoupler_aucell_score.py'
+            --input_file '$input_file'
+            #if $gene_lists_source.source == "gmt"
+            --gmt_file '$gmt_file'
+            --gene_sets_to_score '$gene_sets_to_score'
+            #else:
+            --gene_lists_to_score '$gene_lists_to_score'
+            --score_names '$score_names'
+            #end if
+            --gene_symbols_field '$gene_symbols_field'
+            $use_raw
+            #if $write_anndata:
+            --write_anndata
+            --output_file anndata_aucell.h5ad
+            #else:
+            --output_file anndata_aucell_per_cell.tsv
+            #end if
+    </command>
+    <inputs>
+        <param name="input_file" type="data" format="h5ad" label="Input AnnData file" />
+        <conditional name="gene_lists_source">
+            <param name="source" type="select" label="Source of gene lists to score" help="Choose between using a GMT file and specified gene sets or enumerated genes and gene set names.">
+                <option value="gmt" selected="true">Gene sets from GMT file</option>
+                <option value="enumerated">Gene sets enumerated manually</option>
+            </param>
+            <when value="gmt">
+                <param name="gmt_file" type="data" format="txt" label="GMT file with gene sets" />
+                <param name="gene_sets_to_score" type="text" label="Gene sets to score within the GMT file" />
+            </when>
+            <when value="enumerated">
+                <param name="gene_lists_to_score" type="text" label="Genes to score" />
+                <param name="score_names" type="text" label="Score names" />
+            </when>
+        </conditional>
+        <param name="gene_symbols_field" type="text" label="Gene symbols field" help="The field in the AnnData var table where gene symbols are stored."/>
+        <param name="use_raw" type="boolean" value="false" truevalue="--use_raw" falsevalue="" label="Use raw data" help="Use RAW data in the AnnData instead of the X matrix."/>
+        <param name="write_anndata" type="boolean" value="false" truevalue="--write_anndata" falsevalue="" label="Write the modified AnnData object" help="Whether to write or not the same AnnData file again with the signatures on it. If unselected, a text files of cells in rows and signatures in columns (as in Obs) is produced."/>
+    </inputs>
+    <outputs>
+        <data name="output_ad" format="h5ad" from_work_dir="anndata_aucell.h5ad" label="${tool.name} on ${on_string}: Output AnnData file">
+            <filter>write_anndata</filter>
+        </data>
+        <data name="output_table" format="tabular" from_work_dir="anndata_aucell_per_cell.tsv" label="${tool.name} on ${on_string}: Output AUCell table">
+            <filter>not write_anndata</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="input_file" value="mito_counted_anndata.h5ad"/>
+            <param name="gene_sets_to_score" value="HALLMARK_NOTCH_SIGNALING,HALLMARK_APICAL_SURFACE"/>
+            <param name="gmt_file" value="mouse_hallmark_ss.gmt"/>
+            <param name="gene_symbols_field" value="Symbol"/>
+            <param name="write_anndata" value="true"/>
+            <conditional name="gene_lists_source">
+                <param name="source" value="gmt"/>
+            </conditional>
+            <output name="output_ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/AUCell_HALLMARK_NOTCH_SIGNALING"/>
+                    <has_h5_keys keys="obs/AUCell_HALLMARK_APICAL_SURFACE"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input_file" value="mito_counted_anndata.h5ad"/>
+            <param name="gene_lists_to_score" value="Cd8b1,Cd8b2,Cd8a,Cd4,Nrp1,Cd80:Il1a,Il1b,Il6,Nos2,Tlr2,Tlr4,Cd80"/>
+            <param name="score_names" value="TCell,Macro"/>
+            <param name="gene_symbols_field" value="Symbol"/>
+            <param name="write_anndata" value="true"/>
+            <conditional name="gene_lists_source">
+                <param name="source" value="enumerated"/>
+            </conditional>
+            <output name="output_ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/AUCell_TCell"/>
+                    <has_h5_keys keys="obs/AUCell_Macro"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input_file" value="mito_counted_anndata.h5ad"/>
+            <param name="gene_sets_to_score" value="HALLMARK_NOTCH_SIGNALING,HALLMARK_APICAL_SURFACE"/>
+            <param name="gmt_file" value="mouse_hallmark_ss.gmt"/>
+            <param name="gene_symbols_field" value="Symbol"/>
+            <param name="write_anndata" value="False"/>
+            <conditional name="gene_lists_source">
+                <param name="source" value="gmt"/>
+            </conditional>
+            <output name="output_table">
+                <assert_contents>
+                    <has_n_columns n="3"/>
+                    <has_text_matching expression="AUCell_HALLMARK_NOTCH_SIGNALING"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+**Description**
+
+This tool scores cells using the AUCell method for gene sets.
+
+**Input**
+
+The input file should be an AnnData object in H5AD format. The tool accepts an H5AD file containing raw or normalized data.
+
+The tool takes one of two input formats for specifying the gene sets to score:
+
+1. A GMT file containing gene sets. You can specify which gene sets to use within the GMT file using the "gene_sets_to_score" parameter.
+2. A list of genes and their associated score names. You can specify the genes to score using the "gene_lists_to_score" parameter, and their associated score names using the "score_names" parameter.
+
+In both cases, you must specify the name of the field in the AnnData object that contains the gene symbols using the "gene_symbols_field" parameter.
+
+You can also specify whether to use the raw data in the AnnData object instead of the X matrix using the "use_raw" parameter.
+
+**Output**
+
+The tool outputs an AnnData object containing the scores in the "obs" field, or a tab-separated text file containing the scores for each cell.
+
+If the "write_anndata" parameter is set to "true", the tool will write the modified AnnData object to an H5AD file. Otherwise, it will output a tab-separated text file containing the scores for each cell.
+
+**Example**
+
+
+    </help>
+    <citations>
+        <citation type="doi">10.1093/bioadv/vbac016</citation>
+    </citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/decoupler_pseudobulk.py	Thu Nov 09 11:36:08 2023 +0000
@@ -0,0 +1,367 @@
+import argparse
+
+import anndata
+import decoupler
+import pandas as pd
+
+
+def get_pseudobulk(
+    adata,
+    sample_col,
+    groups_col,
+    layer=None,
+    mode="sum",
+    min_cells=10,
+    min_counts=1000,
+    use_raw=False,
+):
+    """
+    >>> import scanpy as sc
+    >>> adata = sc.datasets.pbmc68k_reduced()
+    >>> adata.X = abs(adata.X).astype(int)
+    >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain")
+    """
+
+    return decoupler.get_pseudobulk(
+        adata,
+        sample_col=sample_col,
+        groups_col=groups_col,
+        layer=layer,
+        mode=mode,
+        use_raw=use_raw,
+        min_cells=min_cells,
+        min_counts=min_counts,
+    )
+
+
+def prepend_c_to_index(index_value):
+    if index_value and index_value[0].isdigit():
+        return "C" + index_value
+    return index_value
+
+
+# write results for loading into DESeq2
+def write_DESeq2_inputs(pdata, layer=None, output_dir="", factor_fields=None):
+    """
+    >>> import scanpy as sc
+    >>> adata = sc.datasets.pbmc68k_reduced()
+    >>> adata.X = abs(adata.X).astype(int)
+    >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain")
+    >>> write_DESeq2_inputs(pseudobulk)
+    """
+    # add / to output_dir if is not empty or if it doesn't end with /
+    if output_dir != "" and not output_dir.endswith("/"):
+        output_dir = output_dir + "/"
+    obs_for_deseq = pdata.obs.copy()
+    # replace any index starting with digits to start with C instead.
+    obs_for_deseq.rename(index=prepend_c_to_index, inplace=True)
+    # avoid dash that is read as point on R colnames.
+    obs_for_deseq.index = obs_for_deseq.index.str.replace("-", "_")
+    obs_for_deseq.index = obs_for_deseq.index.str.replace(" ", "_")
+    col_metadata_file = f"{output_dir}col_metadata.tsv"
+    # write obs to a col_metadata file
+    if factor_fields:
+        # only output the index plus the columns in factor_fields in that order
+        obs_for_deseq[factor_fields].to_csv(col_metadata_file, sep="\t", index=True)
+    else:
+        obs_for_deseq.to_csv(col_metadata_file, sep="\t", index=True)
+    # write var to a gene_metadata file
+    pdata.var.to_csv(f"{output_dir}gene_metadata.tsv", sep="\t", index=True)
+    # write the counts matrix of a specified layer to file
+    if layer is None:
+        # write the X numpy matrix transposed to file
+        df = pd.DataFrame(pdata.X.T, index=pdata.var.index, columns=obs_for_deseq.index)
+    else:
+        df = pd.DataFrame(
+            pdata.layers[layer].T, index=pdata.var.index, columns=obs_for_deseq.index
+        )
+    df.to_csv(f"{output_dir}counts_matrix.tsv", sep="\t", index_label="")
+
+
+def plot_pseudobulk_samples(
+    pseudobulk_data,
+    groupby,
+    figsize=(10, 10),
+    save_path=None,
+):
+    """
+    >>> import scanpy as sc
+    >>> adata = sc.datasets.pbmc68k_reduced()
+    >>> adata.X = abs(adata.X).astype(int)
+    >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain")
+    >>> plot_pseudobulk_samples(pseudobulk, groupby=["bulk_labels", "louvain"], figsize=(10, 10))
+    """
+    fig = decoupler.plot_psbulk_samples(
+        pseudobulk_data, groupby=groupby, figsize=figsize, return_fig=True
+    )
+    if save_path:
+        fig.savefig(f"{save_path}/pseudobulk_samples.png")
+    else:
+        fig.show()
+
+
+def plot_filter_by_expr(
+    pseudobulk_data, group, min_count=None, min_total_count=None, save_path=None
+):
+    """
+    >>> import scanpy as sc
+    >>> adata = sc.datasets.pbmc68k_reduced()
+    >>> adata.X = abs(adata.X).astype(int)
+    >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain")
+    >>> plot_filter_by_expr(pseudobulk, group="bulk_labels", min_count=10, min_total_count=200)
+    """
+    fig = decoupler.plot_filter_by_expr(
+        pseudobulk_data,
+        group=group,
+        min_count=min_count,
+        min_total_count=min_total_count,
+        return_fig=True,
+    )
+    if save_path:
+        fig.savefig(f"{save_path}/filter_by_expr.png")
+    else:
+        fig.show()
+
+
+def filter_by_expr(pdata, min_count=None, min_total_count=None):
+    """
+    >>> import scanpy as sc
+    >>> adata = sc.datasets.pbmc68k_reduced()
+    >>> adata.X = abs(adata.X).astype(int)
+    >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain")
+    >>> pdata_filt = filter_by_expr(pseudobulk, min_count=10, min_total_count=200)
+    """
+    genes = decoupler.filter_by_expr(
+        pdata, min_count=min_count, min_total_count=min_total_count
+    )
+    return pdata[:, genes].copy()
+
+
+def check_fields(fields, adata, obs=True, context=None):
+    """
+    >>> import scanpy as sc
+    >>> adata = sc.datasets.pbmc68k_reduced()
+    >>> check_fields(["bulk_labels", "louvain"], adata, obs=True)
+    """
+
+    legend = ""
+    if context:
+        legend = f", passed in {context},"
+    if obs:
+        if not set(fields).issubset(set(adata.obs.columns)):
+            raise ValueError(
+                f"Some of the following fields {legend} are not present in adata.obs: {fields}. Possible fields are: {list(set(adata.obs.columns))}"
+            )
+    else:
+        if not set(fields).issubset(set(adata.var.columns)):
+            raise ValueError(
+                f"Some of the following fields {legend} are not present in adata.var: {fields}. Possible fields are: {list(set(adata.var.columns))}"
+            )
+
+
+def main(args):
+    # Load AnnData object from file
+    adata = anndata.read_h5ad(args.adata_file)
+
+    # Merge adata.obs fields specified in args.adata_obs_fields_to_merge
+    if args.adata_obs_fields_to_merge:
+        # first split potential groups by ":" and iterate over them
+        for group in args.adata_obs_fields_to_merge.split(":"):
+            fields = group.split(",")
+            check_fields(fields, adata)
+            adata = merge_adata_obs_fields(fields, adata)
+
+    check_fields([args.groupby, args.sample_key], adata)
+
+    factor_fields = None
+    if args.factor_fields:
+        factor_fields = args.factor_fields.split(",")
+        check_fields(factor_fields, adata)
+
+    print(f"Using mode: {args.mode}")
+    # Perform pseudobulk analysis
+    pseudobulk_data = get_pseudobulk(
+        adata,
+        sample_col=args.sample_key,
+        groups_col=args.groupby,
+        layer=args.layer,
+        mode=args.mode,
+        use_raw=args.use_raw,
+        min_cells=args.min_cells,
+        min_counts=args.min_counts,
+    )
+
+    # Plot pseudobulk samples
+    plot_pseudobulk_samples(
+        pseudobulk_data,
+        args.groupby,
+        save_path=args.save_path,
+        figsize=args.plot_samples_figsize,
+    )
+
+    plot_filter_by_expr(
+        pseudobulk_data,
+        group=args.groupby,
+        min_count=args.min_counts,
+        min_total_count=args.min_total_counts,
+        save_path=args.save_path,
+    )
+
+    # Filter by expression if enabled
+    if args.filter_expr:
+        filtered_adata = filter_by_expr(
+            pseudobulk_data,
+            min_count=args.min_counts,
+            min_total_count=args.min_total_counts,
+        )
+
+        pseudobulk_data = filtered_adata
+
+    # Save the pseudobulk data
+    if args.anndata_output_path:
+        pseudobulk_data.write_h5ad(args.anndata_output_path, compression="gzip")
+
+    write_DESeq2_inputs(
+        pseudobulk_data, output_dir=args.deseq2_output_path, factor_fields=factor_fields
+    )
+
+
+def merge_adata_obs_fields(obs_fields_to_merge, adata):
+    """
+    Merge adata.obs fields specified in args.adata_obs_fields_to_merge
+
+    Parameters
+    ----------
+    obs_fields_to_merge : str
+        Fields in adata.obs to merge, comma separated
+    adata : anndata.AnnData
+        The AnnData object
+
+    Returns
+    -------
+    anndata.AnnData
+        The merged AnnData object
+
+    docstring tests:
+    >>> import scanpy as sc
+    >>> ad = sc.datasets.pbmc68k_reduced()
+    >>> ad = merge_adata_obs_fields(["bulk_labels","louvain"], ad)
+    >>> ad.obs.columns
+    Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score',
+           'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'],
+          dtype='object')
+    """
+    field_name = "_".join(obs_fields_to_merge)
+    for field in obs_fields_to_merge:
+        if field not in adata.obs.columns:
+            raise ValueError(f"The '{field}' column is not present in adata.obs.")
+        if field_name not in adata.obs.columns:
+            adata.obs[field_name] = adata.obs[field].astype(str)
+        else:
+            adata.obs[field_name] = (
+                adata.obs[field_name] + "_" + adata.obs[field].astype(str)
+            )
+    return adata
+
+
+if __name__ == "__main__":
+    # Create argument parser
+    parser = argparse.ArgumentParser(
+        description="Perform pseudobulk analysis on an AnnData object"
+    )
+
+    # Add arguments
+    parser.add_argument("adata_file", type=str, help="Path to the AnnData file")
+    parser.add_argument(
+        "-m",
+        "--adata_obs_fields_to_merge",
+        type=str,
+        help="Fields in adata.obs to merge, comma separated. You can have more than one set of fields, separated by semi-colon ;",
+    )
+    parser.add_argument(
+        "--groupby",
+        type=str,
+        required=True,
+        help="The column in adata.obs that defines the groups",
+    )
+    parser.add_argument(
+        "--sample_key",
+        required=True,
+        type=str,
+        help="The column in adata.obs that defines the samples",
+    )
+    # add argument for layer
+    parser.add_argument(
+        "--layer",
+        type=str,
+        default=None,
+        help="The name of the layer of the AnnData object to use",
+    )
+    # add argument for mode
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="sum",
+        help="The mode for Decoupler pseudobulk analysis",
+        choices=["sum", "mean", "median"],
+    )
+    # add boolean argument for use_raw
+    parser.add_argument(
+        "--use_raw",
+        action="store_true",
+        default=False,
+        help="Whether to use the raw part of the AnnData object",
+    )
+    # add argument for min_cells
+    parser.add_argument(
+        "--min_cells",
+        type=int,
+        default=10,
+        help="Minimum number of cells for pseudobulk analysis",
+    )
+    parser.add_argument(
+        "--save_path", type=str, help="Path to save the plot (optional)"
+    )
+    parser.add_argument(
+        "--min_counts",
+        type=int,
+        help="Minimum count threshold for filtering by expression",
+    )
+    parser.add_argument(
+        "--min_total_counts",
+        type=int,
+        help="Minimum total count threshold for filtering by expression",
+    )
+    parser.add_argument(
+        "--anndata_output_path",
+        type=str,
+        help="Path to save the filtered AnnData object or pseudobulk data",
+    )
+    parser.add_argument(
+        "--filter_expr", action="store_true", help="Enable filtering by expression"
+    )
+    parser.add_argument(
+        "--factor_fields",
+        type=str,
+        help="Comma separated list of fields for the factors",
+    )
+    parser.add_argument(
+        "--deseq2_output_path",
+        type=str,
+        help="Path to save the DESeq2 inputs",
+        required=True,
+    )
+    parser.add_argument(
+        "--plot_samples_figsize",
+        type=int,
+        default=[10, 10],
+        nargs=2,
+        help="Size of the samples plot as a tuple (two arguments)",
+    )
+    parser.add_argument("--plot_filtering_figsize", type=int, default=[10, 10], nargs=2)
+
+    # Parse the command line arguments
+    args = parser.parse_args()
+
+    # Call the main function
+    main(args)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_test_data.sh	Thu Nov 09 11:36:08 2023 +0000
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+BASENAME_FILE='mito_counted_anndata.h5ad'
+
+MTX_LINK='https://zenodo.org/record/7053673/files/Mito-counted_AnnData'
+
+# convenience for getting data
+function get_data {
+  local link=$1
+  local fname=$2
+
+  if [ ! -f $fname ]; then
+    echo "$fname not available locally, downloading.."
+    wget -O $fname --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 $link
+  fi
+}
+
+# get matrix data
+mkdir -p test-data
+pushd test-data
+get_data $MTX_LINK $BASENAME_FILE
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mouse_hallmark_ss.gmt	Thu Nov 09 11:36:08 2023 +0000
@@ -0,0 +1,2 @@
+HALLMARK_NOTCH_SIGNALING	http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/HALLMARK_NOTCH_SIGNALING	Jag1	Notch1	Notch2	Notch3	Ccnd1	Tcf7l2	Wnt5a	Lfng	Psenen	Psen2	Heyl	Fzd1	Rbx1	Hes1	Arrb1	Ppard	Prkca	Wnt2	Fzd5	Dtx1	Sap30	Dtx2	Kat2a	Dll1	Fzd7	St3gal6	Fbxw11	Cul1	Aph1a	Dtx4	Skp1	Maml2
+HALLMARK_APICAL_SURFACE	http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/HALLMARK_APICAL_SURFACE	Adam10	Gata3	B4galt1	Hspb1	App	Il2rg	Atp8b1	Brca1	Slc34a3	Atp6v0a4	Ghrl	Ncoa6	Rhcg	Scube1	Efna5	Crybg1	Ephb4	Flot2	Gas1	Gstm5	Il2rb	Shroom2	Lyn	Mal	Plaur	Slc2a4	Thy1	Akap7	Srpx	Cx3cl1	Dcbld2	Lypd3	Pkhd1	Sulf2	Pcsk9	Tmem8b	Rtn4rl1	Ntng1	Slc22a12	Adipor2	Afap1l2	Mdga1	Cd160	Crocc