Mercurial > repos > ebi-gxa > decoupler_pseudobulk
comparison decoupler_pseudobulk.py @ 16:508a93e34599 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit 487508282bda9dbb68138d5c7091f46ef54fe52a
| author | ebi-gxa |
|---|---|
| date | Wed, 19 Feb 2025 16:55:58 +0000 |
| parents | a559be56720c |
| children | 2557d7869e78 |
comparison
equal
deleted
inserted
replaced
| 15:09c833d9b03b | 16:508a93e34599 |
|---|---|
| 1 import argparse | 1 import argparse |
| 2 | 2 |
| 3 import anndata | 3 import anndata |
| 4 import decoupler | 4 import decoupler |
| 5 import numpy as np | |
| 5 import pandas as pd | 6 import pandas as pd |
| 6 | 7 |
| 7 | 8 |
| 8 def get_pseudobulk( | 9 def get_pseudobulk( |
| 9 adata, | 10 adata, |
| 30 mode=mode, | 31 mode=mode, |
| 31 use_raw=use_raw, | 32 use_raw=use_raw, |
| 32 min_cells=min_cells, | 33 min_cells=min_cells, |
| 33 min_counts=min_counts, | 34 min_counts=min_counts, |
| 34 ) | 35 ) |
| 36 | |
| 37 | |
| 38 def create_pseudo_replicates(adata, sample_key, num_replicates, seed=None): | |
| 39 """ | |
| 40 Create pseudo replicates for each sample in the sample_key groups. | |
| 41 | |
| 42 Parameters | |
| 43 ---------- | |
| 44 adata : anndata.AnnData | |
| 45 The AnnData object. | |
| 46 sample_key : str | |
| 47 The column in adata.obs that defines the samples. | |
| 48 num_replicates : int | |
| 49 Number of pseudo replicates to create per sample. | |
| 50 | |
| 51 Returns | |
| 52 ------- | |
| 53 anndata.AnnData | |
| 54 The AnnData object with pseudo replicates. | |
| 55 | |
| 56 Examples | |
| 57 -------- | |
| 58 >>> import anndata | |
| 59 >>> import pandas as pd | |
| 60 >>> import numpy as np | |
| 61 >>> data = { | |
| 62 ... 'obs': pd.DataFrame({'sample': ['A', 'A', 'B', 'B']}), | |
| 63 ... 'X': np.array([[1, 0], [0, 1], [1, 1], [0, 0]]) | |
| 64 ... } | |
| 65 >>> adata = anndata.AnnData(X=data['X'], obs=data['obs']) | |
| 66 >>> adata = create_pseudo_replicates(adata, 'sample', 2) | |
| 67 >>> adata.obs['sample_pseudo'].tolist() | |
| 68 ['A_rep1', 'A_rep2', 'B_rep1', 'B_rep2'] | |
| 69 """ | |
| 70 if seed is not None: | |
| 71 np.random.seed(seed) | |
| 72 | |
| 73 new_sample_key = f"{sample_key}_pseudo" | |
| 74 adata.obs[new_sample_key] = adata.obs[sample_key].astype(str) | |
| 75 | |
| 76 for sample in adata.obs[sample_key].unique(): | |
| 77 sample_indices = adata.obs[ | |
| 78 adata.obs[sample_key] == sample].index.to_numpy() | |
| 79 np.random.shuffle(sample_indices) # Shuffle the indices to randomize | |
| 80 replicate_size = int(len(sample_indices) / num_replicates) | |
| 81 for i in range(num_replicates): | |
| 82 start_idx = i * replicate_size | |
| 83 end_idx = start_idx + replicate_size | |
| 84 replicate_indices = sample_indices[start_idx:end_idx] | |
| 85 adata.obs.loc[replicate_indices, new_sample_key] = ( | |
| 86 adata.obs.loc[replicate_indices, new_sample_key] + f"_rep{i+1}" | |
| 87 ) | |
| 88 | |
| 89 return adata | |
| 35 | 90 |
| 36 | 91 |
| 37 def prepend_c_to_index(index_value): | 92 def prepend_c_to_index(index_value): |
| 38 if index_value and index_value[0].isdigit(): | 93 if index_value and index_value[0].isdigit(): |
| 39 return "C" + index_value | 94 return "C" + index_value |
| 304 | 359 |
| 305 factor_fields = None | 360 factor_fields = None |
| 306 if args.factor_fields: | 361 if args.factor_fields: |
| 307 factor_fields = args.factor_fields.split(",") | 362 factor_fields = args.factor_fields.split(",") |
| 308 check_fields(factor_fields, adata) | 363 check_fields(factor_fields, adata) |
| 364 | |
| 365 # Create pseudo replicates if specified | |
| 366 if args.num_pseudo_replicates: | |
| 367 adata = create_pseudo_replicates( | |
| 368 adata, args.sample_key, args.num_pseudo_replicates, seed=args.seed | |
| 369 ) | |
| 370 args.sample_key = f"{args.sample_key}_pseudo" | |
| 309 | 371 |
| 310 print(f"Using mode: {args.mode}") | 372 print(f"Using mode: {args.mode}") |
| 311 # Perform pseudobulk analysis | 373 # Perform pseudobulk analysis |
| 312 pseudobulk_data = get_pseudobulk( | 374 pseudobulk_data = get_pseudobulk( |
| 313 adata, | 375 adata, |
| 662 "--min_total_counts", | 724 "--min_total_counts", |
| 663 type=int, | 725 type=int, |
| 664 help="Minimum total count threshold for filtering by expression", | 726 help="Minimum total count threshold for filtering by expression", |
| 665 ) | 727 ) |
| 666 parser.add_argument( | 728 parser.add_argument( |
| 729 "--num_pseudo_replicates", | |
| 730 type=int, | |
| 731 choices=range(3, 1000), | |
| 732 help="Number of pseudo replicates to create per sample (at least 3)", | |
| 733 required=False | |
| 734 ) | |
| 735 parser.add_argument( | |
| 736 "--seed", | |
| 737 type=int, | |
| 738 default=None, | |
| 739 help="Random seed for pseudo replicate sampling", | |
| 740 ) | |
| 741 parser.add_argument( | |
| 667 "--anndata_output_path", | 742 "--anndata_output_path", |
| 668 type=str, | 743 type=str, |
| 669 help="Path to save the filtered AnnData object or pseudobulk data", | 744 help="Path to save the filtered AnnData object or pseudobulk data", |
| 670 ) | 745 ) |
| 671 parser.add_argument( | 746 parser.add_argument( |
