Mercurial > repos > ebi-gxa > decoupler_pathway_inference
comparison decoupler_pseudobulk.py @ 11:db14ac3f6b43 draft default tip
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit 487508282bda9dbb68138d5c7091f46ef54fe52a
author | ebi-gxa |
---|---|
date | Wed, 19 Feb 2025 16:55:51 +0000 |
parents | 2c5686d627c0 |
children |
comparison
equal
deleted
inserted
replaced
10:97c2c52a7ab4 | 11:db14ac3f6b43 |
---|---|
1 import argparse | 1 import argparse |
2 | 2 |
3 import anndata | 3 import anndata |
4 import decoupler | 4 import decoupler |
5 import numpy as np | |
5 import pandas as pd | 6 import pandas as pd |
6 | 7 |
7 | 8 |
8 def get_pseudobulk( | 9 def get_pseudobulk( |
9 adata, | 10 adata, |
30 mode=mode, | 31 mode=mode, |
31 use_raw=use_raw, | 32 use_raw=use_raw, |
32 min_cells=min_cells, | 33 min_cells=min_cells, |
33 min_counts=min_counts, | 34 min_counts=min_counts, |
34 ) | 35 ) |
36 | |
37 | |
38 def create_pseudo_replicates(adata, sample_key, num_replicates, seed=None): | |
39 """ | |
40 Create pseudo replicates for each sample in the sample_key groups. | |
41 | |
42 Parameters | |
43 ---------- | |
44 adata : anndata.AnnData | |
45 The AnnData object. | |
46 sample_key : str | |
47 The column in adata.obs that defines the samples. | |
48 num_replicates : int | |
49 Number of pseudo replicates to create per sample. | |
50 | |
51 Returns | |
52 ------- | |
53 anndata.AnnData | |
54 The AnnData object with pseudo replicates. | |
55 | |
56 Examples | |
57 -------- | |
58 >>> import anndata | |
59 >>> import pandas as pd | |
60 >>> import numpy as np | |
61 >>> data = { | |
62 ... 'obs': pd.DataFrame({'sample': ['A', 'A', 'B', 'B']}), | |
63 ... 'X': np.array([[1, 0], [0, 1], [1, 1], [0, 0]]) | |
64 ... } | |
65 >>> adata = anndata.AnnData(X=data['X'], obs=data['obs']) | |
66 >>> adata = create_pseudo_replicates(adata, 'sample', 2) | |
67 >>> adata.obs['sample_pseudo'].tolist() | |
68 ['A_rep1', 'A_rep2', 'B_rep1', 'B_rep2'] | |
69 """ | |
70 if seed is not None: | |
71 np.random.seed(seed) | |
72 | |
73 new_sample_key = f"{sample_key}_pseudo" | |
74 adata.obs[new_sample_key] = adata.obs[sample_key].astype(str) | |
75 | |
76 for sample in adata.obs[sample_key].unique(): | |
77 sample_indices = adata.obs[ | |
78 adata.obs[sample_key] == sample].index.to_numpy() | |
79 np.random.shuffle(sample_indices) # Shuffle the indices to randomize | |
80 replicate_size = int(len(sample_indices) / num_replicates) | |
81 for i in range(num_replicates): | |
82 start_idx = i * replicate_size | |
83 end_idx = start_idx + replicate_size | |
84 replicate_indices = sample_indices[start_idx:end_idx] | |
85 adata.obs.loc[replicate_indices, new_sample_key] = ( | |
86 adata.obs.loc[replicate_indices, new_sample_key] + f"_rep{i+1}" | |
87 ) | |
88 | |
89 return adata | |
35 | 90 |
36 | 91 |
37 def prepend_c_to_index(index_value): | 92 def prepend_c_to_index(index_value): |
38 if index_value and index_value[0].isdigit(): | 93 if index_value and index_value[0].isdigit(): |
39 return "C" + index_value | 94 return "C" + index_value |
304 | 359 |
305 factor_fields = None | 360 factor_fields = None |
306 if args.factor_fields: | 361 if args.factor_fields: |
307 factor_fields = args.factor_fields.split(",") | 362 factor_fields = args.factor_fields.split(",") |
308 check_fields(factor_fields, adata) | 363 check_fields(factor_fields, adata) |
364 | |
365 # Create pseudo replicates if specified | |
366 if args.num_pseudo_replicates: | |
367 adata = create_pseudo_replicates( | |
368 adata, args.sample_key, args.num_pseudo_replicates, seed=args.seed | |
369 ) | |
370 args.sample_key = f"{args.sample_key}_pseudo" | |
309 | 371 |
310 print(f"Using mode: {args.mode}") | 372 print(f"Using mode: {args.mode}") |
311 # Perform pseudobulk analysis | 373 # Perform pseudobulk analysis |
312 pseudobulk_data = get_pseudobulk( | 374 pseudobulk_data = get_pseudobulk( |
313 adata, | 375 adata, |
662 "--min_total_counts", | 724 "--min_total_counts", |
663 type=int, | 725 type=int, |
664 help="Minimum total count threshold for filtering by expression", | 726 help="Minimum total count threshold for filtering by expression", |
665 ) | 727 ) |
666 parser.add_argument( | 728 parser.add_argument( |
729 "--num_pseudo_replicates", | |
730 type=int, | |
731 choices=range(3, 1000), | |
732 help="Number of pseudo replicates to create per sample (at least 3)", | |
733 required=False | |
734 ) | |
735 parser.add_argument( | |
736 "--seed", | |
737 type=int, | |
738 default=None, | |
739 help="Random seed for pseudo replicate sampling", | |
740 ) | |
741 parser.add_argument( | |
667 "--anndata_output_path", | 742 "--anndata_output_path", |
668 type=str, | 743 type=str, |
669 help="Path to save the filtered AnnData object or pseudobulk data", | 744 help="Path to save the filtered AnnData object or pseudobulk data", |
670 ) | 745 ) |
671 parser.add_argument( | 746 parser.add_argument( |