Mercurial > repos > ebi-gxa > decoupler_pseudobulk
comparison decoupler_pseudobulk.py @ 10:f6040492b499 draft default tip
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit dea8a066ccf04e241457719bf5162f9d39fe6c48
author | ebi-gxa |
---|---|
date | Wed, 02 Oct 2024 08:26:56 +0000 |
parents | bd4b54b75888 |
children |
comparison
equal
deleted
inserted
replaced
9:bd4b54b75888 | 10:f6040492b499 |
---|---|
296 if args.adata_obs_fields_to_merge: | 296 if args.adata_obs_fields_to_merge: |
297 # first split potential groups by ":" and iterate over them | 297 # first split potential groups by ":" and iterate over them |
298 for group in args.adata_obs_fields_to_merge.split(":"): | 298 for group in args.adata_obs_fields_to_merge.split(":"): |
299 fields = group.split(",") | 299 fields = group.split(",") |
300 check_fields(fields, adata) | 300 check_fields(fields, adata) |
301 adata = merge_adata_obs_fields(fields, adata) | 301 merge_adata_obs_fields(fields, adata) |
302 | 302 |
303 check_fields([args.groupby, args.sample_key], adata) | 303 check_fields([args.groupby, args.sample_key], adata) |
304 | 304 |
305 factor_fields = None | 305 factor_fields = None |
306 if args.factor_fields: | 306 if args.factor_fields: |
320 min_counts=args.min_counts, | 320 min_counts=args.min_counts, |
321 ) | 321 ) |
322 | 322 |
323 print("Created pseudo-bulk AnnData, checking if fields still make sense.") | 323 print("Created pseudo-bulk AnnData, checking if fields still make sense.") |
324 print( | 324 print( |
325 "If this fails this check, it might mean that you asked for factors " | 325 "If this fails this check, it might mean that you asked for factors \ |
326 + "that are not compatible with you sample identifiers (ie. asked for " | 326 that are not compatible with you sample identifiers (ie. asked for \ |
327 + "phase in the factors, but each sample contains more than one phase, " | 327 phase in the factors, but each sample contains more than one phase,\ |
328 + "try joining fields)" | 328 try joining fields)." |
329 ) | 329 ) |
330 if factor_fields: | 330 if factor_fields: |
331 check_fields( | 331 check_fields( |
332 factor_fields, | 332 factor_fields, |
333 pseudobulk_data, | 333 pseudobulk_data, |
372 output_dir=args.deseq2_output_path, | 372 output_dir=args.deseq2_output_path, |
373 factor_fields=factor_fields, | 373 factor_fields=factor_fields, |
374 min_counts_per_sample_marking=args.min_counts_per_sample_marking, | 374 min_counts_per_sample_marking=args.min_counts_per_sample_marking, |
375 ) | 375 ) |
376 | 376 |
377 # if contrasts file is provided, produce a file with genes that should be | |
378 # filtered for each contrasts | |
379 if args.contrasts_file: | |
380 contrast_genes_df = identify_genes_to_filter_per_contrast( | |
381 contrast_file=args.contrasts_file, | |
382 min_perc_cells_expression=args.min_gene_exp_perc_per_cell, | |
383 adata=adata, | |
384 obs_field=args.groupby | |
385 ) | |
386 contrast_genes_df.to_csv( | |
387 f"{args.save_path}/genes_to_filter_by_contrast.tsv", | |
388 sep="\t", | |
389 index=False, | |
390 ) | |
391 | |
377 | 392 |
378 def merge_adata_obs_fields(obs_fields_to_merge, adata): | 393 def merge_adata_obs_fields(obs_fields_to_merge, adata): |
379 """ | 394 """ |
380 Merge adata.obs fields specified in args.adata_obs_fields_to_merge | 395 Merge adata.obs fields specified in args.adata_obs_fields_to_merge |
381 | 396 |
392 The merged AnnData object | 407 The merged AnnData object |
393 | 408 |
394 docstring tests: | 409 docstring tests: |
395 >>> import scanpy as sc | 410 >>> import scanpy as sc |
396 >>> ad = sc.datasets.pbmc68k_reduced() | 411 >>> ad = sc.datasets.pbmc68k_reduced() |
397 >>> ad = merge_adata_obs_fields(["bulk_labels","louvain"], ad) | 412 >>> merge_adata_obs_fields(["bulk_labels","louvain"], ad) |
398 >>> ad.obs.columns | 413 >>> ad.obs.columns |
399 Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', | 414 Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', |
400 'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'], | 415 'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'], |
401 dtype='object') | 416 dtype='object') |
402 """ | 417 """ |
410 adata.obs[field_name] = adata.obs[field].astype(str) | 425 adata.obs[field_name] = adata.obs[field].astype(str) |
411 else: | 426 else: |
412 adata.obs[field_name] = ( | 427 adata.obs[field_name] = ( |
413 adata.obs[field_name] + "_" + adata.obs[field].astype(str) | 428 adata.obs[field_name] + "_" + adata.obs[field].astype(str) |
414 ) | 429 ) |
415 return adata | 430 |
431 | |
432 def identify_genes_to_filter_per_contrast( | |
433 contrast_file, min_perc_cells_expression, adata, obs_field | |
434 ): | |
435 """ | |
436 Identify genes to filter per contrast based on expression percentage. | |
437 We need those genes to be under the threshold for all conditions | |
438 in a contrast to be identified for further filtering. If | |
439 one condition has the gene expressed above the threshold, the gene | |
440 becomes of interest (it can be highly up or down regulated). | |
441 | |
442 Parameters | |
443 ---------- | |
444 contrast_file : str | |
445 Path to the contrasts file. | |
446 min_perc_cells_expression : float | |
447 Minimum percentage of cells that should express a gene. | |
448 adata: adata | |
449 Original AnnData file | |
450 obs_field: str | |
451 Field in the AnnData observations where the contrasts are defined. | |
452 | |
453 Returns | |
454 ------- | |
455 None | |
456 | |
457 Examples | |
458 -------- | |
459 >>> import anndata | |
460 >>> import pandas as pd | |
461 >>> import numpy as np | |
462 >>> import os | |
463 >>> from io import StringIO | |
464 >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\ | |
465 condition2{os.linesep}") | |
466 >>> min_perc_cells_expression = 30.0 | |
467 >>> data = { | |
468 ... 'obs': pd.DataFrame({'condition': ['condition1', 'condition1', | |
469 ... 'condition2', 'condition2']}), | |
470 ... 'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0], | |
471 ... [0, 0, 1, 1, 0], [0, 0, 0, 2, 0]]), | |
472 ... } | |
473 >>> adata = anndata.AnnData(X=data['X'], obs=data['obs']) | |
474 >>> df = identify_genes_to_filter_per_contrast( | |
475 ... contrast_file, min_perc_cells_expression, adata, 'condition' | |
476 ... ) # doctest:+ELLIPSIS | |
477 Identifying genes to filter using ... | |
478 >>> df.head() # doctest:+ELLIPSIS | |
479 contrast gene | |
480 0 condition1-condition2 ... | |
481 1 condition1-condition2 ... | |
482 """ | |
483 import re | |
484 | |
485 # Implement the logic to identify genes to filter per contrast | |
486 # This is a placeholder implementation | |
487 print( | |
488 f"Identifying genes to filter using {contrast_file} " | |
489 f"with min expression {min_perc_cells_expression}%" | |
490 ) | |
491 sides_regex = re.compile(r"[\+\-\*\/\(\)\^]+") | |
492 | |
493 contrasts = pd.read_csv(contrast_file, sep="\t") | |
494 # Iterate over each line in the contrast file | |
495 genes_filter_for_contrast = dict() | |
496 for contrast in contrasts.iloc[:, 0]: | |
497 conditions = set(sides_regex.split(contrast)) | |
498 # we want to find the genes that are below the threshold | |
499 # of % of cells expressed for ALL the conditions in the | |
500 # contrast. It is enough for one of the conditions | |
501 # of the contrast to have the genes expressed above | |
502 # the threshold of % of cells to be of interest. | |
503 for condition in conditions: | |
504 # remove any starting or trailing whitespaces from condition | |
505 condition = condition.strip() | |
506 # check the percentage of cells that express each gene | |
507 # Filter the AnnData object based on the obs_field value | |
508 adata_filtered = adata[adata.obs[obs_field] == condition] | |
509 # Calculate the percentage of cells expressing each gene | |
510 gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100 | |
511 genes_to_filter = set(adata_filtered.var[ | |
512 gene_expression.transpose() < min_perc_cells_expression | |
513 ].index.tolist()) | |
514 # Update the genes_filter_for_contrast dictionary | |
515 if contrast in genes_filter_for_contrast.keys(): | |
516 genes_filter_for_contrast[contrast].intersection_update( | |
517 genes_to_filter | |
518 ) | |
519 else: | |
520 genes_filter_for_contrast[contrast] = genes_to_filter | |
521 | |
522 # write the genes_filter_for_contrast to pandas dataframe of two columns: | |
523 # contrast and gene | |
524 | |
525 # Initialize an empty list to store the expanded pairs | |
526 expanded_pairs = [] | |
527 | |
528 # Iterate over the dictionary | |
529 for contrast, genes in genes_filter_for_contrast.items(): | |
530 for gene in genes: | |
531 expanded_pairs.append((contrast, gene)) | |
532 | |
533 # Create the DataFrame | |
534 contrast_genes_df = pd.DataFrame( | |
535 expanded_pairs, columns=["contrast", "gene"] | |
536 ) | |
537 | |
538 return contrast_genes_df | |
416 | 539 |
417 | 540 |
418 if __name__ == "__main__": | 541 if __name__ == "__main__": |
419 # Create argument parser | 542 # Create argument parser |
420 parser = argparse.ArgumentParser( | 543 parser = argparse.ArgumentParser( |
472 "--min_cells", | 595 "--min_cells", |
473 type=int, | 596 type=int, |
474 default=10, | 597 default=10, |
475 help="Minimum number of cells for pseudobulk analysis", | 598 help="Minimum number of cells for pseudobulk analysis", |
476 ) | 599 ) |
600 # add argument for min percentage of cells that should express a gene | |
601 parser.add_argument( | |
602 "--min_gene_exp_perc_per_cell", | |
603 type=float, | |
604 default=50, | |
605 help="If all the conditions of one side of a contrast express a \ | |
606 gene in less than this percentage of cells, then the genes \ | |
607 will be added to a list of genes to ignore for that contrast.\ | |
608 Requires the contrast file to be provided.", | |
609 ) | |
610 parser.add_argument( | |
611 "--contrasts_file", | |
612 type=str, | |
613 required=False, | |
614 help="Contrasts file, a one column tsv with a header, each line \ | |
615 represents a contrast as a combination of conditions at each \ | |
616 side of a substraction.", | |
617 ) | |
618 | |
477 parser.add_argument( | 619 parser.add_argument( |
478 "--save_path", type=str, help="Path to save the plot (optional)" | 620 "--save_path", type=str, help="Path to save the plot (optional)" |
479 ) | 621 ) |
480 parser.add_argument( | 622 parser.add_argument( |
481 "--min_counts", | 623 "--min_counts", |