changeset 4:0b27f323c80b draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_filter_cells commit c5b2e910bd79d92566b2c2e9bad508d090a75841
author artbio
date Thu, 07 Nov 2024 18:33:17 +0000
parents 5407dc697e24
children
files filter_cells.R filter_cells.xml
diffstat 2 files changed, 171 insertions(+), 120 deletions(-) [+]
line wrap: on
line diff
--- a/filter_cells.R	Mon Oct 16 22:33:23 2023 +0000
+++ b/filter_cells.R	Thu Nov 07 18:33:17 2024 +0000
@@ -3,11 +3,12 @@
 # percentiles or raw values of number of genes detected or
 # total aligned reads
 
-options(show.error.messages = FALSE,
-  error = function() {
-    cat(geterrmessage(), file = stderr())
-    q("no", 1, FALSE)
-  }
+options(
+    show.error.messages = FALSE,
+    error = function() {
+        cat(geterrmessage(), file = stderr())
+        q("no", 1, FALSE)
+    }
 )
 
 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
@@ -18,38 +19,58 @@
 
 # Arguments
 option_list <- list(
-  make_option(c("-f", "--file"), default = NA, type = "character",
-              help = "Input file that contains values to filter"),
-  make_option("--sep", default = "\t", type = "character",
-              help = "File column separator [default : '%default' ]"),
-  make_option("--percentile_genes", default = 0, type = "integer",
-              help = "nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"),
-  make_option("--percentile_counts", default = 0, type = "integer",
-              help = "nth Percentile of the total counts per cell distribution [default : '%default' ]"),
-  make_option("--absolute_genes", default = 0, type = "integer",
-              help = "Remove cells that did not express at least this number of genes [default : '%default' ]"),
-  make_option("--absolute_counts", default = 0, type = "integer",
-              help = "Number of transcript threshold for cell filtering [default : '%default' ]"),
-  make_option("--manage_cutoffs", default = "intersect", type = "character",
-              help = "combine or intersect cutoffs for filtering"),
-  make_option("--pdfplot", type = "character",
-              help = "Path to pdf file of the plots"),
-  make_option("--output", type = "character",
-              help = "Path to tsv file of filtered cell data"),
-  make_option("--output_metada", type = "character",
-              help = "Path to tsv file of filtered cell metadata")
+    make_option(c("-f", "--file"),
+        default = NA, type = "character",
+        help = "Input file that contains values to filter"
+    ),
+    make_option("--sep",
+        default = "\t", type = "character",
+        help = "File column separator [default : '%default' ]"
+    ),
+    make_option("--percentile_genes",
+        default = 0, type = "integer",
+        help = "nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"
+    ),
+    make_option("--percentile_counts",
+        default = 0, type = "integer",
+        help = "nth Percentile of the total counts per cell distribution [default : '%default' ]"
+    ),
+    make_option("--absolute_genes",
+        default = 0, type = "integer",
+        help = "Remove cells that did not express at least this number of genes [default : '%default' ]"
+    ),
+    make_option("--absolute_counts",
+        default = 0, type = "integer",
+        help = "Number of transcript threshold for cell filtering [default : '%default' ]"
+    ),
+    make_option("--manage_cutoffs",
+        default = "intersect", type = "character",
+        help = "combine or intersect cutoffs for filtering"
+    ),
+    make_option("--pdfplot",
+        type = "character",
+        help = "Path to pdf file of the plots"
+    ),
+    make_option("--output",
+        type = "character",
+        help = "Path to tsv file of filtered cell data"
+    ),
+    make_option("--output_metada",
+        type = "character",
+        help = "Path to tsv file of filtered cell metadata"
+    )
 )
 opt <- parse_args(OptionParser(option_list = option_list),
-  args = commandArgs(trailingOnly = TRUE)
+    args = commandArgs(trailingOnly = TRUE)
 )
 if (opt$sep == "tab") {
-  opt$sep <- "\t"
+    opt$sep <- "\t"
 }
 if (opt$sep == "comma") {
-  opt$sep <- ","
+    opt$sep <- ","
 }
 if (opt$sep == "space") {
-  opt$sep <- " "
+    opt$sep <- " "
 }
 
 
@@ -57,56 +78,64 @@
 
 # if input parameters are not consistent (one or either method, not both), no filtering
 if ((opt$percentile_counts > 0) && (opt$absolute_counts > 0)) {
-  opt$percentile_counts <- 0
+    opt$percentile_counts <- 0
 }
 
 # if input parameters are not consistent (one or either method, not both), no filtering
 if ((opt$percentile_genes > 0) && (opt$absolute_genes > 0)) {
-  opt$percentile_genes <- 0
+    opt$percentile_genes <- 0
 }
 
 # Import datasets
 data_counts <- read.delim(
-  opt$file,
-  header = TRUE,
-  stringsAsFactors = FALSE,
-  sep = opt$sep,
-  check.names = FALSE,
-  row.names = 1
+    opt$file,
+    header = TRUE,
+    stringsAsFactors = FALSE,
+    sep = opt$sep,
+    check.names = FALSE,
+    row.names = 1
 )
 
-QC_metrics <- data.frame(cell_id = colnames(data_counts),
-                         nGenes = colSums(data_counts != 0),  # nGenes is Number of detected genes for each cell
-                         total_counts = colSums(data_counts),  # total_counts is Total counts per cell
-                         stringsAsFactors = FALSE)
+QC_metrics <- data.frame(
+    cell_id = colnames(data_counts),
+    nGenes = colSums(data_counts != 0), # nGenes is Number of detected genes for each cell
+    total_counts = colSums(data_counts), # total_counts is Total counts per cell
+    stringsAsFactors = FALSE
+)
 
 
 plot_hist <- function(mydata, variable, title, cutoff) {
-  mybinwidth <- round(max(mydata[, variable]) * 5 / 100)
-  mylabel <- paste0("cutoff= ", cutoff)
-  hist_plot <- ggplot(as.data.frame(mydata[, variable]),
-                      aes(x = mydata[, variable], colour = I("white"))) +
-    geom_histogram(binwidth = mybinwidth) +
-    labs(title = title, x = variable, y = "count") +
-    scale_x_continuous() +
-    geom_vline(xintercept = cutoff) +
-    annotate(geom = "text",
-             x = cutoff + mybinwidth, y = 1,
-             label = mylabel, color = "white")
-  plot(hist_plot)
-  return
+    mybinwidth <- round(max(mydata[, variable]) * 5 / 100)
+    mylabel <- paste0("cutoff= ", cutoff)
+    hist_plot <- ggplot(
+        as.data.frame(mydata[, variable]),
+        aes(x = mydata[, variable], colour = I("white"))
+    ) +
+        geom_histogram(binwidth = mybinwidth) +
+        labs(title = title, x = variable, y = "count") +
+        scale_x_continuous() +
+        geom_vline(xintercept = cutoff) +
+        annotate(
+            geom = "text",
+            x = cutoff + mybinwidth, y = 1,
+            label = mylabel, color = "white"
+        )
+    plot(hist_plot)
+    return
 }
 
 # returns the highest value such as the sum of the ordered values including this highest
 # value is lower (below) than the percentile threshold (n)
 percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...) {
-  p <- n / 100
-  percentile_threshold <- quantile(qcmetrics[, variable], p)[[1]]
-  plot_hist(qcmetrics,
-            variable,
-            plot_title,
-            percentile_threshold)
-  return(percentile_threshold)
+    p <- n / 100
+    percentile_threshold <- quantile(qcmetrics[, variable], p)[[1]]
+    plot_hist(
+        qcmetrics,
+        variable,
+        plot_title,
+        percentile_threshold
+    )
+    return(percentile_threshold)
 }
 
 pdf(file = opt$pdfplot)
@@ -114,97 +143,116 @@
 # Determine thresholds based on percentile
 
 if (opt$percentile_counts > 0) {
-  counts_threshold <- percentile_cutoff(opt$percentile_counts,
-                                        QC_metrics,
-                                        "total_counts",
-                                        "Histogram of Aligned read counts per cell")
+    counts_threshold <- percentile_cutoff(
+        opt$percentile_counts,
+        QC_metrics,
+        "total_counts",
+        "Histogram of Aligned read counts per cell"
+    )
 } else {
-  counts_threshold <- opt$absolute_counts
-  plot_hist(QC_metrics,
-            variable = "total_counts",
-            title = "Histogram of Total counts per cell",
-            cutoff = counts_threshold)
+    counts_threshold <- opt$absolute_counts
+    plot_hist(QC_metrics,
+        variable = "total_counts",
+        title = "Histogram of Total counts per cell",
+        cutoff = counts_threshold
+    )
 }
 
 if (opt$percentile_genes > 0) {
-  genes_threshold <- percentile_cutoff(opt$percentile_genes,
-                                       QC_metrics,
-                                       "nGenes",
-                                       "Histogram of Number of detected genes per cell")
+    genes_threshold <- percentile_cutoff(
+        opt$percentile_genes,
+        QC_metrics,
+        "nGenes",
+        "Histogram of Number of detected genes per cell"
+    )
 } else {
-  genes_threshold <- opt$absolute_genes
-  plot_hist(QC_metrics,
-            variable = "nGenes",
-            title = "Histogram of Number of detected genes per cell",
-            cutoff = genes_threshold)
+    genes_threshold <- opt$absolute_genes
+    plot_hist(QC_metrics,
+        variable = "nGenes",
+        title = "Histogram of Number of detected genes per cell",
+        cutoff = genes_threshold
+    )
 }
 
 # Filter out rows below thresholds (genes and read counts)
 if (opt$manage_cutoffs == "union") {
-  QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold)
+    QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold)
 } else {
-  QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold)
+    QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold)
 }
 
 ## Plot the results
 
 # Determine title from the parameter logics
 if (opt$percentile_counts > 0) {
-  part_one <- paste0("Cells with aligned reads counts below the ",
-                     opt$percentile_counts,
-                     "th percentile of aligned read counts")
+    part_one <- paste0(
+        "Cells with aligned reads counts below the ",
+        opt$percentile_counts,
+        "th percentile of aligned read counts"
+    )
 } else {
-  part_one <- paste0("Cells with aligned read counts below ",
-                     opt$absolute_counts)
+    part_one <- paste0(
+        "Cells with aligned read counts below ",
+        opt$absolute_counts
+    )
 }
 
 if (opt$percentile_genes > 0) {
-  part_two <- paste0("with number of detected genes below the ",
-                     opt$percentile_genes,
-                     "th percentile of detected gene counts")
+    part_two <- paste0(
+        "with number of detected genes below the ",
+        opt$percentile_genes,
+        "th percentile of detected gene counts"
+    )
 } else {
-  part_two <- paste0("with number of detected genes below ",
-                     opt$absolute_genes)
+    part_two <- paste0(
+        "with number of detected genes below ",
+        opt$absolute_genes
+    )
 }
 
 if (opt$manage_cutoffs == "intersect") {
-  conjunction <- " and\n"
+    conjunction <- " and\n"
 } else {
-  conjunction <- " or\n"
+    conjunction <- " or\n"
 }
 
 # plot with ggplot2
 ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) +
-  geom_point() +
-  scale_y_log10() +
-  scale_colour_discrete(name  = "",
-    breaks = c(FALSE, TRUE),
-    labels = c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"),
-               paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))
-  ) +
-  xlab("Detected genes per cell") +
-  ylab("Aligned reads per cell (log10 scale)") +
-  geom_vline(xintercept = genes_threshold) +
-  geom_hline(yintercept = counts_threshold) +
-  ggtitle(paste0(part_one, conjunction, part_two, "\nwere filtered out")) +
-  theme(plot.title = element_text(size = 8, face = "bold"))
+    geom_point() +
+    scale_y_log10() +
+    scale_colour_discrete(
+        name = "",
+        breaks = c(FALSE, TRUE),
+        labels = c(
+            paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"),
+            paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)")
+        )
+    ) +
+    xlab("Detected genes per cell") +
+    ylab("Aligned reads per cell (log10 scale)") +
+    geom_vline(xintercept = genes_threshold) +
+    geom_hline(yintercept = counts_threshold) +
+    ggtitle(paste0(part_one, conjunction, part_two, "\nwere filtered out")) +
+    theme(plot.title = element_text(size = 8, face = "bold"))
 
 dev.off()
 
 # Retrieve identifier of kept_cells
 kept_cells <- QC_metrics$cell_id[!QC_metrics$filtered]
 
-data_counts <- data.frame(Genes = rownames(data_counts[, kept_cells]),
-                          data_counts[, kept_cells],
-                          check.names = FALSE)
+data_counts <- data.frame(
+    Genes = rownames(data_counts[, kept_cells]),
+    data_counts[, kept_cells],
+    check.names = FALSE
+)
 
 # Save filtered cells
 write.table(data_counts,
-  opt$output,
-  sep = "\t",
-  quote = FALSE,
-  col.names = TRUE,
-  row.names = FALSE
+    opt$output,
+    sep = "\t",
+    quote = FALSE,
+    col.names = TRUE,
+    row.names = FALSE
 )
 
 # Add QC metrics of filtered cells to a metadata file
@@ -212,9 +260,9 @@
 
 # Save the metadata (QC metrics) file
 write.table(metadata,
-  opt$output_metada,
-  sep = "\t",
-  quote = FALSE,
-  col.names = TRUE,
-  row.names = FALSE
+    opt$output_metada,
+    sep = "\t",
+    quote = FALSE,
+    col.names = TRUE,
+    row.names = FALSE
 )
--- a/filter_cells.xml	Mon Oct 16 22:33:23 2023 +0000
+++ b/filter_cells.xml	Thu Nov 07 18:33:17 2024 +0000
@@ -1,5 +1,8 @@
-<tool id="filter_cells" name="Filter cells data" version="4.3.1+galaxy0" profile="21.01">
+<tool id="filter_cells" name="Filter cells data" version="4.3.1+galaxy1" profile="21.01">
     <description>on total aligned reads and/or number of detected genes</description>
+    <xrefs>
+        <xref type="bio.tools">galaxy_single_cell_suite</xref>
+    </xrefs>
     <requirements>
         <requirement type="package" version="1.7.3">r-optparse</requirement>
         <requirement type="package" version="3.4.4">r-ggplot2</requirement>