Mercurial > repos > artbio > gsc_filter_cells

--- a/filter_cells.R	Sun Jul 07 08:29:39 2019 -0400
+++ b/filter_cells.R	Mon Oct 16 22:33:23 2023 +0000
@@ -1,96 +1,107 @@
 # First step of the signature-based workflow
-# Remove low quality cells below a user-fixed cutoff of
+# Remove low quality cells below a user-fixed cutoff of
 # percentiles or raw values of number of genes detected or
 # total aligned reads

-# Example of command (that generates output files) :
-# Rscript filter_cells.R -f <input> --sep "/t" --absolute_genes 1700 --absolute_counts 90000 --pdfplot <plotfile.pdf>  --output <filtered_cells.tsv> --output_metada <filtered_metadata.tsv>
+options(show.error.messages = FALSE,
+  error = function() {
+    cat(geterrmessage(), file = stderr())
+    q("no", 1, FALSE)
+  }
+)

-# load packages that are provided in the conda env
-options( show.error.messages=F,
-       error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
 warnings()
+
 library(optparse)
 library(ggplot2)

 # Arguments
-option_list = list(
-  make_option(c("-f", "--file"), default=NA, type='character',
-              help="Input file that contains values to filter"),
-  make_option("--sep", default="\t", type='character',
-              help="File column separator [default : '%default' ]"),
-  make_option("--percentile_genes", default=0, type='integer',
-              help="nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"),
-  make_option("--percentile_counts", default=0, type='integer',
-              help="nth Percentile of the total counts per cell distribution [default : '%default' ]"),
-  make_option("--absolute_genes", default=0, type='integer',
-              help="Remove cells that didn't express at least this number of genes [default : '%default' ]"),
-  make_option("--absolute_counts", default=0, type='integer',
-              help="Number of transcript threshold for cell filtering [default : '%default' ]"),
-  make_option("--manage_cutoffs", default="intersect", type='character',
-              help="combine or intersect cutoffs for filtering"),
-  make_option("--pdfplot", type = 'character',
-              help="Path to pdf file of the plots"),
-  make_option("--output", type = 'character',
-              help="Path to tsv file of filtered cell data"),
-  make_option("--output_metada", type = 'character',
-              help="Path to tsv file of filtered cell metadata")
+option_list <- list(
+  make_option(c("-f", "--file"), default = NA, type = "character",
+              help = "Input file that contains values to filter"),
+  make_option("--sep", default = "\t", type = "character",
+              help = "File column separator [default : '%default' ]"),
+  make_option("--percentile_genes", default = 0, type = "integer",
+              help = "nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"),
+  make_option("--percentile_counts", default = 0, type = "integer",
+              help = "nth Percentile of the total counts per cell distribution [default : '%default' ]"),
+  make_option("--absolute_genes", default = 0, type = "integer",
+              help = "Remove cells that did not express at least this number of genes [default : '%default' ]"),
+  make_option("--absolute_counts", default = 0, type = "integer",
+              help = "Number of transcript threshold for cell filtering [default : '%default' ]"),
+  make_option("--manage_cutoffs", default = "intersect", type = "character",
+              help = "combine or intersect cutoffs for filtering"),
+  make_option("--pdfplot", type = "character",
+              help = "Path to pdf file of the plots"),
+  make_option("--output", type = "character",
+              help = "Path to tsv file of filtered cell data"),
+  make_option("--output_metada", type = "character",
+              help = "Path to tsv file of filtered cell metadata")
 )
-opt = parse_args(OptionParser(option_list=option_list), args = commandArgs(trailingOnly = TRUE))
-if (opt$sep == "tab") {opt$sep = "\t"}
-if (opt$sep == "comma") {opt$sep = ","}
-if (opt$sep == "space") {opt$sep = " "}
+opt <- parse_args(OptionParser(option_list = option_list),
+  args = commandArgs(trailingOnly = TRUE)
+)
+if (opt$sep == "tab") {
+  opt$sep <- "\t"
+}
+if (opt$sep == "comma") {
+  opt$sep <- ","
+}
+if (opt$sep == "space") {
+  opt$sep <- " "
+}


-# check consistency of filtering options
-if ((opt$percentile_counts > 0) & (opt$absolute_counts > 0)) {
-  opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
-# if ((opt$percentile_counts == 0) & (opt$absolute_counts == 0)) {
-#   opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
-if ((opt$percentile_genes > 0) & (opt$absolute_genes > 0)) {
-  opt$percentile_genes = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
-# if ((opt$percentile_genes == 0) & (opt$absolute_genes == 0)) {
-#   opt$percentile_genes = 100 } # since input parameters are not consistent (one or either method, not both), no filtering
+## check consistency of filtering options
+
+# if input parameters are not consistent (one or either method, not both), no filtering
+if ((opt$percentile_counts > 0) && (opt$absolute_counts > 0)) {
+  opt$percentile_counts <- 0
+}
+
+# if input parameters are not consistent (one or either method, not both), no filtering
+if ((opt$percentile_genes > 0) && (opt$absolute_genes > 0)) {
+  opt$percentile_genes <- 0
+}

 # Import datasets
-data.counts <- read.table(
+data_counts <- read.delim(
   opt$file,
   header = TRUE,
-  stringsAsFactors = F,
+  stringsAsFactors = FALSE,
   sep = opt$sep,
   check.names = FALSE,
   row.names = 1
 )

-QC_metrics <-
-  data.frame(cell_id = colnames(data.counts),
-             nGenes = colSums(data.counts != 0),    # nGenes : Number of detected genes for each cell
-             total_counts = colSums(data.counts),  # total_counts : Total counts per cell
-             stringsAsFactors = F)
+QC_metrics <- data.frame(cell_id = colnames(data_counts),
+                         nGenes = colSums(data_counts != 0),  # nGenes is Number of detected genes for each cell
+                         total_counts = colSums(data_counts),  # total_counts is Total counts per cell
+                         stringsAsFactors = FALSE)
+

-plot_hist <- function(mydata, variable, title, cutoff){
-  mybinwidth = round(max(mydata[, variable]) * 5 / 100)
-  mylabel = paste0("cutoff= ", cutoff)
-  hist_plot <- qplot(
-    mydata[, variable],
-    main = title,
-    xlab = variable,
-    geom="histogram",
-    binwidth = mybinwidth,
-    col = I("white")) +
+plot_hist <- function(mydata, variable, title, cutoff) {
+  mybinwidth <- round(max(mydata[, variable]) * 5 / 100)
+  mylabel <- paste0("cutoff= ", cutoff)
+  hist_plot <- ggplot(as.data.frame(mydata[, variable]),
+                      aes(x = mydata[, variable], colour = I("white"))) +
+    geom_histogram(binwidth = mybinwidth) +
+    labs(title = title, x = variable, y = "count") +
+    scale_x_continuous() +
     geom_vline(xintercept = cutoff) +
-    annotate(geom="text",
-             x=cutoff + mybinwidth, y=1,
-             label=mylabel, color="white")
-    plot(hist_plot)
+    annotate(geom = "text",
+             x = cutoff + mybinwidth, y = 1,
+             label = mylabel, color = "white")
+  plot(hist_plot)
+  return
 }

-# returns the highest value such as the sum of the ordered values including this highest value
-# is lower (below) than the percentile threshold (n)
-percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...){
-  p = n / 100
-  percentile_threshold = quantile(qcmetrics[, variable], p)[[1]]
+# returns the highest value such as the sum of the ordered values including this highest
+# value is lower (below) than the percentile threshold (n)
+percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...) {
+  p <- n / 100
+  percentile_threshold <- quantile(qcmetrics[, variable], p)[[1]]
   plot_hist(qcmetrics,
             variable,
             plot_title,
@@ -103,12 +114,11 @@
 # Determine thresholds based on percentile

 if (opt$percentile_counts > 0) {
-  counts_threshold <- percentile_cutoff(
-    opt$percentile_counts,
-    QC_metrics,
-    "total_counts",
-    "Histogram of Aligned read counts per cell"
-  )} else {
+  counts_threshold <- percentile_cutoff(opt$percentile_counts,
+                                        QC_metrics,
+                                        "total_counts",
+                                        "Histogram of Aligned read counts per cell")
+} else {
   counts_threshold <- opt$absolute_counts
   plot_hist(QC_metrics,
             variable = "total_counts",
@@ -117,88 +127,94 @@
 }

 if (opt$percentile_genes > 0) {
-
-  genes_threshold <- percentile_cutoff(
-    opt$percentile_genes,
-    QC_metrics,
-    "nGenes",
-    "Histogram of Number of detected genes per cell"
-  )} else {
+  genes_threshold <- percentile_cutoff(opt$percentile_genes,
+                                       QC_metrics,
+                                       "nGenes",
+                                       "Histogram of Number of detected genes per cell")
+} else {
   genes_threshold <- opt$absolute_genes
   plot_hist(QC_metrics,
             variable = "nGenes",
-            title = "Histogram of Number of detected genes per cell",
+            title = "Histogram of Number of detected genes per cell",
             cutoff = genes_threshold)
 }

 # Filter out rows below thresholds (genes and read counts)
-if (opt$manage_cutoffs == 'union'){
-    QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold)
+if (opt$manage_cutoffs == "union") {
+  QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold)
 } else {
-    QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold)
+  QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold)
 }

 ## Plot the results

 # Determine title from the parameter logics
-if (opt$percentile_counts > 0){
-    part_one = paste0("Cells with aligned reads counts below the ",
-                      opt$percentile_counts,
-                      "th percentile of aligned read counts")} else {
-    part_one = paste0("Cells with aligned read counts below ",
-                      opt$absolute_counts)
+if (opt$percentile_counts > 0) {
+  part_one <- paste0("Cells with aligned reads counts below the ",
+                     opt$percentile_counts,
+                     "th percentile of aligned read counts")
+} else {
+  part_one <- paste0("Cells with aligned read counts below ",
+                     opt$absolute_counts)
 }

-if(opt$percentile_genes > 0){
-    part_two = paste0("with number of detected genes below the ",
-                      opt$percentile_genes,
-                      "th percentile of detected gene counts")} else {
-    part_two = paste0("with number of detected genes below ",
-                      opt$absolute_genes)
+if (opt$percentile_genes > 0) {
+  part_two <- paste0("with number of detected genes below the ",
+                     opt$percentile_genes,
+                     "th percentile of detected gene counts")
+} else {
+  part_two <- paste0("with number of detected genes below ",
+                     opt$absolute_genes)
 }
+
 if (opt$manage_cutoffs == "intersect") {
-    conjunction = " and\n" } else {
-    conjunction = " or\n"
+  conjunction <- " and\n"
+} else {
+  conjunction <- " or\n"
 }

 # plot with ggplot2
 ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) +
-     geom_point() + scale_y_log10() +
-     scale_colour_discrete(name  = "",
-                           breaks= c(FALSE, TRUE),
-                           labels= c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"),
-                                     paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))) +
-     xlab("Detected genes per cell") + ylab("Aligned reads per cell (log10 scale)") +
-     geom_vline(xintercept = genes_threshold) + geom_hline(yintercept = counts_threshold) +
-     ggtitle( paste0(part_one, conjunction, part_two, "\nwere filtered out")) +
-     theme(plot.title = element_text(size = 8, face = "bold"))
+  geom_point() +
+  scale_y_log10() +
+  scale_colour_discrete(name  = "",
+    breaks = c(FALSE, TRUE),
+    labels = c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"),
+               paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))
+  ) +
+  xlab("Detected genes per cell") +
+  ylab("Aligned reads per cell (log10 scale)") +
+  geom_vline(xintercept = genes_threshold) +
+  geom_hline(yintercept = counts_threshold) +
+  ggtitle(paste0(part_one, conjunction, part_two, "\nwere filtered out")) +
+  theme(plot.title = element_text(size = 8, face = "bold"))

 dev.off()

-# Retrieve identifier of kept cells
-kept.cells <- QC_metrics$cell_id[!QC_metrics$filtered]
+# Retrieve identifier of kept_cells
+kept_cells <- QC_metrics$cell_id[!QC_metrics$filtered]

-data.counts <- data.frame(Genes=rownames(data.counts[,kept.cells]), data.counts[,kept.cells], check.names = FALSE)
+data_counts <- data.frame(Genes = rownames(data_counts[, kept_cells]),
+                          data_counts[, kept_cells],
+                          check.names = FALSE)

-# Save filtered cells
-write.table(
-  data.counts,
+# Save filtered cells
+write.table(data_counts,
   opt$output,
   sep = "\t",
-  quote = F,
-  col.names = T,
-  row.names = F
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
 )

 # Add QC metrics of filtered cells to a metadata file
 metadata <- QC_metrics

 # Save the metadata (QC metrics) file
-write.table(
-  metadata,
+write.table(metadata,
   opt$output_metada,
   sep = "\t",
-  quote = F,
-  col.names = T,
-  row.names = F
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
 )
--- a/filter_cells.xml	Sun Jul 07 08:29:39 2019 -0400
+++ b/filter_cells.xml	Mon Oct 16 22:33:23 2023 +0000
@@ -1,8 +1,8 @@
-<tool id="filter_cells" name="Filter cells data" version="0.9.2">
+<tool id="filter_cells" name="Filter cells data" version="4.3.1+galaxy0" profile="21.01">
     <description>on total aligned reads and/or number of detected genes</description>
     <requirements>
-        <requirement type="package" version="1.3.2=r3.3.2_0">r-optparse</requirement>
-        <requirement type="package" version="2.2.1=r3.3.2_0">r-ggplot2</requirement>
+        <requirement type="package" version="1.7.3">r-optparse</requirement>
+        <requirement type="package" version="3.4.4">r-ggplot2</requirement>
     </requirements>
     <stdio>
         <exit_code range="1:" level="fatal" description="Tool exception" />
Binary file test-data/absolute_counts-only.pdf has changed
Binary file test-data/absolute_gene-and-counts.pdf has changed
Binary file test-data/absolute_gene-only.pdf has changed
Binary file test-data/intersect_percentile_gene-and-counts.pdf has changed
Binary file test-data/no-filtering.pdf has changed
Binary file test-data/percentile_counts-only.pdf has changed
Binary file test-data/percentile_gene-and-counts.pdf has changed
Binary file test-data/percentile_gene-only.pdf has changed