Mercurial > repos > artbio > gsc_filter_cells
diff filter_cells.R @ 0:e63bd8f13679 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_filter_cells commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author | artbio |
---|---|
date | Mon, 24 Jun 2019 13:37:45 -0400 |
parents | |
children | 5407dc697e24 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_cells.R Mon Jun 24 13:37:45 2019 -0400 @@ -0,0 +1,204 @@ +# First step of the signature-based workflow +# Remove low quality cells below a user-fixed cutoff of +# percentiles or raw values of number of genes detected or +# total aligned reads + +# Example of command (that generates output files) : +# Rscript filter_cells.R -f <input> --sep "/t" --absolute_genes 1700 --absolute_counts 90000 --pdfplot <plotfile.pdf> --output <filtered_cells.tsv> --output_metada <filtered_metadata.tsv> + +# load packages that are provided in the conda env +options( show.error.messages=F, + error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +warnings() +library(optparse) +library(ggplot2) + +# Arguments +option_list = list( + make_option(c("-f", "--file"), default=NA, type='character', + help="Input file that contains values to filter"), + make_option("--sep", default="\t", type='character', + help="File column separator [default : '%default' ]"), + make_option("--percentile_genes", default=0, type='integer', + help="nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"), + make_option("--percentile_counts", default=0, type='integer', + help="nth Percentile of the total counts per cell distribution [default : '%default' ]"), + make_option("--absolute_genes", default=0, type='integer', + help="Remove cells that didn't express at least this number of genes [default : '%default' ]"), + make_option("--absolute_counts", default=0, type='integer', + help="Number of transcript threshold for cell filtering [default : '%default' ]"), + make_option("--manage_cutoffs", default="intersect", type='character', + help="combine or intersect cutoffs for filtering"), + make_option("--pdfplot", type = 'character', + help="Path to pdf file of the plots"), + make_option("--output", type = 'character', + help="Path to tsv file of filtered cell data"), + make_option("--output_metada", type = 'character', + help="Path to tsv file of filtered cell metadata") +) +opt = parse_args(OptionParser(option_list=option_list), args = commandArgs(trailingOnly = TRUE)) +if (opt$sep == "tab") {opt$sep = "\t"} +if (opt$sep == "comma") {opt$sep = ","} +if (opt$sep == "space") {opt$sep = " "} + + +# check consistency of filtering options +if ((opt$percentile_counts > 0) & (opt$absolute_counts > 0)) { + opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering +# if ((opt$percentile_counts == 0) & (opt$absolute_counts == 0)) { +# opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering +if ((opt$percentile_genes > 0) & (opt$absolute_genes > 0)) { + opt$percentile_genes = 0 } # since input parameters are not consistent (one or either method, not both), no filtering +# if ((opt$percentile_genes == 0) & (opt$absolute_genes == 0)) { +# opt$percentile_genes = 100 } # since input parameters are not consistent (one or either method, not both), no filtering + +# Import datasets +data.counts <- read.table( + opt$file, + header = TRUE, + stringsAsFactors = F, + sep = opt$sep, + check.names = FALSE, + row.names = 1 +) + +QC_metrics <- + data.frame(cell_id = colnames(data.counts), + nGenes = colSums(data.counts != 0), # nGenes : Number of detected genes for each cell + total_counts = colSums(data.counts), # total_counts : Total counts per cell + stringsAsFactors = F) + +plot_hist <- function(mydata, variable, title, cutoff){ + mybinwidth = round(max(mydata[, variable]) * 5 / 100) + mylabel = paste0("cutoff= ", cutoff) + hist_plot <- qplot( + mydata[, variable], + main = title, + xlab = variable, + geom="histogram", + binwidth = mybinwidth, + col = I("white")) + + geom_vline(xintercept = cutoff) + + annotate(geom="text", + x=cutoff + mybinwidth, y=1, + label=mylabel, color="white") + plot(hist_plot) +} + +# returns the highest value such as the sum of the ordered values including this highest value +# is lower (below) than the percentile threshold (n) +percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...){ + p = n / 100 + percentile_threshold = quantile(qcmetrics[, variable], p)[[1]] + plot_hist(qcmetrics, + variable, + plot_title, + percentile_threshold) + return(percentile_threshold) +} + +pdf(file = opt$pdfplot) + +# Determine thresholds based on percentile + +if (opt$percentile_counts > 0) { + counts_threshold <- percentile_cutoff( + opt$percentile_counts, + QC_metrics, + "total_counts", + "Histogram of Aligned read counts per cell" + )} else { + counts_threshold <- opt$absolute_counts + plot_hist(QC_metrics, + variable = "total_counts", + title = "Histogram of Total counts per cell", + cutoff = counts_threshold) +} + +if (opt$percentile_genes > 0) { + + genes_threshold <- percentile_cutoff( + opt$percentile_genes, + QC_metrics, + "nGenes", + "Histogram of Number of detected genes per cell" + )} else { + genes_threshold <- opt$absolute_genes + plot_hist(QC_metrics, + variable = "nGenes", + title = "Histogram of Number of detected genes per cell", + cutoff = genes_threshold) +} + +# Filter out rows below thresholds (genes and read counts) +if (opt$manage_cutoffs == 'union'){ + QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold) +} else { + QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold) +} + +## Plot the results + +# Determine title from the parameter logics +if (opt$percentile_counts > 0){ + part_one = paste0("Cells with aligned reads counts below the ", + opt$percentile_counts, + "th percentile of aligned read counts")} else { + part_one = paste0("Cells with aligned read counts below ", + opt$absolute_counts) +} + +if(opt$percentile_genes > 0){ + part_two = paste0("with number of detected genes below the ", + opt$percentile_genes, + "th percentile of detected gene counts")} else { + part_two = paste0("with number of detected genes below ", + opt$absolute_genes) +} +if (opt$manage_cutoffs == "intersect") { + conjunction = " and\n" } else { + conjunction = " or\n" +} + +# plot with ggplot2 +ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) + + geom_point() + scale_y_log10() + + scale_colour_discrete(name = "", + breaks= c(FALSE, TRUE), + labels= c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"), + paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))) + + xlab("Detected genes per cell") + ylab("Aligned reads per cell (log10 scale)") + + geom_vline(xintercept = genes_threshold) + geom_hline(yintercept = counts_threshold) + + ggtitle( paste0(part_one, conjunction, part_two, "\nwere filtered out")) + + theme(plot.title = element_text(size = 8, face = "bold")) + +dev.off() + +# Retrieve identifier of kept cells +kept.cells <- QC_metrics$cell_id[!QC_metrics$filtered] + +data.counts <- data.frame(Genes=rownames(data.counts[,kept.cells]), data.counts[,kept.cells], check.names = FALSE) + +# Save filtered cells +write.table( + data.counts, + opt$output, + sep = "\t", + quote = F, + col.names = T, + row.names = F +) + +# Add QC metrics of filtered cells to a metadata file +metadata <- QC_metrics + +# Save the metadata (QC metrics) file +write.table( + metadata, + opt$output_metada, + sep = "\t", + quote = F, + col.names = T, + row.names = F +)