Mercurial > repos > artbio > gsc_filter_cells
view filter_cells.R @ 0:e63bd8f13679 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_filter_cells commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author | artbio |
---|---|
date | Mon, 24 Jun 2019 13:37:45 -0400 |
parents | |
children | 5407dc697e24 |
line wrap: on
line source
# First step of the signature-based workflow # Remove low quality cells below a user-fixed cutoff of # percentiles or raw values of number of genes detected or # total aligned reads # Example of command (that generates output files) : # Rscript filter_cells.R -f <input> --sep "/t" --absolute_genes 1700 --absolute_counts 90000 --pdfplot <plotfile.pdf> --output <filtered_cells.tsv> --output_metada <filtered_metadata.tsv> # load packages that are provided in the conda env options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() library(optparse) library(ggplot2) # Arguments option_list = list( make_option(c("-f", "--file"), default=NA, type='character', help="Input file that contains values to filter"), make_option("--sep", default="\t", type='character', help="File column separator [default : '%default' ]"), make_option("--percentile_genes", default=0, type='integer', help="nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"), make_option("--percentile_counts", default=0, type='integer', help="nth Percentile of the total counts per cell distribution [default : '%default' ]"), make_option("--absolute_genes", default=0, type='integer', help="Remove cells that didn't express at least this number of genes [default : '%default' ]"), make_option("--absolute_counts", default=0, type='integer', help="Number of transcript threshold for cell filtering [default : '%default' ]"), make_option("--manage_cutoffs", default="intersect", type='character', help="combine or intersect cutoffs for filtering"), make_option("--pdfplot", type = 'character', help="Path to pdf file of the plots"), make_option("--output", type = 'character', help="Path to tsv file of filtered cell data"), make_option("--output_metada", type = 'character', help="Path to tsv file of filtered cell metadata") ) opt = parse_args(OptionParser(option_list=option_list), args = commandArgs(trailingOnly = TRUE)) if (opt$sep == "tab") {opt$sep = "\t"} if (opt$sep == "comma") {opt$sep = ","} if (opt$sep == "space") {opt$sep = " "} # check consistency of filtering options if ((opt$percentile_counts > 0) & (opt$absolute_counts > 0)) { opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering # if ((opt$percentile_counts == 0) & (opt$absolute_counts == 0)) { # opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering if ((opt$percentile_genes > 0) & (opt$absolute_genes > 0)) { opt$percentile_genes = 0 } # since input parameters are not consistent (one or either method, not both), no filtering # if ((opt$percentile_genes == 0) & (opt$absolute_genes == 0)) { # opt$percentile_genes = 100 } # since input parameters are not consistent (one or either method, not both), no filtering # Import datasets data.counts <- read.table( opt$file, header = TRUE, stringsAsFactors = F, sep = opt$sep, check.names = FALSE, row.names = 1 ) QC_metrics <- data.frame(cell_id = colnames(data.counts), nGenes = colSums(data.counts != 0), # nGenes : Number of detected genes for each cell total_counts = colSums(data.counts), # total_counts : Total counts per cell stringsAsFactors = F) plot_hist <- function(mydata, variable, title, cutoff){ mybinwidth = round(max(mydata[, variable]) * 5 / 100) mylabel = paste0("cutoff= ", cutoff) hist_plot <- qplot( mydata[, variable], main = title, xlab = variable, geom="histogram", binwidth = mybinwidth, col = I("white")) + geom_vline(xintercept = cutoff) + annotate(geom="text", x=cutoff + mybinwidth, y=1, label=mylabel, color="white") plot(hist_plot) } # returns the highest value such as the sum of the ordered values including this highest value # is lower (below) than the percentile threshold (n) percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...){ p = n / 100 percentile_threshold = quantile(qcmetrics[, variable], p)[[1]] plot_hist(qcmetrics, variable, plot_title, percentile_threshold) return(percentile_threshold) } pdf(file = opt$pdfplot) # Determine thresholds based on percentile if (opt$percentile_counts > 0) { counts_threshold <- percentile_cutoff( opt$percentile_counts, QC_metrics, "total_counts", "Histogram of Aligned read counts per cell" )} else { counts_threshold <- opt$absolute_counts plot_hist(QC_metrics, variable = "total_counts", title = "Histogram of Total counts per cell", cutoff = counts_threshold) } if (opt$percentile_genes > 0) { genes_threshold <- percentile_cutoff( opt$percentile_genes, QC_metrics, "nGenes", "Histogram of Number of detected genes per cell" )} else { genes_threshold <- opt$absolute_genes plot_hist(QC_metrics, variable = "nGenes", title = "Histogram of Number of detected genes per cell", cutoff = genes_threshold) } # Filter out rows below thresholds (genes and read counts) if (opt$manage_cutoffs == 'union'){ QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold) } else { QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold) } ## Plot the results # Determine title from the parameter logics if (opt$percentile_counts > 0){ part_one = paste0("Cells with aligned reads counts below the ", opt$percentile_counts, "th percentile of aligned read counts")} else { part_one = paste0("Cells with aligned read counts below ", opt$absolute_counts) } if(opt$percentile_genes > 0){ part_two = paste0("with number of detected genes below the ", opt$percentile_genes, "th percentile of detected gene counts")} else { part_two = paste0("with number of detected genes below ", opt$absolute_genes) } if (opt$manage_cutoffs == "intersect") { conjunction = " and\n" } else { conjunction = " or\n" } # plot with ggplot2 ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) + geom_point() + scale_y_log10() + scale_colour_discrete(name = "", breaks= c(FALSE, TRUE), labels= c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"), paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))) + xlab("Detected genes per cell") + ylab("Aligned reads per cell (log10 scale)") + geom_vline(xintercept = genes_threshold) + geom_hline(yintercept = counts_threshold) + ggtitle( paste0(part_one, conjunction, part_two, "\nwere filtered out")) + theme(plot.title = element_text(size = 8, face = "bold")) dev.off() # Retrieve identifier of kept cells kept.cells <- QC_metrics$cell_id[!QC_metrics$filtered] data.counts <- data.frame(Genes=rownames(data.counts[,kept.cells]), data.counts[,kept.cells], check.names = FALSE) # Save filtered cells write.table( data.counts, opt$output, sep = "\t", quote = F, col.names = T, row.names = F ) # Add QC metrics of filtered cells to a metadata file metadata <- QC_metrics # Save the metadata (QC metrics) file write.table( metadata, opt$output_metada, sep = "\t", quote = F, col.names = T, row.names = F )