Mercurial > repos > artbio > gsc_filter_genes
diff filter_genes.R @ 0:f689c4ea8c43 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_filter_genes commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author | artbio |
---|---|
date | Mon, 24 Jun 2019 13:38:10 -0400 |
parents | |
children | 5d2304b09f58 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_genes.R Mon Jun 24 13:38:10 2019 -0400 @@ -0,0 +1,93 @@ +# ######################## +# filter genes # +# ######################## + +# Filter out low expressed genes + +# Example of command (used for generate output file) : +# Rscript filter_genes.R -f <input file> -o <output file> + +# load packages that are provided in the conda env +options( show.error.messages=F, + error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +library(optparse) + +# Arguments +option_list = list( + make_option( + c("-f", "--input"), + default = NA, + type = 'character', + help = "Input file that contains count values to filter" + ), + make_option( + c("-s", "--sep"), + default = '\t', + type = 'character', + help = "File separator [default : '%default' ]" + ), + make_option( + c("-c", "--colnames"), + default = TRUE, + type = 'logical', + help = "first line is a header [default : '%default' ]" + ), + make_option( + "--percentile_detection", + default = 0, + type = 'numeric', + help = "Include genes with detected expression in at least \ + this fraction of cells [default : '%default' ]" + ), + make_option( + "--absolute_detection", + default = 0, + type = 'numeric', + help = "Include genes with detected expression in at least \ + this number of cells [default : '%default' ]" + ), + make_option( + c("-o", "--output"), + default = NA, + type = 'character', + help = "Output name [default : '%default' ]" + ) +) + +opt = parse_args(OptionParser(option_list = option_list), + args = commandArgs(trailingOnly = TRUE)) +if (opt$sep == "tab") {opt$sep = "\t"} +if (opt$sep == "comma") {opt$sep = ","} + +# Open files +data.counts <- read.table( + opt$input, + h = opt$colnames, + row.names = 1, + sep = opt$sep, + check.names = F +) + +# note the [if else] below, to handle percentile_detection=absolute_detection=0 +# Search for genes that are expressed in a certain percent of cells +if (opt$percentile_detection > 0) { +kept_genes <- rowSums(data.counts != 0) >= (opt$percentile_detection * ncol(data.counts)) +} else { + +# Search for genes that are expressed in more than an absolute number of cells +kept_genes <- rowSums(data.counts != 0) >= (opt$absolute_detection) +} + +# Filter matrix +data.counts <- data.counts[kept_genes,] + +# Save filtered matrix +write.table( + data.counts, + opt$output, + sep = "\t", + quote = F, + col.names = T, + row.names = T +) \ No newline at end of file