diff filter_genes.R @ 0:f689c4ea8c43 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_filter_genes commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author artbio
date Mon, 24 Jun 2019 13:38:10 -0400
parents
children 5d2304b09f58
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_genes.R	Mon Jun 24 13:38:10 2019 -0400
@@ -0,0 +1,93 @@
+# ########################
+#      filter genes     #
+# ########################
+
+# Filter out low expressed genes
+
+# Example of command (used for generate output file) :
+# Rscript filter_genes.R -f <input file> -o <output file>
+
+# load packages that are provided in the conda env
+options( show.error.messages=F,
+       error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+library(optparse)
+
+# Arguments
+option_list = list(
+  make_option(
+    c("-f", "--input"),
+    default = NA,
+    type = 'character',
+    help = "Input file that contains count values to filter"
+  ),
+  make_option(
+    c("-s", "--sep"),
+    default = '\t',
+    type = 'character',
+    help = "File separator [default : '%default' ]"
+  ),
+  make_option(
+    c("-c", "--colnames"),
+    default = TRUE,
+    type = 'logical',
+    help = "first line is a header [default : '%default' ]"
+  ),
+  make_option(
+    "--percentile_detection",
+    default = 0,
+    type = 'numeric',
+    help = "Include genes with detected expression in at least \
+    this fraction of cells [default : '%default' ]"
+  ),
+  make_option(
+    "--absolute_detection",
+    default = 0,
+    type = 'numeric',
+    help = "Include genes with detected expression in at least \
+    this number of cells [default : '%default' ]"
+  ),
+  make_option(
+    c("-o", "--output"),
+    default = NA,
+    type = 'character',
+    help = "Output name [default : '%default' ]"
+  )
+)
+
+opt = parse_args(OptionParser(option_list = option_list),
+                 args = commandArgs(trailingOnly = TRUE))
+if (opt$sep == "tab") {opt$sep = "\t"}
+if (opt$sep == "comma") {opt$sep = ","}
+
+# Open files
+data.counts <- read.table(
+  opt$input,
+  h = opt$colnames,
+  row.names = 1,
+  sep = opt$sep,
+  check.names = F
+)
+
+# note the [if else] below, to handle percentile_detection=absolute_detection=0
+# Search for genes that are expressed in a certain percent of cells
+if (opt$percentile_detection > 0) {
+kept_genes <- rowSums(data.counts != 0) >= (opt$percentile_detection * ncol(data.counts))
+} else {
+
+# Search for genes that are expressed in more than an absolute number of cells
+kept_genes <- rowSums(data.counts != 0) >= (opt$absolute_detection)
+}
+
+# Filter matrix
+data.counts <- data.counts[kept_genes,]
+
+# Save filtered matrix
+write.table(
+  data.counts,
+  opt$output,
+  sep = "\t",
+  quote = F,
+  col.names = T,
+  row.names = T
+)
\ No newline at end of file