diff add_expression_HPA.R @ 12:dbeabf9bf091 draft

planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
author proteore
date Thu, 07 Mar 2019 09:08:33 -0500
parents 5c260bd3552e
children 133309fd6875
line wrap: on
line diff
--- a/add_expression_HPA.R	Wed Jan 02 04:40:04 2019 -0500
+++ b/add_expression_HPA.R	Thu Mar 07 09:08:33 2019 -0500
@@ -19,6 +19,24 @@
   }
 }
 
+stopQuietly <- function(...) {
+  blankMsg <- sprintf("\r%s\r", paste(rep(" ", getOption("width")-1L), collapse=" "));
+  stop(simpleError(blankMsg));
+} # stopQuietly()
+
+check_ensembl_geneids <- function(vector,type) {
+  ensembl_geneid_pattern = "^ENS[A-Z]+[0-9]{11}$|^[A-Z]{3}[0-9]{3}[A-Za-z](-[A-Za-z])?$|^CG[0-9]+$|^[A-Z0-9]+[.][0-9]+$|^YM[A-Z][0-9]{3}[a-z][0-9]$"
+  res = grepl(ensembl_geneid_pattern,vector)
+  if (all(!res)){
+    cat("No Ensembl geneIDs found in entered ids")
+    stopQuietly()
+  } else if (any(!res)) {
+    cat(paste(sep="",collapse = " ",c(sum(!res, na.rm=TRUE),'IDs are not ENSG IDs, please check:\n')))
+    not_geneids <- sapply(vector[which(!res)], function(x) paste(sep="",collapse = "",x,"\n"),USE.NAMES = F)
+    cat(not_geneids)
+  }
+}
+
 add_expression = function(input, atlas, options) {
   input <- unique(input[!is.na(input)])
   input <- gsub("[[:blank:]]|\u00A0","",input)
@@ -89,7 +107,7 @@
   return(res)
 }
 
-main = function() {
+get_args <- function(){
   args <- commandArgs(TRUE)
   if(length(args)<1) {
     args <- c("--help")
@@ -116,13 +134,28 @@
   argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
   args <- as.list(as.character(argsDF$V2))
   names(args) <- argsDF$V1
+  
+  return(args)
+}
+
+is_col_in_file <- function(file,ncol) { 
+  is_in_file = (ncol <= ncol(file) && ncol > 0)
+  if (!is_in_file){
+    cat(paste(sep = "", collapse = " ", c("Column",ncol,"not found in file") ))
+    stopQuietly()
+  }
+}
+
+main = function() {
+  
+  args = get_args()
 
   #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_expression_data_HPA/args.rda")
   #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_expression_data_HPA/args.rda")
   
   inputtype = args$inputtype
   if (inputtype == "copypaste") {
-    input = strsplit(args$input, "[ \t\n]+")[[1]]
+    ids = strsplit(args$input, "[ \t\n]+")[[1]]
   } else if (inputtype == "tabfile") {
     filename = args$input
     ncol = args$column
@@ -134,10 +167,12 @@
     }
     header = str2bool(args$header)
     file = read_file(filename, header)
+    is_col_in_file(file,ncol)
     file = one_id_one_line(file,ncol)
-    input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE))
-    input = input[which(!is.na(input))]
+    ids = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE))
+    ids = ids[which(!is.na(ids))]
   }
+  check_ensembl_geneids(ids)
 
   # Read protein atlas
   protein_atlas = args$atlas
@@ -146,15 +181,15 @@
   # Add expression
   output = args$output
   options = strsplit(args$select, ",")[[1]]
-  res = add_expression(input, protein_atlas, options)
+  res = add_expression(ids, protein_atlas, options)
   
   # Write output
   if (is.null(res)) {
-    write.table("None of the input ENSG ids are can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
+    write.table("None of the ENSG ids entered can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
   } else {
     if (inputtype == "copypaste") {
-      input <- data.frame(input)
-      output_content = merge(input,res,by.x=1,by.y="row.names",incomparables = NA, all.x=T)
+      ids <- data.frame(ids)
+      output_content = merge(ids,res,by.x=1,by.y="row.names",incomparables = NA, all.x=T)
       colnames(output_content)[1] = "Ensembl"
     } else if (inputtype == "tabfile") {
       output_content = merge(file, res, by.x=ncol, by.y="row.names", incomparables = NA, all.x=T)