Mercurial > repos > proteore > proteore_expression_rnaseq_abbased

--- a/add_expression_HPA.R	Wed Jan 02 04:40:04 2019 -0500
+++ b/add_expression_HPA.R	Thu Mar 07 09:08:33 2019 -0500
@@ -19,6 +19,24 @@
   }
 }

+stopQuietly <- function(...) {
+  blankMsg <- sprintf("\r%s\r", paste(rep(" ", getOption("width")-1L), collapse=" "));
+  stop(simpleError(blankMsg));
+} # stopQuietly()
+
+check_ensembl_geneids <- function(vector,type) {
+  ensembl_geneid_pattern = "^ENS[A-Z]+[0-9]{11}$|^[A-Z]{3}[0-9]{3}[A-Za-z](-[A-Za-z])?$|^CG[0-9]+$|^[A-Z0-9]+[.][0-9]+$|^YM[A-Z][0-9]{3}[a-z][0-9]$"
+  res = grepl(ensembl_geneid_pattern,vector)
+  if (all(!res)){
+    cat("No Ensembl geneIDs found in entered ids")
+    stopQuietly()
+  } else if (any(!res)) {
+    cat(paste(sep="",collapse = " ",c(sum(!res, na.rm=TRUE),'IDs are not ENSG IDs, please check:\n')))
+    not_geneids <- sapply(vector[which(!res)], function(x) paste(sep="",collapse = "",x,"\n"),USE.NAMES = F)
+    cat(not_geneids)
+  }
+}
+
 add_expression = function(input, atlas, options) {
   input <- unique(input[!is.na(input)])
   input <- gsub("[[:blank:]]|\u00A0","",input)
@@ -89,7 +107,7 @@
   return(res)
 }

-main = function() {
+get_args <- function(){
   args <- commandArgs(TRUE)
   if(length(args)<1) {
     args <- c("--help")
@@ -116,13 +134,28 @@
   argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
   args <- as.list(as.character(argsDF$V2))
   names(args) <- argsDF$V1
+
+  return(args)
+}
+
+is_col_in_file <- function(file,ncol) {
+  is_in_file = (ncol <= ncol(file) && ncol > 0)
+  if (!is_in_file){
+    cat(paste(sep = "", collapse = " ", c("Column",ncol,"not found in file") ))
+    stopQuietly()
+  }
+}
+
+main = function() {
+
+  args = get_args()

   #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_expression_data_HPA/args.rda")
   #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_expression_data_HPA/args.rda")

   inputtype = args$inputtype
   if (inputtype == "copypaste") {
-    input = strsplit(args$input, "[ \t\n]+")[[1]]
+    ids = strsplit(args$input, "[ \t\n]+")[[1]]
   } else if (inputtype == "tabfile") {
     filename = args$input
     ncol = args$column
@@ -134,10 +167,12 @@
     }
     header = str2bool(args$header)
     file = read_file(filename, header)
+    is_col_in_file(file,ncol)
     file = one_id_one_line(file,ncol)
-    input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE))
-    input = input[which(!is.na(input))]
+    ids = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE))
+    ids = ids[which(!is.na(ids))]
   }
+  check_ensembl_geneids(ids)

   # Read protein atlas
   protein_atlas = args$atlas
@@ -146,15 +181,15 @@
   # Add expression
   output = args$output
   options = strsplit(args$select, ",")[[1]]
-  res = add_expression(input, protein_atlas, options)
+  res = add_expression(ids, protein_atlas, options)

   # Write output
   if (is.null(res)) {
-    write.table("None of the input ENSG ids are can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
+    write.table("None of the ENSG ids entered can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
   } else {
     if (inputtype == "copypaste") {
-      input <- data.frame(input)
-      output_content = merge(input,res,by.x=1,by.y="row.names",incomparables = NA, all.x=T)
+      ids <- data.frame(ids)
+      output_content = merge(ids,res,by.x=1,by.y="row.names",incomparables = NA, all.x=T)
       colnames(output_content)[1] = "Ensembl"
     } else if (inputtype == "tabfile") {
       output_content = merge(file, res, by.x=ncol, by.y="row.names", incomparables = NA, all.x=T)
--- a/add_expression_data.xml	Wed Jan 02 04:40:04 2019 -0500
+++ b/add_expression_data.xml	Thu Mar 07 09:08:33 2019 -0500
@@ -1,4 +1,4 @@
-<tool id="rna_abbased_data" name="Add expression data" version="2019.01.02">
+<tool id="rna_abbased_data" name="Add expression data" version="2019.03.07">
 <description> (RNAseq or Immuno-assays)[Human Protein Atlas]
 </description>
 <requirements>
@@ -24,7 +24,7 @@

 <inputs>
   <conditional name="inputtype">
-    <param name="filetype" type="select" label="Enter your IDs (Ensembl gene ENSG IDs only)" help="Copy/paste or from a file (e.g. table)">
+    <param name="filetype" type="select" label="Enter your IDs (Ensembl gene IDs only, e.g. ENSG00000064787)" help="Copy/paste or from a file (e.g. table)">
       <option value="file_all" selected="true">Input file containing your IDs</option>
       <option value="copy_paste">Copy/paste your list of IDs</option>
     </param>
@@ -42,11 +42,10 @@
     </when>
     <when value="file_all">
       <param name="genelist" type="data" format="txt,tabular" label="Select your file" help=""/>
-      <param name="column" type="text" label="Column IDs (e.g : Enter c1 if ENSG ID are in column n°1)" value="c1"/>
-      <param name="header" type="select" label="Does file contain header?" multiple="false" optional="false">
- 		      <option value="true" selected="true">Yes</option>
-          <option value="false" selected="false">No</option>
+      <param name="column" type="text" label="Column IDs (e.g : Enter c1 if ENSG ID are in column n°1)" value="c1">
+        <validator type="regex" message="Please enter a column number, for example: 'c1' for the first column">[c]{0,1}[0-9]+</validator>
       </param>
+      <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does file contain header?"/>
     </when>
   </conditional>
   <section name="options" title="RNAseq/Ab-based expression data" expanded="True">
@@ -95,7 +94,7 @@

 **Input**

-Input can be either a list of Ensembl gene (ENSG) IDsds (copy/paste mode) or a file containing multiple fields with at least one column of Ensembl gene IDs. If your input file contains other type of IDs, please use the ID_Converter tool to create a column of Ensembl gene IDs.
+Input can be either a list of Ensembl gene (ENSG) IDs (copy/paste mode) or a file containing multiple fields with at least one column of Ensembl gene IDs. If your input file contains other type of IDs, please use the ID_Converter tool to create a column of Ensembl gene IDs.

 -----

@@ -127,13 +126,13 @@

 **Output**

-The output is a tabular file containing original columns and new columns including selected annotation.
+The output is a tabular file containing initial columns and new columns with annotation from HPA.

 -----

 **Data sources (release date)**

-HPA source file (Human Protein Atlas version 18):  http://www.proteinatlas.org/download/proteinatlas.tab.gz
+HPA source file (Human Protein Atlas version 18.1):  http://www.proteinatlas.org/download/proteinatlas.tab.gz

 -----

@@ -141,7 +140,7 @@

 **Authors**

-Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+Lisa Peru, David Christiany, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR

 Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux - INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform, FR