Mercurial > repos > proteore > proteore_expression_rnaseq_abbased

--- a/expression_rnaseq_abbased.xml	Wed Mar 14 11:27:05 2018 -0400
+++ b/expression_rnaseq_abbased.xml	Mon Mar 19 10:07:38 2018 -0400
@@ -12,11 +12,11 @@

   #if $inputtype.filetype == "copy_paste":

-    Rscript --vanilla $__tool_directory__/get_data_HPA_v2.R --inputtype copypaste --input '$inputtype.genelist' --header FALSE --proteinatlas $__tool_directory__/proteinatlas.csv --column c1 --select '$options.hpaparams' --output '$output'
+    Rscript $__tool_directory__/add_expression_HPA.R --inputtype="copypaste" --input='$inputtype.genelist' --atlas="$__tool_directory__/proteinatlas.csv" --select='$options.hpaparams' --output='$output'

   #else

-    Rscript --vanilla $__tool_directory__/get_data_HPA_v2.R --inputtype tabfile --input '$inputtype.genelist' --header '$inputtype.header' --proteinatlas $__tool_directory__/proteinatlas.csv --column '$inputtype.column' --select '$options.hpaparams' --output '$output'
+    Rscript $__tool_directory__/add_expression_HPA.R --inputtype="tabfile" --input='$inputtype.genelist' --header='$inputtype.header' --atlas="$__tool_directory__/proteinatlas.csv" --column='$inputtype.column' --select='$options.hpaparams' --output='$output'

   #end if

@@ -36,23 +36,23 @@
       <param name="genelist" type="data" format="txt,tabular" label="Choose your file" help="This file must imperatively have 1 column filled with Ensembl Gene IDs (ENSG). Please use the ID_Converter tool if this is not the case."/>
       <param name="column" type="text" label="Please specify the column where are your Ensembl gene IDs (e.g : Enter c1 if ENSG ID are in column n°1)" value="c1"/>
       <param name="header" type="select" label="Does your file have a header?" multiple="false" optional="false">
- 		      <option value="TRUE" selected="true">Yes</option>
-          <option value="FALSE" selected="false">No</option>
+ 		      <option value="true" selected="true">Yes</option>
+          <option value="false" selected="false">No</option>
       </param>
     </when>
   </conditional>
   <section name="options" title="RNAseq/Ab-based expression data" expanded="True">
       <param name="hpaparams" type="select" label="Choose the information from RNAseq/ab-based data you want to add to your list (see below for details)" multiple="True" display="checkboxes">
           <option value="Gene" selected="true">Gene name</option>
-          <option value="Gene.description" selected="false">Gene description</option>
+          <option value="Gene description" selected="false">Gene description</option>
           <option value="Evidence">Evidence (at protein level, at transcript level or no evidence)</option>
           <option value="Antibody">Antibody reference</option>
-          <option value="RNA.tissue.category">RNA tissue category</option>
- 		  <option value="Reliability.IH">IH detection level</option>
-          <option value="Reliability.IF">IF detection level</option>
-          <option value="Subcellular.location">Subcellular location</option>
-          <option value="RNA.TS.TPM">RNA tissue specificity abundance in 'Transcript Per Million'</option>
-          <option value="TPM.max.in.non.specific">RNA non-specific tissue abundance in 'Transcript Per Million'</option>
+          <option value="RNA tissue category">RNA tissue category</option>
+ 		  <option value="Reliability (IH)">IH detection level</option>
+          <option value="Reliability (IF)">IF detection level</option>
+          <option value="Subcellular location">Subcellular location</option>
+          <option value="RNA TS TPM">RNA tissue specificity abundance in 'Transcript Per Million'</option>
+          <option value="TPM max in non-specific">RNA non-specific tissue abundance in 'Transcript Per Million'</option>
     </param>
   </section>
--- a/get_data_HPA_v2.R	Wed Mar 14 11:27:05 2018 -0400
+++ b/get_data_HPA_v2.R	Mon Mar 19 10:07:38 2018 -0400
@@ -17,6 +17,26 @@
 # --output : output file name
 # Useful functions

+# Read file and return file content as data.frame
+readfile = function(filename, header) {
+  if (header == "true") {
+    # Read only first line of the file as header:
+    headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
+    #Read the data of the files (skipping the first row)
+    file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
+    # Remove empty rows
+    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
+    #And assign the header to the data
+    names(file) <- headers
+  }
+  else {
+    file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
+    # Remove empty rows
+    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
+  }
+  return(file)
+}
+
 '%!in%' <- function(x,y)!('%in%'(x,y))

 args = commandArgs(trailingOnly = TRUE)
@@ -52,9 +72,9 @@
 if (typeinput=="tabfile"){

   if (header=="TRUE"){
-    listfile = read.table(listfile,header=TRUE,sep="\t",quote="\"",fill=TRUE, na.strings=c("","NA"))
+    listfile = readfile(listfile, "true")
   }else{
-    listfile = read.table(listfile,header=FALSE,sep="\t",quote="\"",fill=TRUE, na.strings=c("","NA"))
+    listfile = readfile(listfile, "false")
   }
   sample = listfile[,column]

@@ -86,7 +106,7 @@
   # the file with the fields "Protein not found in proteinatlas"
   if (length(which(sample %!in% proteinatlas[,3]))!=0){
     proteins_not_found = as.data.frame(sample[which(sample %!in% proteinatlas[,3])])
-	proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in HPA",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1))
+	  proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in HPA",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1))

     colnames(proteins_not_found)=colnames(data)