Mercurial > repos > proteore > proteore_expression_rnaseq_abbased

--- a/README.rst	Fri Mar 23 10:31:59 2018 -0400
+++ b/README.rst	Tue Dec 18 08:23:48 2018 -0500
@@ -2,7 +2,7 @@
 =========================================================
 **Authors**

-Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+Lisa Perus, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR

 Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform

@@ -20,7 +20,7 @@

 **Databases**

-HPA source file (Human Protein Atlas version 18):  http://www.proteinatlas.org/download/proteinatlas.tab.gz
+HPA source file:  http://www.proteinatlas.org/download/proteinatlas.tab.gz

 **Annotation**

@@ -46,4 +46,4 @@

 **Outputs**

-The output is a tabular file. The initial columns are kept and new columns are added according to what type of annotation data you chose.
+The output is a tabular file. The initial columns are kept and new columns are added according to what type of annotation data you chose.
--- a/add_expression_HPA.R	Fri Mar 23 10:31:59 2018 -0400
+++ b/add_expression_HPA.R	Tue Dec 18 08:23:48 2018 -0500
@@ -1,40 +1,94 @@
 # Read file and return file content as data.frame
-readfile = function(filename, header) {
-  if (header == "true") {
-    # Read only first line of the file as header:
-    headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "", comment.char = "")
-    #Read the data of the files (skipping the first row)
-    file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "", comment.char = "")
-    # Remove empty rows
-    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
-    #And assign the header to the data
-    names(file) <- headers
+read_file <- function(path,header){
+  file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE)
+  if (inherits(file,"try-error")){
+    stop("File not found !")
+  }else{
+    return(file)
   }
-  else {
-    file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "", comment.char = "")
-    # Remove empty rows
-    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
+}
+
+#convert a string to boolean
+str2bool <- function(x){
+  if (any(is.element(c("t","true"),tolower(x)))){
+    return (TRUE)
+  }else if (any(is.element(c("f","false"),tolower(x)))){
+    return (FALSE)
+  }else{
+    return(NULL)
   }
-  return(file)
 }

 add_expression = function(input, atlas, options) {
+  input <- unique(input[!is.na(input)])
+  input <- gsub("[[:blank:]]|\u00A0","",input)
   if (all(!input %in% atlas$Ensembl)) {
     return(NULL)
-  }
-  else {
-    res = matrix(nrow=length(input), ncol=0)
-    names = c()
-    for (opt in options) {
-      names = c(names, opt)
-      info = atlas[match(input, atlas$Ensembl,incomparable="NA"),][opt][,]
-      res = cbind(res, info)
-    }
-    colnames(res) = names
+  } else {
+    res = atlas[match(input,atlas$Ensembl),c("Ensembl",options)]
+    res = res[which(!is.na(res[,1])),]
+    row.names(res)=res[,1]
+    res=res[2:ncol(res)]
+    res <- as.data.frame(apply(res, c(1,2), function(x) gsub("^$|^ $", NA, x)))  #convert "" et " " to NA
     return(res)
   }
 }

+order_columns <- function (df,ncol){
+  if (ncol==1){ #already at the right position
+    return (df)
+  } else {
+    df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])]
+  }
+  return (df)
+}
+
+#take data frame, return  data frame
+split_ids_per_line <- function(line,ncol){
+
+  #print (line)
+  header = colnames(line)
+  line[ncol] = gsub("[[:blank:]]","",line[ncol])
+
+  if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) {
+    if (length(line)==1 ) {
+      lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F)
+    } else {
+      if (ncol==1) {                                #first column
+        lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)]))
+      } else if (ncol==length(line)) {                 #last column
+        lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";"))))
+      } else {
+        lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)]))
+      }
+    }
+    colnames(lines)=header
+    return(lines)
+  } else {
+    return(line)
+  }
+}
+
+#create new lines if there's more than one id per cell in the columns in order to have only one id per line
+one_id_one_line <-function(tab,ncol){
+
+  if (ncol(tab)>1){
+
+    tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x))
+    header=colnames(tab)
+    res=as.data.frame(matrix(ncol=ncol(tab),nrow=0))
+    for (i in 1:nrow(tab) ) {
+      lines = split_ids_per_line(tab[i,],ncol)
+      res = rbind(res,lines)
+    }
+  }else {
+    res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F)
+    res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F)
+    colnames(res)=colnames(tab)
+  }
+  return(res)
+}
+
 main = function() {
   args <- commandArgs(TRUE)
   if(length(args)<1) {
@@ -63,59 +117,52 @@
   args <- as.list(as.character(argsDF$V2))
   names(args) <- argsDF$V1

+  #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_expression_data_HPA/args.rda")
+  #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_expression_data_HPA/args.rda")
+
   inputtype = args$inputtype
   if (inputtype == "copypaste") {
     input = strsplit(args$input, "[ \t\n]+")[[1]]
-  }
-  else if (inputtype == "tabfile") {
+  } else if (inputtype == "tabfile") {
     filename = args$input
     ncol = args$column
     # Check ncol
     if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
       stop("Please enter an integer for level")
-    }
-    else {
+    } else {
       ncol = as.numeric(gsub("c", "", ncol))
     }
-    header = args$header
-    # Get file content
-    file = readfile(filename, header)
-    # Extract Protein IDs list
-    input = c()
-    for (row in as.character(file[,ncol])) {
-      input = c(input, strsplit(row, ";")[[1]][1])
-    }
+    header = str2bool(args$header)
+    file = read_file(filename, header)
+    file = one_id_one_line(file,ncol)
+    input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE))
+    input = input[which(!is.na(input))]
   }

   # Read protein atlas
   protein_atlas = args$atlas
-  protein_atlas = readfile(protein_atlas, "true")
+  protein_atlas = read_file(protein_atlas, T)

   # Add expression
   output = args$output
-  names = c()
   options = strsplit(args$select, ",")[[1]]
   res = add_expression(input, protein_atlas, options)
-
+
   # Write output
   if (is.null(res)) {
     write.table("None of the input ENSG ids are can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
-  }
-  else {
+  } else {
     if (inputtype == "copypaste") {
-      names = c("Ensembl", colnames(res))
-      res = cbind(as.matrix(input), res)
-      colnames(res) = names
-      write.table(res, output, row.names = FALSE, sep = "\t", quote = FALSE)
+      input <- data.frame(input)
+      output_content = merge(input,res,by.x=1,by.y="row.names",incomparables = NA, all.x=T)
+      colnames(output_content)[1] = "Ensembl"
+    } else if (inputtype == "tabfile") {
+      output_content = merge(file, res, by.x=ncol, by.y="row.names", incomparables = NA, all.x=T)
+      output_content = order_columns(output_content,ncol)
     }
-    else if (inputtype == "tabfile") {
-      names = c(names(file), colnames(res))
-      output_content = cbind(file, res)
-      colnames(output_content) = names
-      write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE)
-    }
+  output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x)))
+  write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE)
   }
-
 }

 main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/add_expression_data.xml	Tue Dec 18 08:23:48 2018 -0500
@@ -0,0 +1,157 @@
+<tool id="rna_abbased_data" name="Add expression data" version="2018.12.12">
+<description> (RNAseq or Immuno-assays)[Human Protein Atlas]
+</description>
+<requirements>
+  <requirement type="package" version="3.4.1">R</requirement>
+</requirements>
+<stdio>
+  <exit_code range="1:" />
+</stdio>
+<command><![CDATA[
+
+  #if $inputtype.filetype == "copy_paste":
+
+    Rscript $__tool_directory__/add_expression_HPA.R --inputtype="copypaste" --input='$inputtype.genelist' --atlas="$__tool_directory__/proteinatlas.csv" --select='$options.hpaparams' --output='$output'
+
+  #else
+
+    Rscript $__tool_directory__/add_expression_HPA.R --inputtype="tabfile" --input='$inputtype.genelist' --header='$inputtype.header' --atlas="$__tool_directory__/proteinatlas.csv" --column='$inputtype.column' --select='$options.hpaparams' --output='$output'
+
+  #end if
+
+
+]]></command>
+
+<inputs>
+  <conditional name="inputtype">
+    <param name="filetype" type="select" label="Enter your IDs (Ensembl gene ENSG IDs only)" help="Copy/paste or from a file (e.g. table)">
+      <option value="file_all" selected="true">Input file containing your IDs</option>
+      <option value="copy_paste">Copy/paste your list of IDs</option>
+    </param>
+    <when value="copy_paste">
+      <param name="genelist" type="text" label="Enter a list of IDs">
+        <sanitizer>
+            <valid initial="string.printable">
+                <remove value="&apos;"/>
+            </valid>
+            <mapping initial="none">
+                <add source="&apos;" target="__sq__"/>
+            </mapping>
+        </sanitizer>
+      </param>
+    </when>
+    <when value="file_all">
+      <param name="genelist" type="data" format="txt,tabular" label="Select your file" help=""/>
+      <param name="column" type="text" label="Column IDs (e.g : Enter c1 if ENSG ID are in column n°1)" value="c1"/>
+      <param name="header" type="select" label="Does file contain header?" multiple="false" optional="false">
+ 		      <option value="true" selected="true">Yes</option>
+          <option value="false" selected="false">No</option>
+      </param>
+    </when>
+  </conditional>
+  <section name="options" title="RNAseq/Ab-based expression data" expanded="True">
+      <param name="hpaparams" type="select" label="Select information to add to your list" multiple="True" display="checkboxes" optional="false" >
+          <option value="Gene" selected="true">Gene name</option>
+          <option value="Gene description" selected="false">Gene description</option>
+          <option value="Evidence">Evidence (at protein level, at transcript level or no evidence)</option>
+          <option value="Antibody">Antibody reference</option>
+          <option value="RNA tissue category">RNA tissue category</option>
+ 		  <option value="Reliability (IH)">IH detection level</option>
+          <option value="Reliability (IF)">IF detection level</option>
+          <option value="Subcellular location">Subcellular location</option>
+          <option value="RNA TS TPM">RNA tissue specificity abundance in 'Transcript Per Million'</option>
+          <option value="TPM max in non-specific">RNA non-specific tissue abundance in 'Transcript Per Million'</option>
+    </param>
+  </section>
+
+</inputs>
+
+
+<outputs>
+  <data name="output" format="tsv" label=""/>
+</outputs>
+
+<tests>
+  <test>
+    <conditional name="inputtype">
+      <param name="filetype " value="file_all"/>
+      <param name="genelist" value="ID_Converter_Lacombe_et_al_2017_OK.txt"/>
+      <param name="column" value="c8"/>
+      <param name="header" value="TRUE"/>
+    </conditional>
+    <section name="options">
+      <param name="hpaparams" value="Gene,Gene.description,Evidence,Antibody,RNA.tissue.category,Reliability.IH,Reliability.IF,Subcellular.location,RNA.TS.TPM,TPM.max.in.non.specific"/>
+    </section>
+    <output name="output" file="Get_annotation_RNAseq.txt"/>
+  </test>
+</tests>
+
+<help><![CDATA[
+**Description**
+
+This tool adds expression annotation (RNAseq- or antibody-based experimental data - see "Parameters" below) from the Human Protein Atlas (HPA) database (https://www.proteinatlas.org/) to your gene/protein list.
+
+-----
+
+**Input**
+
+Input can be either a list of Ensembl gene (ENSG) IDsds (copy/paste mode) or a file containing multiple fields with at least one column of Ensembl gene IDs. If your input file contains other type of IDs, please use the ID_Converter tool to create a column of Ensembl gene IDs.
+
+-----
+
+**Parameters**
+
+"Select information to add to your list": choose by clicking the following information:
+
+- Gene name: according to the HGNC (Hugo Gene Nomenclature Committee)
+
+- Gene description: entry description (full text)
+
+- Evidence: at protein level, at transcript level or no evidence
+
+- Antibody reference: reference of the HPA antibody used for immunohistochemistry and immunocytochemistry/IF
+
+- RNA tissue category: categories based on RNA-Seq data to estimate the transcript abundance of each protein-coding gene in tissues. For more information, please refer to http://www.proteinatlas.org/about/assays+annotation#rna .
+
+- IH detection level: level of detection of the protein associated to the coding gene tissues based on immunofluorescency. For more information, please refer to http://www.proteinatlas.org/about/assays+annotation#if .
+
+- IF detection level:level of detection of the protein associated to the coding gene tissues based on immunohistochemistry. For more information, please refer to http://www.proteinatlas.org/about/assays+annotation#ih .
+
+- Subcellular location:according to HPA data. For more information, please refer to https://www.proteinatlas.org/about/assays+annotation#ifa
+
+- RNA tissue specificity abundance in 'Transcript Per Million': For each gene is reported the tissue specificity abundance in 'Transcript Per Million' (TPM) as the sum of the TPM values of all its protein-coding transcripts.
+
+- RNA non-specific tissue abundance in 'Transcript Per Million': please refer to http://www.proteinatlas.org/about/assays+annotation#rna.
+
+-----
+
+**Output**
+
+The output is a tabular file containing original columns and new columns including selected annotation.
+
+-----
+
+**Data sources (release date)**
+
+HPA source file (Human Protein Atlas version 18):  http://www.proteinatlas.org/download/proteinatlas.tab.gz
+
+-----
+
+.. class:: infomark
+
+**Authors**
+
+Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+
+Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux - INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform, FR
+
+This work has been partially funded through the French National Agency for Research (ANR) IFB project.
+
+Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool.
+
+]]></help>
+
+<citations>
+</citations>
+
+</tool>
--- a/expression_rnaseq_abbased.xml	Fri Mar 23 10:31:59 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,147 +0,0 @@
-<tool id="rna_abbased_data" name="Add expression data to your protein list" version="0.1.0">
-<description>
-mRNA and protein level data from Human Protein Atlas
-</description>
-<requirements>
-  <requirement type="package" version="3.4.1">R</requirement>
-</requirements>
-<stdio>
-  <exit_code range="1:" />
-</stdio>
-<command><![CDATA[
-
-  #if $inputtype.filetype == "copy_paste":
-
-    Rscript $__tool_directory__/add_expression_HPA.R --inputtype="copypaste" --input='$inputtype.genelist' --atlas="$__tool_directory__/proteinatlas.csv" --select='$options.hpaparams' --output='$output'
-
-  #else
-
-    Rscript $__tool_directory__/add_expression_HPA.R --inputtype="tabfile" --input='$inputtype.genelist' --header='$inputtype.header' --atlas="$__tool_directory__/proteinatlas.csv" --column='$inputtype.column' --select='$options.hpaparams' --output='$output'
-
-  #end if
-
-
-]]></command>
-
-<inputs>
-  <conditional name="inputtype">
-    <param name="filetype" type="select" label="Enter your list of Ensembl gene ID">
-      <option value="file_all" selected="true">Input file containing your IDs</option>
-      <option value="copy_paste">Copy/paste your list of IDs</option>
-    </param>
-    <when value="copy_paste">
-      <param name="genelist" type="text" label="Enter a list of identifiers">
-        <sanitizer>
-            <valid initial="string.printable">
-                <remove value="&apos;"/>
-            </valid>
-            <mapping initial="none">
-                <add source="&apos;" target="__sq__"/>
-            </mapping>
-        </sanitizer>
-      </param>
-    </when>
-    <when value="file_all">
-      <param name="genelist" type="data" format="txt,tabular" label="Choose your file" help="This file must imperatively have 1 column filled with Ensembl Gene IDs (ENSG). Please use the ID_Converter tool if this is not the case."/>
-      <param name="column" type="text" label="Please specify the column where are your Ensembl gene IDs (e.g : Enter c1 if ENSG ID are in column n°1)" value="c1"/>
-      <param name="header" type="select" label="Does your file have a header?" multiple="false" optional="false">
- 		      <option value="true" selected="true">Yes</option>
-          <option value="false" selected="false">No</option>
-      </param>
-    </when>
-  </conditional>
-  <section name="options" title="RNAseq/Ab-based expression data" expanded="True">
-      <param name="hpaparams" type="select" label="Choose the information from RNAseq/ab-based data you want to add to your list (see below for details)" multiple="True" display="checkboxes">
-          <option value="Gene" selected="true">Gene name</option>
-          <option value="Gene description" selected="false">Gene description</option>
-          <option value="Evidence">Evidence (at protein level, at transcript level or no evidence)</option>
-          <option value="Antibody">Antibody reference</option>
-          <option value="RNA tissue category">RNA tissue category</option>
- 		  <option value="Reliability (IH)">IH detection level</option>
-          <option value="Reliability (IF)">IF detection level</option>
-          <option value="Subcellular location">Subcellular location</option>
-          <option value="RNA TS TPM">RNA tissue specificity abundance in 'Transcript Per Million'</option>
-          <option value="TPM max in non-specific">RNA non-specific tissue abundance in 'Transcript Per Million'</option>
-    </param>
-  </section>
-
-</inputs>
-
-
-<outputs>
-  <data name="output" format="tabular" label=""/>
-</outputs>
-
-<tests>
-  <test>
-    <conditional name="inputtype">
-      <param name="filetype " value="file_all"/>
-      <param name="genelist" value="ID_Converter_Lacombe_et_al_2017_OK.txt"/>
-      <param name="column" value="c8"/>
-      <param name="header" value="TRUE"/>
-    </conditional>
-    <section name="options">
-      <param name="hpaparams" value="Gene,Gene.description,Evidence,Antibody,RNA.tissue.category,Reliability.IH,Reliability.IF,Subcellular.location,RNA.TS.TPM,TPM.max.in.non.specific"/>
-    </section>
-    <output name="output" file="Get_annotation_RNAseq.txt"/>
-  </test>
-</tests>
-
-<help><![CDATA[
-
-This tool adds expression information (RNAseq- or antibody-based experiments) from the Human Protein Atlas (HPA) database (https://www.proteinatlas.org/) to your protein list.
-
-**Input**
-
-Input can be either a list of Ensembl gene ids (copy/paste) or a file containing multiple fields but with **at least one column of Ensembl gene IDs**. If your input file contains other type of IDs, please use the ID_Converter tool to create a column of Ensembl gene IDs.
-
-**Databases**
-
-HPA source file (Human Protein Atlas version 18):  http://www.proteinatlas.org/download/proteinatlas.tab.gz
-
-**Annotation**
-
-- Gene name: according to the HGNC (Hugo Gene Nomenclature Committee)
-
-- Gene description: entry description (full text)
-
-- Evidence: at protein level, at transcript level or no evidence
-
-- Antibody reference: reference of the HPA antibody used for immunohistochemistry and immunocytochemistry/IF
-
-- RNA tissue category: categories based on RNA-Seq data to estimate the transcript abundance of each protein-coding gene in tissues. For more information, please refer to http://www.proteinatlas.org/about/assays+annotation#rna .
-
-- IH detection level: level of detection of the protein associated to the coding gene tissues based on immunofluorescency. For more information, please refer to http://www.proteinatlas.org/about/assays+annotation#if .
-
-- IF detection level:level of detection of the protein associated to the coding gene tissues based on immunohistochemistry. For more information, please refer to http://www.proteinatlas.org/about/assays+annotation#ih .
-
-- Subcellular location:according to HPA data. For more information, please refer to https://www.proteinatlas.org/about/assays+annotation#ifa
-
-- RNA tissue specificity abundance in 'Transcript Per Million': For each gene is reported the tissue specificity abundance in 'Transcript Per Million' (TPM) as the sum of the TPM values of all its protein-coding transcripts.
-
-- RNA non-specific tissue abundance in 'Transcript Per Million': please refer to http://www.proteinatlas.org/about/assays+annotation#rna.
-
-**Outputs**
-
-The output is a tabular file. The initial columns are kept and new columns are added according to what type of annotation data you chose.
-
------
-
-.. class:: infomark
-
-**Authors**
-
-Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
-
-Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform
-
-This work has been partially funded through the French National Agency for Research (ANR) IFB project.
-
-Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool.
-
-]]></help>
-
-<citations>
-</citations>
-
-</tool>