Mercurial > repos > proteore > proteore_expression_rnaseq_abbased
changeset 12:dbeabf9bf091 draft
planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
author | proteore |
---|---|
date | Thu, 07 Mar 2019 09:08:33 -0500 |
parents | e109cacd75b2 |
children | 0b279190f90d |
files | add_expression_HPA.R add_expression_data.xml |
diffstat | 2 files changed, 52 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/add_expression_HPA.R Wed Jan 02 04:40:04 2019 -0500 +++ b/add_expression_HPA.R Thu Mar 07 09:08:33 2019 -0500 @@ -19,6 +19,24 @@ } } +stopQuietly <- function(...) { + blankMsg <- sprintf("\r%s\r", paste(rep(" ", getOption("width")-1L), collapse=" ")); + stop(simpleError(blankMsg)); +} # stopQuietly() + +check_ensembl_geneids <- function(vector,type) { + ensembl_geneid_pattern = "^ENS[A-Z]+[0-9]{11}$|^[A-Z]{3}[0-9]{3}[A-Za-z](-[A-Za-z])?$|^CG[0-9]+$|^[A-Z0-9]+[.][0-9]+$|^YM[A-Z][0-9]{3}[a-z][0-9]$" + res = grepl(ensembl_geneid_pattern,vector) + if (all(!res)){ + cat("No Ensembl geneIDs found in entered ids") + stopQuietly() + } else if (any(!res)) { + cat(paste(sep="",collapse = " ",c(sum(!res, na.rm=TRUE),'IDs are not ENSG IDs, please check:\n'))) + not_geneids <- sapply(vector[which(!res)], function(x) paste(sep="",collapse = "",x,"\n"),USE.NAMES = F) + cat(not_geneids) + } +} + add_expression = function(input, atlas, options) { input <- unique(input[!is.na(input)]) input <- gsub("[[:blank:]]|\u00A0","",input) @@ -89,7 +107,7 @@ return(res) } -main = function() { +get_args <- function(){ args <- commandArgs(TRUE) if(length(args)<1) { args <- c("--help") @@ -116,13 +134,28 @@ argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) args <- as.list(as.character(argsDF$V2)) names(args) <- argsDF$V1 + + return(args) +} + +is_col_in_file <- function(file,ncol) { + is_in_file = (ncol <= ncol(file) && ncol > 0) + if (!is_in_file){ + cat(paste(sep = "", collapse = " ", c("Column",ncol,"not found in file") )) + stopQuietly() + } +} + +main = function() { + + args = get_args() #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_expression_data_HPA/args.rda") #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_expression_data_HPA/args.rda") inputtype = args$inputtype if (inputtype == "copypaste") { - input = strsplit(args$input, "[ \t\n]+")[[1]] + ids = strsplit(args$input, "[ \t\n]+")[[1]] } else if (inputtype == "tabfile") { filename = args$input ncol = args$column @@ -134,10 +167,12 @@ } header = str2bool(args$header) file = read_file(filename, header) + is_col_in_file(file,ncol) file = one_id_one_line(file,ncol) - input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) - input = input[which(!is.na(input))] + ids = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) + ids = ids[which(!is.na(ids))] } + check_ensembl_geneids(ids) # Read protein atlas protein_atlas = args$atlas @@ -146,15 +181,15 @@ # Add expression output = args$output options = strsplit(args$select, ",")[[1]] - res = add_expression(input, protein_atlas, options) + res = add_expression(ids, protein_atlas, options) # Write output if (is.null(res)) { - write.table("None of the input ENSG ids are can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) + write.table("None of the ENSG ids entered can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) } else { if (inputtype == "copypaste") { - input <- data.frame(input) - output_content = merge(input,res,by.x=1,by.y="row.names",incomparables = NA, all.x=T) + ids <- data.frame(ids) + output_content = merge(ids,res,by.x=1,by.y="row.names",incomparables = NA, all.x=T) colnames(output_content)[1] = "Ensembl" } else if (inputtype == "tabfile") { output_content = merge(file, res, by.x=ncol, by.y="row.names", incomparables = NA, all.x=T)
--- a/add_expression_data.xml Wed Jan 02 04:40:04 2019 -0500 +++ b/add_expression_data.xml Thu Mar 07 09:08:33 2019 -0500 @@ -1,4 +1,4 @@ -<tool id="rna_abbased_data" name="Add expression data" version="2019.01.02"> +<tool id="rna_abbased_data" name="Add expression data" version="2019.03.07"> <description> (RNAseq or Immuno-assays)[Human Protein Atlas] </description> <requirements> @@ -24,7 +24,7 @@ <inputs> <conditional name="inputtype"> - <param name="filetype" type="select" label="Enter your IDs (Ensembl gene ENSG IDs only)" help="Copy/paste or from a file (e.g. table)"> + <param name="filetype" type="select" label="Enter your IDs (Ensembl gene IDs only, e.g. ENSG00000064787)" help="Copy/paste or from a file (e.g. table)"> <option value="file_all" selected="true">Input file containing your IDs</option> <option value="copy_paste">Copy/paste your list of IDs</option> </param> @@ -42,11 +42,10 @@ </when> <when value="file_all"> <param name="genelist" type="data" format="txt,tabular" label="Select your file" help=""/> - <param name="column" type="text" label="Column IDs (e.g : Enter c1 if ENSG ID are in column n°1)" value="c1"/> - <param name="header" type="select" label="Does file contain header?" multiple="false" optional="false"> - <option value="true" selected="true">Yes</option> - <option value="false" selected="false">No</option> + <param name="column" type="text" label="Column IDs (e.g : Enter c1 if ENSG ID are in column n°1)" value="c1"> + <validator type="regex" message="Please enter a column number, for example: 'c1' for the first column">[c]{0,1}[0-9]+</validator> </param> + <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does file contain header?"/> </when> </conditional> <section name="options" title="RNAseq/Ab-based expression data" expanded="True"> @@ -95,7 +94,7 @@ **Input** -Input can be either a list of Ensembl gene (ENSG) IDsds (copy/paste mode) or a file containing multiple fields with at least one column of Ensembl gene IDs. If your input file contains other type of IDs, please use the ID_Converter tool to create a column of Ensembl gene IDs. +Input can be either a list of Ensembl gene (ENSG) IDs (copy/paste mode) or a file containing multiple fields with at least one column of Ensembl gene IDs. If your input file contains other type of IDs, please use the ID_Converter tool to create a column of Ensembl gene IDs. ----- @@ -127,13 +126,13 @@ **Output** -The output is a tabular file containing original columns and new columns including selected annotation. +The output is a tabular file containing initial columns and new columns with annotation from HPA. ----- **Data sources (release date)** -HPA source file (Human Protein Atlas version 18): http://www.proteinatlas.org/download/proteinatlas.tab.gz +HPA source file (Human Protein Atlas version 18.1): http://www.proteinatlas.org/download/proteinatlas.tab.gz ----- @@ -141,7 +140,7 @@ **Authors** -Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +Lisa Peru, David Christiany, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux - INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform, FR