Mercurial > repos > proteore > proteore_expression_rnaseq_abbased
diff get_data_HPA_v2.R @ 0:cf2fa609625b draft
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
author | proteore |
---|---|
date | Sun, 26 Nov 2017 20:49:17 -0500 |
parents | |
children | f15cdeeba4b4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_data_HPA_v2.R Sun Nov 26 20:49:17 2017 -0500 @@ -0,0 +1,107 @@ +# Usage : +# Rscript --vanilla get_data_HPA_v2.R --typeinput copypaste --input +# ENSG00000283071 --header FALSE --proteinatlas proteinatlas.csv --column c1 +# --select RNA.tissue.category,Reliability..IH.,Reliability..IF. --output +# output.txt + +# INPUTS : +# --typeinput : "copypaste" or "tabfile" +# --input : either a file name (e.g : input.txt) or a list of blank-separated +# ENSG identifiers (e.g : ENSG00000283071 ENSG00000283072) +# --header : "TRUE" or "FALSE" : indicates in case the input is a file if said +# file has an header +# --proteinatlas : HPA proteinatlas tab file +# --column : column containing in input ENSG identifiers +# --select : information from HPA to select, may be +# : RNA.tissue.category,Reliability..IH.,Reliability..IF. (comma-separated) +# --output : output file name +# Useful functions + +'%!in%' <- function(x,y)!('%in%'(x,y)) + +args = commandArgs(trailingOnly = TRUE) + +# create a list of the arguments from the command line, separated by a blank space +hh <- paste(unlist(args),collapse=' ') +# delete the first element of the list which is always a blank space +listoptions <- unlist(strsplit(hh,'--'))[-1] +# for each input, split the arguments with blank space as separator, unlist, and delete the first element which is the input name (e.g --protatlas) +options.args <- sapply(listoptions,function(x){ + unlist(strsplit(x, ' '))[-1] + }) +# same as the step above, except that only the names are kept +options.names <- sapply(listoptions,function(x){ + option <- unlist(strsplit(x, ' '))[1] +}) +names(options.args) <- unlist(options.names) + + +typeinput = as.character(options.args[1]) +proteinatlas = read.table(as.character(options.args[4]),header=TRUE,sep="\t",quote="\"",fill=TRUE,blank.lines.skip=TRUE, na.strings=c("NA"," ","")) +listfile = options.args[2] + +header = as.character(options.args[3]) +column = as.numeric(gsub("c","",options.args[5])) +select = as.character(options.args[6]) +output = as.character(options.args[7]) + +if (typeinput=="copypaste"){ + sample = as.data.frame(unlist(listfile)) + sample = sample[,column] +} +if (typeinput=="tabfile"){ + + if (header=="TRUE"){ + listfile = read.table(listfile,header=TRUE,sep="\t",quote="\"",fill=TRUE, na.strings=c("","NA")) + }else{ + listfile = read.table(listfile,header=FALSE,sep="\t",quote="\"",fill=TRUE, na.strings=c("","NA")) + } + sample = listfile[,column] + +} + +# Select user input ensembl ids in HPA protein atlas file + +if ((length(sample[sample %in% proteinatlas[,3]]))==0){ + write.table("None of the input ENSG ids are can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) + +}else{ + + + to_keep = c() + + if (select!="None"){ + select = unlist(strsplit(select,",")) + for (arg in select){ + colnb = which(colnames(proteinatlas) %in% c(arg)) + to_keep = c(to_keep,colnb) + } + } + + to_keep = c(3,to_keep) + lines = which(proteinatlas[,3] %in% sample) + data = proteinatlas[lines,] + data = data[,to_keep] + # if only some of the proteins were not found in proteinatlas they will be added to + # the file with the fields "Protein not found in proteinatlas" + if (length(which(sample %!in% proteinatlas[,3]))!=0){ + proteins_not_found = as.data.frame(sample[which(sample %!in% proteinatlas[,3])]) + proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in HPA",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1)) + + colnames(proteins_not_found)=colnames(data) + + data = rbind(data,proteins_not_found) + } + + # Merge original data and data selected from proteinatlas + + # Before that, if the initial ids were uniprot ids change them back from + # proteinatlas to uniprot ids in data + data = merge(listfile, data, by.x = column, by.y=1) + colnames(data)[1] = "Ensembl gene ids" + # Write result + write.table(data,file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) + +} + +