comparison get_data_HPA_v2.R @ 5:f15cdeeba4b4 draft

planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
author proteore
date Mon, 19 Mar 2018 10:07:38 -0400
parents cf2fa609625b
children
comparison
equal deleted inserted replaced
4:2f95774977ff 5:f15cdeeba4b4
14 # --column : column containing in input ENSG identifiers 14 # --column : column containing in input ENSG identifiers
15 # --select : information from HPA to select, may be 15 # --select : information from HPA to select, may be
16 # : RNA.tissue.category,Reliability..IH.,Reliability..IF. (comma-separated) 16 # : RNA.tissue.category,Reliability..IH.,Reliability..IF. (comma-separated)
17 # --output : output file name 17 # --output : output file name
18 # Useful functions 18 # Useful functions
19
20 # Read file and return file content as data.frame
21 readfile = function(filename, header) {
22 if (header == "true") {
23 # Read only first line of the file as header:
24 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
25 #Read the data of the files (skipping the first row)
26 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
27 # Remove empty rows
28 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
29 #And assign the header to the data
30 names(file) <- headers
31 }
32 else {
33 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
34 # Remove empty rows
35 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
36 }
37 return(file)
38 }
19 39
20 '%!in%' <- function(x,y)!('%in%'(x,y)) 40 '%!in%' <- function(x,y)!('%in%'(x,y))
21 41
22 args = commandArgs(trailingOnly = TRUE) 42 args = commandArgs(trailingOnly = TRUE)
23 43
50 sample = sample[,column] 70 sample = sample[,column]
51 } 71 }
52 if (typeinput=="tabfile"){ 72 if (typeinput=="tabfile"){
53 73
54 if (header=="TRUE"){ 74 if (header=="TRUE"){
55 listfile = read.table(listfile,header=TRUE,sep="\t",quote="\"",fill=TRUE, na.strings=c("","NA")) 75 listfile = readfile(listfile, "true")
56 }else{ 76 }else{
57 listfile = read.table(listfile,header=FALSE,sep="\t",quote="\"",fill=TRUE, na.strings=c("","NA")) 77 listfile = readfile(listfile, "false")
58 } 78 }
59 sample = listfile[,column] 79 sample = listfile[,column]
60 80
61 } 81 }
62 82
84 data = data[,to_keep] 104 data = data[,to_keep]
85 # if only some of the proteins were not found in proteinatlas they will be added to 105 # if only some of the proteins were not found in proteinatlas they will be added to
86 # the file with the fields "Protein not found in proteinatlas" 106 # the file with the fields "Protein not found in proteinatlas"
87 if (length(which(sample %!in% proteinatlas[,3]))!=0){ 107 if (length(which(sample %!in% proteinatlas[,3]))!=0){
88 proteins_not_found = as.data.frame(sample[which(sample %!in% proteinatlas[,3])]) 108 proteins_not_found = as.data.frame(sample[which(sample %!in% proteinatlas[,3])])
89 proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in HPA",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1)) 109 proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in HPA",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1))
90 110
91 colnames(proteins_not_found)=colnames(data) 111 colnames(proteins_not_found)=colnames(data)
92 112
93 data = rbind(data,proteins_not_found) 113 data = rbind(data,proteins_not_found)
94 } 114 }