Mercurial > repos > proteore > proteore_expression_levels_by_tissue
comparison sel_ann_hpa.R @ 0:5501e74891e4 draft
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
author | proteore |
---|---|
date | Fri, 16 Feb 2018 04:22:54 -0500 |
parents | |
children | 69cf9e6283f8 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5501e74891e4 |
---|---|
1 | |
2 # Read file and return file content as data.frame | |
3 readfile = function(filename, header) { | |
4 if (header == "true") { | |
5 # Read only first line of the file as header: | |
6 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) | |
7 #Read the data of the files (skipping the first row): | |
8 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) | |
9 # Remove empty rows | |
10 #file <- file[!apply(is.na(file) | file == "", 1, all),] | |
11 #And assign the header to the data: | |
12 names(file) <- headers | |
13 } | |
14 else { | |
15 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) | |
16 } | |
17 return(file) | |
18 } | |
19 | |
20 | |
21 # input has to be a list of IDs in ENSG format | |
22 # tissue is one of unique(HPA.normal.tissue$Tissue) | |
23 # level is one, or several, or 0 (=ALL) of "Not detected", "Medium", "High", "Low" | |
24 # reliability is one, or several, or 0 (=ALL) of "Approved", "Supported", "Uncertain" | |
25 annot.HPAnorm<-function(input, HPA_normal_tissue, tissue, level, reliability, not_mapped_option) { | |
26 dat <- subset(HPA_normal_tissue, Gene %in% input) | |
27 | |
28 if (length(tissue)==1) { | |
29 res.Tissue<-subset(dat, Tissue==tissue) | |
30 } | |
31 if (length(tissue)>1) { | |
32 res.Tissue<-subset(dat, Tissue %in% tissue) | |
33 } | |
34 | |
35 if (length(level)==1) { | |
36 res.Level<-subset(res.Tissue, Level==level) | |
37 } | |
38 if (length(level)>1) { | |
39 print(level) | |
40 res.Level<-subset(res.Tissue, Level %in% level) | |
41 } | |
42 | |
43 if (length(reliability)==1) { | |
44 res.Rel<-subset(res.Level, Reliability==reliability) | |
45 } | |
46 if (length(reliability)>1) { | |
47 print(reliability) | |
48 res.Rel<-subset(res.Level, Reliability %in% reliability) | |
49 } | |
50 | |
51 if (not_mapped_option == "true") { | |
52 if (length(setdiff(intersect(input, unique(dat$Gene)), unique(res.Rel$Gene)))>0) { | |
53 not_match_IDs <- matrix(setdiff(intersect(input, unique(dat$Gene)), unique(res.Rel$Gene)), ncol = 1, nrow = length(setdiff(intersect(input, unique(dat$Gene)), unique(res.Rel$Gene)))) | |
54 not.match <- matrix("not match", ncol = ncol(HPA_normal_tissue) - 1, nrow = length(not_match_IDs)) | |
55 not.match <- cbind(not_match_IDs, unname(not.match)) | |
56 colnames(not.match) <- colnames(HPA_normal_tissue) | |
57 res <- rbind(res.Rel, not.match) | |
58 } | |
59 else { | |
60 res <- res.Rel | |
61 } | |
62 if (length(setdiff(input, unique(dat$Gene)))>0) { | |
63 not.mapped <- matrix(ncol = ncol(HPA_normal_tissue) - 1, nrow = length(setdiff(input, unique(dat$Gene)))) | |
64 not.mapped <- cbind(matrix(setdiff(input, unique(dat$Gene)), ncol = 1, nrow = length(setdiff(input, unique(dat$Gene)))), unname(not.mapped)) | |
65 colnames(not.mapped) <- colnames(HPA_normal_tissue) | |
66 res <- rbind(res, not.mapped) | |
67 } | |
68 } | |
69 else { | |
70 res <- res.Rel | |
71 } | |
72 | |
73 return(res) | |
74 | |
75 } | |
76 | |
77 annot.HPAcancer<-function(input, HPA_cancer_tissue, cancer, not_mapped_option) { | |
78 dat <- subset(HPA_cancer_tissue, Gene %in% input) | |
79 | |
80 if (length(cancer)==1) { | |
81 res.Cancer<-subset(dat, Cancer==cancer) | |
82 } | |
83 if (length(cancer)>1) { | |
84 res.Cancer<-subset(dat, Cancer %in% cancer) | |
85 } | |
86 | |
87 if (not_mapped_option == "true") { | |
88 not.mapped <- matrix(ncol=ncol(HPA_cancer_tissue)-1, nrow=length(setdiff(input, unique(dat$Gene)))) | |
89 not.mapped <- cbind(matrix(setdiff(input, unique(dat$Gene)), ncol = 1, nrow = length(setdiff(input, unique(dat$Gene)))), unname(not.mapped)) | |
90 colnames(not.mapped) <- colnames(HPA_cancer_tissue) | |
91 res <- rbind(res.Cancer, not.mapped) | |
92 } | |
93 else { | |
94 res <- res.Cancer | |
95 } | |
96 return(res) | |
97 } | |
98 | |
99 | |
100 main <- function() { | |
101 args <- commandArgs(TRUE) | |
102 if(length(args)<1) { | |
103 args <- c("--help") | |
104 } | |
105 | |
106 # Help section | |
107 if("--help" %in% args) { | |
108 cat("Selection and Annotation HPA | |
109 Arguments: | |
110 --ref_file: HPA normal/cancer tissue file path | |
111 --input_type: type of input (list of id or filename) | |
112 --input: list of IDs in ENSG format | |
113 --column_number: the column number which you would like to apply... | |
114 --header: true/false if your file contains a header | |
115 --atlas: normal/cancer | |
116 if normal: | |
117 --tissue: list of tissues | |
118 --level: Not detected, Low, Medium, High | |
119 --reliability: Supportive, Uncertain | |
120 if cancer: | |
121 --cancer: Cancer tissues | |
122 --not_mapped: true/false if your output file should contain not-mapped and not-match IDs | |
123 --output: output filename \n") | |
124 q(save="no") | |
125 } | |
126 | |
127 # Parse arguments | |
128 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") | |
129 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) | |
130 args <- as.list(as.character(argsDF$V2)) | |
131 names(args) <- argsDF$V1 | |
132 | |
133 # Extract input | |
134 input_type = args$input_type | |
135 if (input_type == "list") { | |
136 list_id = strsplit(args$input, " ")[[1]] | |
137 } | |
138 else if (input_type == "file") { | |
139 filename = args$input | |
140 column_number = as.numeric(gsub("c", "" ,args$column_number)) | |
141 header = args$header | |
142 file = readfile(filename, header) | |
143 list_id = c() | |
144 print(file) | |
145 list_id = sapply(strsplit(file[,column_number], ";"), "[", 1) | |
146 } | |
147 input = list_id | |
148 | |
149 # Read reference file | |
150 reference_file = read.table(args$ref_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) | |
151 print(colnames(reference_file)) | |
152 | |
153 # Extract other options | |
154 atlas = args$atlas | |
155 not_mapped_option = args$not_mapped | |
156 if (atlas=="normal") { | |
157 tissue = strsplit(args$tissue, ",")[[1]] | |
158 level = strsplit(args$level, ",")[[1]] | |
159 reliability = strsplit(args$reliability, ",")[[1]] | |
160 # Calculation | |
161 res = annot.HPAnorm(input, reference_file, tissue, level, reliability, not_mapped_option) | |
162 } | |
163 else if (atlas=="cancer") { | |
164 cancer = strsplit(args$cancer, ",")[[1]] | |
165 # Calculation | |
166 res = annot.HPAcancer(input, reference_file, cancer, not_mapped_option) | |
167 } | |
168 | |
169 # Write output | |
170 output = args$output | |
171 write.table(res, output, sep = "\t", quote = FALSE, row.names = FALSE) | |
172 } | |
173 | |
174 main() | |
175 | |
176 # Example commands | |
177 # Rscript sel_ann_hpa.R --input_type="file" --input="./test-data/ENSGid.txt" --ref_file="./pathology.tsv" --cancer="lung cancer,carcinoid" --not_mapped="true" --column_number="c1" --header="true" --output="test-data/ENSG_tissue_output_cancer.txt" | |
178 # Rscript sel_ann_hpa.R --input_type="file" --input="./test-data/ENSGid.txt" --ref_file="./normal_tissue.tsv" --tissue="lung" --level="Not detected,Medium,High,Low" --reliability="Approved,Supported,Uncertain" --column_number="c1" --header="true" --not_mapped="false" --output="./test-data/ENSG_tissue_output.txt" | |
179 # Rscript sel_ann_hpa.R --input_type="file" --input="./test-data/ENSG_no_not_match.txt" --ref_file="/Users/LinCun/Documents/ProteoRE/usecase1/normal_tissue.csv" --tissue="lung" --level="Not detected,Medium,High,Low" --reliability="Approved,Supportive,Uncertain" --column_number="c1" --header="true" --output="./test-data/ENSG_tissue_output2.txt" |