annotate expression_calculator/expression.r @ 0:27ebc30a2024 draft default tip

Uploaded
author daumsoft
date Wed, 18 Apr 2018 03:35:12 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
1 args <- commandArgs(TRUE)
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
2
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
3 if (length(args)!=1) {
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
4 stop("USAGE: Rscript expression.r htseq-count.out(mapping count)", call.=FALSE)
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
5 }
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
6
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
7
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
8 htseq_count.result=args[1]
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
9 expression.out<-paste("sample",".txt",sep="")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
10 expression.out<-paste("./out_tmp/",expression.out,sep="")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
11 #gene_region.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt";
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
12 #gene_symbol.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt";
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
13 gene_region.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt";
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
14 gene_symbol.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt";
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
15
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
16 gene_region<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
17 gene_len<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
18 htseq_count<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
19 expression<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
20 expression_FPKM<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
21 expression_FPKM_sort<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
22 expression_FPKM_sort.tmp<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
23 Quartile75<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
24 Quartile75.tmp<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
25 expression_FPKM_UQ<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
26 expression_FPKM_UQ.tmp<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
27 expression.tmp<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
28 expression.tmp.1<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
29 hgnc_swissprot<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
30 gene_symbol<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
31
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
32 gene_region<-read.delim(gene_region.dat,header=F,sep="\t")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
33 gene_len<-cbind(gene_region[1],gene_region[2])
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
34 htseq_count<-read.delim(htseq_count.result,header=F,sep="\t")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
35 colnames(gene_len)<- c("GeneName","Len")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
36 colnames(htseq_count)<- c("GeneName","Count")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
37 htseq_count<-htseq_count[ !grepl("__", htseq_count$GeneName) ,]
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
38 total_count<- sum(htseq_count$Count)
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
39 oneB<-10^9
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
40 expression <- merge(gene_len,htseq_count,by="GeneName")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
41
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
42 expression$FPKM<-0
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
43 expression[,2:4] <- (sapply(expression[,2:4], as.double))
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
44 expression$FPKM<- (oneB*expression$Count)/(total_count*expression$Len)
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
45
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
46 expression$TPM<-0
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
47 expression[,2:5] <- (sapply(expression[,2:5], as.double))
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
48 expression$TPM<-(expression$FPKM / sum(expression$FPKM) ) * (10^6)
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
49
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
50 expression$FPKM_UQ<-0
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
51 expression[,2:6] <- (sapply(expression[,2:6], as.double))
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
52 expression_FPKM<-cbind(expression[1],expression[4])
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
53 expression_FPKM[,2] <- (sapply(expression_FPKM[,2], as.double))
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
54 #expression_FPKM_sort.tmp<-expression_FPKM[which(expression_FPKM$FPKM > 0),]
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
55 expression_FPKM_sort.tmp<-expression_FPKM
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
56 expression_FPKM_sort<-expression_FPKM_sort.tmp[order(expression_FPKM_sort.tmp$FPKM) , ]
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
57
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
58 Quartile75.tmp<-t(quantile(expression_FPKM_sort$FPKM))
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
59 Quartile75<-Quartile75.tmp[4]
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
60
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
61 #expression_FPKM_UQ.tmp<-expression_FPKM[2]/Quartile75
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
62 expression_FPKM_UQ.tmp<-(oneB*expression$Count)/(Quartile75*expression$Len)
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
63 expression_FPKM_UQ<-cbind(expression_FPKM[1],expression_FPKM_UQ.tmp)
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
64 colnames(expression_FPKM_UQ)<-c("GeneName","FPKM_UQ")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
65
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
66 expression$FPKM_UQ<-expression_FPKM_UQ$FPKM_UQ
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
67
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
68 colnames(expression) <- c("ENSEMBL_GENE_ID","GENE_LENGTH","HTSEQ_COUNT","FPKM","TPM","FPKM_UQ")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
69 colnames(gene_region)<-c("ENSEMBL_GENE_ID","GENE_LENGTH","CHROMOSOME","START","END","STRAND")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
70
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
71
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
72 expression.tmp<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
73 expression.tmp<-merge(expression,gene_region[-2],by="ENSEMBL_GENE_ID")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
74 expression<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
75 expression<-expression.tmp
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
76
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
77 gene_symbol<-read.delim(gene_symbol.dat,header=T,sep="\t")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
78 ensembl_gene_id.split<-strsplit(as.character(expression$ENSEMBL_GENE_ID),'.',fixed=TRUE)
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
79 ensembl_gene_id.split.1.1<-as.data.frame(unlist(ensembl_gene_id.split)[2*(1:length(as.character(expression$ENSEMBL_GENE_ID)))-1])
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
80 colnames(ensembl_gene_id.split.1.1)<-c("ENSEMBL_GENE_ID2")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
81 ensembl_gene_id.split.1<-cbind(expression[1],ensembl_gene_id.split.1.1)
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
82
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
83 expression.tmp.0<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
84 expression.tmp.0<-merge(ensembl_gene_id.split.1,expression,by="ENSEMBL_GENE_ID")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
85 expression.tmp.1<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
86 expression.tmp.1<-merge(gene_symbol,expression.tmp.0,by="ENSEMBL_GENE_ID2")
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
87 expression.tmp.2<-expression.tmp.1[-1]
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
88 expression.tmp.3<-cbind(expression.tmp.2[2],expression.tmp.2[-2])
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
89 expression<-NULL
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
90 expression<-expression.tmp.3
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
91
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
92 write.table(expression, expression.out, sep="\t", quote=F, row.names=F, col.names=T)
27ebc30a2024 Uploaded
daumsoft
parents:
diff changeset
93