view expression_calculator/expression.r @ 0:27ebc30a2024 draft default tip

Uploaded
author daumsoft
date Wed, 18 Apr 2018 03:35:12 -0400
parents
children
line wrap: on
line source

args <- commandArgs(TRUE)

if (length(args)!=1) {
  stop("USAGE: Rscript  expression.r  htseq-count.out(mapping count)", call.=FALSE)
} 


htseq_count.result=args[1]
expression.out<-paste("sample",".txt",sep="")
expression.out<-paste("./out_tmp/",expression.out,sep="")
#gene_region.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt";
#gene_symbol.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt";
gene_region.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt";
gene_symbol.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt";

gene_region<-NULL
gene_len<-NULL
htseq_count<-NULL
expression<-NULL
expression_FPKM<-NULL
expression_FPKM_sort<-NULL
expression_FPKM_sort.tmp<-NULL
Quartile75<-NULL
Quartile75.tmp<-NULL
expression_FPKM_UQ<-NULL
expression_FPKM_UQ.tmp<-NULL
expression.tmp<-NULL
expression.tmp.1<-NULL
hgnc_swissprot<-NULL
gene_symbol<-NULL

gene_region<-read.delim(gene_region.dat,header=F,sep="\t")
gene_len<-cbind(gene_region[1],gene_region[2])
htseq_count<-read.delim(htseq_count.result,header=F,sep="\t")
colnames(gene_len)<- c("GeneName","Len")
colnames(htseq_count)<- c("GeneName","Count")  
htseq_count<-htseq_count[ !grepl("__", htseq_count$GeneName) ,]
total_count<- sum(htseq_count$Count)
oneB<-10^9
expression <- merge(gene_len,htseq_count,by="GeneName")

expression$FPKM<-0
expression[,2:4] <- (sapply(expression[,2:4], as.double))
expression$FPKM<- (oneB*expression$Count)/(total_count*expression$Len)

expression$TPM<-0
expression[,2:5] <- (sapply(expression[,2:5], as.double))
expression$TPM<-(expression$FPKM / sum(expression$FPKM) ) * (10^6)      

expression$FPKM_UQ<-0
expression[,2:6] <- (sapply(expression[,2:6], as.double))
expression_FPKM<-cbind(expression[1],expression[4])
expression_FPKM[,2] <- (sapply(expression_FPKM[,2], as.double))
#expression_FPKM_sort.tmp<-expression_FPKM[which(expression_FPKM$FPKM > 0),] 
expression_FPKM_sort.tmp<-expression_FPKM
expression_FPKM_sort<-expression_FPKM_sort.tmp[order(expression_FPKM_sort.tmp$FPKM) , ]

Quartile75.tmp<-t(quantile(expression_FPKM_sort$FPKM))
Quartile75<-Quartile75.tmp[4]

#expression_FPKM_UQ.tmp<-expression_FPKM[2]/Quartile75
expression_FPKM_UQ.tmp<-(oneB*expression$Count)/(Quartile75*expression$Len)
expression_FPKM_UQ<-cbind(expression_FPKM[1],expression_FPKM_UQ.tmp)
colnames(expression_FPKM_UQ)<-c("GeneName","FPKM_UQ")

expression$FPKM_UQ<-expression_FPKM_UQ$FPKM_UQ

colnames(expression) <- c("ENSEMBL_GENE_ID","GENE_LENGTH","HTSEQ_COUNT","FPKM","TPM","FPKM_UQ")
colnames(gene_region)<-c("ENSEMBL_GENE_ID","GENE_LENGTH","CHROMOSOME","START","END","STRAND")


expression.tmp<-NULL
expression.tmp<-merge(expression,gene_region[-2],by="ENSEMBL_GENE_ID")
expression<-NULL
expression<-expression.tmp

gene_symbol<-read.delim(gene_symbol.dat,header=T,sep="\t")
ensembl_gene_id.split<-strsplit(as.character(expression$ENSEMBL_GENE_ID),'.',fixed=TRUE)
ensembl_gene_id.split.1.1<-as.data.frame(unlist(ensembl_gene_id.split)[2*(1:length(as.character(expression$ENSEMBL_GENE_ID)))-1])
colnames(ensembl_gene_id.split.1.1)<-c("ENSEMBL_GENE_ID2")
ensembl_gene_id.split.1<-cbind(expression[1],ensembl_gene_id.split.1.1)

expression.tmp.0<-NULL
expression.tmp.0<-merge(ensembl_gene_id.split.1,expression,by="ENSEMBL_GENE_ID")
expression.tmp.1<-NULL
expression.tmp.1<-merge(gene_symbol,expression.tmp.0,by="ENSEMBL_GENE_ID2")
expression.tmp.2<-expression.tmp.1[-1]
expression.tmp.3<-cbind(expression.tmp.2[2],expression.tmp.2[-2])
expression<-NULL
expression<-expression.tmp.3

write.table(expression, expression.out, sep="\t", quote=F, row.names=F, col.names=T)