Mercurial > repos > daumsoft > htseq_count_normalized2
diff expression_calculator/expression.r @ 0:4f85170b4fd3 draft default tip
Uploaded
author | daumsoft |
---|---|
date | Wed, 18 Apr 2018 04:24:45 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/expression_calculator/expression.r Wed Apr 18 04:24:45 2018 -0400 @@ -0,0 +1,93 @@ +args <- commandArgs(TRUE) + +if (length(args)!=1) { + stop("USAGE: Rscript expression.r htseq-count.out(mapping count)", call.=FALSE) +} + + +htseq_count.result=args[1] +expression.out<-paste("sample",".txt",sep="") +expression.out<-paste("./out_tmp/",expression.out,sep="") +#gene_region.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt"; +#gene_symbol.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt"; +gene_region.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt"; +gene_symbol.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt"; + +gene_region<-NULL +gene_len<-NULL +htseq_count<-NULL +expression<-NULL +expression_FPKM<-NULL +expression_FPKM_sort<-NULL +expression_FPKM_sort.tmp<-NULL +Quartile75<-NULL +Quartile75.tmp<-NULL +expression_FPKM_UQ<-NULL +expression_FPKM_UQ.tmp<-NULL +expression.tmp<-NULL +expression.tmp.1<-NULL +hgnc_swissprot<-NULL +gene_symbol<-NULL + +gene_region<-read.delim(gene_region.dat,header=F,sep="\t") +gene_len<-cbind(gene_region[1],gene_region[2]) +htseq_count<-read.delim(htseq_count.result,header=F,sep="\t") +colnames(gene_len)<- c("GeneName","Len") +colnames(htseq_count)<- c("GeneName","Count") +htseq_count<-htseq_count[ !grepl("__", htseq_count$GeneName) ,] +total_count<- sum(htseq_count$Count) +oneB<-10^9 +expression <- merge(gene_len,htseq_count,by="GeneName") + +expression$FPKM<-0 +expression[,2:4] <- (sapply(expression[,2:4], as.double)) +expression$FPKM<- (oneB*expression$Count)/(total_count*expression$Len) + +expression$TPM<-0 +expression[,2:5] <- (sapply(expression[,2:5], as.double)) +expression$TPM<-(expression$FPKM / sum(expression$FPKM) ) * (10^6) + +expression$FPKM_UQ<-0 +expression[,2:6] <- (sapply(expression[,2:6], as.double)) +expression_FPKM<-cbind(expression[1],expression[4]) +expression_FPKM[,2] <- (sapply(expression_FPKM[,2], as.double)) +#expression_FPKM_sort.tmp<-expression_FPKM[which(expression_FPKM$FPKM > 0),] +expression_FPKM_sort.tmp<-expression_FPKM +expression_FPKM_sort<-expression_FPKM_sort.tmp[order(expression_FPKM_sort.tmp$FPKM) , ] + +Quartile75.tmp<-t(quantile(expression_FPKM_sort$FPKM)) +Quartile75<-Quartile75.tmp[4] + +#expression_FPKM_UQ.tmp<-expression_FPKM[2]/Quartile75 +expression_FPKM_UQ.tmp<-(oneB*expression$Count)/(Quartile75*expression$Len) +expression_FPKM_UQ<-cbind(expression_FPKM[1],expression_FPKM_UQ.tmp) +colnames(expression_FPKM_UQ)<-c("GeneName","FPKM_UQ") + +expression$FPKM_UQ<-expression_FPKM_UQ$FPKM_UQ + +colnames(expression) <- c("ENSEMBL_GENE_ID","GENE_LENGTH","HTSEQ_COUNT","FPKM","TPM","FPKM_UQ") +colnames(gene_region)<-c("ENSEMBL_GENE_ID","GENE_LENGTH","CHROMOSOME","START","END","STRAND") + + +expression.tmp<-NULL +expression.tmp<-merge(expression,gene_region[-2],by="ENSEMBL_GENE_ID") +expression<-NULL +expression<-expression.tmp + +gene_symbol<-read.delim(gene_symbol.dat,header=T,sep="\t") +ensembl_gene_id.split<-strsplit(as.character(expression$ENSEMBL_GENE_ID),'.',fixed=TRUE) +ensembl_gene_id.split.1.1<-as.data.frame(unlist(ensembl_gene_id.split)[2*(1:length(as.character(expression$ENSEMBL_GENE_ID)))-1]) +colnames(ensembl_gene_id.split.1.1)<-c("ENSEMBL_GENE_ID2") +ensembl_gene_id.split.1<-cbind(expression[1],ensembl_gene_id.split.1.1) + +expression.tmp.0<-NULL +expression.tmp.0<-merge(ensembl_gene_id.split.1,expression,by="ENSEMBL_GENE_ID") +expression.tmp.1<-NULL +expression.tmp.1<-merge(gene_symbol,expression.tmp.0,by="ENSEMBL_GENE_ID2") +expression.tmp.2<-expression.tmp.1[-1] +expression.tmp.3<-cbind(expression.tmp.2[2],expression.tmp.2[-2]) +expression<-NULL +expression<-expression.tmp.3 + +write.table(expression, expression.out, sep="\t", quote=F, row.names=F, col.names=T) +