Mercurial > repos > daumsoft > htseq_count_normalized
view expression_calculator/expression.r @ 0:27ebc30a2024 draft default tip
Uploaded
author | daumsoft |
---|---|
date | Wed, 18 Apr 2018 03:35:12 -0400 |
parents | |
children |
line wrap: on
line source
args <- commandArgs(TRUE) if (length(args)!=1) { stop("USAGE: Rscript expression.r htseq-count.out(mapping count)", call.=FALSE) } htseq_count.result=args[1] expression.out<-paste("sample",".txt",sep="") expression.out<-paste("./out_tmp/",expression.out,sep="") #gene_region.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt"; #gene_symbol.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt"; gene_region.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt"; gene_symbol.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt"; gene_region<-NULL gene_len<-NULL htseq_count<-NULL expression<-NULL expression_FPKM<-NULL expression_FPKM_sort<-NULL expression_FPKM_sort.tmp<-NULL Quartile75<-NULL Quartile75.tmp<-NULL expression_FPKM_UQ<-NULL expression_FPKM_UQ.tmp<-NULL expression.tmp<-NULL expression.tmp.1<-NULL hgnc_swissprot<-NULL gene_symbol<-NULL gene_region<-read.delim(gene_region.dat,header=F,sep="\t") gene_len<-cbind(gene_region[1],gene_region[2]) htseq_count<-read.delim(htseq_count.result,header=F,sep="\t") colnames(gene_len)<- c("GeneName","Len") colnames(htseq_count)<- c("GeneName","Count") htseq_count<-htseq_count[ !grepl("__", htseq_count$GeneName) ,] total_count<- sum(htseq_count$Count) oneB<-10^9 expression <- merge(gene_len,htseq_count,by="GeneName") expression$FPKM<-0 expression[,2:4] <- (sapply(expression[,2:4], as.double)) expression$FPKM<- (oneB*expression$Count)/(total_count*expression$Len) expression$TPM<-0 expression[,2:5] <- (sapply(expression[,2:5], as.double)) expression$TPM<-(expression$FPKM / sum(expression$FPKM) ) * (10^6) expression$FPKM_UQ<-0 expression[,2:6] <- (sapply(expression[,2:6], as.double)) expression_FPKM<-cbind(expression[1],expression[4]) expression_FPKM[,2] <- (sapply(expression_FPKM[,2], as.double)) #expression_FPKM_sort.tmp<-expression_FPKM[which(expression_FPKM$FPKM > 0),] expression_FPKM_sort.tmp<-expression_FPKM expression_FPKM_sort<-expression_FPKM_sort.tmp[order(expression_FPKM_sort.tmp$FPKM) , ] Quartile75.tmp<-t(quantile(expression_FPKM_sort$FPKM)) Quartile75<-Quartile75.tmp[4] #expression_FPKM_UQ.tmp<-expression_FPKM[2]/Quartile75 expression_FPKM_UQ.tmp<-(oneB*expression$Count)/(Quartile75*expression$Len) expression_FPKM_UQ<-cbind(expression_FPKM[1],expression_FPKM_UQ.tmp) colnames(expression_FPKM_UQ)<-c("GeneName","FPKM_UQ") expression$FPKM_UQ<-expression_FPKM_UQ$FPKM_UQ colnames(expression) <- c("ENSEMBL_GENE_ID","GENE_LENGTH","HTSEQ_COUNT","FPKM","TPM","FPKM_UQ") colnames(gene_region)<-c("ENSEMBL_GENE_ID","GENE_LENGTH","CHROMOSOME","START","END","STRAND") expression.tmp<-NULL expression.tmp<-merge(expression,gene_region[-2],by="ENSEMBL_GENE_ID") expression<-NULL expression<-expression.tmp gene_symbol<-read.delim(gene_symbol.dat,header=T,sep="\t") ensembl_gene_id.split<-strsplit(as.character(expression$ENSEMBL_GENE_ID),'.',fixed=TRUE) ensembl_gene_id.split.1.1<-as.data.frame(unlist(ensembl_gene_id.split)[2*(1:length(as.character(expression$ENSEMBL_GENE_ID)))-1]) colnames(ensembl_gene_id.split.1.1)<-c("ENSEMBL_GENE_ID2") ensembl_gene_id.split.1<-cbind(expression[1],ensembl_gene_id.split.1.1) expression.tmp.0<-NULL expression.tmp.0<-merge(ensembl_gene_id.split.1,expression,by="ENSEMBL_GENE_ID") expression.tmp.1<-NULL expression.tmp.1<-merge(gene_symbol,expression.tmp.0,by="ENSEMBL_GENE_ID2") expression.tmp.2<-expression.tmp.1[-1] expression.tmp.3<-cbind(expression.tmp.2[2],expression.tmp.2[-2]) expression<-NULL expression<-expression.tmp.3 write.table(expression, expression.out, sep="\t", quote=F, row.names=F, col.names=T)