Mercurial > repos > daumsoft > htseq_count_normalized2
comparison expression_calculator/expression.r @ 0:4f85170b4fd3 draft default tip
Uploaded
| author | daumsoft |
|---|---|
| date | Wed, 18 Apr 2018 04:24:45 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4f85170b4fd3 |
|---|---|
| 1 args <- commandArgs(TRUE) | |
| 2 | |
| 3 if (length(args)!=1) { | |
| 4 stop("USAGE: Rscript expression.r htseq-count.out(mapping count)", call.=FALSE) | |
| 5 } | |
| 6 | |
| 7 | |
| 8 htseq_count.result=args[1] | |
| 9 expression.out<-paste("sample",".txt",sep="") | |
| 10 expression.out<-paste("./out_tmp/",expression.out,sep="") | |
| 11 #gene_region.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt"; | |
| 12 #gene_symbol.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt"; | |
| 13 gene_region.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt"; | |
| 14 gene_symbol.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt"; | |
| 15 | |
| 16 gene_region<-NULL | |
| 17 gene_len<-NULL | |
| 18 htseq_count<-NULL | |
| 19 expression<-NULL | |
| 20 expression_FPKM<-NULL | |
| 21 expression_FPKM_sort<-NULL | |
| 22 expression_FPKM_sort.tmp<-NULL | |
| 23 Quartile75<-NULL | |
| 24 Quartile75.tmp<-NULL | |
| 25 expression_FPKM_UQ<-NULL | |
| 26 expression_FPKM_UQ.tmp<-NULL | |
| 27 expression.tmp<-NULL | |
| 28 expression.tmp.1<-NULL | |
| 29 hgnc_swissprot<-NULL | |
| 30 gene_symbol<-NULL | |
| 31 | |
| 32 gene_region<-read.delim(gene_region.dat,header=F,sep="\t") | |
| 33 gene_len<-cbind(gene_region[1],gene_region[2]) | |
| 34 htseq_count<-read.delim(htseq_count.result,header=F,sep="\t") | |
| 35 colnames(gene_len)<- c("GeneName","Len") | |
| 36 colnames(htseq_count)<- c("GeneName","Count") | |
| 37 htseq_count<-htseq_count[ !grepl("__", htseq_count$GeneName) ,] | |
| 38 total_count<- sum(htseq_count$Count) | |
| 39 oneB<-10^9 | |
| 40 expression <- merge(gene_len,htseq_count,by="GeneName") | |
| 41 | |
| 42 expression$FPKM<-0 | |
| 43 expression[,2:4] <- (sapply(expression[,2:4], as.double)) | |
| 44 expression$FPKM<- (oneB*expression$Count)/(total_count*expression$Len) | |
| 45 | |
| 46 expression$TPM<-0 | |
| 47 expression[,2:5] <- (sapply(expression[,2:5], as.double)) | |
| 48 expression$TPM<-(expression$FPKM / sum(expression$FPKM) ) * (10^6) | |
| 49 | |
| 50 expression$FPKM_UQ<-0 | |
| 51 expression[,2:6] <- (sapply(expression[,2:6], as.double)) | |
| 52 expression_FPKM<-cbind(expression[1],expression[4]) | |
| 53 expression_FPKM[,2] <- (sapply(expression_FPKM[,2], as.double)) | |
| 54 #expression_FPKM_sort.tmp<-expression_FPKM[which(expression_FPKM$FPKM > 0),] | |
| 55 expression_FPKM_sort.tmp<-expression_FPKM | |
| 56 expression_FPKM_sort<-expression_FPKM_sort.tmp[order(expression_FPKM_sort.tmp$FPKM) , ] | |
| 57 | |
| 58 Quartile75.tmp<-t(quantile(expression_FPKM_sort$FPKM)) | |
| 59 Quartile75<-Quartile75.tmp[4] | |
| 60 | |
| 61 #expression_FPKM_UQ.tmp<-expression_FPKM[2]/Quartile75 | |
| 62 expression_FPKM_UQ.tmp<-(oneB*expression$Count)/(Quartile75*expression$Len) | |
| 63 expression_FPKM_UQ<-cbind(expression_FPKM[1],expression_FPKM_UQ.tmp) | |
| 64 colnames(expression_FPKM_UQ)<-c("GeneName","FPKM_UQ") | |
| 65 | |
| 66 expression$FPKM_UQ<-expression_FPKM_UQ$FPKM_UQ | |
| 67 | |
| 68 colnames(expression) <- c("ENSEMBL_GENE_ID","GENE_LENGTH","HTSEQ_COUNT","FPKM","TPM","FPKM_UQ") | |
| 69 colnames(gene_region)<-c("ENSEMBL_GENE_ID","GENE_LENGTH","CHROMOSOME","START","END","STRAND") | |
| 70 | |
| 71 | |
| 72 expression.tmp<-NULL | |
| 73 expression.tmp<-merge(expression,gene_region[-2],by="ENSEMBL_GENE_ID") | |
| 74 expression<-NULL | |
| 75 expression<-expression.tmp | |
| 76 | |
| 77 gene_symbol<-read.delim(gene_symbol.dat,header=T,sep="\t") | |
| 78 ensembl_gene_id.split<-strsplit(as.character(expression$ENSEMBL_GENE_ID),'.',fixed=TRUE) | |
| 79 ensembl_gene_id.split.1.1<-as.data.frame(unlist(ensembl_gene_id.split)[2*(1:length(as.character(expression$ENSEMBL_GENE_ID)))-1]) | |
| 80 colnames(ensembl_gene_id.split.1.1)<-c("ENSEMBL_GENE_ID2") | |
| 81 ensembl_gene_id.split.1<-cbind(expression[1],ensembl_gene_id.split.1.1) | |
| 82 | |
| 83 expression.tmp.0<-NULL | |
| 84 expression.tmp.0<-merge(ensembl_gene_id.split.1,expression,by="ENSEMBL_GENE_ID") | |
| 85 expression.tmp.1<-NULL | |
| 86 expression.tmp.1<-merge(gene_symbol,expression.tmp.0,by="ENSEMBL_GENE_ID2") | |
| 87 expression.tmp.2<-expression.tmp.1[-1] | |
| 88 expression.tmp.3<-cbind(expression.tmp.2[2],expression.tmp.2[-2]) | |
| 89 expression<-NULL | |
| 90 expression<-expression.tmp.3 | |
| 91 | |
| 92 write.table(expression, expression.out, sep="\t", quote=F, row.names=F, col.names=T) | |
| 93 |
