Mercurial > repos > daumsoft > htseq_count_normalized2
changeset 0:4f85170b4fd3 draft default tip
Uploaded
author | daumsoft |
---|---|
date | Wed, 18 Apr 2018 04:24:45 -0400 |
parents | |
children | |
files | expression_calculator/expression.r expression_calculator/expression.sh expression_calculator/expression_run.sh expression_calculator/htseq-count expression_calculator/htseq_count_normalized_wrapper.xml expression_calculator/readme.txt expression_calculator/samtools |
diffstat | 7 files changed, 192 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/expression_calculator/expression.r Wed Apr 18 04:24:45 2018 -0400 @@ -0,0 +1,93 @@ +args <- commandArgs(TRUE) + +if (length(args)!=1) { + stop("USAGE: Rscript expression.r htseq-count.out(mapping count)", call.=FALSE) +} + + +htseq_count.result=args[1] +expression.out<-paste("sample",".txt",sep="") +expression.out<-paste("./out_tmp/",expression.out,sep="") +#gene_region.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt"; +#gene_symbol.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt"; +gene_region.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt"; +gene_symbol.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt"; + +gene_region<-NULL +gene_len<-NULL +htseq_count<-NULL +expression<-NULL +expression_FPKM<-NULL +expression_FPKM_sort<-NULL +expression_FPKM_sort.tmp<-NULL +Quartile75<-NULL +Quartile75.tmp<-NULL +expression_FPKM_UQ<-NULL +expression_FPKM_UQ.tmp<-NULL +expression.tmp<-NULL +expression.tmp.1<-NULL +hgnc_swissprot<-NULL +gene_symbol<-NULL + +gene_region<-read.delim(gene_region.dat,header=F,sep="\t") +gene_len<-cbind(gene_region[1],gene_region[2]) +htseq_count<-read.delim(htseq_count.result,header=F,sep="\t") +colnames(gene_len)<- c("GeneName","Len") +colnames(htseq_count)<- c("GeneName","Count") +htseq_count<-htseq_count[ !grepl("__", htseq_count$GeneName) ,] +total_count<- sum(htseq_count$Count) +oneB<-10^9 +expression <- merge(gene_len,htseq_count,by="GeneName") + +expression$FPKM<-0 +expression[,2:4] <- (sapply(expression[,2:4], as.double)) +expression$FPKM<- (oneB*expression$Count)/(total_count*expression$Len) + +expression$TPM<-0 +expression[,2:5] <- (sapply(expression[,2:5], as.double)) +expression$TPM<-(expression$FPKM / sum(expression$FPKM) ) * (10^6) + +expression$FPKM_UQ<-0 +expression[,2:6] <- (sapply(expression[,2:6], as.double)) +expression_FPKM<-cbind(expression[1],expression[4]) +expression_FPKM[,2] <- (sapply(expression_FPKM[,2], as.double)) +#expression_FPKM_sort.tmp<-expression_FPKM[which(expression_FPKM$FPKM > 0),] +expression_FPKM_sort.tmp<-expression_FPKM +expression_FPKM_sort<-expression_FPKM_sort.tmp[order(expression_FPKM_sort.tmp$FPKM) , ] + +Quartile75.tmp<-t(quantile(expression_FPKM_sort$FPKM)) +Quartile75<-Quartile75.tmp[4] + +#expression_FPKM_UQ.tmp<-expression_FPKM[2]/Quartile75 +expression_FPKM_UQ.tmp<-(oneB*expression$Count)/(Quartile75*expression$Len) +expression_FPKM_UQ<-cbind(expression_FPKM[1],expression_FPKM_UQ.tmp) +colnames(expression_FPKM_UQ)<-c("GeneName","FPKM_UQ") + +expression$FPKM_UQ<-expression_FPKM_UQ$FPKM_UQ + +colnames(expression) <- c("ENSEMBL_GENE_ID","GENE_LENGTH","HTSEQ_COUNT","FPKM","TPM","FPKM_UQ") +colnames(gene_region)<-c("ENSEMBL_GENE_ID","GENE_LENGTH","CHROMOSOME","START","END","STRAND") + + +expression.tmp<-NULL +expression.tmp<-merge(expression,gene_region[-2],by="ENSEMBL_GENE_ID") +expression<-NULL +expression<-expression.tmp + +gene_symbol<-read.delim(gene_symbol.dat,header=T,sep="\t") +ensembl_gene_id.split<-strsplit(as.character(expression$ENSEMBL_GENE_ID),'.',fixed=TRUE) +ensembl_gene_id.split.1.1<-as.data.frame(unlist(ensembl_gene_id.split)[2*(1:length(as.character(expression$ENSEMBL_GENE_ID)))-1]) +colnames(ensembl_gene_id.split.1.1)<-c("ENSEMBL_GENE_ID2") +ensembl_gene_id.split.1<-cbind(expression[1],ensembl_gene_id.split.1.1) + +expression.tmp.0<-NULL +expression.tmp.0<-merge(ensembl_gene_id.split.1,expression,by="ENSEMBL_GENE_ID") +expression.tmp.1<-NULL +expression.tmp.1<-merge(gene_symbol,expression.tmp.0,by="ENSEMBL_GENE_ID2") +expression.tmp.2<-expression.tmp.1[-1] +expression.tmp.3<-cbind(expression.tmp.2[2],expression.tmp.2[-2]) +expression<-NULL +expression<-expression.tmp.3 + +write.table(expression, expression.out, sep="\t", quote=F, row.names=F, col.names=T) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/expression_calculator/expression.sh Wed Apr 18 04:24:45 2018 -0400 @@ -0,0 +1,37 @@ +#!/bin/bash + +if [ "$#" -ne 1 ]; then + echo "[usage:] expression.sh bam.file" + exit 1; +fi + +INPUT_BAM=$1 +SAMPLE_ID="sample" +SAMTOOLS=$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/samtools +HTSEQ_COUNT=$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/htseq-count +GTF=$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gencode.v22.annotation.gtf +EXPRESSION_R=$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/expression.r + +HTSEQ_COUNT_OUT=./$SAMPLE_ID".htseq-count.out" +EXPRESSION_OUT=./out +EXPRESSION_OUT_TMP=./out_tmp + +rm -rf $EXPRESSION_OUT +rm -rf $EXPRESSION_OUT_TMP + +mkdir $EXPRESSION_OUT +mkdir $EXPRESSION_OUT_TMP + + +$SAMTOOLS view -F 4 $INPUT_BAM | +$HTSEQ_COUNT \ +-m intersection-nonempty \ +-i gene_id \ +-r pos \ +-s no \ +- $GTF > $HTSEQ_COUNT_OUT + +Rscript $EXPRESSION_R $HTSEQ_COUNT_OUT + +mv $EXPRESSION_OUT_TMP/$SAMPLE_ID".txt" $EXPRESSION_OUT/ +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/expression_calculator/expression_run.sh Wed Apr 18 04:24:45 2018 -0400 @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ "$#" -ne 1 ]; then + echo "[usage:] expression_run.sh bam.file" + exit 1; +fi + +INPUT_BAM=$1 + +$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/expression.sh $INPUT_BAM > log.out 2>&1 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/expression_calculator/htseq-count Wed Apr 18 04:24:45 2018 -0400 @@ -0,0 +1,5 @@ +#!/storage/home/galaxy/package/Python-2.7.6/bin/python + +import HTSeq.scripts.count + +HTSeq.scripts.count.main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/expression_calculator/htseq_count_normalized_wrapper.xml Wed Apr 18 04:24:45 2018 -0400 @@ -0,0 +1,43 @@ +<tool id="daumsoft_wts_htseq_count_normalization" name="HTSeq-Count-Normalized" version="HTSeq-0.6.1p1"> + <description>Count, TPM, FPKM, FPKM_UQ</description> + <stdio> + <regex match="Exception|Error" source="both" level="fatal" description="Tool execution failed"/> + </stdio> + <version_command></version_command> + <command> + \$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/expression_run.sh ${bam} + </command> + <inputs> + <param format="bam" name="bam" type="data" label="Aligned Read BAM" help="" /> + </inputs> + <outputs> + <data format="txt" name="expression_out" label="${tool.name} on ${on_string}: Gene Expression (HTSeq count,tpm,fpkm,fpkm_uq)" from_work_dir="out/sample.txt"/> + </outputs> + <tests><test><output name="expression_out"/></test></tests> + <help> +For more detailed manual, please visit The GDC mRNA quantification analysis pipeline website: + +https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/ + +mRNA Expression Workflow + +Following alignment, BAM files are processed through the RNA Expression Workflow. + +First the BAM files are filtered for reads that were aligned to protein-coding genes +using the samtools view function. + +The reads mapped to each protein-coding gene are enumerated using HT-Seq count. +The number of reads mapped to each gene (raw count), gene length and +the total number of reads mapped to protein-coding genes are then used +to calculate normalized expression levels using FPKM and FPKM-UQ (upper quartile normalization). +Expression values are provided in a tab-delimited format. +GENCODE v22 was used for gene annotation. + +Add: Custom Script (tpm, fpkm, fpkm_uq) + +Input : Aligned Reads BAM + +Output: Gene Expression (HTSeq count, TPM, FPKM, FPKM_UQ) TXT + </help> + <citations></citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/expression_calculator/readme.txt Wed Apr 18 04:24:45 2018 -0400 @@ -0,0 +1,3 @@ +# download https://api.gdc.cancer.gov/data/fe1750e4-fc2d-4a2c-ba21-5fc969a24f27 +# for gencode.v22.annotation.gtf +wget https://api.gdc.cancer.gov/data/fe1750e4-fc2d-4a2c-ba21-5fc969a24f27