Mercurial > repos > daumsoft > htseq_count_normalized2

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/expression_calculator/expression.r	Wed Apr 18 04:24:45 2018 -0400
@@ -0,0 +1,93 @@
+args <- commandArgs(TRUE)
+
+if (length(args)!=1) {
+  stop("USAGE: Rscript  expression.r  htseq-count.out(mapping count)", call.=FALSE)
+}
+
+
+htseq_count.result=args[1]
+expression.out<-paste("sample",".txt",sep="")
+expression.out<-paste("./out_tmp/",expression.out,sep="")
+#gene_region.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt";
+#gene_symbol.dat="$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt";
+gene_region.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_hg38_region.txt";
+gene_symbol.dat="/storage/home/galaxy/package/DAUMSOFT/RNA-seq/Normalization/gene_symbol_20170220.txt";
+
+gene_region<-NULL
+gene_len<-NULL
+htseq_count<-NULL
+expression<-NULL
+expression_FPKM<-NULL
+expression_FPKM_sort<-NULL
+expression_FPKM_sort.tmp<-NULL
+Quartile75<-NULL
+Quartile75.tmp<-NULL
+expression_FPKM_UQ<-NULL
+expression_FPKM_UQ.tmp<-NULL
+expression.tmp<-NULL
+expression.tmp.1<-NULL
+hgnc_swissprot<-NULL
+gene_symbol<-NULL
+
+gene_region<-read.delim(gene_region.dat,header=F,sep="\t")
+gene_len<-cbind(gene_region[1],gene_region[2])
+htseq_count<-read.delim(htseq_count.result,header=F,sep="\t")
+colnames(gene_len)<- c("GeneName","Len")
+colnames(htseq_count)<- c("GeneName","Count")
+htseq_count<-htseq_count[ !grepl("__", htseq_count$GeneName) ,]
+total_count<- sum(htseq_count$Count)
+oneB<-10^9
+expression <- merge(gene_len,htseq_count,by="GeneName")
+
+expression$FPKM<-0
+expression[,2:4] <- (sapply(expression[,2:4], as.double))
+expression$FPKM<- (oneB*expression$Count)/(total_count*expression$Len)
+
+expression$TPM<-0
+expression[,2:5] <- (sapply(expression[,2:5], as.double))
+expression$TPM<-(expression$FPKM / sum(expression$FPKM) ) * (10^6)
+
+expression$FPKM_UQ<-0
+expression[,2:6] <- (sapply(expression[,2:6], as.double))
+expression_FPKM<-cbind(expression[1],expression[4])
+expression_FPKM[,2] <- (sapply(expression_FPKM[,2], as.double))
+#expression_FPKM_sort.tmp<-expression_FPKM[which(expression_FPKM$FPKM > 0),]
+expression_FPKM_sort.tmp<-expression_FPKM
+expression_FPKM_sort<-expression_FPKM_sort.tmp[order(expression_FPKM_sort.tmp$FPKM) , ]
+
+Quartile75.tmp<-t(quantile(expression_FPKM_sort$FPKM))
+Quartile75<-Quartile75.tmp[4]
+
+#expression_FPKM_UQ.tmp<-expression_FPKM[2]/Quartile75
+expression_FPKM_UQ.tmp<-(oneB*expression$Count)/(Quartile75*expression$Len)
+expression_FPKM_UQ<-cbind(expression_FPKM[1],expression_FPKM_UQ.tmp)
+colnames(expression_FPKM_UQ)<-c("GeneName","FPKM_UQ")
+
+expression$FPKM_UQ<-expression_FPKM_UQ$FPKM_UQ
+
+colnames(expression) <- c("ENSEMBL_GENE_ID","GENE_LENGTH","HTSEQ_COUNT","FPKM","TPM","FPKM_UQ")
+colnames(gene_region)<-c("ENSEMBL_GENE_ID","GENE_LENGTH","CHROMOSOME","START","END","STRAND")
+
+
+expression.tmp<-NULL
+expression.tmp<-merge(expression,gene_region[-2],by="ENSEMBL_GENE_ID")
+expression<-NULL
+expression<-expression.tmp
+
+gene_symbol<-read.delim(gene_symbol.dat,header=T,sep="\t")
+ensembl_gene_id.split<-strsplit(as.character(expression$ENSEMBL_GENE_ID),'.',fixed=TRUE)
+ensembl_gene_id.split.1.1<-as.data.frame(unlist(ensembl_gene_id.split)[2*(1:length(as.character(expression$ENSEMBL_GENE_ID)))-1])
+colnames(ensembl_gene_id.split.1.1)<-c("ENSEMBL_GENE_ID2")
+ensembl_gene_id.split.1<-cbind(expression[1],ensembl_gene_id.split.1.1)
+
+expression.tmp.0<-NULL
+expression.tmp.0<-merge(ensembl_gene_id.split.1,expression,by="ENSEMBL_GENE_ID")
+expression.tmp.1<-NULL
+expression.tmp.1<-merge(gene_symbol,expression.tmp.0,by="ENSEMBL_GENE_ID2")
+expression.tmp.2<-expression.tmp.1[-1]
+expression.tmp.3<-cbind(expression.tmp.2[2],expression.tmp.2[-2])
+expression<-NULL
+expression<-expression.tmp.3
+
+write.table(expression, expression.out, sep="\t", quote=F, row.names=F, col.names=T)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/expression_calculator/expression.sh	Wed Apr 18 04:24:45 2018 -0400
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+if [ "$#" -ne 1 ]; then
+ echo "[usage:] expression.sh bam.file"
+ exit 1;
+fi
+
+INPUT_BAM=$1
+SAMPLE_ID="sample"
+SAMTOOLS=$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/samtools
+HTSEQ_COUNT=$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/htseq-count
+GTF=$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/gencode.v22.annotation.gtf
+EXPRESSION_R=$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/expression.r
+
+HTSEQ_COUNT_OUT=./$SAMPLE_ID".htseq-count.out"
+EXPRESSION_OUT=./out
+EXPRESSION_OUT_TMP=./out_tmp
+
+rm -rf  $EXPRESSION_OUT
+rm -rf  $EXPRESSION_OUT_TMP
+
+mkdir $EXPRESSION_OUT
+mkdir $EXPRESSION_OUT_TMP
+
+
+$SAMTOOLS view -F 4 $INPUT_BAM |
+$HTSEQ_COUNT \
+-m intersection-nonempty \
+-i gene_id \
+-r pos \
+-s no \
+- $GTF   > $HTSEQ_COUNT_OUT
+
+Rscript $EXPRESSION_R  $HTSEQ_COUNT_OUT
+
+mv $EXPRESSION_OUT_TMP/$SAMPLE_ID".txt" $EXPRESSION_OUT/
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/expression_calculator/expression_run.sh	Wed Apr 18 04:24:45 2018 -0400
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if [ "$#" -ne 1 ]; then
+ echo "[usage:] expression_run.sh bam.file"
+ exit 1;
+fi
+
+INPUT_BAM=$1
+
+$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/expression.sh $INPUT_BAM > log.out 2>&1
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/expression_calculator/htseq-count	Wed Apr 18 04:24:45 2018 -0400
@@ -0,0 +1,5 @@
+#!/storage/home/galaxy/package/Python-2.7.6/bin/python
+
+import HTSeq.scripts.count
+
+HTSeq.scripts.count.main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/expression_calculator/htseq_count_normalized_wrapper.xml	Wed Apr 18 04:24:45 2018 -0400
@@ -0,0 +1,43 @@
+<tool id="daumsoft_wts_htseq_count_normalization" name="HTSeq-Count-Normalized" version="HTSeq-0.6.1p1">
+	<description>Count, TPM, FPKM, FPKM_UQ</description>
+        <stdio>
+         <regex match="Exception|Error" source="both" level="fatal" description="Tool execution failed"/>
+        </stdio>
+	<version_command></version_command>
+	<command>
+          \$GALAXY_HOME/package/DAUMSOFT/RNA-seq/Normalization/expression_run.sh ${bam}
+	</command>
+	<inputs>
+	 <param format="bam" name="bam" type="data" label="Aligned Read	BAM" help="" />
+	</inputs>
+	<outputs>
+	 <data format="txt" name="expression_out" label="${tool.name} on ${on_string}: Gene Expression (HTSeq count,tpm,fpkm,fpkm_uq)" from_work_dir="out/sample.txt"/>
+	</outputs>
+        <tests><test><output name="expression_out"/></test></tests>
+        <help>
+For more detailed manual, please visit The GDC mRNA quantification analysis pipeline website:
+
+https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/
+
+mRNA Expression Workflow
+
+Following alignment, BAM files are processed through the RNA Expression Workflow.
+
+First the BAM files are filtered for reads that were aligned to protein-coding genes
+using the samtools view function.
+
+The reads mapped to each protein-coding gene are enumerated using HT-Seq count.
+The number of reads mapped to each gene (raw count), gene length and
+the total number of reads mapped to protein-coding genes are then used
+to calculate normalized expression levels using FPKM and FPKM-UQ (upper quartile normalization).
+Expression values are provided in a tab-delimited format.
+GENCODE v22 was used for gene annotation.
+
+Add: Custom Script (tpm, fpkm, fpkm_uq)
+
+Input :  Aligned Reads BAM
+
+Output:  Gene Expression (HTSeq count, TPM, FPKM, FPKM_UQ) TXT
+	</help>
+        <citations></citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/expression_calculator/readme.txt	Wed Apr 18 04:24:45 2018 -0400
@@ -0,0 +1,3 @@
+# download https://api.gdc.cancer.gov/data/fe1750e4-fc2d-4a2c-ba21-5fc969a24f27
+# for gencode.v22.annotation.gtf
+wget https://api.gdc.cancer.gov/data/fe1750e4-fc2d-4a2c-ba21-5fc969a24f27
Binary file expression_calculator/samtools has changed