Mercurial > repos > mingchen0919 > rmarkdown_feature_counts

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rmarkdown_feature_counts.Rmd	Fri Dec 29 15:03:18 2017 -0500
@@ -0,0 +1,104 @@
+---
+title: 'Feature Counts'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = opt$echo,
+  error = TRUE
+)
+```
+
+
+# User input
+
+```{r 'user input'}
+opt
+```
+
+# Calculate feature counts
+
+```{r 'ste[ 2'}
+res = featureCounts(
+  files = strsplit(opt$input_bam_paths, ',')[[1]],
+  # annotation
+	annot.inbuilt=opt$annot_inbuilt,
+	annot.ext=opt$annot_ext,
+	isGTFAnnotationFile=opt$isGTFAnnotationFile,
+	GTF.featureType=opt$gtf_feature_type,
+	GTF.attrType=opt$gtf_attr_type,
+	chrAliases=opt$chr_aliases,
+
+	# level of summarization
+	useMetaFeatures=opt$use_meta_features,
+
+	# overlap between reads and features
+	allowMultiOverlap=opt$allow_multi_overlap,
+	minOverlap=opt$min_overlap,
+	largestOverlap=opt$largest_overlap,
+	readExtension5=opt$read_extension_5,
+	readExtension3=opt$read_extension_3,
+	read2pos=opt$read_2_pos,
+
+	# multi-mapping reads
+	countMultiMappingReads=opt$count_multi_mapping_reads,
+	fraction=opt$fraction,
+
+	# read filtering
+	minMQS=opt$min_mqs,
+	splitOnly=opt$split_only,
+	nonSplitOnly=opt$non_split_only,
+	primaryOnly=opt$primary_only,
+	ignoreDup=opt$ignore_dup,
+
+	# strandness
+	strandSpecific=opt$strand_specific,
+
+	# exon-exon junctions
+	juncCounts=opt$junc_counts,
+	genome=opt$genome,
+
+	# parameters specific to paired end reads
+	isPairedEnd=opt$is_paired_end,
+	requireBothEndsMapped=opt$require_both_ends_mapped,
+	checkFragLength=opt$check_frag_length,
+	minFragLength=opt$min_frag_length,
+	maxFragLength=opt$max_frag_length,
+	countChimericFragments=opt$count_chimeric_fragments,
+	autosort=opt$auto_sort,
+
+	# miscellaneous
+	nthreads=opt$n_threads,
+	maxMOp=opt$max_mop,
+	reportReads=opt$report_reads
+)
+```
+
+# Write counts into CSV file
+
+```{r}
+colnames(res$counts) = strsplit(opt$input_bam_names, ',')[[1]]
+# write count into csv file
+write.table(res$counts, file = 'feature_counts.txt')
+```
+
+Display the first 100 rows.
+
+```{r}
+datatable(head(res$counts, 100))
+```
+
+# Save results into RData file
+
+```{r}
+save(res, file = 'feature_counts.RData')
+str(res)
+```
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rmarkdown_feature_counts.xml	Fri Dec 29 15:03:18 2017 -0500
@@ -0,0 +1,206 @@
+<tool id="feature_counts" name="featureCounts" version="1.0.0">
+    <requirements>
+        <requirement type="package" version="1.15.0.6-0">pandoc</requirement>
+        <requirement type="package" version="1.20.0">r-getopt</requirement>
+        <requirement type="package" version="1.3">r-rmarkdown</requirement>
+        <requirement type="package" version="0.3.5">r-htmltools</requirement>
+        <requirement type="package" version="0.5.0">r-dplyr</requirement>
+        <requirement type="package" version="0.2">r-dt</requirement>
+        <requirement type="package" version="1.25.2">bioconductor-rsubread</requirement>
+    </requirements>
+    <description>This function assigns mapped sequencing reads to genomic features</description>
+    <stdio>
+        <regex source="stderr" match="XXX" level="warning"
+               description="Check the warnings_and_errors.txt file for more details."/>
+    </stdio>
+    <command><![CDATA[
+                    Rscript '${__tool_directory__}/rmarkdown_feature_counts_render.R'
+                        -e $echo
+                        ##----- code chunk to get file paths and raw file names for a multiple inputs data field ----
+                        #set $sep = ''
+                        #set $input_bam_paths = ''
+                        #set $input_bam_names = ''
+                        #for $input_bam in $input_bams:
+                            #set $input_bam_paths += $sep + str($input_bam)
+                            #set $input_bam_names += $sep + str($input_bam.name)
+                            #set $sep = ','
+                        #end for
+                        ##----------------- end for getting file names and file paths ------------------------------
+                        -a '$input_bam_paths'
+                        -N '$input_bam_names'
+                        -b $annot_inbuilt
+                        -c '$annot_ext'
+                        -f $isGTFAnnotationFile
+                        -g $gtf_feature_type
+                        -h $gtf_attr_type
+                        -i $chr_aliases
+                        -j $use_meta_features
+                        -k $allow_multi_overlap
+                        -l $min_overlap
+                        -m $largest_overlap
+                        -n $read_extension_5
+                        -o $read_extension_3
+                        -p $read_2_pos
+                        -q $count_multi_mapping_reads
+                        -u $fraction
+                        -v $min_mqs
+                        -w $split_only
+                        -x $non_split_only
+                        -y $primary_only
+                        -z $ignore_dup
+                        -A $strand_specific
+                        -B $junc_counts
+                        -C $genome
+                        -D $is_paired_end
+                        -E $require_both_ends_mapped
+                        -F $check_frag_length
+                        -G $min_frag_length
+                        -H $max_frag_length
+                        -I $count_chimeric_fragments
+                        -J $auto_sort
+                        -K $n_threads
+                        -L $max_mop
+                        -M $report_reads
+                        -r $report
+                        -d $report.files_path
+                        -s $sink_message
+                        -t '${__tool_directory__}/rmarkdown_feature_counts.Rmd'
+                    ]]></command>
+    <inputs>
+        <param type="boolean" name="echo" label="Display analysis code in report?" optional="False" checked="False"
+               truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="data" name="input_bams" label="BAM/SAM files"
+               help="The files can be in either SAM format or BAM format. The file format is automatically detected by the function"
+               optional="False" format="bam,sam" multiple="True"/>
+        <param type="text" name="annot_inbuilt" label="annot.inbuilt"
+               help="a character string specifying an in-built annotation used for read summarization. It has four possible values including mm10, mm9, hg38 and hg19, corresponding to the NCBI RefSeq annotations for genomes 'mm10', 'mm9', 'hg38' and 'hg19', respectively. mm10 by default. The in-built annotation has a SAF format (see below)."
+               optional="False" value="mm10"/>
+        <param type="data" name="annot_ext" label="External annotation (GTF)"
+               help="A character string giving name of a user-provided annotation file or a data frame including user-provided annotation data"
+               optional="False" format="gtf,saf" multiple="False"/>
+        <param type="boolean" name="isGTFAnnotationFile" optional="False" checked="False" truevalue="TRUE"
+               falsevalue="FALSE"/>
+        <param type="text" name="gtf_feature_type" label="GTF.featureType"
+               help="a character string giving the feature type used to select rows in the GTF annotation which will be used for read summarization. exon by default. This argument is only applicable when isGTFAnnotationFile is TRUE"
+               optional="False" value="exon"/>
+        <param type="text" name="gtf_attr_type" label="GTF.attrType"
+               help="a character string giving the attribute type in the GTF annotation which will be used to group features (eg. exons) into meta-features (eg. genes). gene_id by default. This argument is only applicable when isGTFAnnotationFile is TRUE"
+               optional="False" value="gene_id"/>
+        <param type="data" name="chr_aliases" label="chrAliases"
+               help="a character string giving the name of a file that contains aliases of chromosome names. The file should be a comma delimited text file that includes two columns. The first column gives the chromosome names used in the annotation and the second column gives the chromosome names used by reads. This file should not contain header lines. Names included in this file are case sensitive."
+               optional="True" format="csv"/>
+        <param type="boolean" name="use_meta_features" label="useMetaFeatures"
+               help="logical indicating whether the read summarization should be performed at the feature level (eg. exons) or meta-feature level (eg genes). If TRUE, features in the annotation (each row is a feature) will be grouped into meta-features using their values in the &quot;GeneID&quot; column in the SAF-format annotation file or using the &quot;gene_id&quot; attribute in the GTF-format annotation file, and reads will assiged to the meta-features instead of the features. See below for more details."
+               optional="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="boolean" name="allow_multi_overlap" label="allowMultiOverlap"
+               help="logical indicating if a read is allowed to be assigned to more than one feature (or meta-feature) if it is found to overlap with more than one feature (or meta-feature). FALSE by default."
+               optional="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="integer" name="min_overlap" label="minOverlap"
+               help="integer giving the minimum number of overlapped bases required for assigning a read to a feature (or a meta-feature). For assignment of read pairs (fragments), numbers of overlapping bases from each read in the same pair will be summed. If a negative value is provided, the read will be extended from both ends. 1 by default."
+               optional="False" value="1"/>
+        <param type="boolean" name="largest_overlap" label="largestOverlap"
+               help="If TRUE, a read (or read pair) will be assigned to the feature (or meta-feature) that has the largest number of overlapping bases, if the read (or read pair) overlaps with multiple features (or meta-features)."
+               optional="False" checked="True" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="integer" name="read_extension_5" label="readExtension5"
+               help="integer giving the number of bases extended upstream from 5' end of each read. 0 by default."
+               optional="False" value="1"/>
+        <param type="integer" name="read_extension_3" label="readExtension3"
+               help="integer giving the number of bases extended downstream from 3' end of each read. 0 by default."
+               optional="False" value="1"/>
+        <param type="text" name="read_2_pos" label="read2pos"
+               help="Specifying whether each read should be reduced to its 5' most base or 3' most base. It has three possible values: NULL, 5 (denoting 5' most base) and 3 (denoting 3' most base). The default value is NULL. With the default value, the whole read is used for summarization. When read2pos is set to 5 (or 3), read summarization will be performed based on the 5' (or 3') most base position. read2pos can be used together with readExtension5 and readExtension3 parameters to set any desired length for reads."
+               optional="False" value="NULL"/>
+        <param type="boolean" name="count_multi_mapping_reads" label="countMultiMappingReads"
+               help="logical indicating if multi-mapping reads/fragments should be counted, FALSE by default. If TRUE, a multi-mapping read will be counted up to N times if it has N reported mapping locations. This function uses the 'NH' tag to find multi-mapping reads."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="boolean" name="fraction" label="fraction"
+               help="logical indicating if fractional counts will be produced for multi-mapping reads. If TRUE, a fractional count, 1/n, will be generated for each reported alignment of a multi-mapping read, where n is the total number of alignments reported for that read. countMultiMappingReads must be set to TRUE when fraction is TRUE."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="integer" name="min_mqs" label="minMQS"
+               help="integer giving the minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 0 by default."
+               optional="False" value="0"/>
+        <param type="boolean" name="split_only" label="splitOnly"
+               help="logical indicating whether only split alignments (their CIGAR strings contain letter 'N') should be included for summarization. FALSE by default. Example split alignments are exon-spanning reads from RNA-seq data. useMetaFeatures should be set to FALSE and allowMultiOverlap should be set to TRUE, if the purpose of summarization is to assign exon-spanning reads to all their overlapping exons."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="boolean" name="non_split_only" label="nonSplitOnly"
+               help="logical indicating whether only non-split alignments (their CIGAR strings do not contain letter 'N') should be included for summarization. FALSE by default."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="boolean" name="primary_only" label="primaryOnly"
+               help="logical indicating if only primary alignments should be counted. Primary and secondary alignments are identified using bit 0x100 in the Flag field of SAM/BAM files. If TRUE, all primary alignments in a dataset will be counted no matter they are from multi-mapping reads or not (ie. countMultiMappingReads is ignored)."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="boolean" name="ignore_dup" label="ignoreDup"
+               help="logical indicating whether reads marked as duplicates should be ignored. FALSE by default. Read duplicates are identified using bit Ox400 in the FLAG field in SAM/BAM files. The whole fragment (read pair) will be ignored if paired end."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="integer" name="strand_specific" label="strandSpecific"
+               help="integer indicating if strand-specific read counting should be performed. It has three possible values: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). 0 by default."
+               optional="False" value="0"/>
+        <param type="boolean" name="junc_counts" label="juncCounts"
+               help="integer indicating if strand-specific read counting should be performed. It has three possible values: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). 0 by default."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="data" name="genome" label="genome"
+               help="a character string giving the name of a FASTA-format file that includes the reference genome sequences. The reference genome provided here should be the same as the one used in read mapping. NULL by default."
+               optional="False" format="fasta"/>
+        <param type="boolean" name="is_paired_end" label="isPairedEnd"
+               help="logical indicating if paired-end reads are used. If TRUE, fragments (templates or read pairs) will be counted instead of individual reads. FALSE by default."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="boolean" name="require_both_ends_mapped" label="requireBothEndsMapped"
+               help="logical indicating if both ends from the same fragment are required to be successfully aligned before the fragment can be assigned to a feature or meta-feature. This parameter is only appliable when isPairedEnd is TRUE."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="boolean" name="check_frag_length" label="checkFragLength"
+               help="logical indicating if the two ends from the same fragment are required to satisify the fragment length criteria before the fragment can be assigned to a feature or meta-feature. This parameter is only appliable when isPairedEnd is TRUE. The fragment length criteria are specified via minFragLength and maxFragLength."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="integer" name="min_frag_length" label="minFragLength"
+               help="integer giving the minimum fragment length for paired-end reads. 50 by default." optional="False"
+               value="50"/>
+        <param type="integer" name="max_frag_length" label="maxFragLength"
+               help="integer giving the maximum fragment length for paired-end reads. 600 by default. minFragLength and maxFragLength are only applicable when isPairedEnd is TRUE. Note that when a fragment spans two or more exons, the observed fragment length might be much bigger than the nominal fragment length."
+               optional="False" value="600"/>
+        <param type="boolean" name="count_chimeric_fragments" label="countChimericFragments"
+               help="logical indicating whether a chimeric fragment, which has its two reads mapped to different chromosomes, should be counted or not. TRUE by default."
+               optional="False" checked="True" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="boolean" name="auto_sort" label="autosort"
+               help="logical specifying if the automatic read sorting is enabled. This option is only applicable for paired-end reads. If TRUE, reads will be automatically sorted by their names if reads from the same pair are found not to be located next to each other in the input. No read sorting will be performed if there are no such reads found."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+        <param type="integer" name="n_threads" label="nthreads"
+               help="integer giving the number of threads used for running this function. 1 by default."
+               optional="False" value="1" min="1"/>
+        <param type="integer" name="max_mop" label="maxMOp"
+               help="integer giving the maximum number of 'M' operations (matches or mis-matches) allowed in a CIGAR string. 10 by default. Both 'X' and '=' operations are treated as 'M' and adjacent 'M' operations are merged in the CIGAR string."
+               optional="False" value="10"/>
+        <param type="boolean" name="report_reads" label="reportReads"
+               help="logical indicating if read counting result for each read/fragment is saved to a file. If TRUE, read counting results for reads/fragments will be saved to a tab-delimited file that contains four columns including name of read/fragment, status(assigned or the reason if not assigned), name of target feature/meta-feature and number of hits if the read/fragment is counted multiple times. Name of the file is the same as name of the input read file except a suffix '.featureCounts' is added. Multiple files will be generated if there is more than one input read file."
+               optional="False" checked="False" truevalue="TRUE" falsevalue="FALSE"/>
+    </inputs>
+    <outputs>
+        <data name="report" format="html" label="tool name report" hidden="false"/>
+        <data name="sink_message" format="txt" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"
+              hidden="false"/>
+        <data name="feature_counts" format="rdata" label="feature counts" from_work_dir="feature_counts.RData"
+              hidden="false"/>
+        <data name="feature_counts_csv" format="tabular" label="feature counts (.txt)"
+              from_work_dir="feature_counts.txt"
+              hidden="false"/>
+    </outputs>
+    <help><![CDATA[some help text]]></help>
+    <citations>
+        <citation type="bibtex"><![CDATA[
+                    @article{allaire2016rmarkdown,
+                        title={rmarkdown: Dynamic Documents for R, 2016},
+                        author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff
+                        and Wickham, Hadley and Atkins, Aron and Hyndman, Rob},
+                        journal={R package version 0.9},
+                        volume={6},
+                        year={2016}
+                    ]]></citation>
+        <citation type="bibtex"><![CDATA[@article{liao2013featurecounts,
+  title={featureCounts: an efficient general purpose program for assigning sequence reads to genomic features},
+  author={Liao, Yang and Smyth, Gordon K and Shi, Wei},
+  journal={Bioinformatics},
+  volume={30},
+  number={7},
+  pages={923--930},
+  year={2013},
+  publisher={Oxford University Press}
+}]]></citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rmarkdown_feature_counts_render.R	Fri Dec 29 15:03:18 2017 -0500
@@ -0,0 +1,118 @@
+##============ Sink warnings and errors to a file ==============
+## use the sink() function to wrap all code within it.
+##==============================================================
+zz = file('warnings_and_errors.txt')
+sink(zz)
+sink(zz, type = 'message')
+
+  ##============== load packages ===============================
+  library(getopt)
+  library(rmarkdown)
+  library(htmltools)
+  library(dplyr)
+  library(Rsubread)
+  library(DT)
+  ##============================================================
+
+  ##---------below is the code for rendering .Rmd templates-----
+
+  ##=============STEP 1: handle command line arguments==========
+  ##
+  ##============================================================
+  # column 1: the long flag name
+  # column 2: the short flag alias. A SINGLE character string
+  # column 3: argument mask
+  #           0: no argument
+  #           1: argument required
+  #           2: argument is optional
+  # column 4: date type to which the flag's argument shall be cast.
+  #           possible values: logical, integer, double, complex, character.
+  #-------------------------------------------------------------
+  #++++++++++++++++++++ Best practice ++++++++++++++++++++++++++
+  # 1. short flag alias should match the flag in the command section in the XML file.
+  # 2. long flag name can be any legal R variable names
+  # 3. two names in args_list can have common string but one name should not be a part of another name.
+  #    for example, one name is "ECHO", if another name is "ECHO_XXX", it will cause problems.
+  #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+  args_list=list()
+  ##------- 1. input data ---------------------
+  args_list$e = c('echo', 'e', '1', 'logical')
+  args_list$a = c('input_bam_paths', 'a', '1', 'character')
+  args_list$N = c('input_bam_names', 'N', '1', 'character')
+  args_list$b = c('annot_inbuilt', 'b', '1', 'character')
+  args_list$c = c('annot_ext', 'c', '1', 'character')
+  args_list$f = c('isGTFAnnotationFile', 'f', '1', 'logical')
+  args_list$g = c('gtf_feature_type', 'g', '1', 'character')
+  args_list$h = c('gtf_attr_type', 'h', '1', 'character')
+  args_list$i = c('chr_aliases', 'i', '2', 'character')
+  args_list$j = c('use_meta_features', 'j', '1', 'logical')
+  args_list$k = c('allow_multi_overlap', 'k', '1', 'logical')
+  args_list$l = c('min_overlap', 'l', '1', 'integer')
+  args_list$m = c('largest_overlap', 'm', '1', 'logical')
+  args_list$n = c('read_extension_5', 'n', '1', 'integer')
+  args_list$o = c('read_extension_3', 'o', '1', 'integer')
+  args_list$p = c('read_2_pos', 'p', '1', 'character')
+  args_list$q = c('count_multi_mapping_reads', 'q', '1', 'logical')
+  args_list$u = c('fraction', 'u', '1', 'logical')
+  args_list$v= c('min_mqs', 'v', '1', 'integer')
+  args_list$w= c('split_only', 'w', '1', 'logical')
+  args_list$x= c('non_split_only', 'x', '1', 'logical')
+  args_list$y= c('primary_only', 'y', '1', 'logical')
+  args_list$z= c('ignore_dup', 'z', '1', 'logical')
+  args_list$A= c('strand_specific', 'A', '1', 'integer')
+  args_list$B= c('junc_counts', 'B', '1', 'logical')
+  args_list$C= c('genome', 'C', '1', 'character')
+  args_list$D= c('is_paired_end', 'D', '1', 'logical')
+  args_list$E= c('require_both_ends_mapped', 'E', '1', 'logical')
+  args_list$F= c('check_frag_length', 'F', '1', 'logical')
+  args_list$G= c('min_frag_length', 'G', '1', 'integer')
+  args_list$H= c('max_frag_length', 'H', '1', 'integer')
+  args_list$I= c('count_chimeric_fragments', 'I', '1', 'logical')
+  args_list$J= c('auto_sort', 'J', '1', 'logical')
+  args_list$K= c('n_threads', 'K', '1', 'integer')
+  args_list$L= c('max_mop', 'L', '1', 'integer')
+  args_list$M= c('report_reads', 'M', '1', 'logical')
+
+  ##--------2. output report and outputs --------------
+  args_list$REPORT_HTML = c('report_html', 'r', '1', 'character')
+  args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character')
+  args_list$SINK_MESSAGE = c('sink_message', 's', '1', 'character')
+  ##--------3. .Rmd templates in the tool directory ----------
+  args_list$TOOL_TEMPLATE_RMD = c('tool_template_rmd', 't', '1', 'character')
+  ##-----------------------------------------------------------
+  opt = getopt(t(as.data.frame(args_list)))
+
+
+
+  ##=======STEP 2: create report directory (optional)==========
+  ##
+  ##===========================================================
+  dir.create(opt$report_dir)
+
+  ##=STEP 3: replace placeholders in .Rmd with argument values=
+  ##
+  ##===========================================================
+  #++ need to replace placeholders with args values one by one+
+  readLines(opt$tool_template_rmd) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('tool_template.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+
+  ##=============STEP 4: render .Rmd templates=================
+  ##
+  ##===========================================================
+  render('tool_template.Rmd', output_file = opt$report_html)
+
+
+  ##--------end of code rendering .Rmd templates----------------
+sink()
+##=========== End of sinking output=============================