# HG changeset patch # User lparsons # Date 1406126690 14400 # Node ID 1e66f05a23aa65bdb1a1821ed0913cd68d800496 # Parent 1b769e35cd8e5da9e3fd79cb2eeaf89350d0d487 Reupload tarball (all files were again deleted by toolshed). diff -r 1b769e35cd8e -r 1e66f05a23aa README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,24 @@ +== RSeQC Galaxy Wrapper == + +This is a Galaxy wrapper for the RSeQC RNA-Seq QC package. + +** Installation ** + +Installation from a tool shed provides the necessary tool dependencies, R, numpy, and RSeQC. + +Otherwise, make sure that R and the RSeQC scripts are in the path and run under the Galaxy environment. +Move the xml files to a subdirectory of your tools directory and add lines in tool_conf.xml to point to them. +Restart the Galaxy server. + +Requires Python 2.7 + +** Attribution ** + +The RSeQC package and associated documentation can be found at: http://rseqc.sourceforge.net/ + +The galaxy wrapper code was written by + Nilesh Kavthekar, School of Engineering and Applied Sciences, University of Pennsylvania, Class of 2016 +Modified by + Lance Parsons, Lewis-Sigler Institute for Integrative Genomics, Princeton University, + Bjorn Gruning, University of Freiburg, bjoern.gruening@gmail.com +The development of the wrapper code is housed on BitBucket at: https://bitbucket.org/lance_parsons/rseqc_galaxy_wrapper diff -r 1b769e35cd8e -r 1e66f05a23aa RPKM_count.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RPKM_count.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,132 @@ + + calculates raw count and RPKM values for transcript at exon, intron, and mRNA level + + numpy + rseqc + + + ln -s "${input}" "local_input.bam" && + ln -s "${input.metadata.bam_index}" "local_input.bam.bai" && + RPKM_count.py -i "local_input.bam" -o output -r $refgene + + #if str($strand_type.strand_specific) == "pair" + -d + #if str($strand_type.pair_type) == "sd" + '1++,1--,2+-,2-+' + #else + '1+-,1-+,2++,2--' + #end if + #end if + + #if str($strand_type.strand_specific) == "single" + -d + #if str($strand_type.single_type) == "s" + '++,--' + #else + '+-,-+' + #end if + #end if + + #if $skiphits + -u + #end if + + #if $onlyexonic + -e + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +RPKM_count.py ++++++++++++++ + +Given a BAM file and reference gene model, this program will calculate the raw count and RPKM +values for transcript at exon, intron and mRNA level. For strand specific RNA-seq data, +program will assign read to its parental gene according to strand rule, if you don't know the +strand rule, run infer_experiment.py. Please note that chromosome ID, genome cooridinates +should be concordant between BAM and BED files. + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Reference gene model + Gene model in BED format. + +Strand sequencing type (default=none) + See Infer Experiment tool if uncertain. + +Options +++++++++++++++ + +Skip Multiple Hit Reads + Use Multiple hit reads or use only uniquely mapped reads. + +Only use exonic reads + Renders program only used exonic (UTR exons and CDS exons) reads, otherwise use all reads. + +Sample Output +++++++++++++++ + +===== ======== ======== ===================== ===== =========== ============= ============= ======== ========= +chrom start end accession score gene strand tag count (+) tag count (-) RPKM (+) RPKM (-) +===== ======== ======== ===================== ===== =========== ============= ============= ======== ========= +chr1 29213722 29313959 NM_001166007_intron_1 0 '+' 431 4329 0.086 0.863 +chr1 29314417 29319841 NM_001166007_intron_2 0 '+' 31 1 0.114 0.004 +chr1 29320054 29323726 NM_001166007_intron_3 0 '+' 32 0 0.174 0.000 +chr1 29213602 29213722 NM_001166007_exon_1 0 '+' 164 0 27.321 0.000 +chr1 29313959 29314417 NM_001166007_exon_2 0 '+' 1699 4 74.158 0.175 +chr1 29319841 29320054 NM_001166007_exon_3 0 '+' 528 1 49.554 0.094 +===== ======== ======== ===================== ===== =========== ============= ============= ======== ========= + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa RPKM_saturation.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RPKM_saturation.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,160 @@ + + calculates raw count and RPKM values for transcript at exon, intron, and mRNA level + + R + numpy + rseqc + + RPKM_saturation.py -i $input -o output -r $refgene + + #if str($strand_type.strand_specific) == "pair" + -d + #if str($strand_type.pair_type) == "sd" + '1++,1--,2+-,2-+' + #else + '1+-,1-+,2++,2--' + #end if + #end if + + #if str($strand_type.strand_specific) == "single" + -d + #if str($strand_type.single_type) == "s" + '++,--' + #else + '+-,-+' + #end if + #end if + + -l $percentileFloor -u $percentileCeiling -s $percentileStep -c $rpkmCutoff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +RPKM_saturation.py +++++++++++++++++++ + +The precision of any sample statitics (RPKM) is affected by sample size (sequencing depth); +\'resampling\' or \'jackknifing\' is a method to estimate the precision of sample statistics by +using subsets of available data. This module will resample a series of subsets from total RNA +reads and then calculate RPKM value using each subset. By doing this we are able to check if +the current sequencing depth was saturated or not (or if the RPKM values were stable or not) +in terms of genes' expression estimation. If sequencing depth was saturated, the estimated +RPKM value will be stationary or reproducible. By default, this module will calculate 20 +RPKM values (using 5%, 10%, ... , 95%,100% of total reads) for each transcripts. + +In the output figure, Y axis is "Percent Relative Error" or "Percent Error" which is used +to measures how the RPKM estimated from subset of reads (i.e. RPKMobs) deviates from real +expression level (i.e. RPKMreal). However, in practice one cannot know the RPKMreal. As a +proxy, we use the RPKM estimated from total reads to approximate RPKMreal. + +.. image:: http://rseqc.sourceforge.net/_images/RelativeError.png + :height: 80 px + :width: 400 px + :scale: 100 % + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Reference gene model + Gene model in BED format. + +Strand sequencing type (default=none) + See Infer Experiment tool if uncertain. + +Options +++++++++++++++ + +Skip Multiple Hit Reads + Use Multiple hit reads or use only uniquely mapped reads. + +Only use exonic reads + Renders program only used exonic (UTR exons and CDS exons) reads, otherwise use all reads. + +Output +++++++++++++++ + +1. output..eRPKM.xls: RPKM values for each transcript +2. output.rawCount.xls: Raw count for each transcript +3. output.saturation.r: R script to generate plot +4. output.saturation.pdf: + +.. image:: http://rseqc.sourceforge.net/_images/saturation.png + :height: 600 px + :width: 600 px + :scale: 80 % + +- All transcripts were sorted in ascending order according to expression level (RPKM). Then they are divided into 4 groups: + 1. Q1 (0-25%): Transcripts with expression level ranked below 25 percentile. + 2. Q2 (25-50%): Transcripts with expression level ranked between 25 percentile and 50 percentile. + 3. Q3 (50-75%): Transcripts with expression level ranked between 50 percentile and 75 percentile. + 4. Q4 (75-100%): Transcripts with expression level ranked above 75 percentile. +- BAM/SAM file containing more than 100 million alignments will make module very slow. +- Follow example below to visualize a particular transcript (using R console):: + + pdf("xxx.pdf") #starts the graphics device driver for producing PDF graphics + x <- seq(5,100,5) #resampling percentage (5,10,15,...,100) + rpkm <- c(32.95,35.43,35.15,36.04,36.41,37.76,38.96,38.62,37.81,38.14,37.97,38.58,38.59,38.54,38.67, 38.67,38.87,38.68, 38.42, 38.23) #Paste RPKM values calculated from each subsets + scatter.smooth(x,100*abs(rpkm-rpkm[length(rpkm)])/(rpkm[length(rpkm)]),type="p",ylab="Precent Relative Error",xlab="Resampling Percentage") + dev.off() #close graphical device + +.. image:: http://rseqc.sourceforge.net/_images/saturation_eg.png + :height: 600 px + :width: 600 px + :scale: 80 % + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa bam2wig.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bam2wig.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,153 @@ + + + converts all types of RNA-seq data from .bam to .wig + + + R + numpy + rseqc + + + tmp_input_name=\$(mktemp -u); + bai='.bai'; + + ln -s "${input}" \$tmp_input_name && + ln -s "${input.metadata.bam_index}" \$tmp_input_name\$bai && + bam2wig.py -i \$tmp_input_name -s $chromsize -o outfile + + #if str($strand_type.strand_specific) == "pair" + -d + #if str($strand_type.pair_type) == "sd" + '1++,1--,2+-,2-+' + #else + '1+-,1-+,2++,2--' + #end if + #end if + + #if str($strand_type.strand_specific) == "single" + -d + #if str($strand_type.single_type) == "s" + '++,--' + #else + '+-,-+' + #end if + #end if + + #if $wigsum.wigsum_type + -t $wigsum.totalwig + #end if + + #if $skipmultihits + -u + #end if + ; + rm "\$tmp_input_name\$bai"; + rm \$tmp_input_name + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + strand_type['strand_specific'] == 'none' + + + strand_type['strand_specific'] != 'none' + + + strand_type['strand_specific'] != 'none' + + + + + + + +bam2wig.py +++++++++++ + +Visualization is the most straightforward and effective way to QC your RNA-seq +data. For example, change of expression or new splicing can be easily checked +by visually comparing two RNA-seq tracks using genome browser such as UCSC_, +IGB_ and IGV_. `bam2wig.py` converts all types of RNA-seq data from BAM_ +format into wiggle_ format in one-stop. wiggle_ files can then be easily +converted into bigwig_. Bigwig is indexed, binary format of wiggle file, and +it's particular useful to display large, continuous dataset on genome +browser. + +Inputs +++++++++++++++ + +Input BAM file + Alignment file in BAM format (SAM is not supported). BAM file will be sorted and indexed using samTools. + +Chromosome size file + Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. Chromosome names (such as "chr1") should be consistent between this file and BAM file. + +Specified wigsum (default=none) + Specified wigsum. Wigsum of 100000000 equals to coverage achieved by 1 million 100nt reads. Ignore this option to disable normalization. + +Skip multiple Hit reads + skips multiple hit reads or only use uniquely mapped reads + +Strand-specific (default=none) + How read(s) were stranded during sequencing. If you are not sure about the strand rule, run infer_experiment.py + +Outputs +++++++++++++++ + +If RNA-seq is not strand specific, one wig file will be generated, if RNA-seq +is strand specific, two wig files corresponding to Forward and Reverse will be generated. + +----- + +About RSeQC ++++++++++++ + + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ +.. _UCSC: http://genome.ucsc.edu/index.html +.. _IGB: http://bioviz.org/igb/ +.. _IGV: http://www.broadinstitute.org/igv/home +.. _BAM: http://genome.ucsc.edu/goldenPath/help/bam.html +.. _wiggle: http://genome.ucsc.edu/goldenPath/help/wiggle.html +.. _bigwig: http://genome.ucsc.edu/FAQ/FAQformat.html#format6.1 + + + diff -r 1b769e35cd8e -r 1e66f05a23aa bam_stat.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bam_stat.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,65 @@ + + + reads mapping statistics for a provided BAM or SAM file. + + + numpy + rseqc + s + + bam_stat.py -i $input -q $mapqual 2> $output + + + + + + + + + + + + + +bam_stat.py ++++++++++++ + +This program is used to calculate reads mapping statistics from provided BAM +file. This script determines "uniquely mapped reads" from `mapping quality`_, +which quality the probability that a read is misplaced (Do NOT confused with +sequence quality, sequence quality measures the probability that a base-calling +was wrong) . + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Minimum mapping quality + Minimum mapping quality for an alignment to be called “uniquely mapped” (default=30) + +Output +++++++++++++++ + +- Total Reads (Total records) = {Multiple mapped reads} + {Uniquely mapped} +- Uniquely mapped Reads = {read-1} + {read-2} (if paired end) +- Uniquely mapped Reads = {Reads map to '+'} + {Reads map to '-'} +- Uniquely mapped Reads = {Splice reads} + {Non-splice reads} + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ +.. _`mapping quality`: http://genome.sph.umich.edu/wiki/Mapping_Quality_Scores + + + diff -r 1b769e35cd8e -r 1e66f05a23aa clipping_profile.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clipping_profile.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,61 @@ + + + estimates clipping profile of RNA-seq reads from BAM or SAM file + + + R + numpy + rseqc + + + clipping_profile.py -i $input -o output + + + + + + + + + + + + + +clipping_profile.py ++++++++++++++++++++ + +This program is used to estimate clipping profile of RNA-seq reads from BAM or SAM file. +Note that to use this funciton, CIGAR strings within SAM/BAM file should have 'S' operation +(This means your reads aligner should support clipped mapping). + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + + +Sample Output +++++++++++++++ + +.. image:: http://rseqc.sourceforge.net/_images/clipping_good.png + :height: 600 px + :width: 600 px + :scale: 80 % + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + diff -r 1b769e35cd8e -r 1e66f05a23aa geneBody_coverage.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/geneBody_coverage.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,76 @@ + + + Read coverage over gene body. + + + R + numpy + rseqc + + + geneBody_coverage.py -i $input -r $refgene -o output + + + + + + + + + + + + + + + +geneBody_coverage.py +++++++++++++++++++++ + +Read coverage over gene body. This module is used to check if reads coverage is uniform and +if there is any 5\'/3\' bias. This module scales all transcripts to 100 nt and calculates the +number of reads covering each nucleotide position. Finally, it generates a plot illustrating +the coverage profile along the gene body. NOTE: this module requires lots of memory for large +BAM files, because it load the entire BAM file into memory. We add another script +"geneBody_coverage2.py" into v2.3.1 which takes bigwig (instead of BAM) as input. +It only use 200M RAM, but users need to convert BAM into WIG, and then WIG into BigWig. + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Reference gene model + Gene Model in BED format. + + +Outputs +++++++++++++++ + +Read coverage over gene body. This module is used to check if reads coverage is uniform and if there is any 5’/3’ bias. This module scales all transcripts to 100 nt and calculates the number of reads covering each nucleotide position. Finally, it generates a plot illustrating the coverage profile along the gene body. NOTE: this module requires lots of memory for large BAM files, because it load the entire BAM file into memory. We add another script "geneBody_coverage2.py" into v2.3.1 which takes bigwig (instead of BAM) as input. It only use 200M RAM, but users need to convert BAM into WIG, and then WIG into BigWig. + +Example output: + .. image:: http://rseqc.sourceforge.net/_images/geneBody_coverage.png + :height: 600 px + :width: 600 px + :scale: 80 % + + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa geneBody_coverage2.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/geneBody_coverage2.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,71 @@ + + + Read coverage over gene body + + + R + numpy + rseqc + + + geneBody_coverage2.py -i $input -r $refgene -o output + + + + + + + + + + + + + + + +geneBody_coverage2.py ++++++++++++++++++++++ + +Similar to geneBody_coverage.py. This module takes bigwig instead of BAM as input, and thus +requires much less memory. The BigWig file could be arbitrarily large. + + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Reference gene model + Gene Model in BED format. + + +Outputs +++++++++++++++ + +Read coverage over gene body. This module is used to check if reads coverage is uniform and if there is any 5’/3’ bias. This module scales all transcripts to 100 nt and calculates the number of reads covering each nucleotide position. Finally, it generates a plot illustrating the coverage profile along the gene body. NOTE: this module requires lots of memory for large BAM files, because it load the entire BAM file into memory. We add another script "geneBody_coverage2.py" into v2.3.1 which takes bigwig (instead of BAM) as input. It only use 200M RAM, but users need to convert BAM into WIG, and then WIG into BigWig. + +Example output: + .. image:: http://dldcc-web.brc.bcm.edu/lilab/liguow/RSeQC/figure/geneBody_coverage.png + :height: 600 px + :width: 600 px + :scale: 80 % + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa infer_experiment.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/infer_experiment.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,141 @@ + + speculates how RNA-seq were configured + + numpy + rseqc + + + infer_experiment.py -i $input -r $refgene + #if $sample_size.boolean + -s $sample_size.size + #end if + + > $output + + + + + + + + + + + + + + + + + + + +infer_experiment.py ++++++++++++++++++++ + +This program is used to speculate how RNA-seq sequencing were configured, especially how +reads were stranded for strand-specific RNA-seq data, through comparing reads' mapping +information to the underneath gene model. + + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Reference gene model + Gene model in BED format. + +Number of usable sampled reads (default=200000) + Number of usable reads sampled from SAM/BAM file. More reads will give more accurate estimation, but make program little slower. + +Outputs ++++++++ + +For pair-end RNA-seq, there are two different +ways to strand reads (such as Illumina ScriptSeq protocol): + +1. 1++,1--,2+-,2-+ + +* read1 mapped to '+' strand indicates parental gene on '+' strand +* read1 mapped to '-' strand indicates parental gene on '-' strand +* read2 mapped to '+' strand indicates parental gene on '-' strand +* read2 mapped to '-' strand indicates parental gene on '+' strand + +2. 1+-,1-+,2++,2-- + +* read1 mapped to '+' strand indicates parental gene on '-' strand +* read1 mapped to '-' strand indicates parental gene on '+' strand +* read2 mapped to '+' strand indicates parental gene on '+' strand +* read2 mapped to '-' strand indicates parental gene on '-' strand + +For single-end RNA-seq, there are also two different ways to strand reads: + +1. ++,-- + +* read mapped to '+' strand indicates parental gene on '+' strand +* read mapped to '-' strand indicates parental gene on '-' strand + +2. +-,-+ + +* read mapped to '+' strand indicates parental gene on '-' strand +* read mapped to '-' strand indicates parental gene on '+' strand + + +Example Output +++++++++++++++ + +**Example1** :: + + ========================================================= + This is PairEnd Data :: + + Fraction of reads explained by "1++,1--,2+-,2-+": 0.4992 + Fraction of reads explained by "1+-,1-+,2++,2--": 0.5008 + Fraction of reads explained by other combinations: 0.0000 + ========================================================= + +*Conclusion*: We can infer that this is NOT a strand specific because 50% of reads can be explained by "1++,1--,2+-,2-+", while the other 50% can be explained by "1+-,1-+,2++,2--". + +**Example2** :: + + ============================================================ + This is PairEnd Data + + Fraction of reads explained by "1++,1--,2+-,2-+": 0.9644 :: + Fraction of reads explained by "1+-,1-+,2++,2--": 0.0356 + Fraction of reads explained by other combinations: 0.0000 + ============================================================ + +*Conclusion*: We can infer that this is a strand-specific RNA-seq data. strandness of read1 is consistent with that of gene model, while strandness of read2 is opposite to the strand of reference gene model. + +**Example3** :: + + ========================================================= + This is SingleEnd Data :: + + Fraction of reads explained by "++,--": 0.9840 :: + Fraction of reads explained by "+-,-+": 0.0160 + Fraction of reads explained by other combinations: 0.0000 + ========================================================= + +*Conclusion*: This is single-end, strand specific RNA-seq data. Strandness of reads are concordant with strandness of reference gene. + + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa inner_distance.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/inner_distance.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,123 @@ + + calculate the inner distance (or insert size) between two paired RNA reads + + R + numpy + rseqc + + + inner_distance.py -i $input -o output -r $refgene + + #if $bounds.hasLowerBound + -l $bounds.lowerBound + #end if + + #if $bounds2.hasUpperBound + -u $bounds2.upperBound + #end if + + #if $steps.step + -s $steps.stepSize + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +inner_distance.py ++++++++++++++++++ + +This module is used to calculate the inner distance (or insert size) between two paired RNA +reads. The distance is the mRNA length between two paired fragments. We first determine the +genomic (DNA) size between two paired reads: D_size = read2_start - read1_end, then + +* if two paired reads map to the same exon: inner distance = D_size +* if two paired reads map to different exons:inner distance = D_size - intron_size +* if two paired reads map non-exonic region (such as intron and intergenic region): inner distance = D_size +* The inner_distance might be a negative value if two fragments were overlapped. + +NOTE: Not all read pairs were used to estimate the inner distance distribution. Those low +quality, PCR duplication, multiple mapped reads were skipped. + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Reference gene model + Gene model in BED format. + +Estimated Upper/Lower Bounds (defaults=250 and -250) + Estimated upper/lower bounds of inner distance (bp). + +Step size (default=5) + Step size of histogram + + +Output +++++++++++++++ + +1. output.inner_distance.txt: + - first column is read ID + -second column is inner distance. Could be negative value if PE reads were overlapped or mapping error (e.g. Read1_start < Read2_start, while Read1_end >> Read2_end due to spliced mapping of read1) + - third column indicates how paired reads were mapped: PE_within_same_exon, PE_within_diff_exon,PE_reads_overlap +2. output..inner_distance_freq.txt: + - inner distance starts + - inner distance ends + - number of read pairs + - note the first 2 columns are left side half open interval +3. output.inner_distance_plot.r: R script to generate histogram +4. output.inner_distance_plot.pdf: histogram plot + +.. image:: http://rseqc.sourceforge.net/_images/inner_distance.png + :height: 600 px + :width: 600 px + :scale: 80 % + + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa junction_annotation.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/junction_annotation.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,101 @@ + + compares detected splice junctions to reference gene model + + R + numpy + rseqc + + + junction_annotation.py + -i $input -o output -r $refgene + #if $intron.hasIntron + -m $intron.min_Intron + #end if + + + + + + + + + + + + + + + + + + + + + + +junction_annotation.py +++++++++++++++++++++++ + +For a given alignment file (-i) in BAM or SAM format and a reference gene model (-r) in BED +format, this program will compare detected splice junctions to reference gene model. splicing +annotation is performed in two levels: splice event level and splice junction level. + +* splice event: An RNA read, especially long read, can be spliced 2 or more times, each time is called a splicing event; In this sense, 100 spliced reads can produce >= 100 splicing events. +* splice junction: multiple splicing events spanning the same intron can be consolidated into one splicing junction. + +All detected junctions can be grouped to 3 exclusive categories: + +1. Annotated: The junction is part of the gene model. Both splice sites, 5' splice site + (5'SS) and 3'splice site (3'SS) can be annotated by reference gene model. +2. complete_novel: Complete new junction. Neither of the two splice sites cannot be annotated by gene model +3. partial_novel: One of the splice site (5'SS or 3'SS) is new, while the other splice site is annotated (known) + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Reference gene model + Gene model in BED format. + +Minimum intron length (default=50) + Minimum intron length (bp). + + +Output +++++++++++++++ + +1. output.junc.anno.junction.xls: + - chrom ID + - start position of junction (coordinate is 0 based) + - end position of junction (coordinate is 1 based) + - number of splice events supporting this junction + - 'annotated', 'complete_novel' or 'partial_novel'. +2. output.anno.junction_plot.r: R script to generate pie chart +3. output.splice_junction.pdf: plot of splice junctions +4. output.splice_events.pdf: plot of splice events + +.. image:: http://rseqc.sourceforge.net/_images/junction.png + :height: 400 px + :width: 850 px + :scale: 80 % + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa junction_saturation.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/junction_saturation.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,99 @@ + + detects splice junctions from each subset and compares them to reference gene model + + R + numpy + rseqc + + junction_saturation.py -i $input -o output -r $refgene -m $intronSize -v $minSplice + + #if $percentiles.specifyPercentiles + -l $percentiles.lowBound -u $percentiles.upBound -s $percentiles.percentileStep + #end if + + + + + + + + + + + + + + + + + + + + + + + + + +junction_saturation.py +++++++++++++++++++++++ + +It's very important to check if current sequencing depth is deep enough to perform +alternative splicing analyses. For a well annotated organism, the number of expressed genes +in particular tissue is almost fixed so the number of splice junctions is also fixed. The fixed +splice junctions can be predetermined from reference gene model. All (annotated) splice +junctions should be rediscovered from a saturated RNA-seq data, otherwise, downstream +alternative splicing analysis is problematic because low abundance splice junctions are +missing. This module checks for saturation by resampling 5%, 10%, 15%, ..., 95% of total +alignments from BAM or SAM file, and then detects splice junctions from each subset and +compares them to reference gene model. + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Reference gene model + Gene model in BED format. + +Sampling Percentiles - Upper Bound, Lower Bound, Sampling Increment (defaults= 100, 5, and 5) + Sampling starts from the Lower Bound and increments to the Upper Bound at the rate of the Sampling Increment. + +Minimum intron length (default=50) + Minimum intron length (bp). + +Minimum coverage (default=1) + Minimum number of supportting reads to call a junction. + +Output +++++++++++++++ + +1. output.junctionSaturation_plot.r: R script to generate plot +2. output.junctionSaturation_plot.pdf + +.. image:: http://rseqc.sourceforge.net/_images/junction_saturation.png + :height: 600 px + :width: 600 px + :scale: 80 % + +In this example, current sequencing depth is almost saturated for "known junction" (red line) detection because the number of "known junction" reaches a plateau. In other words, nearly all "known junctions" (expressed in this particular tissue) have already been detected, and continue sequencing will not detect additional "known junction" and will only increase junction coverage (i.e. junction covered by more reads). While current sequencing depth is not saturated for novel junctions (green). + + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa read_GC.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/read_GC.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,61 @@ + + determines GC% and read count + + R + numpy + rseqc + + + read_GC.py -i $input -o output + + + + + + + + + + + + + + +read_GC.py +++++++++++ + + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Output +++++++++++++++ + +1. output.GC.xls: Two column, plain text file, first column is GC%, second column is read count +2. output.GC_plot.r: R script to generate pdf file. +3. output.GC_plot.pdf: graphical output generated from R script. + +.. image:: http://rseqc.sourceforge.net/_images/read_gc.png + :height: 600 px + :width: 600 px + :scale: 80 % + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa read_NVC.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/read_NVC.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,77 @@ + + to check the nucleotide composition bias + + R + numpy + rseqc + + + read_NVC.py -i $input -o output $nx + + + + + + + + + + + + + + + +read_NVC.py ++++++++++++ + +This module is used to check the nucleotide composition bias. Due to random priming, certain +patterns are over represented at the beginning (5'end) of reads. This bias could be easily +examined by NVC (Nucleotide versus cycle) plot. NVC plot is generated by overlaying all +reads together, then calculating nucleotide composition for each position of read +(or each sequencing cycle). In ideal condition (genome is random and RNA-seq reads is +randomly sampled from genome), we expect A%=C%=G%=T%=25% at each position of reads. + +NOTE: this program expect a fixed read length + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Include N,X in NVC plot + Plots N and X alongside A, T, C, and G in plot. + +Output +++++++++++++++ + +This module is used to check the nucleotide composition bias. Due to random priming, certain patterns are over represented at the beginning (5'end) of reads. This bias could be easily examined by NVC (Nucleotide versus cycle) plot. NVC plot is generated by overlaying all reads together, then calculating nucleotide composition for each position of read (or each sequencing cycle). In ideal condition (genome is random and RNA-seq reads is randomly sampled from genome), we expect A%=C%=G%=T%=25% at each position of reads. + + +1. output.NVC.xls: plain text file, each row is position of read (or sequencing cycle), each column is nucleotide (A,C,G,T,N,X) +2. output.NVC_plot.r: R script to generate NVC plot. +3. output.NVC_plot.pdf: NVC plot. + + +.. image:: http://rseqc.sourceforge.net/_images/NVC_plot.png + :height: 600 px + :width: 600 px + :scale: 80 % + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa read_distribution.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/read_distribution.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,90 @@ + + calculates how mapped reads were distributed over genome feature + + numpy + rseqc + + + read_distribution.py -i $input -r $refgene > $output + + + + + + + + + + + + + +read_distribution.py +++++++++++++++++++++ + +Provided a BAM/SAM file and reference gene model, this module will calculate how mapped +reads were distributed over genome feature (like CDS exon, 5'UTR exon, 3' UTR exon, Intron, +Intergenic regions). When genome features are overlapped (e.g. a region could be annotated +as both exon and intron by two different transcripts) , they are prioritize as: +CDS exons > UTR exons > Introns > Intergenic regions, for example, if a read was mapped to +both CDS exon and intron, it will be assigned to CDS exons. + +* "Total Reads": This does NOT include those QC fail,duplicate and non-primary hit reads +* "Total Tags": reads spliced once will be counted as 2 tags, reads spliced twice will be counted as 3 tags, etc. And because of this, "Total Tags" >= "Total Reads" +* "Total Assigned Tags": number of tags that can be unambiguously assigned the 10 groups (see below table). +* Tags assigned to "TSS_up_1kb" were also assigned to "TSS_up_5kb" and "TSS_up_10kb", tags assigned to "TSS_up_5kb" were also assigned to "TSS_up_10kb". Therefore, "Total Assigned Tags" = CDS_Exons + 5'UTR_Exons + 3'UTR_Exons + Introns + TSS_up_10kb + TES_down_10kb. +* When assign tags to genome features, each tag is represented by its middle point. + +RSeQC cannot assign those reads that: + +* hit to intergenic regions that beyond region starting from TSS upstream 10Kb to TES downstream 10Kb. +* hit to regions covered by both 5'UTR and 3' UTR. This is possible when two head-to-tail transcripts are overlapped in UTR regions. +* hit to regions covered by both TSS upstream 10Kb and TES downstream 10Kb. + + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Reference gene model + Gene model in BED format. + +Sample Output +++++++++++++++ + +Output: + +=============== ============ =========== =========== +Group Total_bases Tag_count Tags/Kb +=============== ============ =========== =========== +CDS_Exons 33302033 20002271 600.63 +5'UTR_Exons 21717577 4408991 203.01 +3'UTR_Exons 15347845 3643326 237.38 +Introns 1132597354 6325392 5.58 +TSS_up_1kb 17957047 215331 11.99 +TSS_up_5kb 81621382 392296 4.81 +TSS_up_10kb 149730983 769231 5.14 +TES_down_1kb 18298543 266161 14.55 +TES_down_5kb 78900674 729997 9.25 +TES_down_10kb 140361190 896882 6.39 +=============== ============ =========== =========== + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa read_duplication.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/read_duplication.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,71 @@ + + determines reads duplication rate with sequence-based and mapping-based strategies + + R + numpy + rseqc + + + read_duplication.py -i $input -o output -u $upLimit + + + + + + + + + + + + + + + + +read_duplication.py ++++++++++++++++++++ + +Two strategies were used to determine reads duplication rate: + +* Sequence based: reads with exactly the same sequence content are regarded as duplicated reads. +* Mapping based: reads mapped to the same genomic location are regarded as duplicated reads. For splice reads, reads mapped to the same starting position and splice the same way are regarded as duplicated reads. + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Upper Limit of Plotted Duplicated Times (default=500) + Only used for plotting. + +Output +++++++++++++++ + +1. output.dup.pos.DupRate.xls: Read duplication rate determined from mapping position of read. First column is "occurrence" or duplication times, second column is number of uniquely mapped reads. +2. output.dup.seq.DupRate.xls: Read duplication rate determined from sequence of read. First column is "occurrence" or duplication times, second column is number of uniquely mapped reads. +3. output.DupRate_plot.r: R script to generate pdf file +4. output.DupRate_plot.pdf: graphical output generated from R script + +.. image:: http://rseqc.sourceforge.net/_images/duplicate.png + :height: 600 px + :width: 600 px + :scale: 80 % + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa read_quality.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/read_quality.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,78 @@ + + determines Phred quality score + + R + numpy + rseqc + + + read_quality.py -i $input -o output -r $reduce + + + + + + + + + + + + + + + +read_quality.py ++++++++++++++++ + +According to SAM specification, if Q is the character to represent "base calling quality" +in SAM file, then Phred Quality Score = ord(Q) - 33. Here ord() is python function that +returns an integer representing the Unicode code point of the character when the argument +is a unicode object, for example, ord('a') returns 97. Phred quality score is widely used +to measure "reliability" of base-calling, for example, phred quality score of 20 means +there is 1/100 chance that the base-calling is wrong, phred quality score of 30 means there +is 1/1000 chance that the base-calling is wrong. In general: Phred quality score = -10xlog(10)P, +here P is probability that base-calling is wrong. + +Inputs +++++++++++++++ + +Input BAM/SAM file + Alignment file in BAM/SAM format. + +Ignore phred scores less than this number (default=1000) + To avoid making huge vector in R, nucleotide with certain phred score represented less than this number will be ignored. Increase this number save more memory while reduce precision. This option only applies to the 'boxplot'. + +Output +++++++++++++++ + +1. output.qual.r +2. output.qual.boxplot.pdf + .. image:: http://rseqc.sourceforge.net/_images/36mer.qual.plot.png + :height: 600 px + :width: 600 px + :scale: 80 % +3. output.qual.heatmap.pdf + .. image:: http://rseqc.sourceforge.net/_images/36mer.qual.heatmap.png + :height: 600 px + :width: 600 px + :scale: 80 % + +Heatmap: use different color to represent nucleotide density ("blue"=low density,"orange"=median density,"red"=high density") + +----- + +About RSeQC ++++++++++++ + +The RSeQC_ package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. "Basic modules" quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while "RNA-seq specific modules" investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. + +The RSeQC package is licensed under the GNU GPL v3 license. + +.. image:: http://rseqc.sourceforge.net/_static/logo.png + +.. _RSeQC: http://rseqc.sourceforge.net/ + + + + diff -r 1b769e35cd8e -r 1e66f05a23aa tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Jul 23 10:44:50 2014 -0400 @@ -0,0 +1,13 @@ + + + + + + + + + + + + +