# HG changeset patch # User nilesh # Date 1373560062 14400 # Node ID 9a33e347a3dedac75f8c7e6298a8ff1aa72668db # Parent 7cac660d1c0aeaf4450ab6e4dbd0d8ce75ce033e Deleted selected files diff -r 7cac660d1c0a -r 9a33e347a3de bam2wig.xml --- a/bam2wig.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,121 +0,0 @@ - - - converts all types of RNA-seq data from .bam to .wig - - - R - samtools - rseqc - - - samtoolshelper.py /home/nilesh/RSeQC-2.3.3/scripts/bam2wig.py -i $input -s $chromsize -o outfile - - #if str($strand_type.strand_specific) == "pair" - -d - #if str($strand_type.pair_type) == "sd" - '1++,1--,2+-,2-+' - #else - '1+-,1-+,2++,2--' - #end if - #end if - - #if str($strand_type.strand_specific) == "single" - -d - #if str($strand_type.single_type) == "s" - '++,--' - #else - '+-,-+' - #end if - #end if - - #if $wigsum.wigsum_type - -t $wigsum.totalwig - #end if - - #if $skipmultihits - -u - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - strand_type['strand_specific'] == 'none' - - - strand_type['strand_specific'] != 'none' - - - strand_type['strand_specific'] != 'none' - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM file - Alignment file in BAM format (SAM is not supported). BAM file will be sorted and indexed using samTools. - -Chromosome size file - Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. Chromosome names (such as "chr1") should be consistent between this file and BAM file. - -Specified wigsum (default=none) - Specified wigsum. Wigsum of 100000000 equals to coverage achieved by 1 million 100nt reads. Ignore this option to disable normalization. - -Skip multiple Hit reads - skips multiple hit reads or only use uniquely mapped reads - -Strand-specific (default=none) - How read(s) were stranded during sequencing. If you are not sure about the strand rule, run infer_experiment.py - -Outputs -++++++++++++++ - -If RNA-seq is not strand specific, one wig file will be generated, if RNA-seq -is strand specific, two wig files corresponding to Forward and Reverse will be generated. - - - - \ No newline at end of file diff -r 7cac660d1c0a -r 9a33e347a3de bam_stat.xml --- a/bam_stat.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ - - - reads mapping statistics for a provided BAM or SAM file. - - - rseqc - s - - bam_stat.py -i $input -q $mapqual > $output - - - - - - - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - -Minimum mapping quality - Minimum mapping quality for an alignment to be called “uniquely mapped” (default=30) - -Output -++++++++++++++ - -- Total Reads (Total records) = {Multiple mapped reads} + {Uniquely mapped} -- Uniquely mapped Reads = {read-1} + {read-2} (if paired end) -- Uniquely mapped Reads = {Reads map to '+'} + {Reads map to '-'} -- Uniquely mapped Reads = {Splice reads} + {Non-splice reads} - - - - \ No newline at end of file diff -r 7cac660d1c0a -r 9a33e347a3de clipping_profile.xml --- a/clipping_profile.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ - - - estimates clipping profile of RNA-seq reads from BAM or SAM file - - - R - rseqc - - - clipping_profile.py -i $input -o output - - - - - - - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - - -Sample Output -++++++++++++++ - -.. image:: http://dldcc-web.brc.bcm.edu/lilab/liguow/RSeQC/figure/clipping_good.png - - - - \ No newline at end of file diff -r 7cac660d1c0a -r 9a33e347a3de geneBody_coverage.xml --- a/geneBody_coverage.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ - - - Read coverage over gene body. - - - R - rseqc - - - geneBody_coverage.py -i $input -r $refgene -o output - - - - - - - - - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - -Reference gene model - Gene Model in BED format. - - -Outputs -++++++++++++++ - -Read coverage over gene body. This module is used to check if reads coverage is uniform and if there is any 5’/3’ bias. This module scales all transcripts to 100 nt and calculates the number of reads covering each nucleotide position. Finally, it generates a plot illustrating the coverage profile along the gene body. NOTE: this module requires lots of memory for large BAM files, because it load the entire BAM file into memory. We add another script "geneBody_coverage2.py" into v2.3.1 which takes bigwig (instead of BAM) as input. It only use 200M RAM, but users need to convert BAM into WIG, and then WIG into BigWig. - -Example output: - .. image:: http://dldcc-web.brc.bcm.edu/lilab/liguow/RSeQC/figure/geneBody_coverage.png - - - - - \ No newline at end of file diff -r 7cac660d1c0a -r 9a33e347a3de geneBody_coverage2.xml --- a/geneBody_coverage2.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ - - - Read coverage over gene body. - - - R - rseqc - - - geneBody_coverage2.py -i $input -r $refgene -o output - - - - - - - - - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - -Reference gene model - Gene Model in BED format. - - -Outputs -++++++++++++++ - -Read coverage over gene body. This module is used to check if reads coverage is uniform and if there is any 5’/3’ bias. This module scales all transcripts to 100 nt and calculates the number of reads covering each nucleotide position. Finally, it generates a plot illustrating the coverage profile along the gene body. NOTE: this module requires lots of memory for large BAM files, because it load the entire BAM file into memory. We add another script "geneBody_coverage2.py" into v2.3.1 which takes bigwig (instead of BAM) as input. It only use 200M RAM, but users need to convert BAM into WIG, and then WIG into BigWig. - -Example output: - .. image:: http://dldcc-web.brc.bcm.edu/lilab/liguow/RSeQC/figure/geneBody_coverage.png - - - - - \ No newline at end of file diff -r 7cac660d1c0a -r 9a33e347a3de infer_experiment.xml --- a/infer_experiment.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,117 +0,0 @@ - - speculates how RNA-seq were configured - - rseqc - - infer_experiment.py -i $input -r $refgene - - #if $sample_size.boolean - -s $sample_size.size - #end if - - > $output - - - - - - - - - - - - - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - -Reference gene model - Gene model in BED format. - -Number of usable sampled reads (default=200000) - Number of usable reads sampled from SAM/BAM file. More reads will give more accurate estimation, but make program little slower. - - -Output -++++++++++++++ -This program is used to speculate how RNA-seq sequencing were configured, especially how reads were stranded for strand-specific RNA-seq data, through comparing reads' mapping information to the underneath gene model. Generally, strand specific RNA-seq data should be handled differently in both visualization and RPKM calculation. - -For pair-end RNA-seq, there are two different ways to strand reads: - -1) 1++,1--,2+-,2-+ - - read1 mapped to '+' strand indicates parental gene on '+' strand - - read1 mapped to '-' strand indicates parental gene on '-' strand - - read2 mapped to '+' strand indicates parental gene on '-' strand - - read2 mapped to '-' strand indicates parental gene on '+' strand -2) 1+-,1-+,2++,2-- - - read1 mapped to '+' strand indicates parental gene on '-' strand - - read1 mapped to '-' strand indicates parental gene on '+' strand - - read2 mapped to '+' strand indicates parental gene on '+' strand - - read2 mapped to '-' strand indicates parental gene on '-' strand - -For single-end RNA-seq, there are also two different ways to strand reads: - -1) ++,-- - -read mapped to '+' strand indicates parental gene on '+' strand - - read mapped to '-' strand indicates parental gene on '-' strand -2) +-,-+ - - read mapped to '+' strand indicates parental gene on '-' strand - - read mapped to '-' strand indicates parental gene on '+' strand - -Example Output -++++++++++++++ - -**Example1** :: - - ========================================================= - This is PairEnd Data :: - - Fraction of reads explained by "1++,1--,2+-,2-+": 0.4992 - Fraction of reads explained by "1+-,1-+,2++,2--": 0.5008 - Fraction of reads explained by other combinations: 0.0000 - ========================================================= - -*Conclusion*: We can infer that this is NOT a strand specific because 50% of reads can be explained by "1++,1--,2+-,2-+", while the other 50% can be explained by "1+-,1-+,2++,2--". - -**Example2** :: - - ============================================================ - This is PairEnd Data - - Fraction of reads explained by "1++,1--,2+-,2-+": 0.9644 :: - Fraction of reads explained by "1+-,1-+,2++,2--": 0.0356 - Fraction of reads explained by other combinations: 0.0000 - ============================================================ - -*Conclusion*: We can infer that this is a strand-specific RNA-seq data. strandness of read1 is consistent with that of gene model, while strandness of read2 is opposite to the strand of reference gene model. - -**Example3** :: - - ========================================================= - This is SingleEnd Data :: - - Fraction of reads explained by "++,--": 0.9840 :: - Fraction of reads explained by "+-,-+": 0.0160 - Fraction of reads explained by other combinations: 0.0000 - ========================================================= - -*Conclusion*: This is single-end, strand specific RNA-seq data. Strandness of reads are concordant with strandness of reference gene. - - \ No newline at end of file diff -r 7cac660d1c0a -r 9a33e347a3de inner_distance.xml --- a/inner_distance.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,95 +0,0 @@ - - calculate the inner distance (or insert size) between two paired RNA reads - - R - rseqc - - inner_distance.py -i $input -o output -r $refgene - - #if $bounds.hasLowerBound - -l $bounds.lowerBound - #end if - - #if $bounds2.hasUpperBound - -u $bounds2.upperBound - #end if - - #if $steps.step - -s $steps.stepSize - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - -Reference gene model - Gene model in BED format. - -Estimated Upper/Lower Bounds (defaults=250 and -250) - Estimated upper/lower bounds of inner distance (bp). - -Step size (default=5) - Step size of histogram - - -Output -++++++++++++++ - -1. output.inner_distance.txt: -- first column is read ID --second column is inner distance. Could be negative value if PE reads were overlapped or mapping error (e.g. Read1_start < Read2_start, while Read1_end >> Read2_end due to spliced mapping of read1) -- third column indicates how paired reads were mapped: PE_within_same_exon, PE_within_diff_exon,PE_reads_overlap -2. output..inner_distance_freq.txt: -- inner distance starts -- inner distance ends -- number of read pairs -- note the first 2 columns are left side half open interval -3. output.inner_distance_plot.r: R script to generate histogram -4. output.inner_distance_plot.pdf: histogram plot - -.. image:: http://dldcc-web.brc.bcm.edu/lilab/liguow/RSeQC/figure/inner_distance.png - - - diff -r 7cac660d1c0a -r 9a33e347a3de junction_annotation.xml --- a/junction_annotation.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ - - compares detected splice junctions to reference gene model - - R - rseqc - - junction_annotation.py -i $input -o output -r $refgene - - #if $intron.hasIntron - -m $intron.min_Intron - #end if - - - - - - - - - - - - - - - - - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - -Reference gene model - Gene model in BED format. - -Minimum intron length (default=50) - Minimum intron length (bp). - - -Output -++++++++++++++ - -1. output.junc.anno.junction.xls: -- chrom ID -- start position of junction (coordinate is 0 based) -- end position of junction (coordinate is 1 based) -- number of splice events supporting this junction -- 'annotated', 'complete_novel' or 'partial_novel'. -2. output.anno.junction_plot.r: R script to generate pie chart -3. output.splice_junction.pdf: plot of splice junctions -4. output.splice_events.pdf: plot of splice events -.. image:: http://dldcc-web.brc.bcm.edu/lilab/liguow/RSeQC/figure/junction.png - - - - - - diff -r 7cac660d1c0a -r 9a33e347a3de junction_saturation.xml --- a/junction_saturation.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ - - detects splice junctions from each subset and compares them to reference gene model - - R - rseqc - - junction_saturation.py -i $input -o output -r $refgene -m $intronSize -v $minSplice - - #if $percentiles.specifyPercentiles - -l $percentiles.lowBound -u $percentiles.upBound -s $percentiles.percentileStep - #end if - - - - - - - - - - - - - - - - - - - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - -Reference gene model - Gene model in BED format. - -Sampling Percentiles - Upper Bound, Lower Bound, Sampling Increment (defaults= 100, 5, and 5) - Sampling starts from the Lower Bound and increments to the Upper Bound at the rate of the Sampling Increment. - -Minimum intron length (default=50) - Minimum intron length (bp). - -Minimum coverage (default=1) - Minimum number of supportting reads to call a junction. - -Output -++++++++++++++ - -1. output.junctionSaturation_plot.r: R script to generate plot -2. output.junctionSaturation_plot.pdf - -.. image:: http://dldcc-web.brc.bcm.edu/lilab/liguow/RSeQC/figure/junction_saturation.png - -In this example, current sequencing depth is almost saturated for "known junction" (red line) detection because the number of "known junction" reaches a plateau. In other words, nearly all "known junctions" (expressed in this particular tissue) have already been detected, and continue sequencing will not detect additional "known junction" and will only increase junction coverage (i.e. junction covered by more reads). While current sequencing depth is not saturated for novel junctions (green). - - - - \ No newline at end of file diff -r 7cac660d1c0a -r 9a33e347a3de read_GC.xml --- a/read_GC.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ - - determines GC% and read count - - R - rseqc - - read_GC.py -i $input -o output - - - - - - - - - - - - .. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - -Output -++++++++++++++ - -1. output.GC.xls: Two column, plain text file, first column is GC%, second column is read count -2. output.GC_plot.r: R script to generate pdf file. -3. output.GC_plot.pdf: graphical output generated from R script. - -.. image:: http://dldcc-web.brc.bcm.edu/lilab/liguow/RSeQC/figure/read_gc.png - - - diff -r 7cac660d1c0a -r 9a33e347a3de read_distribution.xml --- a/read_distribution.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ - - calculates how mapped reads were distributed over genome feature - - rseqc - - read_distribution.py -i $input -r $refgene > $output - - - - - - - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - -Reference gene model - Gene model in BED format. - -Sample Output -++++++++++++++ - -:: - - Total Read: 44,826,454 :: - - Total Tags: 50,023,249 :: - - Total Assigned Tags: 36,057,402 :: - - Group Total_bases Tag_count Tags/Kb - CDS_Exons 33302033 20022538 601.24 - 5'UTR_Exons 21717577 4414913 203.29 - 3'UTR_Exons 15347845 3641689 237.28 - Introns 1132597354 6312099 5.57 - TSS_up_1kb 17957047 215220 11.99 - TSS_up_5kb 81621382 392192 4.81 - TSS_up_10kb 149730983 769210 5.14 - TES_down_1kb 18298543 266157 14.55 - TES_down_5kb 78900674 730072 9.25 - TES_down_10kb 140361190 896953 6.39 - -Note: -- "Total Reads": This does NOT include those QC fail,duplicate and non-primary hit reads -- "Total Tags": reads spliced once will be counted as 2 tags, reads spliced twice will be counted as 3 tags, etc. And because of this, "Total Fragments" >= "Total Reads" -- "Total Assigned Tags": number of tags that can be unambiguously assigned the 10 groups (above table). -- Tags assigned to "TSS_up_1kb" were also assigned to "TSS_up_5kb" and "TSS_up_10kb", tags assigned to "TSS_up_5kb" were also assigned to "TSS_up_10kb". Therefore, "Total Assigned Tags" = CDS_Exons + 5'UTR_Exons + 3'UTR_Exons + Introns + TSS_up_10kb + TES_down_10kb. -- When assigning tags to genome features, each tag is represented by its middle point. -- RSeQC cannot assign those reads that: 1) hit to intergenic regions that beyond region starting from TSS upstream 10Kb to TES downstream 10Kb. 2) hit to regions covered by both 5'UTR and 3' UTR. This is possible when two head-to-tail transcripts are overlapped in UTR regions. 3) hit to regions covered by both TSS upstream 10Kb and TES downstream 10Kb. - - - - diff -r 7cac660d1c0a -r 9a33e347a3de read_duplication.xml --- a/read_duplication.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ - - determines reads duplication rate with sequence-based and mapping-based strategies - - R - rseqc - - read_duplication.py -i $input -o output -u $upLimit - - - - - - - - - - - - -.. image:: https://code.google.com/p/rseqc/logo?cct=1336721062 - ------ - -About RSeQC -+++++++++++ - -The RSeQC package provides a number of useful modules that can comprehensively evaluate high throughput sequence data especially RNA-seq data. “Basic modules” quickly inspect sequence quality, nucleotide composition bias, PCR bias and GC bias, while “RNA-seq specific modules” investigate sequencing saturation status of both splicing junction detection and expression estimation, mapped reads clipping profile, mapped reads distribution, coverage uniformity over gene body, reproducibility, strand specificity and splice junction annotation. - -The RSeQC package is licensed under the GNU GPL v3 license. - -Inputs -++++++++++++++ - -Input BAM/SAM file - Alignment file in BAM/SAM format. - -Upper Limit of Plotted Duplicated Times (default=500) - Only used for plotting. - -Output -++++++++++++++ - -1. output.dup.pos.DupRate.xls: Read duplication rate determined from mapping position of read. First column is "occurrence" or duplication times, second column is number of uniquely mapped reads. -2. output.dup.seq.DupRate.xls: Read duplication rate determined from sequence of read. First column is "occurrence" or duplication times, second column is number of uniquely mapped reads. -3. output.DupRate_plot.r: R script to generate pdf file -4. output.DupRate_plot.pdf: graphical output generated from R script - -.. image:: http://dldcc-web.brc.bcm.edu/lilab/liguow/RSeQC/figure/duplicate.png - - - diff -r 7cac660d1c0a -r 9a33e347a3de samtoolshelper.py --- a/samtoolshelper.py Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -import sys -import subprocess as sp -import os - -# Creates the sorted and indexed bam/bai files that are requried for both bam2wig and RSEQC_count -def samtools_sorted(bam): - sortedbam = bam + ".sorted" - indexedbam = ".".join([sortedbam,"bam.bai"]) - sp.call(['samtools', 'sort', '-m 1000000000', bam, sortedbam]) - sortedbam = sortedbam + '.bam' - sp.call(['samtools', 'index', sortedbam, indexedbam]) - return sortedbam - -def main(args): - args[2] = samtools_sorted(args[2]) - sp.call(args) - - -if __name__ == "__main__": - main(sys.argv[1:]) \ No newline at end of file diff -r 7cac660d1c0a -r 9a33e347a3de tool_dependencies.xml --- a/tool_dependencies.xml Thu Jul 11 12:25:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ - - - - - - http://CRAN.R-project.org/src/base/R-2/R-2.15.1.tar.gz - ./configure --prefix=$INSTALL_DIR - make - - $INSTALL_DIR/bin - - - - - You need a FORTRAN compiler or perhaps f2c in addition to a C compiler to build R. - - - - - - http://sourceforge.net/projects/samtools/files/samtools/0.1.18/samtools-0.1.18.tar.bz2 - sed -i .bak -e 's/-lcurses/-lncurses/g' Makefile - make - - samtools - $INSTALL_DIR/bin - - - misc/maq2sam-long - $INSTALL_DIR/bin - - - $INSTALL_DIR/bin - - - - Compiling SAMtools requires the ncurses and zlib development libraries. - - - - - http://sourceforge.net/projects/rseqc/files/RSeQC-2.3.7.tar.gz - python setup.py install --root - - . - $INSTALL_DIR/lib/rseqc - - - $INSTALL_DIR/bin - - - - - RSeQC version 2.3.7, documentation available at http://dldcc-web.brc.bcm.edu/lilab/liguow/CGI/rseqc/_build/html/index.html#. - - - -