# HG changeset patch # User victor # Date 1330963954 18000 # Node ID 4edac0183857ff164352b7304a14b5020fe9008a Initial commit from tarball version 1.17 diff -r 000000000000 -r 4edac0183857 rsem/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rsem/README Mon Mar 05 11:12:34 2012 -0500 @@ -0,0 +1,92 @@ +# RSEM Galaxy Wrapper # + +## Introduction ## + +RSEM (RNA-Seq by Expectation-Maximization) is a software package for the +estimation of gene and isoform abundances from RNA-Seq data. A key feature of +RSEM is its statistically-principled approach to the handling of RNA-Seq +reads that map to multiple genes and/or isoforms. In addition, RSEM is +well-suited to performing quantification with de novo transcriptome +assemblies, as it does not require a reference genome. + +## Installation ## + +Follow the [Galaxy Tool Shed +instructions](http://wiki.g2.bx.psu.edu/Tool_Shed) to add this wrapper from +the tool shed to your galaxy instance. Once the files are in the tools +directory you have to have RSEM references installed. This can be done by: + +1. Placing the file called `rsem_indices.loc` into the directory + `~/galaxy-dist/tool-data` This file tells the RSEM wrapper how to find the + reference(s). It is formatted according to galaxy's documentation with the + following tab-delimited format: + + unique_build_id dbkey display_name file_base_path + + For example, + + human_refseq_NM human_refseq_NM human_refseq_NM /opt/galaxy/references/human/1.1.2/NM_refseq_ref + +2. Downloaded a pre-built RSEM reference from the [RSEM website](http://deweylab.biostat.wisc.edu/rsem/). + +3. Place reference files into the `file_base_path` listed in the +`rsem_indices.loc` file + +If you would rather build your own reference files follow the instructions +below and then place resulting reference files into the `file_base_path` listed +in the `rsem_indices.loc` file. + +### Building a custom RSEM reference ### + +For instructions on how to build the RSEM reference files, first see the [RSEM +documentation](http://deweylab.biostat.wisc.edu/rsem/README.html). + +#### Example #### + +Suppose we have mouse RNA-Seq data and want to use the UCSC mm9 version of the +mouse genome. We have downloaded the UCSC Genes transcript annotations in GTF +format (as mm9.gtf) using the Table Browser and the knownIsoforms.txt file for +mm9 from the UCSC Downloads. We also have all chromosome files for mm9 in the +directory `/data/mm9`. We want to put the generated reference files under +`/opt/galaxy/references` with name `mouse_125`. We'll add poly(A) tails with +length 125. Please note that GTF files generated from UCSC's Table Browser do +not contain isoform-gene relationship information. For the UCSC Genes +annotation, this information can be obtained from the knownIsoforms.txt file. +Suppose we want to build Bowtie indices and Bowtie executables are found in +`/sw/bowtie`. + +To build the reference files, first run the command: + + rsem-prepare-reference --gtf mm9.gtf \ + --transcript-to-gene-map knownIsoforms.txt \ + --bowtie-path /sw/bowtie \ + /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \ + /opt/galaxy/references/mouse_125 + +To add this reference to your galaxy installation, add the following line to +the the `rsem_indices.loc` file: + + mouse_125 mouse_125 mouse_125 /opt/galaxy/references/mouse_125 + +Then restart galaxy and you should see the `mouse_125` reference listed in the +RSEM wrapper. + +## References ## + +* [RSEM website (stand alone package)](http://deweylab.biostat.wisc.edu/rsem/) + +* B. Li and C. Dewey (2011) [RSEM: accurate transcript quantification from + RNA-Seq data with or without a reference + genome](http://bioinformatics.oxfordjournals.org/content/26/4/493.abstract). + BMC Bioinformatics 12:323. + +* B. Li, V. Ruotti, R. Stewart, J. Thomson, and C. Dewey (2010) [RNA-Seq gene + expression estimation with read mapping + uncertainty](http://www.biomedcentral.com/1471-2105/12/323). Bioinformatics + 26(4): 493-500. + +## Contact information ## +* RSEM galaxy wrapper questions: ruotti@wisc.edu +* RSEM stand alone package questions: bli@cs.wisc.edu +* [RSEM announcements mailing list](http://groups.google.com/group/rsem-announce) +* [RSEM users mailing list](http://groups.google.com/group/rsem-users) diff -r 000000000000 -r 4edac0183857 rsem/rsem-1.1.17.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rsem/rsem-1.1.17.xml Mon Mar 05 11:12:34 2012 -0500 @@ -0,0 +1,572 @@ + + RNA-Seq by Expectation-Maximization + + +## DEFAULT PARAMETERS +rsem-wrapper-1.1.17.pl --calc-ci $useci.ci --fragment-length-mean $fraglenmean --fragment-length-min +$fraglenmin --fragment-length-sd $fraglensd --fragment-length-max $fraglenmax --bowtie-e +$bowtie_e --bowtie-m $bowtie_m + + #if $input.format=="fastq" + ## IF FASTQ AND SINGLE END READS (DEFAULTS) + #if $input.fastqmatepair.matepair=="single" #rsem-wrapper-1.1.17.pl --bam_genome $bam_genome --bamtype $bamtype + --seed-length $seedlength $input.fastq_select --estimate-rspd $rspd --forward-prob + $fprob -p $cpus --bowtie-n $bowtie_mis --output-genome-bam --single_fastq $singlefastq + --output $output --isoformfile $isoforms --bamfile $bam_res --log $log + --sampling-for-bam $sampling_for_bam --reference ${index.fields.path} + #end if + ## IF FASTQ AND PAIRED END READS (DEFAULTS) + #if $input.fastqmatepair.matepair=="paired" #rsem-wrapper-1.1.17.pl --bam_genome $bam_genome --bamtype $bamtype + --paired-end --seed-length $seedlength --estimate-rspd $rspd $input.fastq_select --forward-prob $fprob -p $cpus + --bowtie-n $bowtie_mis --output-genome-bam --fastq1 $fastq1 --fastq2 $fastq2 --output + $output --isoformfile $isoforms --bamfile $bam_res --log $log --sampling-for-bam + $sampling_for_bam --reference ${index.fields.path} + #end if + #end if + #if $input.format=="fasta" + ## IF FASTA AND SINGLE END READS (DEFAULTS) + #if $input.fastamatepair.matepair=="single" #rsem-wrapper-1.1.17.pl --bam_genome $bam_genome --bamtype $bamtype + --no-qualities --seed-length $seedlength --estimate-rspd $rspd --forward-prob $fprob -p $cpus --bowtie-n $bowtie_mis + --output-genome-bam --single_fasta $single_fasta --output $output --isoformfile + $isoforms --bamfile $bam_res --log $log --sampling-for-bam $sampling_for_bam --reference + ${index.fields.path} + #end if + ## IF FASTA AND PAIRED END READS (DEFAULTS) + #if $input.fastamatepair.matepair=="paired" #rsem-wrapper-1.1.17.pl --bam_genome $bam_genome --bamtype $bamtype + --no-qualities --paired-end --seed-length $seedlength --estimate-rspd $rspd --forward-prob $fprob -p $cpus + --bowtie-n $bowtie_mis --output-genome-bam --fasta1 $fasta1 --fasta2 $fasta2 --output + $output --isoformfile $isoforms --bamfile $bam_res --log $log --sampling-for-bam + $sampling_for_bam --reference ${index.fields.path} + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + bamtype == "yes" + + + + + + + +RSEM HOME PAGE - http://deweylab.biostat.wisc.edu/rsem/ + +NAME + rsem-calculate-expression + +SYNOPSIS + rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name + rsem-calculate-expression [options] --paired-end upstream_read_file/s downstream_read_file/s reference_name sample_name + rsem-calculate-expression [options] --sam/--bam [--paired-end] input reference_name sample_name + +ARGUMENTS + upstream_read_files/s + Comma-separated list of files containing single-end reads or + upstream reads for paired-end data. By default, these files are + assumed to be in FASTQ format. If the --no-qualities option is + specified, then FASTA format is expected. + + downstream_read_file/s + Comma-separated list of files containing downstream reads which are + paired with the upstream reads. By default, these files are assumed + to be in FASTQ format. If the --no-qualities option is specified, + then FASTA format is expected. + + input + SAM/BAM formatted input file. If "-" is specified for the filename, + SAM/BAM input is instead assumed to come from standard input. RSEM + requires all alignments of the same read group together. For + paired-end reads, RSEM also requires the two mates of any alignment + be adjacent. See Description section for how to make input file obey + RSEM's requirements. + + reference_name + The name of the reference used. The user must have run + 'rsem-prepare-reference' with this reference_name before running + this program. + + sample_name + The name of the sample analyzed. All output files are prefixed by + this name (e.g., sample_name.genes.results) + +OPTIONS + + --paired-end + Input reads are paired-end reads. (Default: off) + + --no-qualities + Input reads do not contain quality scores. (Default: off) + + --strand-specific + The RNA-Seq protocol used to generate the reads is strand specific, + i.e., all (upstream) reads are derived from the forward strand. This + option is equivalent to --forward-prob=1.0. With this option set, if + RSEM runs the Bowtie aligner, the '--norc' Bowtie option will be + used, which disables alignment to the reverse strand of transcripts. + (Default: off) + + --sam + Input file is in SAM format. (Default: off) + + --bam + Input file is in BAM format. (Default: off) + + --sam-header-info [file] + RSEM reads header information from input by default. If this option + is on, header information is read from the specified file. For the + format of the file, please see SAM official website. (Default: "") + + -p/--num-threads [int] + Number of threads to use. Both Bowtie and expression estimation will + use this many threads. (Default: 1) + + --no-bam-output + Do not output any BAM file. (Default: off) + + --output-genome-bam + Generate a BAM file, 'sample_name.genome.bam', with alignments + mapped to genomic coordinates and annotated with their posterior + probabilities. In addition, RSEM will call samtools (included in + RSEM package) to sort and index the bam file. + 'sample_name.genome.sorted.bam' and + 'sample_name.genome.sorted.bam.bai' will be generated. (Default: + off) + + --sampling-for-bam + When RSEM generates a BAM file, instead of outputing all alignments + a read has with their posterior probabilities, one alignment is + sampled and outputed according to the posterior probabilities. If + the sampling result is that the read comes from the "noise" + transcript, nothing is outputed. (Default: off) + + --calc-ci + Calculate 95% credibility intervals and posterior mean estimates. + (Default: off) + + --seed-length [int] + Seed length used by the read aligner. Providing the correct value is + important for RSEM. If RSEM runs Bowtie, it uses this value for + Bowtie's seed length parameter. Any read with its or at least one of + its mates' (for paired-end reads) length less than this value will + be ignored. If the references are not added poly(A) tails, the + minimum allowed value is 5, otherwise, the minimum allowed value is + 25. Note that this script will only check if the value less or equal than + 5 and give a warning message if the value less than 25 but greter or equal than + 5. (Default: 25) + + --tag [string] + The name of the optional field used in the SAM input for identifying + a read with too many valid alignments. The field should have the + format [tagName]:i:[value], where a [value] bigger than 0 indicates + a read with too many alignments. (Default: "") + + --bowtie-path [path] + The path to the bowtie executables. (Default: the path to the bowtie + executables is assumed to be in the user's PATH environment + variable) + + --bowtie-n [int] + (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, + Default: 2) + + --bowtie-e [int] + (Bowtie parameter) max sum of mismatch quality scores across the + alignment. (Default: 99999999) + + --bowtie-m [int] + (Bowtie parameter) suppress all alignments for a read if greater then [int] + valid alignments exist. (Default: 200) + + --bowtie-chunkmbs [int] + (Bowtie parameter) memory allocated for best first alignment + calculation (Default: 0 - use bowtie's default) + + --phred33-quals + Input quality scores are encoded as Phred+33. (Default: on) + + --phred64-quals + Input quality scores are encoded as Phred+64 (default for GA + Pipeline ver. less than 1.3). (Default: off) + + --solexa-quals + Input quality scores are solexa encoded (from GA Pipeline ver. less + than 1.3). (Default: off) + + --forward-prob [double] + Probability of generating a read from the forward strand of a + transcript. Set to 1 for a strand-specific protocol where all + (upstream) reads are derived from the forward strand, 0 for a + strand-specific protocol where all (upstream) read are derived from + the reverse strand, or 0.5 for a non-strand-specific protocol. + (Default: 0.5) + + --fragment-length-min [int] + Minimum read/insert length allowed. This is also the value for the + bowtie -I option. (Default: 1) + + --fragment-length-max [int] + Maximum read/insert length allowed. This is also the value for the + bowtie -X option. (Default: 1000) + + --fragment-length-mean [double] + (single-end data only) The mean of the fragment length distribution, + which is assumed to be a Gaussian. (Default: -1, which disables use + of the fragment length distribution) + + --fragment-length-sd [double] + (single-end data only) The standard deviation of the fragment length + distribution, which is assumed to be a Gaussian. (Default: 0, which + assumes that all fragments are of the same length, given by the + rounded value of --fragment-length-mean) + + --estimate-rspd + Set this option if you want to estimate the read start position + distribution (RSPD) from data. Otherwise, RSEM will use a uniform + RSPD. (Default: off) + + --num-rspd-bins [int] + Number of bins in the RSPD. Only relevant when '--estimate-rspd' is + specified. Use of the default setting is recommended. (Default: 20) + + --ci-memory [int] + Maximum size (in memory, MB) of the auxiliary buffer used for + computing credibility intervals (CI). Set it larger for a faster CI + calculation. However, leaving 2 GB memory free for other usage is + recommended. (Default: 1024) + + --keep-intermediate-files + Keep temporary files generated by RSEM. RSEM creates a temporary + directory, 'sample_name.temp', into which it puts all intermediate + output files. If this directory already exists, RSEM overwrites all + files generated by previous RSEM runs inside of it. By default, + after RSEM finishes, the temporary directory is deleted. Set this + option to prevent the deletion of this directory and the + intermediate files inside of it. (Default: off) + + --time + Output time consumed by each step of RSEM to 'sample_name.time'. + (Default: off) + + -q/--quiet + Suppress the output of logging information. (Default: off) + + -h/--help + Show help information. + +DESCRIPTION + In its default mode, this program aligns input reads against a reference + transcriptome with Bowtie and calculates expression values using the + alignments. RSEM assumes the data are single-end reads with quality + scores, unless the '--paired-end' or '--no-qualities' options are + specified. Users may use an alternative aligner by specifying one of the + --sam and --bam options, and providing an alignment file in the + specified format. However, users should make sure that they align + against the indices generated by 'rsem-prepare-reference' and the + alignment file satisfies the requirements mentioned in ARGUMENTS + section. + + One simple way to make the alignment file satisfying RSEM's requirements + (assuming the aligner used put mates in a paired-end read adjacent) is + to use 'convert-sam-for-rsem' script. This script only accept SAM format + files as input. If a BAM format file is obtained, please use samtools to + convert it to a SAM file first. For example, if '/ref/mouse_125' is the + 'reference_name' and the SAM file is named 'input.sam', you can run the + following command: + + convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam + + For details, please refer to 'convert-sam-for-rsem's documentation page. + + The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old + SAM/BAM format. However, RSEM cannot recognize 0x100 in the FLAG field. + In addition, RSEM requires SEQ and QUAL are not '*'. + + The user must run 'rsem-prepare-reference' with the appropriate + reference before using this program. + + For single-end data, it is strongly recommended that the user provide + the fragment length distribution parameters (--fragment-length-mean and + --fragment-length-sd). For paired-end data, RSEM will automatically + learn a fragment length distribution from the data. + + Please note that some of the default values for the Bowtie parameters + are not the same as those defined for Bowtie itself. + + The temporary directory and all intermediate files will be removed when + RSEM finishes unless '--keep-intermediate-files' is specified. + + With the '--calc-ci' option, 95% credibility intervals and posterior + mean estimates will be calculated in addition to maximum likelihood + estimates. + +OUTPUT + sample_name.genes.results + File containing gene level expression estimates. The format of each + line in this file is: + + gene_id expected_counts tau_value [pmc_value tau_pme_value + tau_ci_lower_bound tau_ci_upper_bound] transcript_id_list + + Fields are separated by the tab character. Fields within "[]" are + only presented if '--calc-ci' is set. pme stands for posterior mean + estimation. pmc stands for posterior mean counts. ci_lower_bound(l) + means the lower bound of the credibility intervals, + ci_upper_bound(u) means the upper bound of the credibility + intervals. So the credibility interval is [l, u]. + 'transcript_id_list' is a space-separated list of transcript_ids + belonging to the gene. If no gene information is provided, this file + has the same content as 'sample_name.isoforms.results'. + + sample_name.isoforms.results + File containing isoform level expression values. The format of each + line in this file is: + + transcript_id expected_counts tau_value [pmc_value tau_pme_value + tau_ci_lower_bound tau_ci_upper_bound] gene_id + + Fields are separated by the tab character. 'gene_id' is the gene_id + of the gene which this transcript belongs to. If no gene information + is provided, 'gene_id' and 'transcript_id' are the same. + + sample_name.transcript.bam, sample_name.transcript.sorted.bam and + sample_name.transcript.sorted.bam.bai + Only generated when --no-bam-output is not specified. + + 'sample_name.transcript.bam' is a BAM-formatted file of read + alignments in transcript coordinates. The MAPQ field of each + alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), + where w is the posterior probability of that alignment being the + true mapping of a read. In addition, RSEM pads a new tag ZW:f:value, + where value is a single precision floating number representing the + posterior probability. + + 'sample_name.transcript.sorted.bam' and + 'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and + indices generated by samtools (included in RSEM package). + + sample_name.genome.bam, sample_name.genome.sorted.bam and + sample_name.genome.sorted.bam.bai + Only generated when --no-bam-output is not specified and + --output-genome-bam is specified. + + 'sample_name.genome.bam' is a BAM-formatted file of read alignments + in genomic coordinates. Alignments of reads that have identical + genomic coordinates (i.e., alignments to different isoforms that + share the same genomic region) are collapsed into one alignment. The + MAPQ field of each alignment is set to min(100, floor(-10 * + log10(1.0 - w) + 0.5)), where w is the posterior probability of that + alignment being the true mapping of a read. In addition, RSEM pads a + new tag ZW:f:value, where value is a single precision floating + number representing the posterior probability. If an alignment is + spliced, a XS:A:value tag is also added, where value is either '+' + or '-' indicating the strand of the transcript it aligns to. + + 'sample_name.genome.sorted.bam' and + 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and + indices generated by samtools (included in RSEM package). + + sample_name.sam.gz + Only generated when the input files are raw reads instead of SAM/BAM + format files + + It is the gzipped SAM output produced by bowtie aligner. + + sample_name.time + Only generated when --time is specified. + + It contains time (in seconds) consumed by aligning reads, estimating + expression levels and calculating credibility intervals. + + sample_name.stat + This is a folder instead of a file. All model related statistics are + stored in this folder. Use 'rsem-plot-model' can generate plots + using this folder. + +EXAMPLES + Assume the path to the bowtie executables is in the user's PATH + environment variable. Reference files are under '/ref' with name + 'mouse_125'. + + 1) '/data/mmliver.fq', single-end reads with quality scores. Quality + scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 + threads and generate a genome BAM file: + + rsem-calculate-expression --phred64-quals \ + -p 8 \ + --output-genome-bam \ + /data/mmliver.fq \ + /ref/mouse_125 \ + mmliver_single_quals + + 2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', paired-end reads with + quality scores. Quality scores are in SANGER format. We want to use 8 + threads and do not generate a genome BAM file: + + rsem-calculate-expression -p 8 \ + --paired-end \ + /data/mmliver_1.fq \ + /data/mmliver_2.fq \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 3) '/data/mmliver.fa', single-end reads without quality scores. We want + to use 8 threads: + + rsem-calculate-expression -p 8 \ + --no-qualities \ + /data/mmliver.fa \ + /ref/mouse_125 \ + mmliver_single_without_quals + + 4) Data are the same as 1). We want to take a fragment length + distribution into consideration. We set the fragment length mean to 150 + and the standard deviation to 35. In addition to a BAM file, we also + want to generate credibility intervals. We allow RSEM to use 1GB of + memory for CI calculation: + + rsem-calculate-expression --bowtie-path /sw/bowtie \ + --phred64-quals \ + --fragment-length-mean 150.0 \ + --fragment-length-sd 35.0 \ + -p 8 \ + --output-genome-bam \ + --calc-ci \ + --ci-memory 1024 \ + /data/mmliver.fq \ + /ref/mouse_125 \ + mmliver_single_quals + + 5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality + scores. We want to use 8 threads: + + rsem-calculate-expression --paired-end \ + --bam \ + -p 8 \ + /data/mmliver_paired_end_quals.bam \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + diff -r 000000000000 -r 4edac0183857 rsem/rsem-wrapper-1.1.17.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rsem/rsem-wrapper-1.1.17.pl Mon Mar 05 11:12:34 2012 -0500 @@ -0,0 +1,238 @@ +#!/usr/bin/perl + + +use Data::Dumper; +use Getopt::Long; +use Pod::Usage; + + + +#pod2usage(-verbose => 1) if ($help == 1); +#if (@ARGV == 0) { +# pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2); +#} + +my $rsem_version = "/opt/rsem-1.1.17"; +my $minL = 1; +my $maxL = 1000; +my $NMB = 1024; + +# Extra file output #beta +# --isoformfile $isoforms +# --thetafile $theta +# --cntfile $cnt +# --modelfile $model +# --bamfile $bam_res + +GetOptions( + "log=s" => \$log, + "bam_genome=s" => \$bam_genome, + "bamtype=s" => \$bamtype, + "isoformfile=s" => \$isoforms, + "reference=s" => \$dbref, + "sampling-for-bam=s" => \$samplingbam, + "thetafile=s" => \$theta, + "cntfile=s" => \$cnt, + "modelfile=s" => \$model, + "bamfile=s" => \$bamfile, + "output=s" => \$output, + "single_fasta=s" => \$single_fasta, + "fasta1=s" => \$fasta1, + "fasta2=s" => \$fasta2, + "single_fastq=s" => \$single_fastq, + "fastq1=s" => \$fastq1, + "fastq2=s" => \$fastq2, + "no-qualities" => \$no_qual, + "paired-end" => \$paired_end, + "sam" => \$is_sam, + "bam" => \$is_bam, + "sam-header-info=s" => \$fn_list, + "tag=s" => \$tagName, + "seed-length=i" => \$L, + "bowtie-path=s" => \$bowtie_path, + "bowtie-n=i" => \$C, + "bowtie-e=i" => \$E, + "bowtie-m=i" => \$maxHits, + "phred33-quals" => \$phred33, + "phred64-quals" => \$phred64, + "solexa-quals" => \$solexa, + "forward-prob=f" => \$probF, + "fragment-length-min=i" => \$minL, + "fragment-length-max=i" => \$maxL, + "fragment-length-mean=f" => \$mean, + "fragment-length-sd=f" => \$sd, + "estimate-rspd=s" => \$estRSPD, + "num-rspd-bins=i" => \$B, + "p|num-threads=i" => \$nThreads, + "output-genome-bam" => \$genBamF, + "calc-ci=s" => \$calcCI, + "ci-memory=i" => \$NMB, + "time" => \$mTime, + "q|quiet" => \$quiet, +) or pod2usage( -exitval => 2, -verbose => 2 ); + +#check parameters and options + +if ($is_sam || $is_bam) { + pod2usage(-msg => "from rsem-wrapper->Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 4); + pod2usage(-msg => "--sam and --bam cannot be active at the same time!", -exitval => 2, -verbose => 2) if ($is_sam == 1&& $is_bam == 1); + pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m, --phred33-quals, --phred64-quals or --solexa-quals cannot be set if input is SAM/BAM format!", -exitval => 2, -verbose => 2) if ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200 || $phred33 || $phred64 || $solexa); +} +#else { +# pod2usage(-msg => "from rsem-wraper->Invalid number of arguments!", -exitval => 2, -verbose => 2) +# if (!$paired_end && scalar(@ARGV) != 1 || $paired_end && scalar(@ARGV) != 1); +# pod2usage(-msg => "Only one of --phred33-quals --phred64-quals/--solexa1.3-quals --solexa-suqls can be active!", -exitval => 2, -verbose => 2) if ($phred33 + $phred64 + $solexa > 1); +# podwusage(-msg => "--sam , --bam or --sam-header-info cannot be set if use bowtie aligner to produce alignments!", -exitval => 2, -verbose => 2) if ($is_sam || $is_bam || $fn_list ne ""); +#} + +pod2usage(-msg => "Forward probability should be in [0, 1]!", -exitval => 2, -verbose => 2) if ($probF < 0 || $probF > 1); +pod2usage(-msg => "Min fragment length should be at least 1!", -exitval => 2, -verbose => 2) if ($minL < 1); +pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL); +pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1); +pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1); + +# IO Redirection to log file +use IO::Handle; +open OUTPUT, '>', $log or die "cant open file $log $!\n";; +open ERROR, '>>', $log or die "cant open file $log $!\n"; +STDOUT->fdopen( \*OUTPUT, 'w' ) or die "cant open file $!\n"; +STDERR->fdopen( \*ERROR, 'w' ) or die "cant open file $!\n"; +# + +my @options; + +# generates new output called sample_name.genome.bam +# with alignments +# mapped to genomic coordinates and annotated with their posterior +# probabilities. In addition, RSEM will call samtools (included in +# RSEM package) to sort and index the bam file. +# 'sample_name.genome.sorted.bam' and +# 'sample_name.genome.sorted.bam.bai' will be generated. (Default: off) + +if ($bamtype eq "yes") { + my $bam_genome_par = "--output-genome-bam"; + push @options, $bam_genome_par; +} +if ($samplingbam eq "yes") { + my $samplingbam = "--sampling-for-bam"; + push @options, $samplingbam; +} +if ($estRSPD eq "yes") { + my $rspd = "--estimate-rspd"; + push @options, $rspd; +} +$probF = "--forward-prob $probF"; +push @options, $probF; + +if ($calcCI eq "yes") { + my $calcCI = "--calc-ci"; + push @options, $calcCI; + my $cimem = "--ci-memory $NMB"; + push @options, $cimem; +} +if ($tagName) { + my $tagName = "--tag $tagName"; + push @options, $tagName; +} +if ($L) { + my $L = "--seed-length $L"; + push @options, $L; +} +if ($C) { + my $C = "--bowtie-n $C"; + push @options, $C; +} +if ($E) { + my $E = "--bowtie-e $E"; + push @options, $E; +} +if ($maxHits) { + my $maxHits = "--bowtie-m $maxHits"; + push @options, $maxHits; +} +if ($minL != 1) { + my $minL = "--fragment-length-min $minL"; + push @options, $minL; +} +if ($maxL != 1000) { + my $maxL = "--fragment-length-max $maxL"; + push @options, $maxL; +} +if ($mean) { + my $mean = "--fragment-length-mean $mean"; + push @options, $mean; +} +if ($sd) { + my $sd = "--fragment-length-sd $sd"; + push @options, $sd; +} +my $options= join(" ", @options); + +#BUILD COMMAND BASED ON PARSED OPTIONS +if ($no_qual) { + #reads are in fasta file format + if ($paired_end) { # reads are in paired end + my $cmd = "$rsem_version/rsem-calculate-expression --quiet --no-qualities --paired-end -p $nThreads $options $fasta1 $fasta2 $dbref $output"; + print "RSEM Parameters used by Galaxy:\n$cmd\n"; + system($cmd); + } + #run single end with one fasta file + else { + my $cmd = "$rsem_version/rsem-calculate-expression --quiet --no-qualities -p $nThreads $options $single_fasta $dbref $output"; + print "RSEM Parameters used by Galaxy:\n$cmd\n"; + system($cmd); + } +} +else { + # reads are in fastq file format + # type of fastq file? + my $fastqtype; + if ($phred33) { + $fastqtype = "--phred33-quals"; + } + elsif ($phred64) { + $fastqtype = "--phred64-quals"; + } + elsif ($solexa) { + $fastqtype = "--solexa-quals"; + } + if ($paired_end) { + #reads in paired end + #run paired end with two fasq files + my $cmd = "$rsem_version/rsem-calculate-expression --quiet --paired-end -p $nThreads $options $fastqtype $fastq1 $fastq2 $dbref $output"; + print "RSEM Parameters used by Galaxy:\n$cmd\n"; + system($cmd); + } + else { + my $cmd = "$rsem_version/rsem-calculate-expression --quiet -p $nThreads $options $fastqtype $single_fastq $dbref $output"; + print "RSEM Parameters used by Galaxy:\n$cmd\n"; + system($cmd); + } +} + + #Rename files for galaxy +my $mv_genes = "mv $output.genes.results $output"; +my $mv_isoforms = "mv $output.isoforms.results $isoforms"; + +#print "bamtype-parameter=$bamtype\n"; +my $mv_bam_transcript; +my $mv_bam_genome; +if ($bamtype eq "yes") { + $mv_bam_genome = "mv $output.genome.sorted.bam $bam_genome"; + system($mv_bam_genome); +} + +$mv_bam_transcript = "mv $output.transcript.sorted.bam $bamfile"; + +my @rsem_dir = split(/\//, $output); +my $short_output = $rsem_dir[-1]; +my $mv_theta = "mv $output.stat/$short_output.theta $theta"; +my $mv_cnt = "mv $output.stat/$short_output.cnt $cnt"; +my $mv_model = "mv $output.stat/$short_output.model $model"; +system($mv_genes); +system($mv_isoforms); +system($mv_bam_transcript); +#system($mv_theta); +#system($mv_cnt); +#system($mv_model); +#print "LOG $mv\n"; diff -r 000000000000 -r 4edac0183857 rsem/rsem_indices.loc.example --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rsem/rsem_indices.loc.example Mon Mar 05 11:12:34 2012 -0500 @@ -0,0 +1,39 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Bowtie indexed sequences data files. You will +#need to create these data files and then create a bowtie_indices.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bowtie_indices.loc +#file has this format (longer white space characters are TAB characters): +# +# +# +#So, for example, if you had hg18 indexed stored in +#/depot/data2/galaxy/bowtie/hg18/, +#then the bowtie_indices.loc entry would look like this: +# +#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18 +human_refseq_NM human_refseq_NM human_refseq_NM /opt/galaxy/references/human/1.1.2/NM_refseq_ref + +# +#and your /depot/data2/galaxy/bowtie/hg18/ directory +#would contain hg18.*.ebwt files: +# +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt +#...etc... +# +#Your bowtie_indices.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon +#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full +#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19 +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +#