rsem: rsem-bwt2.xml comparison

comparison rsem-bwt2.xml @ 0:e5e836936d60 draft

planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit d84a0359354698a4b29df12ab581c2618bffcf80

author	artbio
date	Sat, 31 Mar 2018 21:30:07 -0400
parents
children	49795544dac7

comparison

equal deleted inserted replaced

--1:000000000000
+:e5e836936d60
+<tool id="rsembowtie2" name="RSEM-Bowtie2" version="0.4.0">
+<description></description>
+<macros>
+<import>macros.xml</import>
+</macros>
+<requirements>
+<requirement type="package" version="1.3.0">rsem</requirement>
+<requirement type="package" version="2.3.4">bowtie2</requirement>
+</requirements>
+<stdio>
+<exit_code range="1:" level="warning" description="Tool exception" />
+</stdio>
+<command detect_errors="exit_code"><![CDATA[
+#if $job.select_job == "index":
+echo ${job.reference_name} " " | tee $reference_file &&
+mkdir $reference_file.files_path &&
+rsem-prepare-reference
+#if $job.polya.polya_use == 'add':
+#if $job.polya.polya_length:
+--polyA-length $job.polya.polya_length
+#end if
+#elif $job.polya.polya_use == 'subset':
+--no-polyA-subset $job.polya.no_polya_subset
+#if $job.polya.polya_length:
+--polyA-length $job.polya.polya_length
+#end if
+#elif $job.polya.polya_use == 'none':
+--no-polyA
+#end if
+$job.ntog
+#if $job.transcript_to_gene_map:
+--transcript-to-gene-map $job.transcript_to_gene_map
+#end if
+--bowtie2
+#if $job.self_reference.ref_type == 'transcripts':
+$job.self_reference.reference_fasta_file
+#else:
+--gtf $job.self_reference.gtf
+$job.self_reference.reference_fasta_file
+#end if
+${reference_file.files_path}/${job.reference_name}
+> ${reference_file.files_path}/${job.reference_name}.log
+#end if
+#if $job.select_job == "index" and $run_rsem.select == "Yes":
+&&
+#end if
+#if $run_rsem.select == "Yes":
+## uncompress fastq.gz or fastqsanger.gz if needed
+#if $run_rsem.input.fastq.matepair=="single":
+#if $run_rsem.input.fastq.singlefastq.is_of_type('fastq.gz') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger.gz'):
+gunzip < '$run_rsem.input.fastq.singlefastq' > uncomp_single.fastq &&
+#elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'):
+ln -f -s '$run_rsem.input.fastq.singlefastq' 'uncomp_single.fastq' &&
+#end if
+#elif $run_rsem.input.fastq.matepair=="paired":
+#if $run_rsem.input.fastq.fastq1.is_of_type('fastq.gz') or $run_rsem.input.fastq.fastq1.is_of_type('fastqsanger.gz'):
+gunzip < '$run_rsem.input.fastq.fastq1' > uncomp_pair1.fastq &&
+gunzip < '$run_rsem.input.fastq.fastq2' > uncomp_pair2.fastq &&
+#elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'):
+ln -f -s '$run_rsem.input.fastq.fastq1' 'uncomp_pair1.fastq' &&
+ln -f -s '$run_rsem.input.fastq.fastq2' 'uncomp_pair2.fastq' &&
+#end if
+#end if
+rsem-calculate-expression
+## --tag string
+#if $run_rsem.seedlength:
+--seed-length $run_rsem.seedlength
+#end if
+--forward-prob $run_rsem.forward_prob
+#if $run_rsem.rsem_options.fullparams == 'fullset':
+## Fragment info
+#if $run_rsem.rsem_options.fragment_length_mean:
+--fragment-length-mean $run_rsem.rsem_options.fragment_length_mean
+#end if
+#if $run_rsem.rsem_options.fragment_length_min:
+--fragment-length-min $run_rsem.rsem_options.fragment_length_min
+#end if
+#if $run_rsem.rsem_options.fragment_length_sd:
+--fragment-length-sd $run_rsem.rsem_options.fragment_length_sd
+#end if
+#if $run_rsem.rsem_options.fragment_length_max:
+--fragment-length-max $run_rsem.rsem_options.fragment_length_max
+#end if
+## RSPD
+#if $run_rsem.rsem_options.rspd.estimate == 'yes':
+--estimate-rspd
+#if $run_rsem.rsem_options.rspd.num_rspd_bins:
+--num-rspd-bins $run_rsem.rsem_options.rspd.num_rspd_bins
+#end if
+#end if
+## Calculate 95% credibility intervals and posterior mean estimates.
+#if $run_rsem.rsem_options.useci.ci == 'yes':
+--calc-ci
+#if $run_rsem.rsem_options.useci.cimem:
+--ci-memory $run_rsem.rsem_options.useci.cimem
+#end if
+#end if
+#end if
+--num-threads \${GALAXY_SLOTS:-4}
+--bowtie2
+#if $run_rsem.input.format == 'fasta' and $run_rsem.input.bowtie2_options.fullparams == 'fullset':
+## Bowtie params
+#if $run_rsem.input.bowtie2_options.bowtie2_mismatch_rate:
+--bowtie2-mismatch-rate $run_rsem.input.bowtie2_options.bowtie2_mismatch_rate
+#end if
+#if $run_rsem.input.bowtie2_options.bowtie2_k:
+--bowtie2-k $run_rsem.input.bowtie2_options.bowtie2_k
+#end if
+#if $run_rsem.input.bowtie2_options.bowtie2_sensitivity_level:
+--bowtie2-sensitivity-level $run_rsem.input.bowtie2_options.bowtie2_sensitivity_level
+#end if
+#end if
+## Outputs
+#if $run_rsem.rsem_outputs.result_bams == 'none':
+--no-bam-output
+#elif $run_rsem.rsem_outputs.result_bams == 'default':
+--sort-bam-by-coordinate
+#else
+--sort-bam-by-coordinate
+--output-genome-bam
+$run_rsem.rsem_outputs.sampling_for_bam
+#end if
+## Input data
+#if $run_rsem.input.format=="fastq"
+$run_rsem.input.fastq_select
+#if $run_rsem.input.fastq.matepair=="single":
+uncomp_single.fastq
+#elif $run_rsem.input.fastq.matepair=="paired":
+--paired-end
+uncomp_pair1.fastq
+uncomp_pair2.fastq
+#end if
+#elif $run_rsem.input.format=="fasta"
+--no-qualities
+#if $run_rsem.input.fasta.matepair=="single":
+$run_rsem.input.fasta.singlefasta
+#elif $run_rsem.input.fasta.matepair=="paired":
+--paired-end
+$run_rsem.input.fasta.fasta1
+$run_rsem.input.fasta.fasta2
+#end if
+#elif $run_rsem.input.format=="sam"
+#if $run_rsem.input.matepair=="paired":
+--paired-end
+#end if
+#if $run_rsem.input.rsem_sam._extension == 'sam':
+--sam
+#elif $run_rsem.input.rsem_sam._extension == 'bam':
+--bam
+#end if
+$run_rsem.input.rsem_sam
+#end if
+## RSEM reference
+#if $run_rsem.reference.refSrc == 'history':
+${run_rsem.reference.rsem_ref.extra_files_path}/${run_rsem.reference.rsem_ref.metadata.reference_name}
+#elif $run_rsem.reference.refSrc == 'self':
+${reference_file.files_path}/${job.reference_name}
+#end if
+## sample_name: use a hard coded name so we can pull out galaxy outputs
+rsem_output
+## direct output into logfile
+> $log
+#end if
+]]></command>
+<inputs>
+<conditional name="job">
+<param name="select_job" type="select" label="rsem reference">
+<option value="index">Build rsem reference</option>
+<option value="no-index" selected="true">rsem reference available from history</option>
+</param>
+<when value="index">
+<conditional name="self_reference">
+<param name="ref_type" type="select" label="Reference transcript source">
+<option value="transcripts">transcript fasta</option>
+<option value="genomic">reference genome and gtf</option>
+</param>
+<when value="transcripts">
+<param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file"
+help="The files should contain the sequences of transcripts."/>
+</when>
+<when value="genomic">
+<param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file"
+help="The file should contain the sequence of an entire genome."/>
+<param name="gtf" type="data" format="gtf" label="gtf"
+help="extract transcript reference sequences using the gene annotations specified in this GTF" />
+</when>
+</conditional>
+<param name="transcript_to_gene_map" type="data" format="tabular" optional="true" label="Map of gene ids to transcript (isoform) ids" >
+<help>
+Each line of should be of the form: gene_id transcript_id ( with the two fields separated by a tab character )
+The map can be obtained from the UCSC table browser
+group: Genes and Gene Prediction Tracks
+table: knownIsoforms
+Without a map:
+If a reference genome and gtf is used, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file.
+Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene.
+</help>
+</param>
+<param name="reference_name" type="text" value="rsem_ref_name" label="reference name">
+<help>A one word name for this RSEM reference containing only letters, digits, and underscore characters</help>
+<validator type="regex" message="Use only letters, digits, and underscore characters">^\w+$</validator>
+</param>
+<conditional name="polya">
+<param name="polya_use" type="select" label="PolyA ">
+<option value="add" selected="true">Add poly(A) tails to all transcripts</option>
+<option value="subset">Exclude poly(A) tails from selected transcripts</option>
+<option value="none">Do not add poly(A) tails to any transcripts</option>
+</param>
+<when value="add">
+<param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)">
+<validator type="in_range" message="must be positive " min="1"/>
+</param>
+</when>
+<when value="subset">
+<param name="no_polya_subset" type="data" format="tabular" optional="true" label="List of transcript IDs (one per line) that should should not have polyA tails added."/>
+<param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)">
+<validator type="in_range" message="must be positive " min="1"/>
+</param>
+</when>
+<when value="none"/>
+</conditional>
+<param name="ntog" type="boolean" truevalue="--no-ntog" falsevalue="" checked="false" label="Disable the conversion of 'N' characters to 'G' characters in the reference sequences" help="Bowtie uses the automatic N to G conversion to align against all positions in the reference."/>
+</when>
+<when value="no-index">
+</when>
+</conditional>
+<conditional name="run_rsem">
+<param name="select" type="select" label="calculate expression with rsem">
+<option value="No">Just build rsem reference for latter rsem profiling</option>
+<option value="Yes" selected="true">profile expression with rsem</option>
+</param>
+<when value="Yes">
+<param name="sample" type="text" value="rsem_sample" label="Sample name" />
+<conditional name="reference">
+<param name="refSrc" type="select" label="RSEM Reference Source">
+<option value="history">From your history</option>
+<option value="self">Prepare RSEM Reference with this tool</option>
+</param>
+<when value="history">
+<param name="rsem_ref" type="data" format="rsem_ref" label="RSEM reference" />
+</when>
+<when value="self">
+</when>
+</conditional>
+<conditional name="input">
+<param name="format" type="select" label="RSEM Input file type">
+<option value="fastq">FASTQ</option>
+<option value="fasta">FASTA</option>
+<option value="sam">SAM/BAM</option>
+</param>
+<when value="fastq">
+<param name="fastq_select" size="15" type="select" label="FASTQ type" >
+<option value="--phred33-quals" selected="true">phred33 qualities (default for sanger)</option>
+<option value="--solexa-quals">solexa qualities</option>
+<option value="--phred64-quals">phred64 qualities</option>
+</param>
+<conditional name="fastq">
+<param name="matepair" type="select" label="Library type">
+<option value="single">Single End Reads</option>
+<option value="paired">Paired End Reads</option>
+</param>
+<when value="single">
+<param name="singlefastq" type="data" format="fastq,fastq.gz" label="FASTQ file" />
+</when>
+<when value="paired">
+<param name="fastq1" type="data" format="fastq,fastq.gz" label="Read 1 fastq file" />
+<param name="fastq2" type="data" format="fastq,fastq.gz" label="Read 2 fastq file" />
+</when>
+</conditional>
+<expand macro="bowtie2_options"/>
+</when>
+<when value="fasta">
+<conditional name="fasta">
+<param name="matepair" type="select" label="Library Type">
+<option value="single">Single End Reads</option>
+<option value="paired">Paired End Reads</option>
+</param>
+<when value="single">
+<param name="singlefasta" type="data" format="fasta" label="fasta file" />
+</when>
+<when value="paired">
+<param name="fasta1" type="data" format="fasta" label="Read 1 fasta file" />
+<param name="fasta2" type="data" format="fasta" label="Read 2 fasta file" />
+</when>
+</conditional>
+<expand macro="bowtie2_options"/>
+</when>
+<when value="sam">
+<!-- convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam -->
+<param name="matepair" type="select" label="Library Type">
+<option value="single">Single End Reads</option>
+<option value="paired">Paired End Reads</option>
+</param>
+<param name="rsem_sam" type="data" format="rsem_sam" label="RSEM formatted SAM file" />
+</when>
+</conditional>
+<expand macro="rsem_options"/>
+<conditional name="rsem_outputs">
+<param name="result_bams" type="select" label="Create bam results files"
+help="In addition to the transcript-coordinate-based BAM file output, also output a BAM file with the read alignments in genomic coordinates" >
+<option value="none">No BAM results files</option>
+<option value="default" selected="true">Transcript BAM results file</option>
+<option value="both">Transcript and genome BAM results files</option>
+</param>
+<when value="none"/>
+<when value="default">
+<expand macro="sampling_for_bam"/>
+</when>
+<when value="both">
+<expand macro="sampling_for_bam"/>
+</when>
+</conditional>
+</when>
+<when value="No">
+</when>
+</conditional>
+</inputs>
+<outputs>
+<data format="rsem_ref" name="reference_file" label="RSEM ${job.reference_name} reference">
+<filter>job['select_job'] == "index"</filter>
+</data>
+<data format="tabular" name="gene_abundances" label="${run_rsem.sample}.gene_abundances" from_work_dir="rsem_output.genes.results">
+<filter>run_rsem['select'] == "Yes"</filter>
+</data>
+<data format="tabular" name="isoform_abundances" label="${run_rsem.sample}.isoform_abundances" from_work_dir="rsem_output.isoforms.results">
+<filter>run_rsem['select'] == "Yes"</filter>
+</data>
+<data format="bam" name="transcript_sorted_bam" label="${run_rsem.sample}.transcript.bam" from_work_dir="rsem_output.transcript.sorted.bam" >
+<filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] != "none"</filter>
+</data>
+<data format="bam" name="genome_sorted_bam" label="${run_rsem.sample}.genome.bam" from_work_dir="rsem_output.genome.sorted.bam">
+<filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] == "both"</filter>
+</data>
+<data format="txt" name="log" label="${run_rsem.sample}.rsem_log">
+<filter>run_rsem['select'] == "Yes"</filter>
+</data>
+</outputs>
+<tests>
+<test>
+<param name="select_job" value="index"/>
+<param name="ref_type" value="genomic"/>
+<param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
+<param name="gtf" value="ref.gtf" ftype="gtf"/>
+<param name="reference_name" value="ref"/>
+<param name="select" value="Yes"/>
+<param name="sample" value="rsem_sample"/>
+<param name="refSrc" value="self"/>
+<param name="format" value="fastq"/>
+<param name="matepair" value="single"/>
+<param name="singlefastq" value="test.fq" ftype="fastqsanger"/>
+<param name="result_bams" value="none"/>
+<output name="reference_file">
+<assert_contents>
+<has_text text="ref" />
+</assert_contents>
+</output>
+<output name="gene_abundances" value="gene_abundances.tab2"/>
+<output name="isoform_abundances" value="isoform_abundances.tab2" />
+<output name="log">
+<assert_contents>
+<has_text text="Expression Results are written" />
+</assert_contents>
+</output>
+</test>
+<test>
+<param name="select_job" value="index"/>
+<param name="ref_type" value="genomic"/>
+<param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
+<param name="gtf" value="ref.gtf" ftype="gtf"/>
+<param name="reference_name" value="ref"/>
+<param name="select" value="Yes"/>
+<param name="sample" value="rsem_sample"/>
+<param name="refSrc" value="self"/>
+<param name="format" value="fastq"/>
+<param name="matepair" value="single"/>
+<param name="singlefastq" value="test.fastq.gz" ftype="fastqsanger.gz"/>
+<param name="result_bams" value="none"/>
+<output name="reference_file">
+<assert_contents>
+<has_text text="ref" />
+</assert_contents>
+</output>
+<output name="gene_abundances" value="gene_abundances.tab2"/>
+<output name="isoform_abundances" value="isoform_abundances.tab2" />
+<output name="log">
+<assert_contents>
+<has_text text="Expression Results are written" />
+</assert_contents>
+</output>
+</test>
+<test>
+<param name="select_job" value="index"/>
+<param name="ref_type" value="genomic"/>
+<param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
+<param name="gtf" value="ref.gtf" ftype="gtf"/>
+<param name="reference_name" value="ref"/>
+<param name="select" value="No"/>
+<output name="reference_file">
+<assert_contents>
+<has_text text="ref" />
+</assert_contents>
+</output>
+</test>
+<test>
+<param name="select_job" value="index"/>
+<param name="ref_type" value="genomic"/>
+<param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
+<param name="gtf" value="ref.gtf" ftype="gtf"/>
+<param name="reference_name" value="ref"/>
+<param name="select" value="No"/>
+<output name="reference_file">
+<assert_contents>
+<has_text text="ref" />
+</assert_contents>
+</output>
+</test>
+</tests>
+<help>
+.. class:: infomark
+RSEM HOME PAGE - http://deweylab.biostat.wisc.edu/rsem/
+NAME
+rsem-prepare-reference
+SYNOPSIS
+rsem-prepare-reference [options] reference_fasta_file(s) reference_name
+DESCRIPTION
+The rsem-prepare-reference program extracts/preprocesses the reference sequences and builds Bowtie indices using default parameters.
+This program is used in conjunction with the 'rsem-calculate-expression' program.
+INPUTS
+A fasta file of transcripts
+or
+A genome sequence fasta file and a GTF gene annotation file.  (When using UCSC data, include the related knownIsoforms.txt)
+---
+NAME
+rsem-calculate-expression - Estimate gene and isoform expression from
+RNA-Seq data.
+SYNOPSIS
+rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name
+rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name
+rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name
+ARGUMENTS
+upstream_read_files(s)
+Comma-separated list of files containing single-end reads or
+upstream reads for paired-end data. By default, these files are
+assumed to be in FASTQ format. If the --no-qualities option is
+specified, then FASTA format is expected.
+downstream_read_file(s)
+Comma-separated list of files containing downstream reads which are
+paired with the upstream reads. By default, these files are assumed
+to be in FASTQ format. If the --no-qualities option is specified,
+then FASTA format is expected.
+input
+SAM/BAM/CRAM formatted input file. If "-" is specified for the
+filename, the input is instead assumed to come from standard input.
+RSEM requires all alignments of the same read group together. For
+paired-end reads, RSEM also requires the two mates of any alignment
+be adjacent. In addition, RSEM does not allow the SEQ and QUAL
+fields to be empty. See Description section for how to make input
+file obey RSEM's requirements.
+reference_name
+The name of the reference used. The user must have run
+'rsem-prepare-reference' with this reference_name before running
+this program.
+sample_name
+The name of the sample analyzed. All output files are prefixed by
+this name (e.g., sample_name.genes.results)
+BASIC OPTIONS
+--paired-end
+Input reads are paired-end reads. (Default: off)
+--no-qualities
+Input reads do not contain quality scores. (Default: off)
+--strandedness &lt;none|forward|reverse&gt;
+This option defines the strandedness of the RNA-Seq reads. It
+recognizes three values: 'none', 'forward', and 'reverse'. 'none'
+refers to non-strand-specific protocols. 'forward' means all
+(upstream) reads are derived from the forward strand. 'reverse'
+means all (upstream) reads are derived from the reverse strand. If
+'forward'/'reverse' is set, the '--norc'/'--nofw' Bowtie/Bowtie 2
+option will also be enabled to avoid aligning reads to the opposite
+strand. For Illumina TruSeq Stranded protocols, please use
+'reverse'. (Default: 'none')
+-p/--num-threads &lt;int&gt;
+Number of threads to use. Both Bowtie/Bowtie2, expression estimation
+and 'samtools sort' will use this many threads. (Default: 1)
+--alignments
+Input file contains alignments in SAM/BAM/CRAM format. The exact
+file format will be determined automatically. (Default: off)
+--fai &lt;file&gt;
+If the header section of input alignment file does not contain
+reference sequence information, this option should be turned on.
+&lt;file&gt; is a FAI format file containing each reference sequence's
+name and length. Please refer to the SAM official website for the
+details of FAI format. (Default: off)
+--bowtie2
+Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM
+does not handle indel, local and discordant alignments, the Bowtie2
+parameters are set in a way to avoid those alignments. In
+particular, we use options '--sensitive --dpad 0 --gbar 99999999
+--mp 1,1 --np 1 --score-min L,0,-0.1' by default. The last parameter
+of '--score-min', '-0.1', is the negative of maximum mismatch rate.
+This rate can be set by option '--bowtie2-mismatch-rate'. If reads
+are paired-end, we additionally use options '--no-mixed' and
+'--no-discordant'. (Default: off)
+--star
+Use STAR to align reads. Alignment parameters are from ENCODE3's
+STAR-RSEM pipeline. To save computational time and memory resources,
+STAR's Output BAM file is unsorted. It is stored in RSEM's temporary
+directory with name as 'sample_name.bam'. Each STAR job will have
+its own private copy of the genome in memory. (Default: off)
+--append-names
+If gene_name/transcript_name is available, append it to the end of
+gene_id/transcript_id (separated by '_') in files
+'sample_name.isoforms.results' and 'sample_name.genes.results'.
+(Default: off)
+--seed &lt;uint32&gt;
+Set the seed for the random number generators used in calculating
+posterior mean estimates and credibility intervals. The seed must be
+a non-negative 32 bit integer. (Default: off)
+--single-cell-prior
+By default, RSEM uses Dirichlet(1) as the prior to calculate
+posterior mean estimates and credibility intervals. However, much
+less genes are expressed in single cell RNA-Seq data. Thus, if you
+want to compute posterior mean estimates and/or credibility
+intervals and you have single-cell RNA-Seq data, you are recommended
+to turn on this option. Then RSEM will use Dirichlet(0.1) as the
+prior which encourage the sparsity of the expression levels.
+(Default: off)
+--calc-pme
+Run RSEM's collapsed Gibbs sampler to calculate posterior mean
+estimates. (Default: off)
+--calc-ci
+Calculate 95% credibility intervals and posterior mean estimates.
+The credibility level can be changed by setting
+'--ci-credibility-level'. (Default: off)
+-q/--quiet
+Suppress the output of logging information. (Default: off)
+-h/--help
+Show help information.
+--version
+Show version information.
+OUTPUT OPTIONS
+--sort-bam-by-read-name
+Sort BAM file aligned under transcript coordidate by read name.
+Setting this option on will produce deterministic maximum likelihood
+estimations from independent runs. Note that sorting will take long
+time and lots of memory. (Default: off)
+--no-bam-output
+Do not output any BAM file. (Default: off)
+--sampling-for-bam
+When RSEM generates a BAM file, instead of outputting all alignments
+a read has with their posterior probabilities, one alignment is
+sampled according to the posterior probabilities. The sampling
+procedure includes the alignment to the "noise" transcript, which
+does not appear in the BAM file. Only the sampled alignment has a
+weight of 1. All other alignments have weight 0. If the "noise"
+transcript is sampled, all alignments appeared in the BAM file
+should have weight 0. (Default: off)
+--output-genome-bam
+Generate a BAM file, 'sample_name.genome.bam', with alignments
+mapped to genomic coordinates and annotated with their posterior
+probabilities. In addition, RSEM will call samtools (included in
+RSEM package) to sort and index the bam file.
+'sample_name.genome.sorted.bam' and
+'sample_name.genome.sorted.bam.bai' will be generated. (Default:
+off)
+--sort-bam-by-coordinate
+Sort RSEM generated transcript and genome BAM files by coordinates
+and build associated indices. (Default: off)
+--sort-bam-memory-per-thread &lt;string&gt;
+Set the maximum memory per thread that can be used by 'samtools
+sort'. &lt;string&gt; represents the memory and accepts suffices 'K/M/G'.
+RSEM will pass &lt;string&gt; to the '-m' option of 'samtools sort'. Note
+that the default used here is different from the default used by
+samtools. (Default: 1G)
+ALIGNER OPTIONS
+--seed-length &lt;int&gt;
+Seed length used by the read aligner. Providing the correct value is
+important for RSEM. If RSEM runs Bowtie, it uses this value for
+Bowtie's seed length parameter. Any read with its or at least one of
+its mates' (for paired-end reads) length less than this value will
+be ignored. If the references are not added poly(A) tails, the
+minimum allowed value is 5, otherwise, the minimum allowed value is
+25. Note that this script will only check if the value &gt;= 5 and give
+a warning message if the value &lt; 25 but &gt;= 5. (Default: 25)
+--phred33-quals
+Input quality scores are encoded as Phred+33. (Default: on)
+--phred64-quals
+Input quality scores are encoded as Phred+64 (default for GA
+Pipeline ver. &gt;= 1.3). (Default: off)
+--solexa-quals
+Input quality scores are solexa encoded (from GA Pipeline ver. &lt;
+1.3). (Default: off)
+--bowtie-path &lt;path&gt;
+The path to the Bowtie executables. (Default: the path to the Bowtie
+executables is assumed to be in the user's PATH environment
+variable)
+--bowtie-n &lt;int&gt;
+(Bowtie parameter) max # of mismatches in the seed. (Range: 0-3,
+Default: 2)
+--bowtie-e &lt;int&gt;
+(Bowtie parameter) max sum of mismatch quality scores across the
+alignment. (Default: 99999999)
+--bowtie-m &lt;int&gt;
+(Bowtie parameter) suppress all alignments for a read if &gt; &lt;int&gt;
+valid alignments exist. (Default: 200)
+--bowtie-chunkmbs &lt;int&gt;
+(Bowtie parameter) memory allocated for best first alignment
+calculation (Default: 0 - use Bowtie's default)
+--bowtie2-path &lt;path&gt;
+(Bowtie 2 parameter) The path to the Bowtie 2 executables. (Default:
+the path to the Bowtie 2 executables is assumed to be in the user's
+PATH environment variable)
+--bowtie2-mismatch-rate &lt;double&gt;
+(Bowtie 2 parameter) The maximum mismatch rate allowed. (Default:
+0.1)
+--bowtie2-k &lt;int&gt;
+(Bowtie 2 parameter) Find up to &lt;int&gt; alignments per read. (Default:
+200)
+--bowtie2-sensitivity-level &lt;string&gt;
+(Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end
+mode. This option controls how hard Bowtie 2 tries to find
+alignments. &lt;string&gt; must be one of "very_fast", "fast", "sensitive"
+and "very_sensitive". The four candidates correspond to Bowtie 2's
+"--very-fast", "--fast", "--sensitive" and "--very-sensitive"
+options. (Default: "sensitive" - use Bowtie 2's default)
+--star-path &lt;path&gt;
+The path to STAR's executable. (Default: the path to STAR executable
+is assumed to be in user's PATH environment variable)
+--star-gzipped-read-file
+(STAR parameter) Input read file(s) is compressed by gzip. (Default:
+off)
+--star-bzipped-read-file
+(STAR parameter) Input read file(s) is compressed by bzip2.
+(Default: off)
+--star-output-genome-bam
+(STAR parameter) Save the BAM file from STAR alignment under genomic
+coordinate to 'sample_name.STAR.genome.bam'. This file is NOT sorted
+by genomic coordinate. In this file, according to STAR's manual,
+'paired ends of an alignment are always adjacent, and multiple
+alignments of a read are adjacent as well'. (Default: off)
+ADVANCED OPTIONS
+--tag &lt;string&gt;
+The name of the optional field used in the SAM input for identifying
+a read with too many valid alignments. The field should have the
+format &lt;tagName&gt;:i:&lt;value&gt;, where a &lt;value&gt; bigger than 0 indicates
+a read with too many alignments. (Default: "")
+--fragment-length-min &lt;int&gt;
+Minimum read/insert length allowed. This is also the value for the
+Bowtie/Bowtie2 -I option. (Default: 1)
+--fragment-length-max &lt;int&gt;
+Maximum read/insert length allowed. This is also the value for the
+Bowtie/Bowtie 2 -X option. (Default: 1000)
+--fragment-length-mean &lt;double&gt;
+(single-end data only) The mean of the fragment length distribution,
+which is assumed to be a Gaussian. (Default: -1, which disables use
+of the fragment length distribution)
+--fragment-length-sd &lt;double&gt;
+(single-end data only) The standard deviation of the fragment length
+distribution, which is assumed to be a Gaussian. (Default: 0, which
+assumes that all fragments are of the same length, given by the
+rounded value of --fragment-length-mean)
+--estimate-rspd
+Set this option if you want to estimate the read start position
+distribution (RSPD) from data. Otherwise, RSEM will use a uniform
+RSPD. (Default: off)
+--num-rspd-bins &lt;int&gt;
+Number of bins in the RSPD. Only relevant when '--estimate-rspd' is
+specified. Use of the default setting is recommended. (Default: 20)
+--gibbs-burnin &lt;int&gt;
+The number of burn-in rounds for RSEM's Gibbs sampler. Each round
+passes over the entire data set once. If RSEM can use multiple
+threads, multiple Gibbs samplers will start at the same time and all
+samplers share the same burn-in number. (Default: 200)
+--gibbs-number-of-samples &lt;int&gt;
+The total number of count vectors RSEM will collect from its Gibbs
+samplers. (Default: 1000)
+--gibbs-sampling-gap &lt;int&gt;
+The number of rounds between two succinct count vectors RSEM
+collects. If the count vector after round N is collected, the count
+vector after round N + &lt;int&gt; will also be collected. (Default: 1)
+--ci-credibility-level &lt;double&gt;
+The credibility level for credibility intervals. (Default: 0.95)
+--ci-memory &lt;int&gt;
+Maximum size (in memory, MB) of the auxiliary buffer used for
+computing credibility intervals (CI). (Default: 1024)
+--ci-number-of-samples-per-count-vector &lt;int&gt;
+The number of read generating probability vectors sampled per
+sampled count vector. The crebility intervals are calculated by
+first sampling P(C | D) and then sampling P(Theta | C) for each
+sampled count vector. This option controls how many Theta vectors
+are sampled per sampled count vector. (Default: 50)
+--keep-intermediate-files
+Keep temporary files generated by RSEM. RSEM creates a temporary
+directory, 'sample_name.temp', into which it puts all intermediate
+output files. If this directory already exists, RSEM overwrites all
+files generated by previous RSEM runs inside of it. By default,
+after RSEM finishes, the temporary directory is deleted. Set this
+option to prevent the deletion of this directory and the
+intermediate files inside of it. (Default: off)
+--temporary-folder &lt;string&gt;
+Set where to put the temporary files generated by RSEM. If the
+folder specified does not exist, RSEM will try to create it.
+(Default: sample_name.temp)
+--time
+Output time consumed by each step of RSEM to 'sample_name.time'.
+(Default: off)
+PRIOR-ENHANCED RSEM OPTIONS
+--run-pRSEM
+Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e.
+isoform's initial pseudo-count for RSEM's Gibbs sampling, will be
+learned from input RNA-seq data and an external data set. When pRSEM
+needs and only needs ChIP-seq peak information to partition isoforms
+(e.g. in pRSEM's default partition model), either ChIP-seq peak file
+(with the '--chipseq-peak-file' option) or ChIP-seq FASTQ files for
+target and input and the path for Bowtie executables are required
+(with the '--chipseq-target-read-files &lt;string&gt;',
+'--chipseq-control-read-files &lt;string&gt;', and '--bowtie-path &lt;path&gt;
+options), otherwise, ChIP-seq FASTQ files for target and control and
+the path to Bowtie executables are required. (Default: off)
+--chipseq-peak-file &lt;string&gt;
+Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e.
+BED6+4, format. This file is used when running prior-enhanced RSEM
+in the default two-partition model. It partitions isoforms by
+whether they have ChIP-seq overlapping with their transcription
+start site region or not. Each partition will have its own prior
+parameter learned from a training set. This file can be either
+gzipped or ungzipped. (Default: "")
+--chipseq-target-read-files &lt;string&gt;
+Comma-separated full path of FASTQ read file(s) for ChIP-seq target.
+This option is used when running prior-enhanced RSEM. It provides
+information to calculate ChIP-seq peaks and signals. The file(s) can
+be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The
+options '--bowtie-path &lt;path&gt;' and '--chipseq-control-read-files
+&lt;string&gt;' must be defined when this option is specified. (Default:
+"")
+--chipseq-control-read-files &lt;string&gt;
+Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol.
+This option is used when running prior-enhanced RSEM. It provides
+information to call ChIP-seq peaks. The file(s) can be either
+ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options
+'--bowtie-path &lt;path&gt;' and '--chipseq-target-read-files &lt;string&gt;'
+must be defined when this option is specified. (Default: "")
+--chipseq-read-files-multi-targets &lt;string&gt;
+Comma-separated full path of FASTQ read files for multiple ChIP-seq
+targets. This option is used when running prior-enhanced RSEM, where
+prior is learned from multiple complementary data sets. It provides
+information to calculate ChIP-seq signals. All files can be either
+ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this
+option is specified, the option '--bowtie-path &lt;path&gt;' must be
+defined and the option '--partition-model &lt;string&gt;' will be set to
+'cmb_lgt' automatically. (Default: "")
+--chipseq-bed-files-multi-targets &lt;string&gt;
+Comma-separated full path of BED files for multiple ChIP-seq
+targets. This option is used when running prior-enhanced RSEM, where
+prior is learned from multiple complementary data sets. It provides
+information of ChIP-seq signals and must have at least the first six
+BED columns. All files can be either ungzipped or gzipped with a
+suffix '.gz' or '.gzip'. When this option is specified, the option
+'--partition-model &lt;string&gt;' will be set to 'cmb_lgt' automatically.
+(Default: "")
+--cap-stacked-chipseq-reads
+Keep a maximum number of ChIP-seq reads that aligned to the same
+genomic interval. This option is used when running prior-enhanced
+RSEM, where prior is learned from multiple complementary data sets.
+This option is only in use when either
+'--chipseq-read-files-multi-targets &lt;string&gt;' or
+'--chipseq-bed-files-multi-targets &lt;string&gt;' is specified. (Default:
+off)
+--n-max-stacked-chipseq-reads &lt;int&gt;
+The maximum number of stacked ChIP-seq reads to keep. This option is
+used when running prior-enhanced RSEM, where prior is learned from
+multiple complementary data sets. This option is only in use when
+the option '--cap-stacked-chipseq-reads' is set. (Default: 5)
+--partition-model &lt;string&gt;
+A keyword to specify the partition model used by prior-enhanced
+RSEM. It must be one of the following keywords:
+- pk
+Partitioned by whether an isoform has a ChIP-seq peak overlapping
+with its transcription start site (TSS) region. The TSS region is
+defined as [TSS-500bp, TSS+500bp]. For simplicity, we refer this
+type of peak as 'TSS peak' when explaining other keywords.
+- pk_lgtnopk
+First partitioned by TSS peak. Then, for isoforms in the 'no TSS
+peak' set, a logistic model is employed to further classify them
+into two partitions.
+- lm3, lm4, lm5, or lm6
+Based on their ChIP-seq signals, isoforms are classified into 3,
+4, 5, or 6 partitions by a linear regression model.
+- nopk_lm2pk, nopk_lm3pk, nopk_lm4pk, or nopk_lm5pk
+First partitioned by TSS peak. Then, for isoforms in the 'with TSS
+peak' set, a linear regression model is employed to further
+classify them into 2, 3, 4, or 5 partitions.
+- pk_lm2nopk, pk_lm3nopk, pk_lm4nopk, or pk_lm5nopk
+First partitioned by TSS peak. Then, for isoforms in the 'no TSS
+peak' set, a linear regression model is employed to further
+classify them into 2, 3, 4, or 5 partitions.
+- cmb_lgt
+Using a logistic regression to combine TSS signals from multiple
+complementary data sets and partition training set isoform into
+'expressed' and 'not expressed'. This partition model is only in
+use when either '--chipseq-read-files-multi-targets &lt;string&gt;' or
+'--chipseq-bed-files-multi-targets &lt;string&gt; is specified.
+Parameters for all the above models are learned from a training set.
+For detailed explanations, please see prior-enhanced RSEM's paper.
+(Default: 'pk')
+DEPRECATED OPTIONS
+The options in this section are deprecated. They are here only for
+compatibility reasons and may be removed in future releases.
+--sam
+Inputs are alignments in SAM format. (Default: off)
+--bam
+Inputs are alignments in BAM format. (Default: off)
+--strand-specific
+Equivalent to '--strandedness forward'. (Default: off)
+--forward-prob &lt;double&gt;
+Probability of generating a read from the forward strand of a
+transcript. Set to 1 for a strand-specific protocol where all
+(upstream) reads are derived from the forward strand, 0 for a
+strand-specific protocol where all (upstream) read are derived from
+the reverse strand, or 0.5 for a non-strand-specific protocol.
+(Default: off)
+DESCRIPTION
+In its default mode, this program aligns input reads against a reference
+transcriptome with Bowtie and calculates expression values using the
+alignments. RSEM assumes the data are single-end reads with quality
+scores, unless the '--paired-end' or '--no-qualities' options are
+specified. Alternatively, users can use STAR to align reads using the
+'--star' option. RSEM has provided options in 'rsem-prepare-reference'
+to prepare STAR's genome indices. Users may use an alternative aligner
+by specifying '--alignments', and providing an alignment file in
+SAM/BAM/CRAM format. However, users should make sure that they align
+against the indices generated by 'rsem-prepare-reference' and the
+alignment file satisfies the requirements mentioned in ARGUMENTS
+section.
+One simple way to make the alignment file satisfying RSEM's requirements
+is to use the 'convert-sam-for-rsem' script. This script accepts
+SAM/BAM/CRAM files as input and outputs a BAM file. For example, type
+the following command to convert a SAM file, 'input.sam', to a
+ready-for-use BAM file, 'input_for_rsem.bam':
+convert-sam-for-rsem input.sam input_for_rsem
+For details, please refer to 'convert-sam-for-rsem's documentation page.
+NOTES
+1. Users must run 'rsem-prepare-reference' with the appropriate
+reference before using this program.
+2. For single-end data, it is strongly recommended that the user provide
+the fragment length distribution parameters (--fragment-length-mean and
+--fragment-length-sd). For paired-end data, RSEM will automatically
+learn a fragment length distribution from the data.
+3. Some aligner parameters have default values different from their
+original settings.
+4. With the '--calc-pme' option, posterior mean estimates will be
+calculated in addition to maximum likelihood estimates.
+5. With the '--calc-ci' option, 95% credibility intervals and posterior
+mean estimates will be calculated in addition to maximum likelihood
+estimates.
+6. The temporary directory and all intermediate files will be removed
+when RSEM finishes unless '--keep-intermediate-files' is specified.
+With the '--run-pRSEM' option and associated options (see section
+'PRIOR-ENHANCED RSEM OPTIONS' above for details), prior-enhanced RSEM
+will be running. Prior parameters will be learned from supplied external
+data set(s) and assigned as initial pseudo-counts for isoforms in the
+corresponding partition for Gibbs sampling.
+OUTPUT
+sample_name.isoforms.results
+File containing isoform level expression estimates. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
+transcript_id gene_id length effective_length expected_count TPM
+FPKM IsoPct [posterior_mean_count
+posterior_standard_deviation_of_count pme_TPM pme_FPKM
+IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound
+TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound
+FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
+'transcript_id' is the transcript name of this transcript. 'gene_id'
+is the gene name of the gene which this transcript belongs to
+(denote this gene as its parent gene). If no gene information is
+provided, 'gene_id' and 'transcript_id' are the same.
+'length' is this transcript's sequence length (poly(A) tail is not
+counted). 'effective_length' counts only the positions that can
+generate a valid fragment. If no poly(A) tail is added,
+'effective_length' is equal to transcript length - mean fragment
+length + 1. If one transcript's effective length is less than 1,
+this transcript's both effective length and abundance estimates are
+set to 0.
+'expected_count' is the sum of the posterior probability of each
+read comes from this transcript over all reads. Because 1) each read
+aligning to this transcript has a probability of being generated
+from background noise; 2) RSEM may filter some alignable low quality
+reads, the sum of expected counts for all transcript are generally
+less than the total number of reads aligned.
+'TPM' stands for Transcripts Per Million. It is a relative measure
+of transcript abundance. The sum of all transcripts' TPM is 1
+million. 'FPKM' stands for Fragments Per Kilobase of transcript per
+Million mapped reads. It is another relative measure of transcript
+abundance. If we define l_bar be the mean transcript length in a
+sample, which can be calculated as
+l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through
+every transcript),
+the following equation is hold:
+FPKM_i = 10^3 / l_bar * TPM_i.
+We can see that the sum of FPKM is not a constant across samples.
+'IsoPct' stands for isoform percentage. It is the percentage of this
+transcript's abandunce over its parent gene's abandunce. If its
+parent gene has only one isoform or the gene information is not
+provided, this field will be set to 100.
+'posterior_mean_count', 'pme_TPM', 'pme_FPKM' are posterior mean
+estimates calculated by RSEM's Gibbs sampler.
+'posterior_standard_deviation_of_count' is the posterior standard
+deviation of counts. 'IsoPct_from_pme_TPM' is the isoform percentage
+calculated from 'pme_TPM' values.
+'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound'
+and 'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95%
+credibility intervals for TPM and FPKM values. The bounds are
+inclusive (i.e. [l, u]).
+'TPM_coefficient_of_quartile_variation' and
+'FPKM_coefficient_of_quartile_variation' are coefficients of
+quartile variation (CQV) for TPM and FPKM values. CQV is a robust
+way of measuring the ratio between the standard deviation and the
+mean. It is defined as
+CQV := (Q3 - Q1) / (Q3 + Q1),
+where Q1 and Q3 are the first and third quartiles.
+sample_name.genes.results
+File containing gene level expression estimates. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
+gene_id transcript_id(s) length effective_length expected_count TPM
+FPKM [posterior_mean_count posterior_standard_deviation_of_count
+pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound
+TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound
+FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
+'transcript_id(s)' is a comma-separated list of transcript_ids
+belonging to this gene. If no gene information is provided,
+'gene_id' and 'transcript_id(s)' are identical (the
+'transcript_id').
+A gene's 'length' and 'effective_length' are defined as the weighted
+average of its transcripts' lengths and effective lengths (weighted
+by 'IsoPct'). A gene's abundance estimates are just the sum of its
+transcripts' abundance estimates.
+sample_name.alleles.results
+Only generated when the RSEM references are built with
+allele-specific transcripts.
+This file contains allele level expression estimates for
+allele-specific expression calculation. The first line contains
+column names separated by the tab character. The format of each line
+in the rest of this file is:
+allele_id transcript_id gene_id length effective_length
+expected_count TPM FPKM AlleleIsoPct AlleleGenePct
+[posterior_mean_count posterior_standard_deviation_of_count pme_TPM
+pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM
+TPM_ci_lower_bound TPM_ci_upper_bound
+TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound
+FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
+'allele_id' is the allele-specific name of this allele-specific
+transcript.
+'AlleleIsoPct' stands for allele-specific percentage on isoform
+level. It is the percentage of this allele-specific transcript's
+abundance over its parent transcript's abundance. If its parent
+transcript has only one allele variant form, this field will be set
+to 100.
+'AlleleGenePct' stands for allele-specific percentage on gene level.
+It is the percentage of this allele-specific transcript's abundance
+over its parent gene's abundance.
+'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have
+similar meanings. They are calculated based on posterior mean
+estimates.
+Please note that if this file is present, the fields 'length' and
+'effective_length' in 'sample_name.isoforms.results' should be
+interpreted similarly as the corresponding definitions in
+'sample_name.genes.results'.
+sample_name.transcript.bam
+Only generated when --no-bam-output is not specified.
+'sample_name.transcript.bam' is a BAM-formatted file of read
+alignments in transcript coordinates. The MAPQ field of each
+alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)),
+where w is the posterior probability of that alignment being the
+true mapping of a read. In addition, RSEM pads a new tag ZW:f:value,
+where value is a single precision floating number representing the
+posterior probability. Because this file contains all alignment
+lines produced by bowtie or user-specified aligners, it can also be
+used as a replacement of the aligner generated BAM/SAM file.
+sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai
+Only generated when --no-bam-output is not specified and --sort-bam-by-coordinate is specified.
+'sample_name.transcript.sorted.bam' and
+'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and
+indices generated by samtools (included in RSEM package).
+sample_name.genome.bam
+Only generated when --no-bam-output is not specified and
+--output-genome-bam is specified.
+'sample_name.genome.bam' is a BAM-formatted file of read alignments
+in genomic coordinates. Alignments of reads that have identical
+genomic coordinates (i.e., alignments to different isoforms that
+share the same genomic region) are collapsed into one alignment. The
+MAPQ field of each alignment is set to min(100, floor(-10 *
+log10(1.0 - w) + 0.5)), where w is the posterior probability of that
+alignment being the true mapping of a read. In addition, RSEM pads a
+new tag ZW:f:value, where value is a single precision floating
+number representing the posterior probability. If an alignment is
+spliced, a XS:A:value tag is also added, where value is either '+'
+or '-' indicating the strand of the transcript it aligns to.
+sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai
+Only generated when --no-bam-output is not specified, and
+--sort-bam-by-coordinate and --output-genome-bam are specified.
+'sample_name.genome.sorted.bam' and
+'sample_name.genome.sorted.bam.bai' are the sorted BAM file and
+indices generated by samtools (included in RSEM package).
+sample_name.time
+Only generated when --time is specified.
+It contains time (in seconds) consumed by aligning reads, estimating
+expression levels and calculating credibility intervals.
+sample_name.stat
+This is a folder instead of a file. All model related statistics are
+stored in this folder. Use 'rsem-plot-model' can generate plots
+using this folder.
+'sample_name.stat/sample_name.cnt' contains alignment statistics.
+The format and meanings of each field are described in
+'cnt_file_description.txt' under RSEM directory.
+'sample_name.stat/sample_name.model' stores RNA-Seq model parameters
+learned from the data. The format and meanings of each filed of this
+file are described in 'model_file_description.txt' under RSEM
+directory.
+The following four output files will be generated only by
+prior-enhanced RSEM
+- 'sample_name.stat/sample_name_prsem.all_tr_features'
+It stores isofrom features for deriving and assigning pRSEM prior.
+The first line is a header and the rest is one isoform per line.
+The description for each column is:
+* trid: transcript ID from input annotation
+* geneid: gene ID from input anntation
+* chrom: isoform's chromosome name
+* strand: isoform's strand name
+* start: isoform's end with the lowest genomic loci
+* end: isoform's end with the highest genomic loci
+* tss_mpp: average mappability of [TSS-500bp, TSS+500bp], where
+TSS is isoform's transcription start site, i.e. 5'-end
+* body_mpp: average mappability of (TSS+500bp, TES-500bp), where
+TES is isoform's transcription end site, i.e. 3'-end
+* tes_mpp: average mappability of [TES-500bp, TES+500bp]
+* pme_count: isoform's fragment or read count from RSEM's
+posterior mean estimates
+* tss: isoform's TSS loci
+* tss_pk: equal to 1 if isoform's [TSS-500bp, TSS+500bp] region
+overlaps with a RNA Pol II peak; 0 otherwise
+* is_training: equal to 1 if isoform is in the training set where
+Pol II prior is learned; 0 otherwise
+- 'sample_name.stat/sample_name_prsem.all_tr_prior'
+It stores prior parameters for every isoform. This file does not
+have a header. Each line contains a prior parameter and an
+isoform's transcript ID delimited by " # ".
+- 'sample_name.stat/sample_name_uniform_prior_1.isoforms.results'
+RSEM's posterior mean estimates on the isoform level with an
+initial pseudo-count of one for every isoform. It is in the same
+format as the 'sample_name.isoforms.results'.
+- 'sample_name.stat/sample_name_uniform_prior_1.genes.results'
+RSEM's posterior mean estimates on the gene level with an initial
+pseudo-count of one for every isoform. It is in the same format as
+the 'sample_name.genes.results'.
+When learning prior from multiple external data sets in
+prior-enhanced RSEM, two additional output files will be generated.
+- 'sample_name.stat/sample_name.pval_LL'
+It stores a p-value and a log-likelihood. The p-value indicates
+whether the combination of multiple complementary data sets is
+informative for RNA-seq quantification. The log-likelihood shows
+how well pRSEM's Dirichlet-multinomial model fits the read counts
+of partitioned training set isoforms.
+- 'sample_name.stat/sample_name.lgt_mdl.RData'
+It stores an R object named 'glmmdl', which is a logistic
+regression model on the training set isoforms and multiple
+external data sets.
+In addition, extra columns will be added to
+'sample_name.stat/all_tr_features'
+* is_expr: equal to 1 if isoform has an abundance &gt;= 1 TPM and a
+non-zero read count from RSEM's posterior mean estimates; 0
+otherwise
+* "$external_data_set_basename": log10 of external data's signal at
+[TSS-500, TSS+500]. Signal is the number of reads aligned within
+that interval and normalized to RPKM by read depth and interval
+length. It will be set to -4 if no read aligned to that interval.
+There are multiple columns like this one, where each represents an
+external data set.
+* prd_expr_prob: predicted probability from logistic regression
+model on whether this isoform is expressed or not. A probability
+higher than 0.5 is considered as expressed
+* partition: group index, to which this isoforms is partitioned
+* prior: prior parameter for this isoform
+EXAMPLES
+Assume the path to the bowtie executables is in the user's PATH
+environment variable. Reference files are under '/ref' with name
+'mouse_125'.
+1) '/data/mmliver.fq', single-end reads with quality scores. Quality
+scores are encoded as for 'GA pipeline version &gt;= 1.3'. We want to use 8
+threads and generate a genome BAM file. In addition, we want to append
+gene/transcript names to the result files:
+rsem-calculate-expression --phred64-quals \
+-p 8 \
+--append-names \
+--output-genome-bam \
+/data/mmliver.fq \
+/ref/mouse_125 \
+mmliver_single_quals
+2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', stranded paired-end
+reads with quality scores. Suppose the library is prepared using TruSeq
+Stranded Kit, which means the first mate should map to the reverse
+strand. Quality scores are in SANGER format. We want to use 8 threads
+and do not generate a genome BAM file:
+rsem-calculate-expression -p 8 \
+--paired-end \
+--strandedness reverse \
+/data/mmliver_1.fq \
+/data/mmliver_2.fq \
+/ref/mouse_125 \
+mmliver_paired_end_quals
+3) '/data/mmliver.fa', single-end reads without quality scores. We want
+to use 8 threads:
+rsem-calculate-expression -p 8 \
+--no-qualities \
+/data/mmliver.fa \
+/ref/mouse_125 \
+mmliver_single_without_quals
+4) Data are the same as 1). This time we assume the bowtie executables
+are under '/sw/bowtie'. We want to take a fragment length distribution
+into consideration. We set the fragment length mean to 150 and the
+standard deviation to 35. In addition to a BAM file, we also want to
+generate credibility intervals. We allow RSEM to use 1GB of memory for
+CI calculation:
+rsem-calculate-expression --bowtie-path /sw/bowtie \
+--phred64-quals \
+--fragment-length-mean 150.0 \
+--fragment-length-sd 35.0 \
+-p 8 \
+--output-genome-bam \
+--calc-ci \
+--ci-memory 1024 \
+/data/mmliver.fq \
+/ref/mouse_125 \
+mmliver_single_quals
+5) '/data/mmliver_paired_end_quals.bam', BAM-formatted alignments for
+paired-end reads with quality scores. We want to use 8 threads:
+rsem-calculate-expression --paired-end \
+--alignments \
+-p 8 \
+/data/mmliver_paired_end_quals.bam \
+/ref/mouse_125 \
+mmliver_paired_end_quals
+6) '/data/mmliver_1.fq.gz' and '/data/mmliver_2.fq.gz', paired-end reads
+with quality scores and read files are compressed by gzip. We want to
+use STAR to aligned reads and assume STAR executable is '/sw/STAR'.
+Suppose we want to use 8 threads and do not generate a genome BAM file:
+rsem-calculate-expression --paired-end \
+--star \
+--star-path /sw/STAR \
+--gzipped-read-file \
+--paired-end \
+-p 8 \
+/data/mmliver_1.fq.gz \
+/data/mmliver_2.fq.gz \
+/ref/mouse_125 \
+mmliver_paired_end_quals
+7) In the above example, suppose we want to run prior-enhanced RSEM
+instead. Assuming we want to learn priors from a ChIP-seq peak file
+'/data/mmlive.narrowPeak.gz':
+rsem-calculate-expression --star \
+--star-path /sw/STAR \
+--gzipped-read-file \
+--paired-end \
+--calc-pme \
+--run-pRSEM \
+--chipseq-peak-file /data/mmliver.narrowPeak.gz \
+-p 8 \
+/data/mmliver_1.fq.gz \
+/data/mmliver_2.fq.gz \
+/ref/mouse_125 \
+mmliver_paired_end_quals
+8) Similar to the example in 7), suppose we want to use the partition
+model 'pk_lm2nopk' (partitioning isoforms by Pol II TSS peak first and
+then partitioning 'no TSS peak' isoforms into two bins by a linear
+regression model), and we want to partition isoforms by RNA Pol II's
+ChIP-seq read files '/data/mmliver_PolIIRep1.fq.gz' and
+'/data/mmliver_PolIIRep2.fq.gz', and the control ChIP-seq read files
+'/data/mmliver_ChIPseqCtrl.fq.gz'. Also, assuming Bowtie's executables
+are under '/sw/bowtie/':
+rsem-calculate-expression --star \
+--star-path /sw/STAR \
+--gzipped-read-file \
+--paired-end \
+--calc-pme \
+--run-pRSEM \
+--chipseq-target-read-files /data/mmliver_PolIIRep1.fq.gz,/data/mmliver_PolIIRep2.fq.gz \
+--chipseq-control-read-files /data/mmliver_ChIPseqCtrl.fq.gz \
+--partition-model pk_lm2nopk \
+--bowtie-path /sw/bowtie \
+-p 8 \
+/data/mmliver_1.fq.gz \
+/data/mmliver_2.fq.gz \
+/ref/mouse_125 \
+mmliver_paired_end_quals
+9) Similar to the example in 8), suppose we want to derive prior from
+four histone modification ChIP-seq read data sets:
+'/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz',
+'/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming
+Bowtie's executables are under '/sw/bowtie/':
+rsem-calculate-expression --star \
+--star-path /sw/STAR \
+--gzipped-read-file \
+--paired-end \
+--calc-pme \
+--run-pRSEM \
+--partition-model cmb_lgt \
+--chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \
+--bowtie-path /sw/bowtie \
+-p 8 \
+/data/mmliver_1.fq.gz \
+/data/mmliver_2.fq.gz \
+/ref/mouse_125 \
+mmliver_paired_end_quals
+</help>
+<citations>
+<citation type="doi">10.1186/1471-2105-12-323</citation>
+</citations>
+</tool>

Mercurial > repos > artbio > rsem

comparison rsem-bwt2.xml @ 0:e5e836936d60 draft