comparison rsem-bwt2.xml @ 0:e5e836936d60 draft

planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit d84a0359354698a4b29df12ab581c2618bffcf80
author artbio
date Sat, 31 Mar 2018 21:30:07 -0400
parents
children 49795544dac7
comparison
equal deleted inserted replaced
-1:000000000000 0:e5e836936d60
1 <tool id="rsembowtie2" name="RSEM-Bowtie2" version="0.4.0">
2 <description></description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <requirements>
7 <requirement type="package" version="1.3.0">rsem</requirement>
8 <requirement type="package" version="2.3.4">bowtie2</requirement>
9 </requirements>
10 <stdio>
11 <exit_code range="1:" level="warning" description="Tool exception" />
12 </stdio>
13 <command detect_errors="exit_code"><![CDATA[
14 #if $job.select_job == "index":
15 echo ${job.reference_name} " " | tee $reference_file &&
16 mkdir $reference_file.files_path &&
17 rsem-prepare-reference
18 #if $job.polya.polya_use == 'add':
19 #if $job.polya.polya_length:
20 --polyA-length $job.polya.polya_length
21 #end if
22 #elif $job.polya.polya_use == 'subset':
23 --no-polyA-subset $job.polya.no_polya_subset
24 #if $job.polya.polya_length:
25 --polyA-length $job.polya.polya_length
26 #end if
27 #elif $job.polya.polya_use == 'none':
28 --no-polyA
29 #end if
30 $job.ntog
31 #if $job.transcript_to_gene_map:
32 --transcript-to-gene-map $job.transcript_to_gene_map
33 #end if
34 --bowtie2
35 #if $job.self_reference.ref_type == 'transcripts':
36 $job.self_reference.reference_fasta_file
37 #else:
38 --gtf $job.self_reference.gtf
39 $job.self_reference.reference_fasta_file
40 #end if
41 ${reference_file.files_path}/${job.reference_name}
42 > ${reference_file.files_path}/${job.reference_name}.log
43 #end if
44
45 #if $job.select_job == "index" and $run_rsem.select == "Yes":
46 &&
47 #end if
48
49 #if $run_rsem.select == "Yes":
50 ## uncompress fastq.gz or fastqsanger.gz if needed
51 #if $run_rsem.input.fastq.matepair=="single":
52 #if $run_rsem.input.fastq.singlefastq.is_of_type('fastq.gz') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger.gz'):
53 gunzip < '$run_rsem.input.fastq.singlefastq' > uncomp_single.fastq &&
54 #elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'):
55 ln -f -s '$run_rsem.input.fastq.singlefastq' 'uncomp_single.fastq' &&
56 #end if
57 #elif $run_rsem.input.fastq.matepair=="paired":
58 #if $run_rsem.input.fastq.fastq1.is_of_type('fastq.gz') or $run_rsem.input.fastq.fastq1.is_of_type('fastqsanger.gz'):
59 gunzip < '$run_rsem.input.fastq.fastq1' > uncomp_pair1.fastq &&
60 gunzip < '$run_rsem.input.fastq.fastq2' > uncomp_pair2.fastq &&
61 #elif $run_rsem.input.fastq.singlefastq.is_of_type('fastq') or $run_rsem.input.fastq.singlefastq.is_of_type('fastqsanger'):
62 ln -f -s '$run_rsem.input.fastq.fastq1' 'uncomp_pair1.fastq' &&
63 ln -f -s '$run_rsem.input.fastq.fastq2' 'uncomp_pair2.fastq' &&
64 #end if
65 #end if
66 rsem-calculate-expression
67 ## --tag string
68 #if $run_rsem.seedlength:
69 --seed-length $run_rsem.seedlength
70 #end if
71 --forward-prob $run_rsem.forward_prob
72 #if $run_rsem.rsem_options.fullparams == 'fullset':
73 ## Fragment info
74 #if $run_rsem.rsem_options.fragment_length_mean:
75 --fragment-length-mean $run_rsem.rsem_options.fragment_length_mean
76 #end if
77 #if $run_rsem.rsem_options.fragment_length_min:
78 --fragment-length-min $run_rsem.rsem_options.fragment_length_min
79 #end if
80 #if $run_rsem.rsem_options.fragment_length_sd:
81 --fragment-length-sd $run_rsem.rsem_options.fragment_length_sd
82 #end if
83 #if $run_rsem.rsem_options.fragment_length_max:
84 --fragment-length-max $run_rsem.rsem_options.fragment_length_max
85 #end if
86 ## RSPD
87 #if $run_rsem.rsem_options.rspd.estimate == 'yes':
88 --estimate-rspd
89 #if $run_rsem.rsem_options.rspd.num_rspd_bins:
90 --num-rspd-bins $run_rsem.rsem_options.rspd.num_rspd_bins
91 #end if
92 #end if
93 ## Calculate 95% credibility intervals and posterior mean estimates.
94 #if $run_rsem.rsem_options.useci.ci == 'yes':
95 --calc-ci
96 #if $run_rsem.rsem_options.useci.cimem:
97 --ci-memory $run_rsem.rsem_options.useci.cimem
98 #end if
99 #end if
100 #end if
101 --num-threads \${GALAXY_SLOTS:-4}
102 --bowtie2
103 #if $run_rsem.input.format == 'fasta' and $run_rsem.input.bowtie2_options.fullparams == 'fullset':
104 ## Bowtie params
105 #if $run_rsem.input.bowtie2_options.bowtie2_mismatch_rate:
106 --bowtie2-mismatch-rate $run_rsem.input.bowtie2_options.bowtie2_mismatch_rate
107 #end if
108 #if $run_rsem.input.bowtie2_options.bowtie2_k:
109 --bowtie2-k $run_rsem.input.bowtie2_options.bowtie2_k
110 #end if
111 #if $run_rsem.input.bowtie2_options.bowtie2_sensitivity_level:
112 --bowtie2-sensitivity-level $run_rsem.input.bowtie2_options.bowtie2_sensitivity_level
113 #end if
114 #end if
115 ## Outputs
116 #if $run_rsem.rsem_outputs.result_bams == 'none':
117 --no-bam-output
118 #elif $run_rsem.rsem_outputs.result_bams == 'default':
119 --sort-bam-by-coordinate
120 #else
121 --sort-bam-by-coordinate
122 --output-genome-bam
123 $run_rsem.rsem_outputs.sampling_for_bam
124 #end if
125 ## Input data
126 #if $run_rsem.input.format=="fastq"
127 $run_rsem.input.fastq_select
128 #if $run_rsem.input.fastq.matepair=="single":
129 uncomp_single.fastq
130 #elif $run_rsem.input.fastq.matepair=="paired":
131 --paired-end
132 uncomp_pair1.fastq
133 uncomp_pair2.fastq
134 #end if
135 #elif $run_rsem.input.format=="fasta"
136 --no-qualities
137 #if $run_rsem.input.fasta.matepair=="single":
138 $run_rsem.input.fasta.singlefasta
139 #elif $run_rsem.input.fasta.matepair=="paired":
140 --paired-end
141 $run_rsem.input.fasta.fasta1
142 $run_rsem.input.fasta.fasta2
143 #end if
144 #elif $run_rsem.input.format=="sam"
145 #if $run_rsem.input.matepair=="paired":
146 --paired-end
147 #end if
148 #if $run_rsem.input.rsem_sam._extension == 'sam':
149 --sam
150 #elif $run_rsem.input.rsem_sam._extension == 'bam':
151 --bam
152 #end if
153 $run_rsem.input.rsem_sam
154 #end if
155 ## RSEM reference
156 #if $run_rsem.reference.refSrc == 'history':
157 ${run_rsem.reference.rsem_ref.extra_files_path}/${run_rsem.reference.rsem_ref.metadata.reference_name}
158 #elif $run_rsem.reference.refSrc == 'self':
159 ${reference_file.files_path}/${job.reference_name}
160 #end if
161 ## sample_name: use a hard coded name so we can pull out galaxy outputs
162 rsem_output
163 ## direct output into logfile
164 > $log
165 #end if
166 ]]></command>
167
168 <inputs>
169 <conditional name="job">
170 <param name="select_job" type="select" label="rsem reference">
171 <option value="index">Build rsem reference</option>
172 <option value="no-index" selected="true">rsem reference available from history</option>
173 </param>
174 <when value="index">
175 <conditional name="self_reference">
176 <param name="ref_type" type="select" label="Reference transcript source">
177 <option value="transcripts">transcript fasta</option>
178 <option value="genomic">reference genome and gtf</option>
179 </param>
180 <when value="transcripts">
181 <param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file"
182 help="The files should contain the sequences of transcripts."/>
183 </when>
184 <when value="genomic">
185 <param name="reference_fasta_file" type="data" format="fasta" label="reference fasta file"
186 help="The file should contain the sequence of an entire genome."/>
187 <param name="gtf" type="data" format="gtf" label="gtf"
188 help="extract transcript reference sequences using the gene annotations specified in this GTF" />
189 </when>
190 </conditional>
191 <param name="transcript_to_gene_map" type="data" format="tabular" optional="true" label="Map of gene ids to transcript (isoform) ids" >
192 <help>
193 Each line of should be of the form: gene_id transcript_id ( with the two fields separated by a tab character )
194 The map can be obtained from the UCSC table browser
195 group: Genes and Gene Prediction Tracks
196 table: knownIsoforms
197 Without a map:
198 If a reference genome and gtf is used, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file.
199 Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene.
200 </help>
201 </param>
202 <param name="reference_name" type="text" value="rsem_ref_name" label="reference name">
203 <help>A one word name for this RSEM reference containing only letters, digits, and underscore characters</help>
204 <validator type="regex" message="Use only letters, digits, and underscore characters">^\w+$</validator>
205 </param>
206 <conditional name="polya">
207 <param name="polya_use" type="select" label="PolyA ">
208 <option value="add" selected="true">Add poly(A) tails to all transcripts</option>
209 <option value="subset">Exclude poly(A) tails from selected transcripts</option>
210 <option value="none">Do not add poly(A) tails to any transcripts</option>
211 </param>
212 <when value="add">
213 <param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)">
214 <validator type="in_range" message="must be positive " min="1"/>
215 </param>
216 </when>
217 <when value="subset">
218 <param name="no_polya_subset" type="data" format="tabular" optional="true" label="List of transcript IDs (one per line) that should should not have polyA tails added."/>
219 <param name="polya_length" type="integer" value="125" optional="true" label="The length of the poly(A) tails to be added. (Default: 125)">
220 <validator type="in_range" message="must be positive " min="1"/>
221 </param>
222 </when>
223 <when value="none"/>
224 </conditional>
225 <param name="ntog" type="boolean" truevalue="--no-ntog" falsevalue="" checked="false" label="Disable the conversion of 'N' characters to 'G' characters in the reference sequences" help="Bowtie uses the automatic N to G conversion to align against all positions in the reference."/>
226 </when>
227 <when value="no-index">
228 </when>
229 </conditional>
230
231 <conditional name="run_rsem">
232 <param name="select" type="select" label="calculate expression with rsem">
233 <option value="No">Just build rsem reference for latter rsem profiling</option>
234 <option value="Yes" selected="true">profile expression with rsem</option>
235 </param>
236 <when value="Yes">
237 <param name="sample" type="text" value="rsem_sample" label="Sample name" />
238 <conditional name="reference">
239 <param name="refSrc" type="select" label="RSEM Reference Source">
240 <option value="history">From your history</option>
241 <option value="self">Prepare RSEM Reference with this tool</option>
242 </param>
243 <when value="history">
244 <param name="rsem_ref" type="data" format="rsem_ref" label="RSEM reference" />
245 </when>
246 <when value="self">
247 </when>
248 </conditional>
249 <conditional name="input">
250 <param name="format" type="select" label="RSEM Input file type">
251 <option value="fastq">FASTQ</option>
252 <option value="fasta">FASTA</option>
253 <option value="sam">SAM/BAM</option>
254 </param>
255 <when value="fastq">
256 <param name="fastq_select" size="15" type="select" label="FASTQ type" >
257 <option value="--phred33-quals" selected="true">phred33 qualities (default for sanger)</option>
258 <option value="--solexa-quals">solexa qualities</option>
259 <option value="--phred64-quals">phred64 qualities</option>
260 </param>
261 <conditional name="fastq">
262 <param name="matepair" type="select" label="Library type">
263 <option value="single">Single End Reads</option>
264 <option value="paired">Paired End Reads</option>
265 </param>
266 <when value="single">
267 <param name="singlefastq" type="data" format="fastq,fastq.gz" label="FASTQ file" />
268 </when>
269 <when value="paired">
270 <param name="fastq1" type="data" format="fastq,fastq.gz" label="Read 1 fastq file" />
271 <param name="fastq2" type="data" format="fastq,fastq.gz" label="Read 2 fastq file" />
272 </when>
273 </conditional>
274 <expand macro="bowtie2_options"/>
275 </when>
276 <when value="fasta">
277 <conditional name="fasta">
278 <param name="matepair" type="select" label="Library Type">
279 <option value="single">Single End Reads</option>
280 <option value="paired">Paired End Reads</option>
281 </param>
282 <when value="single">
283 <param name="singlefasta" type="data" format="fasta" label="fasta file" />
284 </when>
285 <when value="paired">
286 <param name="fasta1" type="data" format="fasta" label="Read 1 fasta file" />
287 <param name="fasta2" type="data" format="fasta" label="Read 2 fasta file" />
288 </when>
289 </conditional>
290 <expand macro="bowtie2_options"/>
291 </when>
292 <when value="sam">
293 <!-- convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam -->
294 <param name="matepair" type="select" label="Library Type">
295 <option value="single">Single End Reads</option>
296 <option value="paired">Paired End Reads</option>
297 </param>
298 <param name="rsem_sam" type="data" format="rsem_sam" label="RSEM formatted SAM file" />
299 </when>
300 </conditional>
301 <expand macro="rsem_options"/>
302 <conditional name="rsem_outputs">
303 <param name="result_bams" type="select" label="Create bam results files"
304 help="In addition to the transcript-coordinate-based BAM file output, also output a BAM file with the read alignments in genomic coordinates" >
305 <option value="none">No BAM results files</option>
306 <option value="default" selected="true">Transcript BAM results file</option>
307 <option value="both">Transcript and genome BAM results files</option>
308 </param>
309 <when value="none"/>
310 <when value="default">
311 <expand macro="sampling_for_bam"/>
312 </when>
313 <when value="both">
314 <expand macro="sampling_for_bam"/>
315 </when>
316 </conditional>
317 </when>
318 <when value="No">
319 </when>
320 </conditional>
321 </inputs>
322
323 <outputs>
324 <data format="rsem_ref" name="reference_file" label="RSEM ${job.reference_name} reference">
325 <filter>job['select_job'] == "index"</filter>
326 </data>
327 <data format="tabular" name="gene_abundances" label="${run_rsem.sample}.gene_abundances" from_work_dir="rsem_output.genes.results">
328 <filter>run_rsem['select'] == "Yes"</filter>
329 </data>
330 <data format="tabular" name="isoform_abundances" label="${run_rsem.sample}.isoform_abundances" from_work_dir="rsem_output.isoforms.results">
331 <filter>run_rsem['select'] == "Yes"</filter>
332 </data>
333 <data format="bam" name="transcript_sorted_bam" label="${run_rsem.sample}.transcript.bam" from_work_dir="rsem_output.transcript.sorted.bam" >
334 <filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] != "none"</filter>
335 </data>
336 <data format="bam" name="genome_sorted_bam" label="${run_rsem.sample}.genome.bam" from_work_dir="rsem_output.genome.sorted.bam">
337 <filter>run_rsem['select'] == "Yes" and run_rsem['rsem_outputs']['result_bams'] == "both"</filter>
338 </data>
339 <data format="txt" name="log" label="${run_rsem.sample}.rsem_log">
340 <filter>run_rsem['select'] == "Yes"</filter>
341 </data>
342 </outputs>
343
344 <tests>
345 <test>
346 <param name="select_job" value="index"/>
347 <param name="ref_type" value="genomic"/>
348 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
349 <param name="gtf" value="ref.gtf" ftype="gtf"/>
350 <param name="reference_name" value="ref"/>
351 <param name="select" value="Yes"/>
352 <param name="sample" value="rsem_sample"/>
353 <param name="refSrc" value="self"/>
354 <param name="format" value="fastq"/>
355 <param name="matepair" value="single"/>
356 <param name="singlefastq" value="test.fq" ftype="fastqsanger"/>
357 <param name="result_bams" value="none"/>
358 <output name="reference_file">
359 <assert_contents>
360 <has_text text="ref" />
361 </assert_contents>
362 </output>
363 <output name="gene_abundances" value="gene_abundances.tab2"/>
364 <output name="isoform_abundances" value="isoform_abundances.tab2" />
365 <output name="log">
366 <assert_contents>
367 <has_text text="Expression Results are written" />
368 </assert_contents>
369 </output>
370 </test>
371 <test>
372 <param name="select_job" value="index"/>
373 <param name="ref_type" value="genomic"/>
374 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
375 <param name="gtf" value="ref.gtf" ftype="gtf"/>
376 <param name="reference_name" value="ref"/>
377 <param name="select" value="Yes"/>
378 <param name="sample" value="rsem_sample"/>
379 <param name="refSrc" value="self"/>
380 <param name="format" value="fastq"/>
381 <param name="matepair" value="single"/>
382 <param name="singlefastq" value="test.fastq.gz" ftype="fastqsanger.gz"/>
383 <param name="result_bams" value="none"/>
384 <output name="reference_file">
385 <assert_contents>
386 <has_text text="ref" />
387 </assert_contents>
388 </output>
389 <output name="gene_abundances" value="gene_abundances.tab2"/>
390 <output name="isoform_abundances" value="isoform_abundances.tab2" />
391 <output name="log">
392 <assert_contents>
393 <has_text text="Expression Results are written" />
394 </assert_contents>
395 </output>
396 </test>
397 <test>
398 <param name="select_job" value="index"/>
399 <param name="ref_type" value="genomic"/>
400 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
401 <param name="gtf" value="ref.gtf" ftype="gtf"/>
402 <param name="reference_name" value="ref"/>
403 <param name="select" value="No"/>
404 <output name="reference_file">
405 <assert_contents>
406 <has_text text="ref" />
407 </assert_contents>
408 </output>
409 </test>
410 <test>
411 <param name="select_job" value="index"/>
412 <param name="ref_type" value="genomic"/>
413 <param name="reference_fasta_file" value="ref.fasta" ftype="fasta"/>
414 <param name="gtf" value="ref.gtf" ftype="gtf"/>
415 <param name="reference_name" value="ref"/>
416 <param name="select" value="No"/>
417 <output name="reference_file">
418 <assert_contents>
419 <has_text text="ref" />
420 </assert_contents>
421 </output>
422 </test>
423 </tests>
424
425 <help>
426 .. class:: infomark
427
428 RSEM HOME PAGE - http://deweylab.biostat.wisc.edu/rsem/
429
430 NAME
431 rsem-prepare-reference
432
433 SYNOPSIS
434 rsem-prepare-reference [options] reference_fasta_file(s) reference_name
435
436 DESCRIPTION
437 The rsem-prepare-reference program extracts/preprocesses the reference sequences and builds Bowtie indices using default parameters.
438 This program is used in conjunction with the 'rsem-calculate-expression' program.
439
440 INPUTS
441 A fasta file of transcripts
442 or
443 A genome sequence fasta file and a GTF gene annotation file. (When using UCSC data, include the related knownIsoforms.txt)
444
445 ---
446
447 NAME
448 rsem-calculate-expression - Estimate gene and isoform expression from
449 RNA-Seq data.
450
451 SYNOPSIS
452 rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name
453 rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name
454 rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name
455
456 ARGUMENTS
457 upstream_read_files(s)
458 Comma-separated list of files containing single-end reads or
459 upstream reads for paired-end data. By default, these files are
460 assumed to be in FASTQ format. If the --no-qualities option is
461 specified, then FASTA format is expected.
462
463 downstream_read_file(s)
464 Comma-separated list of files containing downstream reads which are
465 paired with the upstream reads. By default, these files are assumed
466 to be in FASTQ format. If the --no-qualities option is specified,
467 then FASTA format is expected.
468
469 input
470 SAM/BAM/CRAM formatted input file. If "-" is specified for the
471 filename, the input is instead assumed to come from standard input.
472 RSEM requires all alignments of the same read group together. For
473 paired-end reads, RSEM also requires the two mates of any alignment
474 be adjacent. In addition, RSEM does not allow the SEQ and QUAL
475 fields to be empty. See Description section for how to make input
476 file obey RSEM's requirements.
477
478 reference_name
479 The name of the reference used. The user must have run
480 'rsem-prepare-reference' with this reference_name before running
481 this program.
482
483 sample_name
484 The name of the sample analyzed. All output files are prefixed by
485 this name (e.g., sample_name.genes.results)
486
487 BASIC OPTIONS
488 --paired-end
489 Input reads are paired-end reads. (Default: off)
490
491 --no-qualities
492 Input reads do not contain quality scores. (Default: off)
493
494 --strandedness &lt;none|forward|reverse&gt;
495 This option defines the strandedness of the RNA-Seq reads. It
496 recognizes three values: 'none', 'forward', and 'reverse'. 'none'
497 refers to non-strand-specific protocols. 'forward' means all
498 (upstream) reads are derived from the forward strand. 'reverse'
499 means all (upstream) reads are derived from the reverse strand. If
500 'forward'/'reverse' is set, the '--norc'/'--nofw' Bowtie/Bowtie 2
501 option will also be enabled to avoid aligning reads to the opposite
502 strand. For Illumina TruSeq Stranded protocols, please use
503 'reverse'. (Default: 'none')
504
505 -p/--num-threads &lt;int&gt;
506 Number of threads to use. Both Bowtie/Bowtie2, expression estimation
507 and 'samtools sort' will use this many threads. (Default: 1)
508
509 --alignments
510 Input file contains alignments in SAM/BAM/CRAM format. The exact
511 file format will be determined automatically. (Default: off)
512
513 --fai &lt;file&gt;
514 If the header section of input alignment file does not contain
515 reference sequence information, this option should be turned on.
516 &lt;file&gt; is a FAI format file containing each reference sequence's
517 name and length. Please refer to the SAM official website for the
518 details of FAI format. (Default: off)
519
520 --bowtie2
521 Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM
522 does not handle indel, local and discordant alignments, the Bowtie2
523 parameters are set in a way to avoid those alignments. In
524 particular, we use options '--sensitive --dpad 0 --gbar 99999999
525 --mp 1,1 --np 1 --score-min L,0,-0.1' by default. The last parameter
526 of '--score-min', '-0.1', is the negative of maximum mismatch rate.
527 This rate can be set by option '--bowtie2-mismatch-rate'. If reads
528 are paired-end, we additionally use options '--no-mixed' and
529 '--no-discordant'. (Default: off)
530
531 --star
532 Use STAR to align reads. Alignment parameters are from ENCODE3's
533 STAR-RSEM pipeline. To save computational time and memory resources,
534 STAR's Output BAM file is unsorted. It is stored in RSEM's temporary
535 directory with name as 'sample_name.bam'. Each STAR job will have
536 its own private copy of the genome in memory. (Default: off)
537
538 --append-names
539 If gene_name/transcript_name is available, append it to the end of
540 gene_id/transcript_id (separated by '_') in files
541 'sample_name.isoforms.results' and 'sample_name.genes.results'.
542 (Default: off)
543
544 --seed &lt;uint32&gt;
545 Set the seed for the random number generators used in calculating
546 posterior mean estimates and credibility intervals. The seed must be
547 a non-negative 32 bit integer. (Default: off)
548
549 --single-cell-prior
550 By default, RSEM uses Dirichlet(1) as the prior to calculate
551 posterior mean estimates and credibility intervals. However, much
552 less genes are expressed in single cell RNA-Seq data. Thus, if you
553 want to compute posterior mean estimates and/or credibility
554 intervals and you have single-cell RNA-Seq data, you are recommended
555 to turn on this option. Then RSEM will use Dirichlet(0.1) as the
556 prior which encourage the sparsity of the expression levels.
557 (Default: off)
558
559 --calc-pme
560 Run RSEM's collapsed Gibbs sampler to calculate posterior mean
561 estimates. (Default: off)
562
563 --calc-ci
564 Calculate 95% credibility intervals and posterior mean estimates.
565 The credibility level can be changed by setting
566 '--ci-credibility-level'. (Default: off)
567
568 -q/--quiet
569 Suppress the output of logging information. (Default: off)
570
571 -h/--help
572 Show help information.
573
574 --version
575 Show version information.
576
577 OUTPUT OPTIONS
578 --sort-bam-by-read-name
579 Sort BAM file aligned under transcript coordidate by read name.
580 Setting this option on will produce deterministic maximum likelihood
581 estimations from independent runs. Note that sorting will take long
582 time and lots of memory. (Default: off)
583
584 --no-bam-output
585 Do not output any BAM file. (Default: off)
586
587 --sampling-for-bam
588 When RSEM generates a BAM file, instead of outputting all alignments
589 a read has with their posterior probabilities, one alignment is
590 sampled according to the posterior probabilities. The sampling
591 procedure includes the alignment to the "noise" transcript, which
592 does not appear in the BAM file. Only the sampled alignment has a
593 weight of 1. All other alignments have weight 0. If the "noise"
594 transcript is sampled, all alignments appeared in the BAM file
595 should have weight 0. (Default: off)
596
597 --output-genome-bam
598 Generate a BAM file, 'sample_name.genome.bam', with alignments
599 mapped to genomic coordinates and annotated with their posterior
600 probabilities. In addition, RSEM will call samtools (included in
601 RSEM package) to sort and index the bam file.
602 'sample_name.genome.sorted.bam' and
603 'sample_name.genome.sorted.bam.bai' will be generated. (Default:
604 off)
605
606 --sort-bam-by-coordinate
607 Sort RSEM generated transcript and genome BAM files by coordinates
608 and build associated indices. (Default: off)
609
610 --sort-bam-memory-per-thread &lt;string&gt;
611 Set the maximum memory per thread that can be used by 'samtools
612 sort'. &lt;string&gt; represents the memory and accepts suffices 'K/M/G'.
613 RSEM will pass &lt;string&gt; to the '-m' option of 'samtools sort'. Note
614 that the default used here is different from the default used by
615 samtools. (Default: 1G)
616
617 ALIGNER OPTIONS
618 --seed-length &lt;int&gt;
619 Seed length used by the read aligner. Providing the correct value is
620 important for RSEM. If RSEM runs Bowtie, it uses this value for
621 Bowtie's seed length parameter. Any read with its or at least one of
622 its mates' (for paired-end reads) length less than this value will
623 be ignored. If the references are not added poly(A) tails, the
624 minimum allowed value is 5, otherwise, the minimum allowed value is
625 25. Note that this script will only check if the value &gt;= 5 and give
626 a warning message if the value &lt; 25 but &gt;= 5. (Default: 25)
627
628 --phred33-quals
629 Input quality scores are encoded as Phred+33. (Default: on)
630
631 --phred64-quals
632 Input quality scores are encoded as Phred+64 (default for GA
633 Pipeline ver. &gt;= 1.3). (Default: off)
634
635 --solexa-quals
636 Input quality scores are solexa encoded (from GA Pipeline ver. &lt;
637 1.3). (Default: off)
638
639 --bowtie-path &lt;path&gt;
640 The path to the Bowtie executables. (Default: the path to the Bowtie
641 executables is assumed to be in the user's PATH environment
642 variable)
643
644 --bowtie-n &lt;int&gt;
645 (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3,
646 Default: 2)
647
648 --bowtie-e &lt;int&gt;
649 (Bowtie parameter) max sum of mismatch quality scores across the
650 alignment. (Default: 99999999)
651
652 --bowtie-m &lt;int&gt;
653 (Bowtie parameter) suppress all alignments for a read if &gt; &lt;int&gt;
654 valid alignments exist. (Default: 200)
655
656 --bowtie-chunkmbs &lt;int&gt;
657 (Bowtie parameter) memory allocated for best first alignment
658 calculation (Default: 0 - use Bowtie's default)
659
660 --bowtie2-path &lt;path&gt;
661 (Bowtie 2 parameter) The path to the Bowtie 2 executables. (Default:
662 the path to the Bowtie 2 executables is assumed to be in the user's
663 PATH environment variable)
664
665 --bowtie2-mismatch-rate &lt;double&gt;
666 (Bowtie 2 parameter) The maximum mismatch rate allowed. (Default:
667 0.1)
668
669 --bowtie2-k &lt;int&gt;
670 (Bowtie 2 parameter) Find up to &lt;int&gt; alignments per read. (Default:
671 200)
672
673 --bowtie2-sensitivity-level &lt;string&gt;
674 (Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end
675 mode. This option controls how hard Bowtie 2 tries to find
676 alignments. &lt;string&gt; must be one of "very_fast", "fast", "sensitive"
677 and "very_sensitive". The four candidates correspond to Bowtie 2's
678 "--very-fast", "--fast", "--sensitive" and "--very-sensitive"
679 options. (Default: "sensitive" - use Bowtie 2's default)
680
681 --star-path &lt;path&gt;
682 The path to STAR's executable. (Default: the path to STAR executable
683 is assumed to be in user's PATH environment variable)
684
685 --star-gzipped-read-file
686 (STAR parameter) Input read file(s) is compressed by gzip. (Default:
687 off)
688
689 --star-bzipped-read-file
690 (STAR parameter) Input read file(s) is compressed by bzip2.
691 (Default: off)
692
693 --star-output-genome-bam
694 (STAR parameter) Save the BAM file from STAR alignment under genomic
695 coordinate to 'sample_name.STAR.genome.bam'. This file is NOT sorted
696 by genomic coordinate. In this file, according to STAR's manual,
697 'paired ends of an alignment are always adjacent, and multiple
698 alignments of a read are adjacent as well'. (Default: off)
699
700 ADVANCED OPTIONS
701 --tag &lt;string&gt;
702 The name of the optional field used in the SAM input for identifying
703 a read with too many valid alignments. The field should have the
704 format &lt;tagName&gt;:i:&lt;value&gt;, where a &lt;value&gt; bigger than 0 indicates
705 a read with too many alignments. (Default: "")
706
707 --fragment-length-min &lt;int&gt;
708 Minimum read/insert length allowed. This is also the value for the
709 Bowtie/Bowtie2 -I option. (Default: 1)
710
711 --fragment-length-max &lt;int&gt;
712 Maximum read/insert length allowed. This is also the value for the
713 Bowtie/Bowtie 2 -X option. (Default: 1000)
714
715 --fragment-length-mean &lt;double&gt;
716 (single-end data only) The mean of the fragment length distribution,
717 which is assumed to be a Gaussian. (Default: -1, which disables use
718 of the fragment length distribution)
719
720 --fragment-length-sd &lt;double&gt;
721 (single-end data only) The standard deviation of the fragment length
722 distribution, which is assumed to be a Gaussian. (Default: 0, which
723 assumes that all fragments are of the same length, given by the
724 rounded value of --fragment-length-mean)
725
726 --estimate-rspd
727 Set this option if you want to estimate the read start position
728 distribution (RSPD) from data. Otherwise, RSEM will use a uniform
729 RSPD. (Default: off)
730
731 --num-rspd-bins &lt;int&gt;
732 Number of bins in the RSPD. Only relevant when '--estimate-rspd' is
733 specified. Use of the default setting is recommended. (Default: 20)
734
735 --gibbs-burnin &lt;int&gt;
736 The number of burn-in rounds for RSEM's Gibbs sampler. Each round
737 passes over the entire data set once. If RSEM can use multiple
738 threads, multiple Gibbs samplers will start at the same time and all
739 samplers share the same burn-in number. (Default: 200)
740
741 --gibbs-number-of-samples &lt;int&gt;
742 The total number of count vectors RSEM will collect from its Gibbs
743 samplers. (Default: 1000)
744
745 --gibbs-sampling-gap &lt;int&gt;
746 The number of rounds between two succinct count vectors RSEM
747 collects. If the count vector after round N is collected, the count
748 vector after round N + &lt;int&gt; will also be collected. (Default: 1)
749
750 --ci-credibility-level &lt;double&gt;
751 The credibility level for credibility intervals. (Default: 0.95)
752
753 --ci-memory &lt;int&gt;
754 Maximum size (in memory, MB) of the auxiliary buffer used for
755 computing credibility intervals (CI). (Default: 1024)
756
757 --ci-number-of-samples-per-count-vector &lt;int&gt;
758 The number of read generating probability vectors sampled per
759 sampled count vector. The crebility intervals are calculated by
760 first sampling P(C | D) and then sampling P(Theta | C) for each
761 sampled count vector. This option controls how many Theta vectors
762 are sampled per sampled count vector. (Default: 50)
763
764 --keep-intermediate-files
765 Keep temporary files generated by RSEM. RSEM creates a temporary
766 directory, 'sample_name.temp', into which it puts all intermediate
767 output files. If this directory already exists, RSEM overwrites all
768 files generated by previous RSEM runs inside of it. By default,
769 after RSEM finishes, the temporary directory is deleted. Set this
770 option to prevent the deletion of this directory and the
771 intermediate files inside of it. (Default: off)
772
773 --temporary-folder &lt;string&gt;
774 Set where to put the temporary files generated by RSEM. If the
775 folder specified does not exist, RSEM will try to create it.
776 (Default: sample_name.temp)
777
778 --time
779 Output time consumed by each step of RSEM to 'sample_name.time'.
780 (Default: off)
781
782 PRIOR-ENHANCED RSEM OPTIONS
783 --run-pRSEM
784 Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e.
785 isoform's initial pseudo-count for RSEM's Gibbs sampling, will be
786 learned from input RNA-seq data and an external data set. When pRSEM
787 needs and only needs ChIP-seq peak information to partition isoforms
788 (e.g. in pRSEM's default partition model), either ChIP-seq peak file
789 (with the '--chipseq-peak-file' option) or ChIP-seq FASTQ files for
790 target and input and the path for Bowtie executables are required
791 (with the '--chipseq-target-read-files &lt;string&gt;',
792 '--chipseq-control-read-files &lt;string&gt;', and '--bowtie-path &lt;path&gt;
793 options), otherwise, ChIP-seq FASTQ files for target and control and
794 the path to Bowtie executables are required. (Default: off)
795
796 --chipseq-peak-file &lt;string&gt;
797 Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e.
798 BED6+4, format. This file is used when running prior-enhanced RSEM
799 in the default two-partition model. It partitions isoforms by
800 whether they have ChIP-seq overlapping with their transcription
801 start site region or not. Each partition will have its own prior
802 parameter learned from a training set. This file can be either
803 gzipped or ungzipped. (Default: "")
804
805 --chipseq-target-read-files &lt;string&gt;
806 Comma-separated full path of FASTQ read file(s) for ChIP-seq target.
807 This option is used when running prior-enhanced RSEM. It provides
808 information to calculate ChIP-seq peaks and signals. The file(s) can
809 be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The
810 options '--bowtie-path &lt;path&gt;' and '--chipseq-control-read-files
811 &lt;string&gt;' must be defined when this option is specified. (Default:
812 "")
813
814 --chipseq-control-read-files &lt;string&gt;
815 Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol.
816 This option is used when running prior-enhanced RSEM. It provides
817 information to call ChIP-seq peaks. The file(s) can be either
818 ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options
819 '--bowtie-path &lt;path&gt;' and '--chipseq-target-read-files &lt;string&gt;'
820 must be defined when this option is specified. (Default: "")
821
822 --chipseq-read-files-multi-targets &lt;string&gt;
823 Comma-separated full path of FASTQ read files for multiple ChIP-seq
824 targets. This option is used when running prior-enhanced RSEM, where
825 prior is learned from multiple complementary data sets. It provides
826 information to calculate ChIP-seq signals. All files can be either
827 ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this
828 option is specified, the option '--bowtie-path &lt;path&gt;' must be
829 defined and the option '--partition-model &lt;string&gt;' will be set to
830 'cmb_lgt' automatically. (Default: "")
831
832 --chipseq-bed-files-multi-targets &lt;string&gt;
833 Comma-separated full path of BED files for multiple ChIP-seq
834 targets. This option is used when running prior-enhanced RSEM, where
835 prior is learned from multiple complementary data sets. It provides
836 information of ChIP-seq signals and must have at least the first six
837 BED columns. All files can be either ungzipped or gzipped with a
838 suffix '.gz' or '.gzip'. When this option is specified, the option
839 '--partition-model &lt;string&gt;' will be set to 'cmb_lgt' automatically.
840 (Default: "")
841
842 --cap-stacked-chipseq-reads
843 Keep a maximum number of ChIP-seq reads that aligned to the same
844 genomic interval. This option is used when running prior-enhanced
845 RSEM, where prior is learned from multiple complementary data sets.
846 This option is only in use when either
847 '--chipseq-read-files-multi-targets &lt;string&gt;' or
848 '--chipseq-bed-files-multi-targets &lt;string&gt;' is specified. (Default:
849 off)
850
851 --n-max-stacked-chipseq-reads &lt;int&gt;
852 The maximum number of stacked ChIP-seq reads to keep. This option is
853 used when running prior-enhanced RSEM, where prior is learned from
854 multiple complementary data sets. This option is only in use when
855 the option '--cap-stacked-chipseq-reads' is set. (Default: 5)
856
857 --partition-model &lt;string&gt;
858 A keyword to specify the partition model used by prior-enhanced
859 RSEM. It must be one of the following keywords:
860
861 - pk
862 Partitioned by whether an isoform has a ChIP-seq peak overlapping
863 with its transcription start site (TSS) region. The TSS region is
864 defined as [TSS-500bp, TSS+500bp]. For simplicity, we refer this
865 type of peak as 'TSS peak' when explaining other keywords.
866
867 - pk_lgtnopk
868 First partitioned by TSS peak. Then, for isoforms in the 'no TSS
869 peak' set, a logistic model is employed to further classify them
870 into two partitions.
871
872 - lm3, lm4, lm5, or lm6
873 Based on their ChIP-seq signals, isoforms are classified into 3,
874 4, 5, or 6 partitions by a linear regression model.
875
876 - nopk_lm2pk, nopk_lm3pk, nopk_lm4pk, or nopk_lm5pk
877 First partitioned by TSS peak. Then, for isoforms in the 'with TSS
878 peak' set, a linear regression model is employed to further
879 classify them into 2, 3, 4, or 5 partitions.
880
881 - pk_lm2nopk, pk_lm3nopk, pk_lm4nopk, or pk_lm5nopk
882 First partitioned by TSS peak. Then, for isoforms in the 'no TSS
883 peak' set, a linear regression model is employed to further
884 classify them into 2, 3, 4, or 5 partitions.
885
886 - cmb_lgt
887 Using a logistic regression to combine TSS signals from multiple
888 complementary data sets and partition training set isoform into
889 'expressed' and 'not expressed'. This partition model is only in
890 use when either '--chipseq-read-files-multi-targets &lt;string&gt;' or
891 '--chipseq-bed-files-multi-targets &lt;string&gt; is specified.
892
893 Parameters for all the above models are learned from a training set.
894 For detailed explanations, please see prior-enhanced RSEM's paper.
895 (Default: 'pk')
896
897 DEPRECATED OPTIONS
898 The options in this section are deprecated. They are here only for
899 compatibility reasons and may be removed in future releases.
900
901 --sam
902 Inputs are alignments in SAM format. (Default: off)
903
904 --bam
905 Inputs are alignments in BAM format. (Default: off)
906
907 --strand-specific
908 Equivalent to '--strandedness forward'. (Default: off)
909
910 --forward-prob &lt;double&gt;
911 Probability of generating a read from the forward strand of a
912 transcript. Set to 1 for a strand-specific protocol where all
913 (upstream) reads are derived from the forward strand, 0 for a
914 strand-specific protocol where all (upstream) read are derived from
915 the reverse strand, or 0.5 for a non-strand-specific protocol.
916 (Default: off)
917
918 DESCRIPTION
919 In its default mode, this program aligns input reads against a reference
920 transcriptome with Bowtie and calculates expression values using the
921 alignments. RSEM assumes the data are single-end reads with quality
922 scores, unless the '--paired-end' or '--no-qualities' options are
923 specified. Alternatively, users can use STAR to align reads using the
924 '--star' option. RSEM has provided options in 'rsem-prepare-reference'
925 to prepare STAR's genome indices. Users may use an alternative aligner
926 by specifying '--alignments', and providing an alignment file in
927 SAM/BAM/CRAM format. However, users should make sure that they align
928 against the indices generated by 'rsem-prepare-reference' and the
929 alignment file satisfies the requirements mentioned in ARGUMENTS
930 section.
931
932 One simple way to make the alignment file satisfying RSEM's requirements
933 is to use the 'convert-sam-for-rsem' script. This script accepts
934 SAM/BAM/CRAM files as input and outputs a BAM file. For example, type
935 the following command to convert a SAM file, 'input.sam', to a
936 ready-for-use BAM file, 'input_for_rsem.bam':
937
938 convert-sam-for-rsem input.sam input_for_rsem
939
940 For details, please refer to 'convert-sam-for-rsem's documentation page.
941
942 NOTES
943 1. Users must run 'rsem-prepare-reference' with the appropriate
944 reference before using this program.
945
946 2. For single-end data, it is strongly recommended that the user provide
947 the fragment length distribution parameters (--fragment-length-mean and
948 --fragment-length-sd). For paired-end data, RSEM will automatically
949 learn a fragment length distribution from the data.
950
951 3. Some aligner parameters have default values different from their
952 original settings.
953
954 4. With the '--calc-pme' option, posterior mean estimates will be
955 calculated in addition to maximum likelihood estimates.
956
957 5. With the '--calc-ci' option, 95% credibility intervals and posterior
958 mean estimates will be calculated in addition to maximum likelihood
959 estimates.
960
961 6. The temporary directory and all intermediate files will be removed
962 when RSEM finishes unless '--keep-intermediate-files' is specified.
963
964 With the '--run-pRSEM' option and associated options (see section
965 'PRIOR-ENHANCED RSEM OPTIONS' above for details), prior-enhanced RSEM
966 will be running. Prior parameters will be learned from supplied external
967 data set(s) and assigned as initial pseudo-counts for isoforms in the
968 corresponding partition for Gibbs sampling.
969
970 OUTPUT
971 sample_name.isoforms.results
972 File containing isoform level expression estimates. The first line
973 contains column names separated by the tab character. The format of
974 each line in the rest of this file is:
975
976 transcript_id gene_id length effective_length expected_count TPM
977 FPKM IsoPct [posterior_mean_count
978 posterior_standard_deviation_of_count pme_TPM pme_FPKM
979 IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound
980 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound
981 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
982
983 Fields are separated by the tab character. Fields within "[]" are
984 optional. They will not be presented if neither '--calc-pme' nor
985 '--calc-ci' is set.
986
987 'transcript_id' is the transcript name of this transcript. 'gene_id'
988 is the gene name of the gene which this transcript belongs to
989 (denote this gene as its parent gene). If no gene information is
990 provided, 'gene_id' and 'transcript_id' are the same.
991
992 'length' is this transcript's sequence length (poly(A) tail is not
993 counted). 'effective_length' counts only the positions that can
994 generate a valid fragment. If no poly(A) tail is added,
995 'effective_length' is equal to transcript length - mean fragment
996 length + 1. If one transcript's effective length is less than 1,
997 this transcript's both effective length and abundance estimates are
998 set to 0.
999
1000 'expected_count' is the sum of the posterior probability of each
1001 read comes from this transcript over all reads. Because 1) each read
1002 aligning to this transcript has a probability of being generated
1003 from background noise; 2) RSEM may filter some alignable low quality
1004 reads, the sum of expected counts for all transcript are generally
1005 less than the total number of reads aligned.
1006
1007 'TPM' stands for Transcripts Per Million. It is a relative measure
1008 of transcript abundance. The sum of all transcripts' TPM is 1
1009 million. 'FPKM' stands for Fragments Per Kilobase of transcript per
1010 Million mapped reads. It is another relative measure of transcript
1011 abundance. If we define l_bar be the mean transcript length in a
1012 sample, which can be calculated as
1013
1014 l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through
1015 every transcript),
1016
1017 the following equation is hold:
1018
1019 FPKM_i = 10^3 / l_bar * TPM_i.
1020
1021 We can see that the sum of FPKM is not a constant across samples.
1022
1023 'IsoPct' stands for isoform percentage. It is the percentage of this
1024 transcript's abandunce over its parent gene's abandunce. If its
1025 parent gene has only one isoform or the gene information is not
1026 provided, this field will be set to 100.
1027
1028 'posterior_mean_count', 'pme_TPM', 'pme_FPKM' are posterior mean
1029 estimates calculated by RSEM's Gibbs sampler.
1030 'posterior_standard_deviation_of_count' is the posterior standard
1031 deviation of counts. 'IsoPct_from_pme_TPM' is the isoform percentage
1032 calculated from 'pme_TPM' values.
1033
1034 'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound'
1035 and 'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95%
1036 credibility intervals for TPM and FPKM values. The bounds are
1037 inclusive (i.e. [l, u]).
1038
1039 'TPM_coefficient_of_quartile_variation' and
1040 'FPKM_coefficient_of_quartile_variation' are coefficients of
1041 quartile variation (CQV) for TPM and FPKM values. CQV is a robust
1042 way of measuring the ratio between the standard deviation and the
1043 mean. It is defined as
1044
1045 CQV := (Q3 - Q1) / (Q3 + Q1),
1046
1047 where Q1 and Q3 are the first and third quartiles.
1048
1049 sample_name.genes.results
1050 File containing gene level expression estimates. The first line
1051 contains column names separated by the tab character. The format of
1052 each line in the rest of this file is:
1053
1054 gene_id transcript_id(s) length effective_length expected_count TPM
1055 FPKM [posterior_mean_count posterior_standard_deviation_of_count
1056 pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound
1057 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound
1058 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
1059
1060 Fields are separated by the tab character. Fields within "[]" are
1061 optional. They will not be presented if neither '--calc-pme' nor
1062 '--calc-ci' is set.
1063
1064 'transcript_id(s)' is a comma-separated list of transcript_ids
1065 belonging to this gene. If no gene information is provided,
1066 'gene_id' and 'transcript_id(s)' are identical (the
1067 'transcript_id').
1068
1069 A gene's 'length' and 'effective_length' are defined as the weighted
1070 average of its transcripts' lengths and effective lengths (weighted
1071 by 'IsoPct'). A gene's abundance estimates are just the sum of its
1072 transcripts' abundance estimates.
1073
1074 sample_name.alleles.results
1075 Only generated when the RSEM references are built with
1076 allele-specific transcripts.
1077
1078 This file contains allele level expression estimates for
1079 allele-specific expression calculation. The first line contains
1080 column names separated by the tab character. The format of each line
1081 in the rest of this file is:
1082
1083 allele_id transcript_id gene_id length effective_length
1084 expected_count TPM FPKM AlleleIsoPct AlleleGenePct
1085 [posterior_mean_count posterior_standard_deviation_of_count pme_TPM
1086 pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM
1087 TPM_ci_lower_bound TPM_ci_upper_bound
1088 TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound
1089 FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
1090
1091 Fields are separated by the tab character. Fields within "[]" are
1092 optional. They will not be presented if neither '--calc-pme' nor
1093 '--calc-ci' is set.
1094
1095 'allele_id' is the allele-specific name of this allele-specific
1096 transcript.
1097
1098 'AlleleIsoPct' stands for allele-specific percentage on isoform
1099 level. It is the percentage of this allele-specific transcript's
1100 abundance over its parent transcript's abundance. If its parent
1101 transcript has only one allele variant form, this field will be set
1102 to 100.
1103
1104 'AlleleGenePct' stands for allele-specific percentage on gene level.
1105 It is the percentage of this allele-specific transcript's abundance
1106 over its parent gene's abundance.
1107
1108 'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have
1109 similar meanings. They are calculated based on posterior mean
1110 estimates.
1111
1112 Please note that if this file is present, the fields 'length' and
1113 'effective_length' in 'sample_name.isoforms.results' should be
1114 interpreted similarly as the corresponding definitions in
1115 'sample_name.genes.results'.
1116
1117 sample_name.transcript.bam
1118 Only generated when --no-bam-output is not specified.
1119
1120 'sample_name.transcript.bam' is a BAM-formatted file of read
1121 alignments in transcript coordinates. The MAPQ field of each
1122 alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)),
1123 where w is the posterior probability of that alignment being the
1124 true mapping of a read. In addition, RSEM pads a new tag ZW:f:value,
1125 where value is a single precision floating number representing the
1126 posterior probability. Because this file contains all alignment
1127 lines produced by bowtie or user-specified aligners, it can also be
1128 used as a replacement of the aligner generated BAM/SAM file.
1129
1130 sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai
1131 Only generated when --no-bam-output is not specified and --sort-bam-by-coordinate is specified.
1132
1133 'sample_name.transcript.sorted.bam' and
1134 'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and
1135 indices generated by samtools (included in RSEM package).
1136
1137 sample_name.genome.bam
1138 Only generated when --no-bam-output is not specified and
1139 --output-genome-bam is specified.
1140
1141 'sample_name.genome.bam' is a BAM-formatted file of read alignments
1142 in genomic coordinates. Alignments of reads that have identical
1143 genomic coordinates (i.e., alignments to different isoforms that
1144 share the same genomic region) are collapsed into one alignment. The
1145 MAPQ field of each alignment is set to min(100, floor(-10 *
1146 log10(1.0 - w) + 0.5)), where w is the posterior probability of that
1147 alignment being the true mapping of a read. In addition, RSEM pads a
1148 new tag ZW:f:value, where value is a single precision floating
1149 number representing the posterior probability. If an alignment is
1150 spliced, a XS:A:value tag is also added, where value is either '+'
1151 or '-' indicating the strand of the transcript it aligns to.
1152
1153 sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai
1154 Only generated when --no-bam-output is not specified, and
1155 --sort-bam-by-coordinate and --output-genome-bam are specified.
1156
1157 'sample_name.genome.sorted.bam' and
1158 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and
1159 indices generated by samtools (included in RSEM package).
1160
1161 sample_name.time
1162 Only generated when --time is specified.
1163
1164 It contains time (in seconds) consumed by aligning reads, estimating
1165 expression levels and calculating credibility intervals.
1166
1167 sample_name.stat
1168 This is a folder instead of a file. All model related statistics are
1169 stored in this folder. Use 'rsem-plot-model' can generate plots
1170 using this folder.
1171
1172 'sample_name.stat/sample_name.cnt' contains alignment statistics.
1173 The format and meanings of each field are described in
1174 'cnt_file_description.txt' under RSEM directory.
1175
1176 'sample_name.stat/sample_name.model' stores RNA-Seq model parameters
1177 learned from the data. The format and meanings of each filed of this
1178 file are described in 'model_file_description.txt' under RSEM
1179 directory.
1180
1181 The following four output files will be generated only by
1182 prior-enhanced RSEM
1183
1184 - 'sample_name.stat/sample_name_prsem.all_tr_features'
1185 It stores isofrom features for deriving and assigning pRSEM prior.
1186 The first line is a header and the rest is one isoform per line.
1187 The description for each column is:
1188
1189 * trid: transcript ID from input annotation
1190
1191 * geneid: gene ID from input anntation
1192
1193 * chrom: isoform's chromosome name
1194
1195 * strand: isoform's strand name
1196
1197 * start: isoform's end with the lowest genomic loci
1198
1199 * end: isoform's end with the highest genomic loci
1200
1201 * tss_mpp: average mappability of [TSS-500bp, TSS+500bp], where
1202 TSS is isoform's transcription start site, i.e. 5'-end
1203
1204 * body_mpp: average mappability of (TSS+500bp, TES-500bp), where
1205 TES is isoform's transcription end site, i.e. 3'-end
1206
1207 * tes_mpp: average mappability of [TES-500bp, TES+500bp]
1208
1209 * pme_count: isoform's fragment or read count from RSEM's
1210 posterior mean estimates
1211
1212 * tss: isoform's TSS loci
1213
1214 * tss_pk: equal to 1 if isoform's [TSS-500bp, TSS+500bp] region
1215 overlaps with a RNA Pol II peak; 0 otherwise
1216
1217 * is_training: equal to 1 if isoform is in the training set where
1218 Pol II prior is learned; 0 otherwise
1219
1220 - 'sample_name.stat/sample_name_prsem.all_tr_prior'
1221 It stores prior parameters for every isoform. This file does not
1222 have a header. Each line contains a prior parameter and an
1223 isoform's transcript ID delimited by " # ".
1224
1225 - 'sample_name.stat/sample_name_uniform_prior_1.isoforms.results'
1226 RSEM's posterior mean estimates on the isoform level with an
1227 initial pseudo-count of one for every isoform. It is in the same
1228 format as the 'sample_name.isoforms.results'.
1229
1230 - 'sample_name.stat/sample_name_uniform_prior_1.genes.results'
1231 RSEM's posterior mean estimates on the gene level with an initial
1232 pseudo-count of one for every isoform. It is in the same format as
1233 the 'sample_name.genes.results'.
1234
1235 When learning prior from multiple external data sets in
1236 prior-enhanced RSEM, two additional output files will be generated.
1237
1238 - 'sample_name.stat/sample_name.pval_LL'
1239 It stores a p-value and a log-likelihood. The p-value indicates
1240 whether the combination of multiple complementary data sets is
1241 informative for RNA-seq quantification. The log-likelihood shows
1242 how well pRSEM's Dirichlet-multinomial model fits the read counts
1243 of partitioned training set isoforms.
1244
1245 - 'sample_name.stat/sample_name.lgt_mdl.RData'
1246 It stores an R object named 'glmmdl', which is a logistic
1247 regression model on the training set isoforms and multiple
1248 external data sets.
1249
1250 In addition, extra columns will be added to
1251 'sample_name.stat/all_tr_features'
1252
1253 * is_expr: equal to 1 if isoform has an abundance &gt;= 1 TPM and a
1254 non-zero read count from RSEM's posterior mean estimates; 0
1255 otherwise
1256
1257 * "$external_data_set_basename": log10 of external data's signal at
1258 [TSS-500, TSS+500]. Signal is the number of reads aligned within
1259 that interval and normalized to RPKM by read depth and interval
1260 length. It will be set to -4 if no read aligned to that interval.
1261
1262 There are multiple columns like this one, where each represents an
1263 external data set.
1264
1265 * prd_expr_prob: predicted probability from logistic regression
1266 model on whether this isoform is expressed or not. A probability
1267 higher than 0.5 is considered as expressed
1268
1269 * partition: group index, to which this isoforms is partitioned
1270
1271 * prior: prior parameter for this isoform
1272
1273 EXAMPLES
1274 Assume the path to the bowtie executables is in the user's PATH
1275 environment variable. Reference files are under '/ref' with name
1276 'mouse_125'.
1277
1278 1) '/data/mmliver.fq', single-end reads with quality scores. Quality
1279 scores are encoded as for 'GA pipeline version &gt;= 1.3'. We want to use 8
1280 threads and generate a genome BAM file. In addition, we want to append
1281 gene/transcript names to the result files:
1282
1283 rsem-calculate-expression --phred64-quals \
1284 -p 8 \
1285 --append-names \
1286 --output-genome-bam \
1287 /data/mmliver.fq \
1288 /ref/mouse_125 \
1289 mmliver_single_quals
1290
1291 2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', stranded paired-end
1292 reads with quality scores. Suppose the library is prepared using TruSeq
1293 Stranded Kit, which means the first mate should map to the reverse
1294 strand. Quality scores are in SANGER format. We want to use 8 threads
1295 and do not generate a genome BAM file:
1296
1297 rsem-calculate-expression -p 8 \
1298 --paired-end \
1299 --strandedness reverse \
1300 /data/mmliver_1.fq \
1301 /data/mmliver_2.fq \
1302 /ref/mouse_125 \
1303 mmliver_paired_end_quals
1304
1305 3) '/data/mmliver.fa', single-end reads without quality scores. We want
1306 to use 8 threads:
1307
1308 rsem-calculate-expression -p 8 \
1309 --no-qualities \
1310 /data/mmliver.fa \
1311 /ref/mouse_125 \
1312 mmliver_single_without_quals
1313
1314 4) Data are the same as 1). This time we assume the bowtie executables
1315 are under '/sw/bowtie'. We want to take a fragment length distribution
1316 into consideration. We set the fragment length mean to 150 and the
1317 standard deviation to 35. In addition to a BAM file, we also want to
1318 generate credibility intervals. We allow RSEM to use 1GB of memory for
1319 CI calculation:
1320
1321 rsem-calculate-expression --bowtie-path /sw/bowtie \
1322 --phred64-quals \
1323 --fragment-length-mean 150.0 \
1324 --fragment-length-sd 35.0 \
1325 -p 8 \
1326 --output-genome-bam \
1327 --calc-ci \
1328 --ci-memory 1024 \
1329 /data/mmliver.fq \
1330 /ref/mouse_125 \
1331 mmliver_single_quals
1332
1333 5) '/data/mmliver_paired_end_quals.bam', BAM-formatted alignments for
1334 paired-end reads with quality scores. We want to use 8 threads:
1335
1336 rsem-calculate-expression --paired-end \
1337 --alignments \
1338 -p 8 \
1339 /data/mmliver_paired_end_quals.bam \
1340 /ref/mouse_125 \
1341 mmliver_paired_end_quals
1342
1343 6) '/data/mmliver_1.fq.gz' and '/data/mmliver_2.fq.gz', paired-end reads
1344 with quality scores and read files are compressed by gzip. We want to
1345 use STAR to aligned reads and assume STAR executable is '/sw/STAR'.
1346 Suppose we want to use 8 threads and do not generate a genome BAM file:
1347
1348 rsem-calculate-expression --paired-end \
1349 --star \
1350 --star-path /sw/STAR \
1351 --gzipped-read-file \
1352 --paired-end \
1353 -p 8 \
1354 /data/mmliver_1.fq.gz \
1355 /data/mmliver_2.fq.gz \
1356 /ref/mouse_125 \
1357 mmliver_paired_end_quals
1358
1359 7) In the above example, suppose we want to run prior-enhanced RSEM
1360 instead. Assuming we want to learn priors from a ChIP-seq peak file
1361 '/data/mmlive.narrowPeak.gz':
1362
1363 rsem-calculate-expression --star \
1364 --star-path /sw/STAR \
1365 --gzipped-read-file \
1366 --paired-end \
1367 --calc-pme \
1368 --run-pRSEM \
1369 --chipseq-peak-file /data/mmliver.narrowPeak.gz \
1370 -p 8 \
1371 /data/mmliver_1.fq.gz \
1372 /data/mmliver_2.fq.gz \
1373 /ref/mouse_125 \
1374 mmliver_paired_end_quals
1375
1376 8) Similar to the example in 7), suppose we want to use the partition
1377 model 'pk_lm2nopk' (partitioning isoforms by Pol II TSS peak first and
1378 then partitioning 'no TSS peak' isoforms into two bins by a linear
1379 regression model), and we want to partition isoforms by RNA Pol II's
1380 ChIP-seq read files '/data/mmliver_PolIIRep1.fq.gz' and
1381 '/data/mmliver_PolIIRep2.fq.gz', and the control ChIP-seq read files
1382 '/data/mmliver_ChIPseqCtrl.fq.gz'. Also, assuming Bowtie's executables
1383 are under '/sw/bowtie/':
1384
1385 rsem-calculate-expression --star \
1386 --star-path /sw/STAR \
1387 --gzipped-read-file \
1388 --paired-end \
1389 --calc-pme \
1390 --run-pRSEM \
1391 --chipseq-target-read-files /data/mmliver_PolIIRep1.fq.gz,/data/mmliver_PolIIRep2.fq.gz \
1392 --chipseq-control-read-files /data/mmliver_ChIPseqCtrl.fq.gz \
1393 --partition-model pk_lm2nopk \
1394 --bowtie-path /sw/bowtie \
1395 -p 8 \
1396 /data/mmliver_1.fq.gz \
1397 /data/mmliver_2.fq.gz \
1398 /ref/mouse_125 \
1399 mmliver_paired_end_quals
1400
1401 9) Similar to the example in 8), suppose we want to derive prior from
1402 four histone modification ChIP-seq read data sets:
1403 '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz',
1404 '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming
1405 Bowtie's executables are under '/sw/bowtie/':
1406
1407 rsem-calculate-expression --star \
1408 --star-path /sw/STAR \
1409 --gzipped-read-file \
1410 --paired-end \
1411 --calc-pme \
1412 --run-pRSEM \
1413 --partition-model cmb_lgt \
1414 --chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \
1415 --bowtie-path /sw/bowtie \
1416 -p 8 \
1417 /data/mmliver_1.fq.gz \
1418 /data/mmliver_2.fq.gz \
1419 /ref/mouse_125 \
1420 mmliver_paired_end_quals
1421
1422 </help>
1423 <citations>
1424 <citation type="doi">10.1186/1471-2105-12-323</citation>
1425 </citations>
1426
1427 </tool>