comparison stringtie.xml @ 15:dd4df992d93d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/stringtie commit a834a41c94d184df80e45ffa2339723826a075b1
author iuc
date Tue, 24 Jul 2018 10:23:37 -0400
parents eafd5dc95228
children eba36e001f45
comparison
equal deleted inserted replaced
14:eafd5dc95228 15:dd4df992d93d
1 <tool id="stringtie" name="StringTie" version="1.3.3.2"> 1 <tool id="stringtie" name="StringTie" version="@TOOL_VERSION@">
2 <description>transcript assembly and quantification</description> 2 <description>transcript assembly and quantification</description>
3 <macros> 3 <macros>
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements" /> 6 <expand macro="requirements" />
93 #end if 93 #end if
94 94
95 ## Replace commas with tabs 95 ## Replace commas with tabs
96 && 96 &&
97 sed -i.bak -e "s/,/\${TAB}/g" -e "s/\${CR}//g" gene_counts.csv transcript_counts.csv 97 sed -i.bak -e "s/,/\${TAB}/g" -e "s/\${CR}//g" gene_counts.csv transcript_counts.csv
98 #if $guide.special_outputs.keep_header: 98 ## Output header
99 && 99 &&
100 head -n 1 gene_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$gene_counts' 100 head -n 1 gene_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$gene_counts'
101 && 101 &&
102 head -n 1 transcript_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$transcript_counts' 102 head -n 1 transcript_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$transcript_counts'
103 #end if
104 ## Sort count files on the first column 103 ## Sort count files on the first column
105 && 104 &&
106 tail -n +2 gene_counts.csv | sort -t"\${TAB}" -k1,1 >> '$gene_counts' 105 tail -n +2 gene_counts.csv | sort -t"\${TAB}" -k1,1 >> '$gene_counts'
107 && 106 &&
108 tail -n +2 transcript_counts.csv | sort -t"\${TAB}" -k1,1 >> '$transcript_counts' 107 tail -n +2 transcript_counts.csv | sort -t"\${TAB}" -k1,1 >> '$transcript_counts'
143 </conditional> 142 </conditional>
144 <param name="input_estimation" argument="-e" type="boolean" truevalue="-e" falsevalue="" checked="False" label="Use Reference transcripts only?" help="Limit the processing of read alignments to only estimate and output the assembled transcripts matching the reference transcripts given with the -G option. With this option, read bundles with no reference transcripts (novel transcripts) will be entirely skipped, which may provide a considerable speed boost when the given set of reference transcripts is limited to a set of target genes, for example. Default: No"/> 143 <param name="input_estimation" argument="-e" type="boolean" truevalue="-e" falsevalue="" checked="False" label="Use Reference transcripts only?" help="Limit the processing of read alignments to only estimate and output the assembled transcripts matching the reference transcripts given with the -G option. With this option, read bundles with no reference transcripts (novel transcripts) will be entirely skipped, which may provide a considerable speed boost when the given set of reference transcripts is limited to a set of target genes, for example. Default: No"/>
145 <conditional name="special_outputs"> 144 <conditional name="special_outputs">
146 <param name="special_outputs_select" type="select" label="Output files for differential expression?" help="Select to output additional files that can be used with Ballgown or DESeq2/edgeR. See Help section below for more information"> 145 <param name="special_outputs_select" type="select" label="Output files for differential expression?" help="Select to output additional files that can be used with Ballgown or DESeq2/edgeR. See Help section below for more information">
147 <option value="ballgown">Ballgown</option> 146 <option value="ballgown">Ballgown</option>
148 <option value="deseq2">DESeq2/edgeR</option> 147 <option value="deseq2">DESeq2/edgeR/limma-voom</option>
149 <option value="no" selected="True">No additional output</option> 148 <option value="no" selected="True">No additional output</option>
150 </param> 149 </param>
151 <when value="ballgown" /> 150 <when value="ballgown" />
152 <when value="deseq2"> 151 <when value="deseq2">
153 <param name="read_length" argument="--length" type="integer" min="0" value="75" label="Specify the average read length" help="Default: 75" /> 152 <param name="read_length" argument="--length" type="integer" min="0" value="75" label="Specify the average read length" help="Default: 75" />
160 <param argument="--key" type="text" label="Prefix for clustering" help="If clustering, what prefix to use for geneIDs assigned by this script. Only letters and numbers will be retained in this field. Default: prepG"> 159 <param argument="--key" type="text" label="Prefix for clustering" help="If clustering, what prefix to use for geneIDs assigned by this script. Only letters and numbers will be retained in this field. Default: prepG">
161 <sanitizer> 160 <sanitizer>
162 <valid initial="string.letters,string.digits"></valid> 161 <valid initial="string.letters,string.digits"></valid>
163 </sanitizer> 162 </sanitizer>
164 </param> 163 </param>
165 <param name="keep_header" type="boolean" checked="true" label="Output header line?" help="Keep the header line for edgeR, remove it for DESeq2" />
166 </when> 164 </when>
167 <when value="no" /> 165 <when value="no" />
168 </conditional> 166 </conditional>
169 <param name="coverage_file" argument="-C" type="boolean" truevalue="-C" falsevalue="" checked="False" label="Output coverage file?" help="If StringTie is run with this option (requires -G), it returns a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described below. Each line of the GTF is corresponds to a gene or transcript in the reference annotation. Default: No"/> 167 <param name="coverage_file" argument="-C" type="boolean" truevalue="-C" falsevalue="" checked="False" label="Output coverage file?" help="If StringTie is run with this option (requires -G), it returns a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described below. Each line of the GTF is corresponds to a gene or transcript in the reference annotation. Default: No"/>
170 </when> 168 </when>
228 </outputs> 226 </outputs>
229 <tests> 227 <tests>
230 <!--Ensure default GTF output works --> 228 <!--Ensure default GTF output works -->
231 <test expect_num_outputs="1"> 229 <test expect_num_outputs="1">
232 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 230 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
233 <output name="output_gtf" file="stringtie_out1.gtf" ftype="gtf" lines_diff="2" /> 231 <output name="output_gtf" file="stringtie_out1.gtf" ftype="gtf" lines_diff="4" />
234 </test> 232 </test>
235 <!--Ensure fraction option works --> 233 <!--Ensure fraction option works -->
236 <test expect_num_outputs="1"> 234 <test expect_num_outputs="1">
237 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 235 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
238 <param name="fraction" value="0.17" /> 236 <param name="fraction" value="0.17" />
239 <output name="output_gtf" file="stringtie_out2.gtf" ftype="gtf" lines_diff="2" /> 237 <output name="output_gtf" file="stringtie_out2.gtf" ftype="gtf" lines_diff="4" />
240 </test> 238 </test>
241 <!--Ensure guide option works --> 239 <!--Ensure guide option works -->
242 <test expect_num_outputs="1"> 240 <test expect_num_outputs="1">
243 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 241 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
244 <param name="use_guide" value="yes" /> 242 <param name="use_guide" value="yes" />
245 <param name="guide_gff_select" value="history" /> 243 <param name="guide_gff_select" value="history" />
246 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> 244 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
247 <output name="output_gtf" file="stringtie_out3.gtf" ftype="gtf" lines_diff="2" /> 245 <output name="output_gtf" file="stringtie_out3.gtf" ftype="gtf" lines_diff="4" />
248 </test> 246 </test>
249 <!--Ensure guide with fraction works --> 247 <!--Ensure guide with fraction works -->
250 <test expect_num_outputs="1"> 248 <test expect_num_outputs="1">
251 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 249 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
252 <param name="use_guide" value="yes" /> 250 <param name="use_guide" value="yes" />
253 <param name="guide_gff_select" value="history" /> 251 <param name="guide_gff_select" value="history" />
254 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> 252 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
255 <param name="fraction" value="0.17" /> 253 <param name="fraction" value="0.17" />
256 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="2" /> 254 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="4" />
257 </test> 255 </test>
258 <!--Ensure coverage and output for Ballgown works --> 256 <!--Ensure coverage and output for Ballgown works -->
259 <test expect_num_outputs="7"> 257 <test expect_num_outputs="7">
260 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 258 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
261 <param name="use_guide" value="yes" /> 259 <param name="use_guide" value="yes" />
266 <output name="exon_expression" file="./ballgown/e_data.ctab" ftype="tabular" /> 264 <output name="exon_expression" file="./ballgown/e_data.ctab" ftype="tabular" />
267 <output name="intron_expression" file="./ballgown/i_data.ctab" ftype="tabular" /> 265 <output name="intron_expression" file="./ballgown/i_data.ctab" ftype="tabular" />
268 <output name="transcript_expression" file="./ballgown/t_data.ctab" ftype="tabular" /> 266 <output name="transcript_expression" file="./ballgown/t_data.ctab" ftype="tabular" />
269 <output name="exon_transcript_mapping" file="./ballgown/e2t.ctab" ftype="tabular" /> 267 <output name="exon_transcript_mapping" file="./ballgown/e2t.ctab" ftype="tabular" />
270 <output name="intron_transcript_mapping" file="./ballgown/i2t.ctab" ftype="tabular" /> 268 <output name="intron_transcript_mapping" file="./ballgown/i2t.ctab" ftype="tabular" />
271 <output name="output_gtf" file="stringtie_out5.gtf" ftype="gtf" lines_diff="2" /> 269 <output name="output_gtf" file="stringtie_out5.gtf" ftype="gtf" lines_diff="4" />
272 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" /> 270 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" />
273 </test> 271 </test>
274 <!--Ensure output for edgeR works --> 272 <!--Ensure output for edgeR works -->
275 <test expect_num_outputs="5"> 273 <test expect_num_outputs="5">
276 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 274 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
282 <param name="coverage_file" value="True" /> 280 <param name="coverage_file" value="True" />
283 <param name="clustering" value="True" /> 281 <param name="clustering" value="True" />
284 <output name="gene_counts" file="gene_counts_edger.tsv" ftype="tabular" /> 282 <output name="gene_counts" file="gene_counts_edger.tsv" ftype="tabular" />
285 <output name="transcript_counts" file="transcript_counts_edger.tsv" ftype="tabular" /> 283 <output name="transcript_counts" file="transcript_counts_edger.tsv" ftype="tabular" />
286 <output name="legend" file="legend.tsv" ftype="tabular" /> 284 <output name="legend" file="legend.tsv" ftype="tabular" />
287 <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="2" /> 285 <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="4" />
288 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" />
289 </test>
290 <!--Ensure output for DESeq2 works -->
291 <test expect_num_outputs="5">
292 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
293 <param name="use_guide" value="yes" />
294 <param name="special_outputs_select" value="deseq2" />
295 <param name="keep_header" value="False" />
296 <param name="input_estimation" value="True" />
297 <param name="guide_gff_select" value="history" />
298 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
299 <param name="coverage_file" value="True" />
300 <param name="clustering" value="True" />
301 <output name="gene_counts" file="gene_counts_deseq2.tsv" ftype="tabular" />
302 <output name="transcript_counts" file="transcript_counts_deseq2.tsv" ftype="tabular" />
303 <output name="legend" file="legend.tsv" ftype="tabular" />
304 <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="2" />
305 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" /> 286 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" />
306 </test> 287 </test>
307 <!--Ensure gene abundances output works --> 288 <!--Ensure gene abundances output works -->
308 <test expect_num_outputs="2"> 289 <test expect_num_outputs="2">
309 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 290 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
310 <param name="use_guide" value="yes" /> 291 <param name="use_guide" value="yes" />
311 <param name="guide_gff_select" value="history" /> 292 <param name="guide_gff_select" value="history" />
312 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> 293 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
313 <param name="fraction" value="0.17" /> 294 <param name="fraction" value="0.17" />
314 <param name="abundance_estimation" value="True" /> 295 <param name="abundance_estimation" value="True" />
315 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="2" /> 296 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="4" />
316 <output name="gene_abundance_estimation" file="stringtie_out7.gtf" ftype="gtf" lines_diff="2" /> 297 <output name="gene_abundance_estimation" file="stringtie_out7.gtf" ftype="gtf" lines_diff="2" />
317 </test> 298 </test>
318 <!--Ensure another fraction value works --> 299 <!--Ensure another fraction value works -->
319 <test expect_num_outputs="1"> 300 <test expect_num_outputs="1">
320 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 301 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
321 <param name="use_guide" value="yes" /> 302 <param name="use_guide" value="yes" />
322 <param name="guide_gff_select" value="history" /> 303 <param name="guide_gff_select" value="history" />
323 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> 304 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
324 <param name="fraction" value="0.15" /> 305 <param name="fraction" value="0.15" />
325 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="2" /> 306 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="4" />
326 </test> 307 </test>
327 <!--Ensure built-in GTFs work --> 308 <!--Ensure built-in GTFs work -->
328 <test expect_num_outputs="1"> 309 <test expect_num_outputs="1">
329 <param name="input_bam" ftype="bam" dbkey="hg38" value="stringtie_in1.bam" /> 310 <param name="input_bam" ftype="bam" dbkey="hg38" value="stringtie_in1.bam" />
330 <param name="use_guide" value="yes" /> 311 <param name="use_guide" value="yes" />
331 <param name="guide_gff_select" value="cached" /> 312 <param name="guide_gff_select" value="cached" />
332 <param name="fraction" value="0.15" /> 313 <param name="fraction" value="0.15" />
333 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="2" /> 314 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="4" />
334 </test> 315 </test>
335 </tests> 316 </tests>
336 <help><![CDATA[ 317 <help><![CDATA[
337 318
338 .. class:: infomark 319 .. class:: infomark
339 320
340 **What it does** 321 **What it does**
341 322
342 StringTie_ is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. It uses a novel network flow algorithm as well as an optional *de novo* assembly step to assemble and quantitate full-length transcripts representing multiple splice variants for each gene locus. Its input can include not only the alignments of raw reads used by other transcript assemblers, but also alignments of longer sequences that have been assembled from those reads. In order to identify differentially expressed genes between experiments, StringTie's output can be processed by specialized software like Ballgown_, Cuffdiff_ or other programs (DESeq2_, edgeR_, etc.). 323 StringTie_ is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. It uses a novel network flow algorithm as well as an optional *de novo* assembly step to assemble and quantitate full-length transcripts representing multiple splice variants for each gene locus. Its input can include not only the alignments of raw reads used by other transcript assemblers, but also alignments of longer sequences that have been assembled from those reads. In order to identify differentially expressed genes between experiments, StringTie's output can be processed by specialized software like Ballgown_, Cuffdiff_ or other programs (DESeq2_, edgeR_, limma_ etc.).
343 324
344 ----- 325 -----
345 326
346 **Inputs** 327 **Inputs**
347 328
368 * a TSV (tab-delimited) file of **Gene abundances** 349 * a TSV (tab-delimited) file of **Gene abundances**
369 350
370 If a reference GTF/GFF3 file is used as a guide, StringTie can also output: 351 If a reference GTF/GFF3 file is used as a guide, StringTie can also output:
371 352
372 * a GTF file containing all **fully-covered reference transcripts** in the provided reference file that are covered end-to-end by reads 353 * a GTF file containing all **fully-covered reference transcripts** in the provided reference file that are covered end-to-end by reads
373 * Files (tables) for **Ballgown** and/or **DESeq2/edgeR**, which can use them to estimate differential expression 354 * Files (tables) for **Ballgown** and/or **DESeq2/edgeR/limma-voom**, which can use them to estimate differential expression
374 355
375 356
376 **StringTie's primary GTF output** 357 **StringTie's primary GTF output**
377 358
378 The primary output of StringTie is a Gene Transfer Format (GTF) file that contains details of the transcripts that StringTie assembles from RNA-Seq data. GTF is an extension of GFF (Gene Finding Format, also called General Feature Format), and is very similar to GFF2 and GFF3. The field definitions for the 9 columns of GTF output can be found at the `Ensembl site here`_. The following is an example of a transcript assembled by StringTie as shown in a GTF file: 359 The primary output of StringTie is a Gene Transfer Format (GTF) file that contains details of the transcripts that StringTie assembles from RNA-Seq data. GTF is an extension of GFF (Gene Finding Format, also called General Feature Format), and is very similar to GFF2 and GFF3. The field definitions for the 9 columns of GTF output can be found at the `Ensembl site here`_. The following is an example of a transcript assembled by StringTie as shown in a GTF file:
449 430
450 If StringTie is run with the use reference guide option (-G), it will also return a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described above. Each line of the GTF is corresponds to a gene or transcript in the reference annotation. 431 If StringTie is run with the use reference guide option (-G), it will also return a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described above. Each line of the GTF is corresponds to a gene or transcript in the reference annotation.
451 432
452 **Ballgown Input Table Files** 433 **Ballgown Input Table Files**
453 434
454 An option to output files for Ballgown can be selected under **Output additional files** above. If selected, StringTie will return Ballgown input table files containing coverage data for the reference transcripts given with the -G option. These tables have these specific names: (1) e2t.ctab, (2) e_data.ctab, (3) i2t.ctab, (4) i_data.ctab, and (5) t_data.ctab. A detailed description of each of these five required inputs to Ballgown can be found at `this link`. With this option StringTie can be used as a direct replacement of the tablemaker program included with the Ballgown distribution. 435 An option to output files for Ballgown can be selected under **Output files for differential expression?** above. If selected, StringTie will return Ballgown input table files containing coverage data for the reference transcripts given with the -G option. These tables have these specific names: (1) e2t.ctab, (2) e_data.ctab, (3) i2t.ctab, (4) i_data.ctab, and (5) t_data.ctab. A detailed description of each of these five required inputs to Ballgown can be found at `this link`. With this option StringTie can be used as a direct replacement of the tablemaker program included with the Ballgown distribution.
455 436
456 437
457 **DESeq2/edgeR Input Table Files** 438 **DESeq2/edgeR/limma-voom Input Table Files**
458 439
459 DESeq2_ and edgeR_ are two popular Bioconductor_ packages for analyzing differential expression, which take as input a matrix of read counts mapped to particular genomic features (e.g., genes). This read count information can be extracted directly from the files generated by StringTie (run with the -e parameter) by selecting DESeq2/edgeR under **Output additional files** above. This uses the StringTie helper script ``prepDE.py`` to convert the GTF output from StringTie into two tab-delimited (TSV) files, containing the count matrices for genes and transcripts, using the coverage values found in the output of StringTie -e. 440 DESeq2_, edgeR_ and limma_ are three popular Bioconductor_ packages for analyzing differential expression, which take as input a matrix of read counts mapped to particular genomic features (e.g., genes). This read count information can be extracted directly from the files generated by StringTie (run with the -e parameter) by selecting DESeq2/edgeR/limma-voom under **Output files for differential expression?** above. This uses the StringTie helper script ``prepDE.py`` to convert the GTF output from StringTie into two tab-delimited (TSV) files, containing the count matrices for genes and transcripts, using the coverage values found in the output of StringTie -e.
460 441
461 ----- 442 -----
462 443
463 **More Information** 444 **More Information**
464 445
465 *Evaluating transcript assemblies:* 446 *Evaluating transcript assemblies:*
466 A simple way of getting more information about the transcripts assembled by StringTie (summary of gene and transcript counts, novel vs. known etc.), or even performing basic tracking of assembled isoforms across multiple RNA-Seq experiments, is to use the **gffcompare** program. Basic usage information for this program can be found on the `GFF utilities page`_. 447 A simple way of getting more information about the transcripts assembled by StringTie (summary of gene and transcript counts, novel vs. known etc.), or even performing basic tracking of assembled isoforms across multiple RNA-Seq experiments, is to use the **gffcompare** program. Basic usage information for this program can be found on the `GFF utilities page`_.
467 448
468 *Differential expression analysis:* 449 *Differential expression analysis:*
469 450
470 Together with HISAT and Ballgown (or DESeq2/edgeR), StringTie can be used for estimating differential expression across multiple RNA-Seq samples and generating plots and differential expression tables as described in our `protocol paper`_ and shown in a diagram in the `StringTie manual here`_. 451 Together with HISAT and Ballgown (or DESeq2/edgeR/limma-voom), StringTie can be used for estimating differential expression across multiple RNA-Seq samples and generating plots and differential expression tables as described in our `protocol paper`_ and shown in a diagram in the `StringTie manual here`_.
471 452
472 Our recommended workflow includes the following steps: 453 Our recommended workflow includes the following steps:
473 454
474 1. For each RNA-Seq sample, map the reads to the genome with HISAT2 using the --dta option. It is highly recommended to use the reference annotation information when mapping the reads, which can be either embedded in the genome index (built with the --ss and --exon options, see HISAT2 manual), or provided separately at run time (using the --known-splicesite-infile option of HISAT2). The SAM output of each HISAT2 run must be sorted and converted to BAM using samtools as explained above. 455 1. For each RNA-Seq sample, map the reads to the genome with HISAT2 using the --dta option. It is highly recommended to use the reference annotation information when mapping the reads, which can be either embedded in the genome index (built with the --ss and --exon options, see HISAT2 manual), or provided separately at run time (using the --known-splicesite-infile option of HISAT2). The SAM output of each HISAT2 run must be sorted and converted to BAM using samtools as explained above.
475 456
476 2. For each RNA-Seq sample, use this StringTie tool to assemble the read alignments obtained in the previous step; it is recommended to run StringTie with the -G option if the reference annotation is available. 457 2. For each RNA-Seq sample, use this StringTie tool to assemble the read alignments obtained in the previous step; it is recommended to run StringTie with the -G option if the reference annotation is available.
477 458
478 3. Run the separate **StringTie merge** tool in order to generate a non-redundant set of transcripts observed in all the RNA-Seq samples assembled previously. ``StringTie merge`` takes as input a list of all the assembled transcripts files (in GTF format) previously obtained for each sample, as well as a reference annotation file (-G option) if available. 459 3. Run the separate **StringTie merge** tool in order to generate a non-redundant set of transcripts observed in all the RNA-Seq samples assembled previously. ``StringTie merge`` takes as input a list of all the assembled transcripts files (in GTF format) previously obtained for each sample, as well as a reference annotation file (-G option) if available.
479 460
480 4. For each RNA-Seq sample, run this StringTie tool selecting to output files for Ballgown (or DESeq2/edgeR), which will generate tables of transcript and gene estimated abundances (count files). The option -e (*Use Reference transcripts only*) is not required but is recommended for this run in order to produce more accurate abundance estimations of the input transcripts. Each StringTie run in this step will take as input the sorted read alignments (BAM file) obtained in step 1 for the corresponding sample and the -G option with the merged transcripts (GTF file) generated by ``stringtie merge`` in step 3. Please note that this is the only case where the -G option is not used with a reference annotation, but with the global, merged set of transcripts as observed across all samples. (This step is the equivalent of the *Tablemaker* step described in the original Ballgown pipeline.) 461 4. For each RNA-Seq sample, run this StringTie tool selecting to output files for Ballgown (or DESeq2/edgeR/limma-voom), which will generate tables of transcript and gene estimated abundances (count files). The option -e (*Use Reference transcripts only*) is not required but is recommended for this run in order to produce more accurate abundance estimations of the input transcripts. Each StringTie run in this step will take as input the sorted read alignments (BAM file) obtained in step 1 for the corresponding sample and the -G option with the merged transcripts (GTF file) generated by ``stringtie merge`` in step 3. Please note that this is the only case where the -G option is not used with a reference annotation, but with the global, merged set of transcripts as observed across all samples. (This step is the equivalent of the *Tablemaker* step described in the original Ballgown pipeline.)
481 462
482 5. Ballgown (or DESeq2/edgeR) can now be used to load the coverage tables generated in the previous step and perform various statistical analyses for differential expression, generate plots etc. 463 5. Ballgown (or DESeq2/edgeR/limma-voom) can now be used to load the coverage tables generated in the previous step and perform various statistical analyses for differential expression, generate plots etc.
483 464
484 An alternate, faster differential expression analysis workflow can be pursued if there is no interest in novel isoforms (i.e. assembled transcripts present in the samples but missing from the reference annotation), or if only a well known set of transcripts of interest are targeted by the analysis. This simplified protocol has only 3 steps (depicted in the `StringTie manual here`_) as it bypasses the individual assembly of each RNA-Seq sample and the "transcript merge" step. This simplified workflow attempts to directly estimate and analyze the expression of a known set of transcripts as given in the reference annotation file. 465 An alternate, faster differential expression analysis workflow can be pursued if there is no interest in novel isoforms (i.e. assembled transcripts present in the samples but missing from the reference annotation), or if only a well known set of transcripts of interest are targeted by the analysis. This simplified protocol has only 3 steps (depicted in the `StringTie manual here`_) as it bypasses the individual assembly of each RNA-Seq sample and the "transcript merge" step. This simplified workflow attempts to directly estimate and analyze the expression of a known set of transcripts as given in the reference annotation file.
485 466
486 .. _StringTie: http://ccb.jhu.edu/software/stringtie/ 467 .. _StringTie: http://ccb.jhu.edu/software/stringtie/
487 .. _Ballgown: https://www.biorxiv.org/content/early/2014/09/05/003665 468 .. _Ballgown: https://www.biorxiv.org/content/early/2014/09/05/003665
488 .. _Cuffdiff: http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/ 469 .. _Cuffdiff: http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/
489 .. _DESeq2: https://bioconductor.org/packages/release/bioc/html/DESeq2.html 470 .. _DESeq2: https://bioconductor.org/packages/release/bioc/html/DESeq2.html
490 .. _edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html 471 .. _edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html
472 .. _limma: https://bioconductor.org/packages/release/bioc/html/limma.html
491 .. _Bioconductor: https://www.bioconductor.org/ 473 .. _Bioconductor: https://www.bioconductor.org/
492 .. _SAM: http://samtools.github.io/hts-specs/SAMv1.pdf 474 .. _SAM: http://samtools.github.io/hts-specs/SAMv1.pdf
493 .. _HISAT2: http://ccb.jhu.edu/software/hisat2 475 .. _HISAT2: http://ccb.jhu.edu/software/hisat2
494 .. _`GTF/GFF3`: https://ccb.jhu.edu/software/stringtie/gff.shtml 476 .. _`GTF/GFF3`: https://ccb.jhu.edu/software/stringtie/gff.shtml
495 .. _`this link`: https://github.com/alyssafrazee/ballgown#ballgown-readable-expression-output 477 .. _`this link`: https://github.com/alyssafrazee/ballgown#ballgown-readable-expression-output