comparison stringtie.xml @ 12:76d290331481 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/stringtie commit 11ee7ac206d41894c0b6a11f2439aaea490824f0
author iuc
date Thu, 09 Nov 2017 11:17:32 -0500
parents 6e45b443ef1f
children a305d75e13f2
comparison
equal deleted inserted replaced
11:6e45b443ef1f 12:76d290331481
1 <tool id="stringtie" name="StringTie" version="1.3.3"> 1 <tool id="stringtie" name="StringTie" version="1.3.3.1">
2 <description>transcript assembly and quantification</description> 2 <description>transcript assembly and quantification</description>
3 <macros> 3 <macros>
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements" /> 6 <expand macro="requirements" />
7 <expand macro="stdio" /> 7 <expand macro="stdio" />
8 <expand macro="version_command" /> 8 <expand macro="version_command" />
9 <command><![CDATA[ 9 <command><![CDATA[
10 mkdir -p ./special_de_output/sample1/ && 10 mkdir -p ./special_de_output/sample1/ &&
11 #if str($guide.use_guide) == 'yes': 11
12 ln -s '$guide.guide_gff' ./special_de_output/sample1/guide.gtf && 12 ## Get Guide GTF/GFF if selected
13 #end if 13
14 14 #if str($guide.use_guide) == 'yes':
15 #if $input_bam.metadata.ftype == 'sam': 15 #if $guide.guide_source.guide_gff_select == "history":
16 samtools sort -@ \${GALAXY_SLOTS:-1} '$input_bam' | stringtie 16 ln -s '$guide.guide_source.ref_hist' guide.gff &&
17 #else 17 #elif $guide.guide_source.guide_gff_select == "cached":
18 stringtie '$input_bam' 18 ln -s '$guide.guide_source.ref_builtin.fields.path' guide.gff &&
19 #end if 19 #end if
20 20 #end if
21 -o "$output_gtf" 21
22 -p "\${GALAXY_SLOTS:-1}" 22 #if $input_bam.metadata.ftype == 'sam':
23 #if str($guide.use_guide) == 'yes': 23 samtools sort -@ \${GALAXY_SLOTS:-1} '$input_bam' | stringtie
24 -C '$coverage' 24 #else
25 -G '$guide.guide_gff' 25 stringtie '$input_bam'
26 $guide.input_estimation 26 #end if
27 #if $guide.special_outputs != 'no': 27
28 -b ./special_de_output/sample1/ 28 -o '$output_gtf'
29 -p "\${GALAXY_SLOTS:-1}"
30
31 $rna_strandness
32
33 #if str($guide.use_guide) == 'yes':
34 -G guide.gff
35 #if $guide.coverage_file:
36 -C '$coverage'
37 #end if
38 $guide.input_estimation
39 #if $guide.special_outputs != 'no':
40 -b ./special_de_output/sample1/
41 #end if
42 #end if
43
44 #if $adv.name_prefix:
45 -l '$adv.name_prefix'
46 #end if
47 -f '$adv.fraction'
48 -m '$adv.min_tlen'
49 -a '$adv.min_anchor_len'
50 -j '$adv.min_anchor_cov'
51 -c '$adv.min_bundle_cov'
52 -g '$adv.bdist'
53 -M '$adv.bundle_fraction'
54 $adv.disable_trimming
55 $adv.multi_mapping
56 #if $adv.abundance_estimation:
57 -A '$gene_abundance_estimation'
58 #end if
59 #if str($adv.omit_sequences).strip() != "":
60 -x '$adv.omit_sequences'
61 #end if
62
63 #if str($guide.use_guide) == 'yes':
64 #if $guide.special_outputs.special_outputs_select == 'deseq2':
65 &&
66 ln -s '$output_gtf' ./special_de_output/sample1/output.gtf
67 &&
68 prepDE.py
69 -i ./special_de_output/
70 -g '$gene_counts'
71 -t '$transcript_counts'
72 -l $guide.special_outputs.read_length
73 #if $guide.special_outputs.string:
74 -s '$guide.special_outputs.string'
29 #end if 75 #end if
30 #end if 76 #if $guide.special_outputs.clustering:
31 #if str($option_set.options) == 'advanced': 77 -c
32 -l '$option_set.name_prefix' 78 #if $guide.special_outputs.key:
33 -f '$option_set.fraction' 79 -k '$guide.special_outputs.key'
34 -m '$option_set.min_tlen' 80 #end if
35 -a '$option_set.min_anchor_len' 81 --legend '$legend'
36 -j '$option_set.min_anchor_cov' 82 > /dev/null
37 -c '$option_set.min_bundle_cov' 83 &&
38 -g '$option_set.bdist' 84 sed -i.bak 's/,/\t/g' '$legend'
39 -M '$option_set.bundle_fraction' $option_set.sensitive $option_set.disable_trimming $option_set.multi_mapping 85 &&
40 #if $option_set.abundance_estimation: 86 sed -i.bak 's/\r//g' '$legend'
41 -A "$gene_abundance_estimation"
42 #end if 87 #end if
43 #if str($option_set.omit_sequences).strip() != "": 88
44 -x '$option_set.omit_sequences' 89 > /dev/null
45 #end if 90
46 #end if 91 &&
47 92 sed -i.bak 's/,/\t/g' '$transcript_counts'
48 #if str($guide.use_guide) == 'yes': 93 &&
49 #if $guide.special_outputs.special_outputs_select == 'deseq2': 94 sed -i.bak 's/\r//g' '$transcript_counts'
50 && 95 &&
51 prepDE.py 96 sed -i.bak 's/,/\t/g' '$gene_counts'
52 -i ./special_de_output/ 97 &&
53 -g gene_cout_matrix.tsv 98 sed -i.bak 's/\r//g' '$gene_counts'
54 -t transcripts_count_matrix.tsv 99 #end if
55 -l $guide.special_outputs.read_length 100 #end if
56 #if str($option_set.options) == 'advanced':
57 -s '$option_set.name_prefix'
58 #end if
59 #if $guide.special_outputs.clustering:
60 -c
61 --legend ./legend.tsv
62
63 &&
64 sed -i.bak 's/,/\t/g' ./legend.tsv
65
66 #end if
67 &&
68 sed -i.bak 's/,/\t/g' transcripts_count_matrix.tsv
69 &&
70 sed -i.bak 's/,/\t/g' gene_cout_matrix.tsv
71 #end if
72 #end if
73 ]]></command> 101 ]]></command>
74 <inputs> 102 <inputs>
75 <param name="input_bam" type="data" format="sam,bam" label="Mapped reads to assemble transcripts from" /> 103 <param name="input_bam" type="data" format="sam,bam" label="Input mapped reads" help="Input BAM/SAM file containing reads you want to assemble into transcripts"/>
104 <param name="rna_strandness" type="select" label="Specify strand information"
105 help="Select 'Forward (FR)' if your reads are from a forward-stranded library, 'Reverse (RF)' if your reads are from a reverse-stranded library, or 'Unstranded' if your reads are not from a stranded library. See Help section below for more information. Default: Unstranded">
106 <option value="" selected="True">Unstranded</option>
107 <option value="--fr">Forward (FR)</option>
108 <option value="--rf">Reverse (RF)</option>
109 </param>
76 <conditional name="guide"> 110 <conditional name="guide">
77 <param name="use_guide" type="select" label="Use GFF file to guide assembly"> 111 <param name="use_guide" argument="-G" type="select" label="Use a reference file to guide assembly?" help="Use the reference annotation file (in GTF or GFF3 format) to guide the assembly process. The output will include expressed reference transcripts as well as any novel transcripts that are assembled. This option is required by option -e (Use Reference transcripts only), see below.">
78 <option value="yes">Use GFF/GTF</option> 112 <option value="yes">Use reference GTF/GFF3</option>
79 <option selected="True" value="no">Do not use GFF/GTF</option> 113 <option value="no" selected="True" >Do not use reference GTF/GFF3</option>
80 </param> 114 </param>
81 <when value="no" /> 115 <when value="no" />
82 <when value="yes"> 116 <when value="yes">
83 <param name="guide_gff" argument="-G" type="data" format="gtf,gff3" 117 <conditional name="guide_source">
84 label="Reference annotation to use for guiding the assembly process" /> 118 <param name="guide_gff_select" type="select" label="Reference file">
85 <param name="input_estimation" argument="-e" type="boolean" truevalue="-e" falsevalue="" 119 <option value="cached" selected="true">Use a built-in file</option>
86 label="Perform abundance estimation only of input transcripts" /> 120 <option value="history">Use a file from history</option>
121 </param>
122 <when value="cached">
123 <param name="ref_builtin" type="select" label="Use a built-in GTF" help="If the GTF file for your transcriptome of interest is not listed, contact your Galaxy administrator">
124 <options from_data_table="gene_sets">
125 <filter type="sort_by" column="2" />
126 <validator type="no_options" message="No GTF file is available." />
127 </options>
128 </param>
129 </when>
130 <when value="history">
131 <param name="ref_hist" type="data" format="gtf,gff3" label="GTF/GFF3 dataset to guide assembly" />
132 </when>
133 </conditional>
134 <param name="input_estimation" argument="-e" type="boolean" truevalue="-e" falsevalue="" checked="False" label="Use Reference transcripts only?" help="Limit the processing of read alignments to only estimate and output the assembled transcripts matching the reference transcripts given with the -G option. With this option, read bundles with no reference transcripts (novel transcripts) will be entirely skipped, which may provide a considerable speed boost when the given set of reference transcripts is limited to a set of target genes, for example. Default: No"/>
87 <conditional name="special_outputs"> 135 <conditional name="special_outputs">
88 <param name="special_outputs_select" type="select" label="Output additional files for use in..."> 136 <param name="special_outputs_select" type="select" label="Output files for differential expression?" help="Select to output additional files that can be used with Ballgown or DESeq2/edgeR. See Help section below for more information">
89 <option value="ballgown">Ballgown</option> 137 <option value="ballgown">Ballgown</option>
90 <option selected="True" value="deseq2">DESeq2/EdgeR</option> 138 <option value="deseq2">DESeq2/edgeR</option>
91 <option value="no">No addional output</option> 139 <option value="no" selected="True">No additional output</option>
92 </param> 140 </param>
93 <when value="ballgown" /> 141 <when value="ballgown" />
94 <when value="deseq2"> 142 <when value="deseq2">
95 <param name="read_length" type="integer" value="75" label="Average read length" /> 143 <param name="read_length" argument="--length" type="integer" min="0" value="75" label="Specify the average read length" help="Default: 75" />
96 <param name="clustering" type="boolean" truevalue="--cluster" falsevalue="" label="Whether to cluster genes that overlap with different gene IDs" help="ignoring ones with geneID pattern" /> 144 <param name="clustering" argument="--cluster" type="boolean" truevalue="--cluster" falsevalue="" checked="False" label="Cluster overlapping genes" help="Choose whether to cluster genes with different gene IDs that overlap. Transcripts containing the geneID prefix will be ignored. Default: No" />
145 <param argument="--string" type="text" label="Prefix used for transcripts" help="If a different prefix was used for geneIDs assigned by StringTie than the default, specify it here. Only letters and numbers will be retained in this field. Default: MSTRG" >
146 <sanitizer>
147 <valid initial="string.letters,string.digits"></valid>
148 </sanitizer>
149 </param>
150 <param argument="--key" type="text" label="Prefix for clustering" help="If clustering, what prefix to use for geneIDs assigned by this script. Only letters and numbers will be retained in this field. Default: prepG">
151 <sanitizer>
152 <valid initial="string.letters,string.digits"></valid>
153 </sanitizer>
154 </param>
97 </when> 155 </when>
98 <when value="no" /> 156 <when value="no" />
99 </conditional> 157 </conditional>
158 <param name="coverage_file" argument="-C" type="boolean" truevalue="-C" falsevalue="" checked="False" label="Output coverage file?" help="If StringTie is run with this option (requires -G), it returns a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described below. Each line of the GTF is corresponds to a gene or transcript in the reference annotation. Default: No"/>
100 </when> 159 </when>
101 </conditional> 160 </conditional>
102 <conditional name="option_set"> 161 <section name="adv" title="Advanced Options">
103 <param name="options" type="select" label="Options"> 162 <param name="abundance_estimation" argument="-A" type="boolean" truevalue="-A" falsevalue="" checked="False" label="Output gene abundance estimation file?" help="If selected, gene abundances will be reported in a tab-delimited file, see below for more information. Default: No"/>
104 <option selected="True" value="default">Use defaults</option> 163 <param name="omit_sequences" argument="-x" type="text" value="" label="Do not assemble any transcripts on these reference sequence(s)" help="Ignore all read alignments (and thus do not attempt to perform transcript assembly) on the specified reference sequences. This parameter can be a single reference sequence name (e.g. chrM) or a comma-delimited list of sequence names (e.g. chrM,chrX,chrY). This can speed up StringTie especially in the case of excluding the mitochondrial genome, whose genes may have very high coverage in some cases, even though they may be of no interest for a particular RNA-Seq analysis. The reference sequence names are case sensitive, they must match identically the names of chromosomes/contigs of the target genome against which the RNA-Seq reads were aligned in the first place." />
105 <option value="advanced">Specify advanced options</option> 164 <param name="name_prefix" argument="-l" type="text" label="Name prefix for output transcripts" help="This prefix will be added to the name of the transcripts that are output. Only letters and numbers will be retained in this field. Default: STRG">
165 <sanitizer>
166 <valid initial="string.letters,string.digits"></valid>
167 </sanitizer>
106 </param> 168 </param>
107 <when value="default" /> 169 <param name="fraction" argument="-f" type="float" min="0.0" max="1.0" value="0.15" label="Minimum isoform fraction" help="Sets the minimum isoform abundance of the predicted transcripts as a fraction of the most abundant transcript assembled at a given locus. Lower abundance transcripts are often artifacts of incompletely spliced precursors of processed transcripts. Default: 0.15"/>
108 <when value="advanced"> 170 <param name="min_tlen" argument="-m" type="integer" min="0" value="200" label="Minimum assembled transcript length" help="Sets the minimum length allowed for the predicted transcripts. Default: 200"/>
109 <param name="disable_trimming" argument="-t" type="boolean" truevalue="-t" falsevalue="" 171 <param name="min_anchor_len" argument="-a" type="integer" min="0" value="10" label="Minimum anchor length for junctions" help="Junctions that don't have spliced reads that align across them with at least this amount of bases on both sides are filtered out. Default: 10" />
110 label="Disable trimming of predicted transcripts based on coverage" /> 172 <param name="min_anchor_cov" argument="-j" type="integer" min="0" value="1" label="Minimum junction coverage" help="There should be at least this many spliced reads that align across a junction (i.e. junction coverage). This number can be fractional, since some reads align in more than one place. A read that aligns in n places will contribute 1/n to the junction coverage. Default: 1" />
111 <param name="sensitive" argument="-S" type="boolean" truevalue="-S" falsevalue="" 173 <param name="min_bundle_cov" argument="-c" type="integer" min="0" value="2" label="Minimum bundle reads per bp coverage to consider for assembly" help="Sets the minimum read coverage allowed for the predicted transcripts. A transcript with a lower coverage than this value is not shown in the output. Default: 2"/>
112 label="Increase sensitivity" /> 174 <param name="bdist" argument="-g" type="integer" min="0" value="50" label="Gap between read mappings triggering a new bundle" help="Minimum locus gap separation value. Reads that are mapped closer than this distance are merged together in the same processing bundle. Default: 50 (bp)"/>
113 <param name="name_prefix" argument="-l" type="text" value="STRG" label="Name prefix for output transcripts" /> 175 <param name="bundle_fraction" argument="-M" type="float" min="0.0" max="1.0" value="0.95" label="Fraction of bundle allowed to be covered by multi-hit reads" help="Sets the maximum fraction of muliple-location-mapped reads that are allowed to be present at a given locus. Default: 0.95"/>
114 <param name="fraction" argument="-f" type="float" value="0.15" min="0.0" max="1.0" label="Minimum isoform fraction" /> 176 <param name="disable_trimming" argument="-t" type="boolean" truevalue="-t" falsevalue="" checked="False" label="Disable trimming of predicted transcripts based on coverage" help="This parameter disables trimming at the ends of the assembled transcripts. By default StringTie adjusts the predicted transcript's start and/or stop coordinates based on sudden drops in coverage of the assembled transcript. Default: No" />
115 <param name="min_tlen" argument="-m" type="integer" value="200" label="Minimum assembled transcript length" /> 177 <param name="multi_mapping" argument="-u" type="boolean" truevalue="-u" falsevalue="" checked="False" label="Disable multi-mapping correction" help="Default: No"/>
116 <param name="min_anchor_len" argument="-a" type="integer" value="10" label="Minimum anchor length for junctions" /> 178 </section>
117 <param name="min_anchor_cov" argument="-j" type="integer" value="1" label="Minimum junction coverage" />
118 <param name="min_bundle_cov" argument="-c" type="integer" value="2" label="Minimum bundle reads per bp coverage to consider for assembly" />
119 <param name="bdist" argument="-g" type="integer" value="50" label="Gap between read mappings triggering a new bundle" />
120 <param name="bundle_fraction" argument="-M" type="float" value="0.95" label="Fraction of bundle allowed to be covered by multi-hit reads" />
121 <param name="omit_sequences" argument="-x" type="text" value=""
122 label="Do not assemble any transcripts on these reference sequence(s)" help="e.g. chrM,chrX" />
123 <param name="abundance_estimation" argument="-A" type="boolean" truevalue="-A" falsevalue=""
124 label="Additional gene abundance estimation output file" />
125 <param name="multi_mapping" argument="-u" type="boolean" truevalue="-u" falsevalue=""
126 label="Disable multi-mapping correction" />
127 </when>
128 </conditional>
129 </inputs> 179 </inputs>
130 <outputs> 180 <outputs>
131 <data name="output_gtf" format="gtf" label="${tool.name} on ${on_string}: Assembled transcripts" /> 181 <data name="output_gtf" format="gtf" label="${tool.name} on ${on_string}: Assembled transcripts" />
132 <data name="gene_abundance_estimation" format="gtf" label="${tool.name} on ${on_string}: Gene abundance estimates"> 182 <data name="gene_abundance_estimation" format="gtf" label="${tool.name} on ${on_string}: Gene abundance estimates">
133 <filter>option_set['options'] == 'advanced' and option_set['abundance_estimation']</filter> 183 <filter>adv['abundance_estimation']</filter>
134 </data> 184 </data>
135 <data name="coverage" format="gff3" label="${tool.name} on ${on_string}: Coverage"> 185 <data name="coverage" format="gtf" label="${tool.name} on ${on_string}: Coverage">
136 <filter>guide['use_guide'] == 'yes'</filter> 186 <filter>guide['use_guide'] == 'yes' and guide['coverage_file'] is True </filter>
137 </data> 187 </data>
138 <data name="exon_expression" format="tabular" from_work_dir="special_de_output/sample1/e_data.ctab" 188 <data name="exon_expression" format="tabular" from_work_dir="special_de_output/sample1/e_data.ctab"
139 label="${tool.name} on ${on_string}: exon-level expression measurements"> 189 label="${tool.name} on ${on_string}: exon-level expression measurements">
140 <filter>guide['use_guide'] == 'yes' and guide['special_outputs']['special_outputs_select'] == 'ballgown'</filter> 190 <filter>guide['use_guide'] == 'yes' and guide['special_outputs']['special_outputs_select'] == 'ballgown'</filter>
141 </data> 191 </data>
153 </data> 203 </data>
154 <data name="intron_transcript_mapping" format="tabular" from_work_dir="special_de_output/sample1/i2t.ctab" 204 <data name="intron_transcript_mapping" format="tabular" from_work_dir="special_de_output/sample1/i2t.ctab"
155 label="${tool.name} on ${on_string}: intron to transcript mapping"> 205 label="${tool.name} on ${on_string}: intron to transcript mapping">
156 <filter>guide['use_guide'] == 'yes' and guide['special_outputs']['special_outputs_select'] == 'ballgown'</filter> 206 <filter>guide['use_guide'] == 'yes' and guide['special_outputs']['special_outputs_select'] == 'ballgown'</filter>
157 </data> 207 </data>
158 <data name="gene_counts" format="tabular" from_work_dir="gene_cout_matrix.tsv" 208 <data name="gene_counts" format="tabular" from_work_dir="special_de_output/sample1/gene_counts.tsv" label="${tool.name} on ${on_string}: Gene counts">
159 label="${tool.name} on ${on_string}: Gene counts">
160 <filter>guide['use_guide'] == 'yes' and guide['special_outputs']['special_outputs_select'] == 'deseq2'</filter> 209 <filter>guide['use_guide'] == 'yes' and guide['special_outputs']['special_outputs_select'] == 'deseq2'</filter>
161 </data> 210 </data>
162 <data name="transcript_counts" format="tabular" from_work_dir="transcripts_count_matrix.tsv" 211 <data name="transcript_counts" format="tabular" from_work_dir="special_de_output/sample1/transcript_counts.tsv" label="${tool.name} on ${on_string}: Transcript counts">
163 label="${tool.name} on ${on_string}: Transcript counts">
164 <filter>guide['use_guide'] == 'yes' and guide['special_outputs']['special_outputs_select'] == 'deseq2'</filter> 212 <filter>guide['use_guide'] == 'yes' and guide['special_outputs']['special_outputs_select'] == 'deseq2'</filter>
165 </data> 213 </data>
166 <data name="legend" format="tabular" from_work_dir="legend.tsv" 214 <data name="legend" format="tabular" from_work_dir="special_de_output/sample1/legend.tsv" label="${tool.name} on ${on_string}: legend">
167 label="${tool.name} on ${on_string}: legend">
168 <filter>guide['use_guide'] == 'yes' and guide['special_outputs']['special_outputs_select'] == 'deseq2' and guide['special_outputs']['clustering'] is True</filter> 215 <filter>guide['use_guide'] == 'yes' and guide['special_outputs']['special_outputs_select'] == 'deseq2' and guide['special_outputs']['clustering'] is True</filter>
169 </data> 216 </data>
170 </outputs> 217 </outputs>
171 <tests> 218 <tests>
172 <test> 219 <!--Ensure default GTF output works -->
173 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 220 <test expect_num_outputs="1">
174 <param name="use_guide" value="no" /> 221 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
175 <param name="options" value="default" />
176 <output name="output_gtf" file="stringtie_out1.gtf" ftype="gtf" lines_diff="2" /> 222 <output name="output_gtf" file="stringtie_out1.gtf" ftype="gtf" lines_diff="2" />
177 </test> 223 </test>
178 <test> 224 <!--Ensure fraction option works -->
179 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 225 <test expect_num_outputs="1">
180 <param name="use_guide" value="no" /> 226 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
181 <param name="options" value="advanced" />
182 <param name="fraction" value="0.17" /> 227 <param name="fraction" value="0.17" />
183 <output name="output_gtf" file="stringtie_out2.gtf" ftype="gtf" lines_diff="2" /> 228 <output name="output_gtf" file="stringtie_out2.gtf" ftype="gtf" lines_diff="2" />
184 </test> 229 </test>
185 <test> 230 <!--Ensure guide option works -->
186 <param ftype="bam" name="input_bam" value="stringtie_in1.bam" /> 231 <test expect_num_outputs="1">
187 <param name="use_guide" value="yes" /> 232 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
188 <param name="special_outputs_select" value="no" /> 233 <param name="use_guide" value="yes" />
189 <param name="guide_gff" value="stringtie_in.gtf" /> 234 <param name="guide_gff_select" value="history" />
190 <param name="options" value="default" /> 235 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
191 <output file="stringtie_out3.gtf" ftype="gtf" lines_diff="2" name="output_gtf" /> 236 <output name="output_gtf" file="stringtie_out3.gtf" ftype="gtf" lines_diff="2" />
192 </test> 237 </test>
193 <test> 238 <!--Ensure guide with fraction works -->
194 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 239 <test expect_num_outputs="1">
195 <param name="use_guide" value="yes" /> 240 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
196 <param name="special_outputs_select" value="no" /> 241 <param name="use_guide" value="yes" />
197 <param name="guide_gff" value="stringtie_in.gtf" /> 242 <param name="guide_gff_select" value="history" />
198 <param name="options" value="advanced" /> 243 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
199 <param name="fraction" value="0.17" /> 244 <param name="fraction" value="0.17" />
200 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="2" /> 245 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="2" />
201 </test> 246 </test>
202 <test> 247 <!--Ensure coverage and output for Ballgown works -->
203 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 248 <test expect_num_outputs="7">
204 <param name="use_guide" value="yes" /> 249 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
250 <param name="use_guide" value="yes" />
251 <param name="guide_gff_select" value="history" />
252 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
205 <param name="special_outputs_select" value="ballgown" /> 253 <param name="special_outputs_select" value="ballgown" />
206 <param name="guide_gff" value="stringtie_in.gtf" /> 254 <param name="coverage_file" value="True" />
207 <param name="options" value="default" />
208 <output name="exon_expression" file="./ballgown/e_data.ctab" ftype="tabular" /> 255 <output name="exon_expression" file="./ballgown/e_data.ctab" ftype="tabular" />
209 <output name="intron_expression" file="./ballgown/i_data.ctab" ftype="tabular" /> 256 <output name="intron_expression" file="./ballgown/i_data.ctab" ftype="tabular" />
210 <output name="transcript_expression" file="./ballgown/t_data.ctab" ftype="tabular" /> 257 <output name="transcript_expression" file="./ballgown/t_data.ctab" ftype="tabular" />
211 <output name="exon_transcript_mapping" file="./ballgown/e2t.ctab" ftype="tabular" /> 258 <output name="exon_transcript_mapping" file="./ballgown/e2t.ctab" ftype="tabular" />
212 <output name="intron_transcript_mapping" file="./ballgown/i2t.ctab" ftype="tabular" /> 259 <output name="intron_transcript_mapping" file="./ballgown/i2t.ctab" ftype="tabular" />
213 <output name="output_gtf" file="stringtie_out5.gtf" ftype="gtf" lines_diff="2" /> 260 <output name="output_gtf" file="stringtie_out5.gtf" ftype="gtf" lines_diff="2" />
214 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gff3" /> 261 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" />
215 </test> 262 </test>
216 <test> 263 <!--Ensure output for DESeq2/edgeR works -->
264 <test expect_num_outputs="5">
217 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 265 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
218 <param name="use_guide" value="yes" /> 266 <param name="use_guide" value="yes" />
219 <param name="special_outputs_select" value="deseq2" /> 267 <param name="special_outputs_select" value="deseq2" />
220 <param name="input_estimation" value="True" /> 268 <param name="input_estimation" value="True" />
221 <param name="guide_gff" value="stringtie_in.gtf" /> 269 <param name="guide_gff_select" value="history" />
222 <param name="options" value="default" /> 270 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
271 <param name="coverage_file" value="True" />
223 <param name="clustering" value="True" /> 272 <param name="clustering" value="True" />
224 <output name="gene_counts" file="./deseq2/gene_counts.tsv" ftype="tabular" lines_diff="2" /> 273 <output name="gene_counts" file="./deseq2/gene_counts.tsv" ftype="tabular" />
225 <output name="transcript_counts" file="./deseq2/transcript_counts.tsv" ftype="tabular" /> 274 <output name="transcript_counts" file="./deseq2/transcript_counts.tsv" ftype="tabular" />
226 <output name="legend" file="./deseq2/legend.tsv" ftype="tabular" /> 275 <output name="legend" file="./deseq2/legend.tsv" ftype="tabular" />
227 <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="2" /> 276 <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="2" />
228 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gff3" /> 277 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" />
229 </test> 278 </test>
230 <test> 279 <!--Ensure gene abundances output works -->
231 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 280 <test expect_num_outputs="2">
232 <param name="use_guide" value="yes" /> 281 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
233 <param name="guide_gff" value="stringtie_in.gtf" /> 282 <param name="use_guide" value="yes" />
234 <param name="options" value="advanced" /> 283 <param name="guide_gff_select" value="history" />
284 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
235 <param name="fraction" value="0.17" /> 285 <param name="fraction" value="0.17" />
236 <param name="abundance_estimation" value="True" /> 286 <param name="abundance_estimation" value="True" />
237 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="2" /> 287 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="2" />
238 <output name="gene_abundance_estimation" file="stringtie_out7.gtf" ftype="gtf" lines_diff="2" /> 288 <output name="gene_abundance_estimation" file="stringtie_out7.gtf" ftype="gtf" lines_diff="2" />
239 </test> 289 </test>
240 <test> 290 <!--Ensure another fraction value works -->
241 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> 291 <test expect_num_outputs="1">
242 <param name="use_guide" value="yes" /> 292 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
243 <param name="special_outputs_select" value="no" /> 293 <param name="use_guide" value="yes" />
244 <param name="guide_gff" value="stringtie_in.gtf" /> 294 <param name="guide_gff_select" value="history" />
245 <param name="options" value="advanced" /> 295 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
246 <param name="fraction" value="0.15" /> 296 <param name="fraction" value="0.15" />
247 <param name="c" value="test_chromosome" /> 297 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="2" />
298 </test>
299 <!--Ensure built-in GTFs work -->
300 <test expect_num_outputs="1">
301 <param name="input_bam" ftype="bam" dbkey="hg38" value="stringtie_in1.bam" />
302 <param name="use_guide" value="yes" />
303 <param name="guide_gff_select" value="cached" />
304 <param name="fraction" value="0.15" />
248 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="2" /> 305 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="2" />
249 </test> 306 </test>
250 </tests> 307 </tests>
251 <help><![CDATA[ 308 <help><![CDATA[
252 **What it does?** 309
253 310 .. class:: infomark
254 StringTie_ is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. It uses a novel network flow algorithm as well as an optional *de novo* assembly step to assemble and quantitate full-length transcripts representing multiple splice variants for each gene locus. Its input can include not only the alignments of raw reads used by other transcript assemblers, but also alignments longer sequences that have been assembled from those reads.To identify differentially expressed genes between experiments, StringTie's output can be processed either by the Cuffdiff or Ballgown programs. 311
312 **What it does**
313
314 StringTie_ is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. It uses a novel network flow algorithm as well as an optional *de novo* assembly step to assemble and quantitate full-length transcripts representing multiple splice variants for each gene locus. Its input can include not only the alignments of raw reads used by other transcript assemblers, but also alignments of longer sequences that have been assembled from those reads. In order to identify differentially expressed genes between experiments, StringTie's output can be processed by specialized software like Ballgown_, Cuffdiff_ or other programs (DESeq2_, edgeR_, etc.).
315
316 -----
317
318 **Inputs**
319
320 StringTie takes as input a BAM (or SAM) file of paired-end RNA-seq reads, which must be sorted by genomic location (coordinate position). This file contains spliced read alignments and can be produced directly by programs such as HISAT2_. We recommend using HISAT2 as it is a fast and accurate alignment program. Every spliced read alignment (i.e. an alignment across at least one junction) in the input BAM file must contain the tag XS to indicate the genomic strand that produced the RNA from which the read was sequenced. Alignments produced by HISAT2 (when run with --dta option) already include this tag, but if you use a different read mapper you should check that this XS tag is included for spliced alignments.
321
322 *NOTE: be sure to run HISAT2 with the --dta option for alignment (under Spliced alignment options), or your results will suffer.*
323
324 Also note that if your reads are from a stranded library, you need to choose the appropriate setting under **Specify strand information** above. As, if Forward (FR) is selected, StringTie will assume the reads are from a --fr library, while if Reverse (RF) is selected, StringTie will assume the reads are from a --rf library, otherwise it is assumed that the reads are from an unstranded library (The widely-used, although now deprecated, TopHat had a similar --library-type option, where fr-firststrand corresponded to RF; fr-secondstrand corresponded to FR). If you don't know whether your reads are from are a stranded library or not, you could use the tool **RSeQC Infer Experiment** to try to determine.
325
326 As an option, a reference annotation file in `GTF/GFF3`_ format can be provided to StringTie. In this case, StringTie will prefer to use these "known" genes from the annotation file, and for the ones that are expressed it will compute coverage, TPM and FPKM values. It will also produce additional transcripts to account for RNA-seq data that aren't covered by (or explained by) the annotation. Note that if option -e is not used the reference transcripts need to be fully covered by reads in order to be included in StringTie's output. In that case, other transcripts assembled from the data by StringTie and not present in the reference file will be printed as well.
327
328 *NOTE: we highly recommend that you provide annotation if you are analyzing a genome that is well-annotated, such as human, mouse, or other model organisms.*
329
330 -----
331
332 **Outputs**
333
334 StringTie's primary output is
335
336 * a GTF file containing the **Assembled transcripts**
337
338 Optionally, it can output
339
340 * a TSV (tab-delimited) file of **Gene abundances**
341
342 If a reference GTF/GFF3 file is used as a guide, StringTie can also output:
343
344 * a GTF file containing all **fully-covered reference transcripts** in the provided reference file that are covered end-to-end by reads
345 * Files (tables) for **Ballgown** and/or **DESeq2/edgeR**, which can use them to estimate differential expression
346
347
348 **StringTie's primary GTF output**
349
350 The primary output of StringTie is a Gene Transfer Format (GTF) file that contains details of the transcripts that StringTie assembles from RNA-Seq data. GTF is an extension of GFF (Gene Finding Format, also called General Feature Format), and is very similar to GFF2 and GFF3. The field definitions for the 9 columns of GTF output can be found at the `Ensembl site here`_. The following is an example of a transcript assembled by StringTie as shown in a GTF file:
351
352 =========== ========== =========== ========= ======= ========= ========== ========= ===========================================================================================
353 **seqname** **source** **feature** **start** **end** **score** **strand** **frame** **attributes**
354 ----------- ---------- ----------- --------- ------- --------- ---------- --------- -------------------------------------------------------------------------------------------
355 chrX StringTie transcript 281394 303355 1000 \+ . gene_id "ERR188044.1"; transcript_id "ERR188044.1.1"; reference_id "NM_018390"; ref_gene_id
356 "NM_018390"; ref_gene_name "PLCXD1"; cov "101.256691"; FPKM "530.078918"; TPM "705.667908";
357 chrX StringTie exon 281394 281684 1000 \+ . gene_id "ERR188044.1"; transcript_id "ERR188044.1.1"; exon_number "1"; reference_id
358 "NM_018390"; ref_gene_id "NM_018390"; ref_gene_name "PLCXD1"; cov "116.270836";
359 =========== ========== =========== ========= ======= ========= ========== ========= ===========================================================================================
360
361
362 * **seqname**: Denotes the chromosome, contig, or scaffold for this transcript. Here the assembled transcript is on chromosome X.
363
364 * **source**: The source of the GTF file. Since this example was produced by StringTie, this column simply shows 'StringTie'.
365
366 * **feature**: Feature type (e.g., exon, transcript, mRNA, 5'UTR).
367
368 * **start**: Start position of the feature (exon, transcript, etc), using a 1-based index.
369
370 * **end**: End position of the feature, using a 1-based index.
371
372 * **score**: A confidence score for the assembled transcript. Currently this field is not used, and StringTie reports a constant value of 1000 if the transcript has a connection to a read alignment bundle.
373
374 * **strand**: If the transcript resides on the forward strand, '+'. If the transcript resides on the reverse strand, '-'.
375
376 * **frame**: Frame or phase of CDS features. StringTie does not use this field and simply records a ".".
377
378 * **attributes**: A semicolon-separated list of tag-value pairs, providing additional information about each feature. Depending on whether an instance is a transcript or an exon and on whether the transcript matches the reference annotation file provided by the user, the content of the attributes field will differ. The following list describes the possible attributes shown in this column:
379
380 * *gene_id*: A unique identifier for a single gene and its child transcript and exons based on the alignments' file name.
381
382 * *transcript_id*: A unique identifier for a single transcript and its child exons based on the alignments' file name.
383
384 * *exon_number*: A unique identifier for a single exon, starting from 1, within a given transcript.
385
386 * *reference_id*: The transcript_id in the reference annotation (optional) that the instance matched.
387
388 * *ref_gene_id*: The gene_id in the reference annotation (optional) that the instance matched.
389
390 * *ref_gene_name*: The gene_name in the reference annotation (optional) that the instance matched.
391
392 * *cov*: The average per-base coverage for the transcript or exon.
393
394 * *FPKM*: Fragments per kilobase of transcript per million read pairs. This is the number of pairs of reads aligning to this feature, normalized by the total number of fragments sequenced (in millions) and the length of the transcript (in kilobases).
395
396 * *TPM*: Transcripts per million. This is the number of transcripts from this particular gene normalized first by gene length, and then by sequencing depth (in millions) in the sample. A detailed explanation and a comparison of TPM and FPKM can be found here_, and TPM was defined `by B. Li and C. Dewey here`_.
397
398
399 **Gene abundances in tab-delimited format**
400
401 If StringTie is run with the -A option, it returns a file containing gene abundances. The tab-delimited gene abundances output file has nine fields; below is an example of a gene abundance file produced by StringTie using reference annotation:
402
403 =========== ============= ============= ========== ========= ======= ============ ======== ========
404 **Gene ID** **Gene Name** **Reference** **Strand** **Start** **End** **Coverage** **FPKM** **TPM**
405 ----------- ------------- ------------- ---------- --------- ------- ------------ -------- --------
406 NM_000451 SHOX chrX \+ 624344 646823 0.000000 0.000000 0.000000
407 NM_006883 SHOX chrX \+ 624344 659411 0.000000 0.000000 0.000000
408 =========== ============= ============= ========== ========= ======= ============ ======== ========
409
410 * **Gene ID**: The gene identifier comes from the reference annotation provided with the -G option. If no reference is provided this field is replaced with the name prefix for output transcripts (-l).
411 * **Gene Name**: This field contains the gene name in the reference annotation provided with the -G option. If no reference is provided this field is populated with '-'.
412 * **Reference**: Name of the reference sequence that was used in the alignment of the reads. Equivalent to the 3rd column in the .SAM alignment.
413 * **Strand**: '+' denotes that the gene is on the forward strand, '-' for the reverse strand.
414 * **Start**: Start position of the gene (1-based index).
415 * **End**: End position of the gene (1-based index).
416 * **Coverage**: Per-base coverage of the gene.
417 * **FPKM**: normalized expression level in FPKM units (see previous section).
418 * **TPM**: normalized expression level in RPM units (see previous section).
419
420 **Fully covered transcripts matching the reference annotation transcripts (in GTF format)**
421
422 If StringTie is run with the use reference guide option (-G), it will also return a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described above. Each line of the GTF is corresponds to a gene or transcript in the reference annotation.
423
424 **Ballgown Input Table Files**
425
426 An option to output files for Ballgown can be selected under **Output additional files** above. If selected, StringTie will return Ballgown input table files containing coverage data for the reference transcripts given with the -G option. These tables have these specific names: (1) e2t.ctab, (2) e_data.ctab, (3) i2t.ctab, (4) i_data.ctab, and (5) t_data.ctab. A detailed description of each of these five required inputs to Ballgown can be found at `this link`. With this option StringTie can be used as a direct replacement of the tablemaker program included with the Ballgown distribution.
427
428
429 **DESeq2/edgeR Input Table Files**
430
431 DESeq2_ and edgeR_ are two popular Bioconductor_ packages for analyzing differential expression, which take as input a matrix of read counts mapped to particular genomic features (e.g., genes). This read count information can be extracted directly from the files generated by StringTie (run with the -e parameter) by selecting DESeq2/edgeR under **Output additional files** above. This uses the StringTie helper script ``prepDE.py`` to convert the GTF output from StringTie into two tab-delimited (TSV) files, containing the count matrices for genes and transcripts, using the coverage values found in the output of StringTie -e.
432
433 -----
434
435 **More Information**
436
437 *Evaluating transcript assemblies:*
438 A simple way of getting more information about the transcripts assembled by StringTie (summary of gene and transcript counts, novel vs. known etc.), or even performing basic tracking of assembled isoforms across multiple RNA-Seq experiments, is to use the **gffcompare** program. Basic usage information for this program can be found on the `GFF utilities page`_.
439
440 *Differential expression analysis:*
441
442 Together with HISAT and Ballgown (or DESeq2/edgeR), StringTie can be used for estimating differential expression across multiple RNA-Seq samples and generating plots and differential expression tables as described in our `protocol paper`_ and shown in a diagram in the `StringTie manual here`_.
443
444 Our recommended workflow includes the following steps:
445
446 1. For each RNA-Seq sample, map the reads to the genome with HISAT2 using the --dta option. It is highly recommended to use the reference annotation information when mapping the reads, which can be either embedded in the genome index (built with the --ss and --exon options, see HISAT2 manual), or provided separately at run time (using the --known-splicesite-infile option of HISAT2). The SAM output of each HISAT2 run must be sorted and converted to BAM using samtools as explained above.
447
448 2. For each RNA-Seq sample, use this StringTie tool to assemble the read alignments obtained in the previous step; it is recommended to run StringTie with the -G option if the reference annotation is available.
449
450 3. Run the separate **StringTie merge** tool in order to generate a non-redundant set of transcripts observed in all the RNA-Seq samples assembled previously. ``StringTie merge`` takes as input a list of all the assembled transcripts files (in GTF format) previously obtained for each sample, as well as a reference annotation file (-G option) if available.
451
452 4. For each RNA-Seq sample, run this StringTie tool selecting to output files for Ballgown (or DESeq2/edgeR), which will generate tables of transcript and gene estimated abundances (count files). The option -e (*Use Reference transcripts only*) is not required but is recommended for this run in order to produce more accurate abundance estimations of the input transcripts. Each StringTie run in this step will take as input the sorted read alignments (BAM file) obtained in step 1 for the corresponding sample and the -G option with the merged transcripts (GTF file) generated by ``stringtie merge`` in step 3. Please note that this is the only case where the -G option is not used with a reference annotation, but with the global, merged set of transcripts as observed across all samples. (This step is the equivalent of the *Tablemaker* step described in the original Ballgown pipeline.)
453
454 5. Ballgown (or DESeq2/edgeR) can now be used to load the coverage tables generated in the previous step and perform various statistical analyses for differential expression, generate plots etc.
455
456 An alternate, faster differential expression analysis workflow can be pursued if there is no interest in novel isoforms (i.e. assembled transcripts present in the samples but missing from the reference annotation), or if only a well known set of transcripts of interest are targeted by the analysis. This simplified protocol has only 3 steps (depicted in the `StringTie manual here`_) as it bypasses the individual assembly of each RNA-Seq sample and the "transcript merge" step. This simplified workflow attempts to directly estimate and analyze the expression of a known set of transcripts as given in the reference annotation file.
255 457
256 .. _StringTie: http://ccb.jhu.edu/software/stringtie/ 458 .. _StringTie: http://ccb.jhu.edu/software/stringtie/
459 .. _Ballgown: https://www.biorxiv.org/content/early/2014/09/05/003665
460 .. _Cuffdiff: http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/
461 .. _DESeq2: https://bioconductor.org/packages/release/bioc/html/DESeq2.html
462 .. _edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html
463 .. _Bioconductor: https://www.bioconductor.org/
464 .. _SAM: http://samtools.github.io/hts-specs/SAMv1.pdf
465 .. _HISAT2: http://ccb.jhu.edu/software/hisat2
466 .. _`GTF/GFF3`: https://ccb.jhu.edu/software/stringtie/gff.shtml
467 .. _`this link`: https://github.com/alyssafrazee/ballgown#ballgown-readable-expression-output
468 .. _`Ensembl site here`: http://useast.ensembl.org/info/website/upload/gff.html
469 .. _here: http://www.rna-seqblog.com/rpkm-fpkm-and-tpm-clearly-explained/
470 .. _`by B. Li and C. Dewey here`: http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-323
471 .. _`GFF utilities page`: https://ccb.jhu.edu/software/stringtie/gff.shtml#gffcompare
472 .. _`protocol paper`: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5032908/
473 .. _`StringTie manual here`: https://ccb.jhu.edu/software/stringtie/index.shtml?t=manual
474
257 ]]></help> 475 ]]></help>
258 <expand macro="citations" /> 476 <expand macro="citations" />
259 </tool> 477 </tool>