| Previous changeset 11:6e45b443ef1f (2017-06-01) Next changeset 13:a305d75e13f2 (2018-04-12) |
|
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/stringtie commit 11ee7ac206d41894c0b6a11f2439aaea490824f0 |
|
modified:
macros.xml stringtie.xml test-data/deseq2/gene_counts.tsv test-data/deseq2/transcript_counts.tsv test-data/stringtie_out6.gtf |
|
added:
test-data/cached_locally/gene_sets.loc test-data/cached_locally/ref.gtf tool-data/gene_sets.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
| b |
| diff -r 6e45b443ef1f -r 76d290331481 macros.xml --- a/macros.xml Thu Jun 01 12:16:04 2017 -0400 +++ b/macros.xml Thu Nov 09 11:17:32 2017 -0500 |
| b |
| @@ -2,6 +2,7 @@ <xml name="requirements"> <requirements> <requirement type="package" version="1.3.3">stringtie</requirement> + <requirement type="package" version="1.6">samtools</requirement> <yield/> </requirements> </xml> |
| b |
| diff -r 6e45b443ef1f -r 76d290331481 stringtie.xml --- a/stringtie.xml Thu Jun 01 12:16:04 2017 -0400 +++ b/stringtie.xml Thu Nov 09 11:17:32 2017 -0500 |
| [ |
| b'@@ -1,4 +1,4 @@\n-<tool id="stringtie" name="StringTie" version="1.3.3">\n+<tool id="stringtie" name="StringTie" version="1.3.3.1">\n <description>transcript assembly and quantification</description>\n <macros>\n <import>macros.xml</import>\n@@ -7,133 +7,183 @@\n <expand macro="stdio" />\n <expand macro="version_command" />\n <command><![CDATA[\n- mkdir -p ./special_de_output/sample1/ &&\n- #if str($guide.use_guide) == \'yes\':\n- ln -s \'$guide.guide_gff\' ./special_de_output/sample1/guide.gtf &&\n- #end if\n+mkdir -p ./special_de_output/sample1/ &&\n+\n+## Get Guide GTF/GFF if selected\n+\n+#if str($guide.use_guide) == \'yes\':\n+ #if $guide.guide_source.guide_gff_select == "history":\n+ ln -s \'$guide.guide_source.ref_hist\' guide.gff &&\n+ #elif $guide.guide_source.guide_gff_select == "cached":\n+ ln -s \'$guide.guide_source.ref_builtin.fields.path\' guide.gff &&\n+ #end if\n+#end if\n+\n+#if $input_bam.metadata.ftype == \'sam\':\n+ samtools sort -@ \\${GALAXY_SLOTS:-1} \'$input_bam\' | stringtie\n+#else\n+ stringtie \'$input_bam\'\n+#end if\n+\n+-o \'$output_gtf\'\n+-p "\\${GALAXY_SLOTS:-1}"\n+\n+$rna_strandness\n \n- #if $input_bam.metadata.ftype == \'sam\':\n- samtools sort -@ \\${GALAXY_SLOTS:-1} \'$input_bam\' | stringtie\n- #else\n- stringtie \'$input_bam\'\n- #end if\n+#if str($guide.use_guide) == \'yes\':\n+ -G guide.gff\n+ #if $guide.coverage_file:\n+ -C \'$coverage\'\n+ #end if\n+ $guide.input_estimation\n+ #if $guide.special_outputs != \'no\':\n+ -b ./special_de_output/sample1/\n+ #end if\n+#end if\n \n- -o "$output_gtf"\n- -p "\\${GALAXY_SLOTS:-1}"\n- #if str($guide.use_guide) == \'yes\':\n- -C \'$coverage\'\n- -G \'$guide.guide_gff\'\n- $guide.input_estimation\n- #if $guide.special_outputs != \'no\':\n- -b ./special_de_output/sample1/\n+#if $adv.name_prefix:\n+ -l \'$adv.name_prefix\'\n+#end if\n+-f \'$adv.fraction\'\n+-m \'$adv.min_tlen\'\n+-a \'$adv.min_anchor_len\'\n+-j \'$adv.min_anchor_cov\'\n+-c \'$adv.min_bundle_cov\'\n+-g \'$adv.bdist\'\n+-M \'$adv.bundle_fraction\'\n+$adv.disable_trimming\n+$adv.multi_mapping\n+#if $adv.abundance_estimation:\n+ -A \'$gene_abundance_estimation\'\n+#end if\n+#if str($adv.omit_sequences).strip() != "":\n+ -x \'$adv.omit_sequences\'\n+#end if\n+\n+#if str($guide.use_guide) == \'yes\':\n+ #if $guide.special_outputs.special_outputs_select == \'deseq2\':\n+ &&\n+ ln -s \'$output_gtf\' ./special_de_output/sample1/output.gtf\n+ &&\n+ prepDE.py\n+ -i ./special_de_output/\n+ -g \'$gene_counts\'\n+ -t \'$transcript_counts\'\n+ -l $guide.special_outputs.read_length\n+ #if $guide.special_outputs.string:\n+ -s \'$guide.special_outputs.string\'\n #end if\n- #end if\n- #if str($option_set.options) == \'advanced\':\n- -l \'$option_set.name_prefix\'\n- -f \'$option_set.fraction\'\n- -m \'$option_set.min_tlen\'\n- -a \'$option_set.min_anchor_len\'\n- -j \'$option_set.min_anchor_cov\'\n- -c \'$option_set.min_bundle_cov\'\n- -g \'$option_set.bdist\'\n- -M \'$option_set.bundle_fraction\' $option_set.sensitive $option_set.disable_trimming $option_set.multi_mapping\n- #if $option_set.abundance_estimation:\n- -A "$gene_abundance_estimation"\n- #end if\n- #if str($option_set.omit_sequences).strip() != "":\n- -x \'$option_set.omit_sequences\'\n- #end if\n- #end if\n-\n- #if str($guide.use_guide) == \'yes\':\n- #if $guide.special_outputs.special_outputs_select == \'deseq2\':\n+ #if $guide.special_outputs.clustering:\n+ -c\n+ #if $guide.special_outputs.key:\n+ -k \'$guide.special_outputs.key\'\n+ #end if\n+ --legend \'$legend\'\n+ > /dev/null\n+ '..b'tion. It is highly recommended to use the reference annotation information when mapping the reads, which can be either embedded in the genome index (built with the --ss and --exon options, see HISAT2 manual), or provided separately at run time (using the --known-splicesite-infile option of HISAT2). The SAM output of each HISAT2 run must be sorted and converted to BAM using samtools as explained above.\n+\n+ 2. For each RNA-Seq sample, use this StringTie tool to assemble the read alignments obtained in the previous step; it is recommended to run StringTie with the -G option if the reference annotation is available.\n+\n+ 3. Run the separate **StringTie merge** tool in order to generate a non-redundant set of transcripts observed in all the RNA-Seq samples assembled previously. ``StringTie merge`` takes as input a list of all the assembled transcripts files (in GTF format) previously obtained for each sample, as well as a reference annotation file (-G option) if available.\n+\n+ 4. For each RNA-Seq sample, run this StringTie tool selecting to output files for Ballgown (or DESeq2/edgeR), which will generate tables of transcript and gene estimated abundances (count files). The option -e (*Use Reference transcripts only*) is not required but is recommended for this run in order to produce more accurate abundance estimations of the input transcripts. Each StringTie run in this step will take as input the sorted read alignments (BAM file) obtained in step 1 for the corresponding sample and the -G option with the merged transcripts (GTF file) generated by ``stringtie merge`` in step 3. Please note that this is the only case where the -G option is not used with a reference annotation, but with the global, merged set of transcripts as observed across all samples. (This step is the equivalent of the *Tablemaker* step described in the original Ballgown pipeline.)\n+\n+ 5. Ballgown (or DESeq2/edgeR) can now be used to load the coverage tables generated in the previous step and perform various statistical analyses for differential expression, generate plots etc.\n+\n+An alternate, faster differential expression analysis workflow can be pursued if there is no interest in novel isoforms (i.e. assembled transcripts present in the samples but missing from the reference annotation), or if only a well known set of transcripts of interest are targeted by the analysis. This simplified protocol has only 3 steps (depicted in the `StringTie manual here`_) as it bypasses the individual assembly of each RNA-Seq sample and the "transcript merge" step. This simplified workflow attempts to directly estimate and analyze the expression of a known set of transcripts as given in the reference annotation file.\n \n .. _StringTie: http://ccb.jhu.edu/software/stringtie/\n+.. _Ballgown: https://www.biorxiv.org/content/early/2014/09/05/003665\n+.. _Cuffdiff: http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/\n+.. _DESeq2: https://bioconductor.org/packages/release/bioc/html/DESeq2.html\n+.. _edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html\n+.. _Bioconductor: https://www.bioconductor.org/\n+.. _SAM: http://samtools.github.io/hts-specs/SAMv1.pdf\n+.. _HISAT2: http://ccb.jhu.edu/software/hisat2\n+.. _`GTF/GFF3`: https://ccb.jhu.edu/software/stringtie/gff.shtml\n+.. _`this link`: https://github.com/alyssafrazee/ballgown#ballgown-readable-expression-output\n+.. _`Ensembl site here`: http://useast.ensembl.org/info/website/upload/gff.html\n+.. _here: http://www.rna-seqblog.com/rpkm-fpkm-and-tpm-clearly-explained/\n+.. _`by B. Li and C. Dewey here`: http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-323\n+.. _`GFF utilities page`: https://ccb.jhu.edu/software/stringtie/gff.shtml#gffcompare\n+.. _`protocol paper`: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5032908/\n+.. _`StringTie manual here`: https://ccb.jhu.edu/software/stringtie/index.shtml?t=manual\n+\n ]]></help>\n <expand macro="citations" />\n-</tool>\n+</tool>\n\\ No newline at end of file\n' |
| b |
| diff -r 6e45b443ef1f -r 76d290331481 test-data/cached_locally/gene_sets.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cached_locally/gene_sets.loc Thu Nov 09 11:17:32 2017 -0500 |
| b |
| @@ -0,0 +1,1 @@ +hg38 hg38 hg38GTF ${__HERE__}/ref.gtf |
| b |
| diff -r 6e45b443ef1f -r 76d290331481 test-data/cached_locally/ref.gtf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cached_locally/ref.gtf Thu Nov 09 11:17:32 2017 -0500 |
| b |
| @@ -0,0 +1,4 @@ +test_chromosome Cufflinks transcript 53 550 1000 + . gene_id "CUFF.1"; transcript_id "CUFF.1.1"; FPKM "10679134.4063403048"; frac "1.000000"; conf_lo "8542701.791788"; conf_hi "12815567.020892"; cov "145.770185"; +test_chromosome Cufflinks exon 53 250 1000 + . gene_id "CUFF.1"; transcript_id "CUFF.1.1"; exon_number "1"; FPKM "10679134.4063403048"; frac "1.000000"; conf_lo "8542701.791788"; conf_hi "12815567.020892"; cov "145.770185"; +test_chromosome Cufflinks exon 351 400 1000 + . gene_id "CUFF.1"; transcript_id "CUFF.1.1"; exon_number "2"; FPKM "10679134.4063403048"; frac "1.000000"; conf_lo "8542701.791788"; conf_hi "12815567.020892"; cov "145.770185"; +test_chromosome Cufflinks exon 501 550 1000 + . gene_id "CUFF.1"; transcript_id "CUFF.1.1"; exon_number "3"; FPKM "10679134.4063403048"; frac "1.000000"; conf_lo "8542701.791788"; conf_hi "12815567.020892"; cov "145.770185"; |
| b |
| diff -r 6e45b443ef1f -r 76d290331481 test-data/deseq2/gene_counts.tsv --- a/test-data/deseq2/gene_counts.tsv Thu Jun 01 12:16:04 2017 -0400 +++ b/test-data/deseq2/gene_counts.tsv Thu Nov 09 11:17:32 2017 -0500 |
| b |
| @@ -1,2 +1,2 @@ - sample1 -CUFF.1 574 +gene_id sample1 +CUFF.1 182 |
| b |
| diff -r 6e45b443ef1f -r 76d290331481 test-data/deseq2/transcript_counts.tsv --- a/test-data/deseq2/transcript_counts.tsv Thu Jun 01 12:16:04 2017 -0400 +++ b/test-data/deseq2/transcript_counts.tsv Thu Nov 09 11:17:32 2017 -0500 |
| b |
| @@ -1,2 +1,2 @@ - sample1 -CUFF.1.1 574 +transcript_id sample1 +CUFF.1.1 182 |
| b |
| diff -r 6e45b443ef1f -r 76d290331481 test-data/stringtie_out6.gtf --- a/test-data/stringtie_out6.gtf Thu Jun 01 12:16:04 2017 -0400 +++ b/test-data/stringtie_out6.gtf Thu Nov 09 11:17:32 2017 -0500 |
| b |
| @@ -1,4 +1,4 @@ -# stringtie /tmp/tmpJfKWNy/files/000/dataset_22.dat -o /tmp/tmpJfKWNy/files/000/dataset_24.dat -p 1 -C /tmp/tmpJfKWNy/files/000/dataset_25.dat -G /tmp/tmpJfKWNy/files/000/dataset_23.dat -e -b ./special_de_output/sample1/ +# stringtie /tmp/tmpSoPTYX/files/000/dataset_1.dat -o /tmp/tmpSoPTYX/files/000/dataset_3.dat -p 1 -G guide.gff -C /tmp/tmpSoPTYX/files/000/dataset_4.dat -e -b ./special_de_output/sample1/ -f 0.15 -m 200 -a 10 -j 1 -c 2 -g 50 -M 0.95 # StringTie version 1.3.3 test_chromosome StringTie transcript 53 550 1000 + . gene_id "CUFF.1"; transcript_id "CUFF.1.1"; cov "45.795296"; FPKM "3354966.750000"; TPM "1000000.000000"; test_chromosome StringTie exon 53 250 1000 + . gene_id "CUFF.1"; transcript_id "CUFF.1.1"; exon_number "1"; cov "49.777779"; |
| b |
| diff -r 6e45b443ef1f -r 76d290331481 tool-data/gene_sets.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/gene_sets.loc.sample Thu Nov 09 11:17:32 2017 -0500 |
| b |
| @@ -0,0 +1,14 @@ +# This is a sample file distributed with featureCounts that enables it and other# tools to use gene/exon annotations in the GFF/GTF format. +# +# The gene_sets.loc file syntax is: +#<unique_build_id> <dbkey> <display_name> <path> +# +# Please ensure that the above fields are tab separated. +# +# In case you have TWO or MORE providers PER dbkey, the one mentioned +# first in the file, should have the "default" priority. +# +#Example: +# +#Homo_sapiens.GRCh37.74 hg19 GRCh37 (hg19) annotation from Ensembl, release 74 /depot/data2/galaxy/hg19/gene_sets/Homo_sapiens.GRCh37.74.gtf +#Homo_sapiens.NCBI36.54 hg18 hg18 annotation from Ensembl, release 54 /depot/data2/galaxy/hg18/gene_sets/Homo_sapiens.NCBI36.54.gtf |
| b |
| diff -r 6e45b443ef1f -r 76d290331481 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Nov 09 11:17:32 2017 -0500 |
| b |
| @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of all gtf files with annotations of genome builds --> + <table name="gene_sets" comment_char="#" allow_duplicate_entries="False"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/gene_sets.loc" /> + </table> +</tables> |
| b |
| diff -r 6e45b443ef1f -r 76d290331481 tool_data_table_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Thu Nov 09 11:17:32 2017 -0500 |
| b |
| @@ -0,0 +1,6 @@ +<tables> + <table name="gene_sets" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="${__HERE__}/test-data/cached_locally/gene_sets.loc" /> + </table> +</tables> |