view glimmer_w_icm.xml @ 0:841357e0acbf draft

Uploaded
author bgruening
date Sat, 06 Jul 2013 10:09:30 -0400
parents
children
line wrap: on
line source

<tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.2">
    <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description>
    <requirements>
        <requirement type="package" version="3.02b">glimmer</requirement>
        <requirement type="package" version="1.61">biopython</requirement>
        <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement>
    </requirements>
    <command>
    #import tempfile, os
    #set $temp = tempfile.NamedTemporaryFile( delete=False )
    #silent $temp.close()
    #set $temp = $temp.name

    glimmer3
        --max_olap $max_olap
        --gene_len $gene_len
        --threshold $threshold
        #if float( str($gc_percent) ) > 0.0:
            --gc_percent $gc_percent
        #end if

        #if $stop_codon_opts.stop_codon_opts_selector == "gb":
            --trans_table "${stop_codon_opts.genbank_gencode}"
        #else:
            --stop_codons "${stop_codon_opts.stop_codons}"
        #end if

        --start_codons $start_codons

        $linear
        $no_indep
        $extend
        $seq_input
        $icm_input
        $temp 2>&#38;1;

    ## convert prediction to FASTA sequences
    \$GLIMMER_SCRIPT_PATH/glimmer2seq.py $temp".predict" $seq_input $genes_output;

    #if $report:
        mv $temp".predict" $report_output;
    #else:
        rm $temp".predict";
    #end if

    #if $detailed_report:
        mv $temp".detail" $detailed_output;
    #else:
        rm $temp".detail";
    #end if

    rm $temp
    </command>
    <inputs>
        <param name="seq_input" type="data" format="fasta" label="Genome Sequence" />
        <param name="icm_input" type="data" format="data" label="Interpolated context model (ICM)" />

        <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." />
        <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/>
        <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." />
        <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." />

        <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
        <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." />
        <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" />
        <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" />

        <conditional name="stop_codon_opts">
            <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
              <option value="gb" selected="True">Genbank translation table entry</option>
              <option value="free_form">Comma-separated list</option>
            </param>
            <when value="gb">
                <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
                    <option value="1" select="True">1. Standard</option>
                    <option value="2">2. Vertebrate Mitochondrial</option>
                    <option value="3">3. Yeast Mitochondrial</option>
                    <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
                    <option value="5">5. Invertebrate Mitochondrial</option>
                    <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
                    <option value="9">9. Echinoderm Mitochondrial</option>
                    <option value="10">10. Euplotid Nuclear</option>
                    <option value="11">11. Bacteria and Archaea</option>
                    <option value="12">12. Alternative Yeast Nuclear</option>
                    <option value="13">13. Ascidian Mitochondrial</option>
                    <option value="14">14. Flatworm Mitochondrial</option>
                    <option value="15">15. Blepharisma Macronuclear</option>
                    <option value="16">16. Chlorophycean Mitochondrial</option>
                    <option value="21">21. Trematode Mitochondrial</option>
                    <option value="22">22. Scenedesmus obliquus mitochondrial</option>
                    <option value="23">23. Thraustochytrium Mitochondrial</option>
                    <option value="24">24. Pterobranchia mitochondrial</option>
                </param>
            </when>
            <when value="free_form">
                <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
            </when>
        </conditional>

        <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/>
        <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/>
    </inputs>
    <outputs>
        <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
        <data name="report_output" format="txt" label="Glimmer3 on ${on_string} (Gene Prediction table)">
            <filter>report == True</filter>
        </data>
        <data name="detailed_output" format="txt" label="Glimmer3 on ${on_string} (detailed report)">
            <filter>detailed_report == True</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="seq_input" value='streptomyces_Tu6071_genomic.fasta' />
            <param name="icm_input" value='streptomyces_Tu6071_plasmid_genes.icm' />
            <param name="max_olap" value="50" />
            <param name="gene_len" value="90" />
            <param name="threshold" value="30" />
            <param name="gc_percent" value="0.0" />
            <param name="linear" value="--linear" />
            <param name="no_indep" value="" />
            <param name="extend" value="" />
            <param name="start_codons" value="atg,gtg,ttg" />
            <param name="genbank_gencode" value="11" />
            <param name="detailed_report" value="" />
            <param name="report" value="" />
            <output name="genes_output" file='glimmer_w_icm_trans-table-11_genomic.fasta' ftype="fasta" />
        </test>
    </tests>
    <help>


**What it does**

This is the main program that makes gene preditions based on an interpolated context model (ICM).

The ICM can be generated with extracted CDS from related organisms (ICM builder). If you can't generate an ICM model you can use the non knowlegde-based Glimmer with a de novo prediction.

-----

**Example**

*Input*::
	
	- interpolated context model (ICM): Use the 'Glimmer ICM builder' tool to create one
	- Genome Sequence in FASTA format
	
		>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7
		GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
		GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
		TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
		TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
		GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
		ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
		AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
		CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
		TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
		AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
		GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
		AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
		CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
		AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
		GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
		.....

*Output*:: 

	- FASTA file with predicted proteins
	- Glimmer prediction file (optional)

		>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
		orf00001    40137       52  +2     8.68
		orf00004      603       34  -1     2.91
		orf00006     1289     1095  -3     3.16
		orf00007     1555     1391  -2     2.33
		orf00008     1809     1576  -1     1.02
		orf00010     1953     2066  +3     3.09
		orf00011     2182     2304  +1     0.89
		orf00013     2390     2521  +2     0.60
		orf00018     2570     3073  +2     2.54
		orf00020     3196     3747  +1     2.91
		orf00022     3758     4000  +2     0.83
		orf00023     4399     4157  -2     1.31
		orf00025     4463     4759  +2     2.92
		orf00026     4878     5111  +3     0.78
		orf00027     5468     5166  -3     1.64
		orf00029     5590     5832  +1     0.29
		orf00032     6023     6226  +2     6.02
		orf00033     6217     6336  +1     3.09
		........

	- Glimmer detailed report (optional)

		>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
		Sequence length = 40222
			
			   ----- Start -----           --- Length ----  ------------- Scores -------------
		 ID  Frame   of Orf  of Gene     Stop   of Orf of Gene      Raw InFrm F1 F2 F3 R1 R2 R3 NC
		0001    +2    40137    40137       52      135     135     9.26    96  - 96  -  -  3  -  0
		0002    +1       58       64      180      120     114     5.01    69 69  -  - 30  -  -  0
			+3      300      309      422      120     111    -0.68    20  -  - 20 38  -  - 41
			+3      423      432      545      120     111     1.29    21  - 51 21 13  -  8  5
		0003    +2      401      416      595      192     177     2.51    93  - 93  -  5  -  -  1
		0004    -1      645      552       34      609     516     2.33    99  -  -  - 99  -  -  0
			+1      562      592      762      198     168    -2.54     1  1  -  -  -  -  - 98
			+1      763      772      915      150     141    -1.34     1  1  -  -  -  - 86 11
			+3      837      846     1007      168     159     1.35    28  - 50 28  -  - 17  3
		0005    -3     1073      977      654      417     321     0.52    84  -  -  -  -  - 84 15
		0006    -3     1373     1319     1095      276     222     3.80    99  -  -  -  -  - 99  0
		0007    -2     1585     1555     1391      192     162     2.70    98  -  -  -  - 98  -  1
		0008    -1     1812     1809     1576      234     231     1.26    94  -  -  - 94  -  -  5
		0009    +2     1721     1730     1945      222     213     0.68    80  - 80  -  -  -  - 19
		.....

-------

**References**

A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).


    </help>

</tool>