Mercurial > repos > iuc > miniprot

<?xml version="1.0"?>
<tool id="miniprot" name="Miniprot align" version="@TOOL_VERSION@+galaxy0" profile="21.05">
    <description>align a protein sequence against a genome with affine gap penalty, splicing and frameshift</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">miniprot</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        miniprot
        -t \${GALAXY_SLOTS:-1}
        #if str($adv.options) == "yes"
            $adv.mapping.no_splicing
            -c $adv.mapping.max_kmer
            -n $adv.mapping.min_syncmers
            -m $adv.mapping.min_chain_score
            -l $adv.mapping.second_round_kmer_size
            -e $adv.mapping.max_extension
            -p $adv.mapping.score_ratio
            -N $adv.mapping.max_secondary_alignments
            -w $adv.mapping.log_gap_penalty_weight
            -O $adv.alignment.gap_open
            -E $adv.alignment.gap_extension
            -J $adv.alignment.intron_open
            -C $adv.alignment.non_canonical_splice
            -F $adv.alignment.frameshift
            -B $adv.alignment.end_bonus
            -j $adv.alignment.splice_model
        #if str($adv.mapping.intron_size.mode) == 'manual'
            -G $adv.mapping.intron_size.max_intron
        #elif str($adv.mapping.intron_size.mode) == 'auto'
             -I
        #end if

        #if str($adv.output.prefix) != 'MP'
            -P '$adv.output.prefix'
        #end if
            $adv.output.print_unmapped_proteins
            --outn=$adv.output.outputs_per_query
            --outc=$adv.output.output_fraction_query
            --outs=$adv.output.output_score_least
            $adv.output.output_residue_alignment
        #end if
        #if str($db.dbtype) == 'fasta'
            '$db.genomic_fasta'
            -k $db.kmer_size
            -b $db.bits_per_block
            -M $db.modimisers
            -L $db.min_ORF
        #else
            '$db.genomic_db'
        #end if
        #if str($output_format) == "gff"
            --gff
        #else if str($output_format) == "gtf"
            --gtf
        #end if
        '$protein_fasta' > '$output_alignment'
    ]]></command>
    <inputs>
        <conditional name="db">
            <param name="dbtype" type="select" label="Database type" help="Build an index from FASTA or use a pre-indexed database">
                <option value="fasta" selected="true">FASTA</option>
                <option value="preindexed">Pre-indexed</option>
            </param>
            <when value="fasta">
                <param name="genomic_fasta" type="data" format="fasta,fasta.gz" label="Genomic sequence (FASTA)" help="Genomic contigs / scaffolds to be aligned against in FASTA format" />
                <param argument="-k" name="kmer_size" type="integer" min="1" value="6" label="K-mer size for genome-wide indexing" />
                <param argument="-b" name="bits_per_block" type="integer" min="1" value="8" label="Number of bits per bin" help="Miniprot splits the genome into non-overlapping bins of 2^8 bp in size" />
                <param argument="-M" name="modimisers" type="integer" value="1" label="Sample k-mers at a rate 1/2**INT" help="Increasing this option reduces peak memory but decreases sensitivity" />
                <param argument="-L" name="min_ORF" type="integer" value="30" label="Minimum ORF length to index" />
            </when>
            <when value="preindexed">
                <!-- refine the datatype here once Miniprot index data type is in Galaxy -->
                <param name="genomic_db" type="data" format="binary" label="Pre-indexed genomic database" help="A pre-indexed database built by miniprot" />
            </when>
        </conditional>
        <param name="protein_fasta" type="data" format="fasta,fasta.gz" label="Protein sequence (FASTA)" help="Protein sequences to be aligned in FASTA format" />
        <param name="output_format" type="select" label="Output format" >
            <option value="gff" selected="true">GFF3</option>
            <option value="paf">PAF</option>
            <option value="gtf">GTF</option>
        </param>
        <conditional name="adv">
            <param name="options" type="select" label="Advanced options">
                <option value="yes">Show</option>
                <option value="no" selected="true">Hide</option>
            </param>
            <when value="yes">
                <section name="mapping" title="Mapping">
                    <param argument="-S" name="no_splicing" type="boolean" truevalue="-S" falsevalue="" checked="false" label="No splicing" help="No splicing (apply -G1000 -J1000 -e1000)" />
                    <param argument="-c" name="max_kmer" type="integer" min="1" value="20000" label="Max k-mer occurences" />
                    <param argument="-w" name="log_gap_penalty_weight" type="float" min="0" max="1" value="0.75" label="Log gap penalty weight" />
                    <param argument="-n" name="min_syncmers" type="integer" min="1" value="3" label="Minimum number of syncmers in a chain" />
                    <param argument="-m" name="min_chain_score" type="integer" min="0" value="0" label="Minimum chaining score" />
                    <param argument="-l" name="second_round_kmer_size" type="integer" min="1" value="5" label="K-mer size for second round of chaining" />
                    <param argument="-e" name="max_extension" type="integer" min="0" value="10000" label="Max extension for second round of chaining" />
                    <param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.7" label="Minimum secondary-to-primary score ratio" />
                    <param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="30" label="Max secondary alignments to consider" />
                    <conditional name="intron_size">
                        <param name="mode" type="select" label="Maximum intron size">
                            <option value="manual" selected="true">Manual</option>
                            <option value="auto">Auto (3.6*sqrt)</option>
                        </param>
                        <when value="manual">
                            <param argument="-G" name="max_intron" type="integer" min="0" value="200000" label="Maximum intron size" />
                        </when>
                        <when value="auto" />
                    </conditional>
                </section>
                <section name="alignment" title="Alignment">
                    <param argument="-O" name="gap_open" type="integer" min="0" value="11" label="Gap open penalty" />
                    <param argument="-E" name="gap_extension" type="integer" min="0" value="1" label="Gap extension penalty" help="A k-long gap costs open_penalty+k*extension_penalty" />
                    <param argument="-J" name="intron_open" type="integer" min="0" value="29" label="Intron open penalty" />
                    <param argument="-C" name="non_canonical_splice" type="float" value="1" label="Weight of splice penalty; 0 to ignore splice signals" />
                    <param argument="-F" name="frameshift" type="integer" min="0" value="23" label="Frameshift penalty" />
                    <param argument="-B" name="end_bonus" type="integer" min="0" value="5" label="Bonus score for alignment reaching query ends" />
                    <param argument="-j" name="splice_model" type="select" label="Splicing model for the target genome" help="2=mammal, 1=general, 0=none" >
                        <option value="O" >None: No splicing model (0)</option>
                        <option value="1" selected="true">General: Optimal splicing sequence: '|GTR...YAG|' (1)</option>
                        <option value="2">Mammal: Optimal splicing sequence: 'G|GTR...YYYNYAG|' (2)</option>
                    </param>
                </section>
                <section name="output" title="Output">
                    <param argument="-P" name="prefix" type="text" label="Prefix for IDs in GFF3 output" value="MP">
                        <sanitizer invalid_char="">
                            <valid initial="string.ascii_letters,string.digits">
                                <add value="_" />
                                <add value="-" />
                            </valid>
                        </sanitizer>
                    </param>
                    <param argument="-u" name="print_unmapped_proteins" type="boolean" truevalue="-u" falsevalue="" label="Print unmapped proteins" checked="false" />
                    <param argument="--outn" name="outputs_per_query" type="integer" min="0" value="100" label="Outputs per query" help="The number of outputs will be the minimum of this and the max secondary alignments option" />
                    <param argument="--aln" name="output_residue_alignment" type="boolean" truevalue="--aln" falsevalue="" checked="false" label="Output residue alignment" help="Only for GFF output" />
                    <param argument="--outs" name="output_score_least" type="float" min="0" max="1" value="0.99"
                        label="For each protein, only output alignments with a score higher than 'best_score' multiplied by this value"/>
                    <param argument="--outc" name="output_fraction_query" type="float" value="0.1" label="Output if at least this fraction of query is aligned" />
                </section>
                <param argument="-K" name="query_batch_size" type="integer" min="1" value="2000000" label="Query batch size" />
            </when>
            <when value="no">
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="output_alignment" format="gff3" label="Miniprot on ${on_string}">
            <change_format>
                <when input="output_format" value="paf" format="paf" />
                <when input="output_format" value="gtf" format="gtf"/>
            </change_format>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <conditional name="db">
                <param name="dbtype" value="fasta" />
                <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
            </conditional>
            <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
            <output name="output_alignment" ftype="gff3">
                <assert_contents>
                    <has_text text="ID=MP000001;Rank=1;Identity=0.3420;Positive=0.5104;Target=tr|I6YGH7|I6YGH7_MYCTU 1 375" />
                    <has_text text="ID=MP000066;Rank=1;Identity=0.3613;Positive=0.5178;Target=sp|P9WQE5|PPSB_MYCTU 1 1214" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="db">
                <param name="dbtype" value="fasta" />
                <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
            </conditional>
            <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
            <param name="output_format" value="paf" />
            <output name="output_alignment" ftype="paf">
                <assert_contents>
                    <has_text text="tr|O06302|O06302_MYCTU" />
                    <has_text text="cs:Z::29*agcG:3*gtgA:5*ccgA:9*accS:1*gccV:4*cagL:1*gtcS:3*gtcA*gtcI*accA*gccG:8*gccS:2*ggtA:5*gccI*agcG:1*ctgA:4*gccV:5*gggL:1*gtgS:2" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="db">
                <param name="dbtype" value="fasta" />
                <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
            </conditional>
            <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
            <param name="output_format" value="gff" />
            <conditional name="adv">
                <param name="options" value="yes" />
                <param name="second_round_kmer_size" value="32" />
            </conditional>
            <output name="output_alignment" ftype="gff3">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                </assert_contents>
            </output>
        </test>

        <test expect_num_outputs="1">
            <conditional name="db">
                <param name="dbtype" value="fasta"></param>
                <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param>
            </conditional>
            <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param>
            <param name="output_format" value="gtf"></param>
            <conditional name="adv">
                <param name="options" value="yes"></param>
                <param name="second_round_kmer_size" value="32"></param>
            </conditional>
            <output name="output_alignment" ftype="gtf">
                <assert_contents>
                    <has_text text="NC_000962.3" />
                    <has_text text='transcript_id "MPT000004"; gene_id "MPG000004"' />
                </assert_contents>
            </output>
        </test>


    </tests>
    <help><![CDATA[
        miniprot_  rapidly aligns a protein sequence against a genome with affine gap penalty, splicing and frameshift.
        It is primarily intended for annotating protein-coding genes in a new species using known genes from other species.

        While an index of the genome to be mapped to can be built "on the fly", the Miniprot index tool can pre-index a genome
        and will result in faster performance if the genome index is reused multiple times.

        For details of the algorithm and some insight into how parameters can be tuned see this overview_.

        .. _miniprot: https://github.com/lh3/miniprot
        .. _overview: https://github.com/lh3/miniprot#algorithm-overview
    ]]></help>
    <expand macro="citation"></expand>
</tool>
author	iuc
date	Thu, 06 Apr 2023 09:20:48 +0000
parents	ce04c239454b
children	52bdc302299b