Mercurial > repos > iuc > miniprot
diff miniprot.xml @ 3:52bdc302299b draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/miniprot commit 7686c033edb974379f855d2563149d56920f063e
author | iuc |
---|---|
date | Thu, 13 Jul 2023 09:37:56 +0000 |
parents | d518cf04b55c |
children | 00b22efdd100 |
line wrap: on
line diff
--- a/miniprot.xml Thu Apr 06 09:20:48 2023 +0000 +++ b/miniprot.xml Thu Jul 13 09:37:56 2023 +0000 @@ -30,7 +30,7 @@ #if str($adv.mapping.intron_size.mode) == 'manual' -G $adv.mapping.intron_size.max_intron #elif str($adv.mapping.intron_size.mode) == 'auto' - -I + -I #end if #if str($adv.output.prefix) != 'MP' @@ -40,6 +40,8 @@ --outn=$adv.output.outputs_per_query --outc=$adv.output.output_fraction_query --outs=$adv.output.output_score_least + $adv.output.output_translated_protein + $adv.output.output_no_cs $adv.output.output_residue_alignment #end if #if str($db.dbtype) == 'fasta' @@ -58,6 +60,7 @@ #end if '$protein_fasta' > '$output_alignment' ]]></command> + <inputs> <conditional name="db"> <param name="dbtype" type="select" label="Database type" help="Build an index from FASTA or use a pre-indexed database"> @@ -65,19 +68,19 @@ <option value="preindexed">Pre-indexed</option> </param> <when value="fasta"> - <param name="genomic_fasta" type="data" format="fasta,fasta.gz" label="Genomic sequence (FASTA)" help="Genomic contigs / scaffolds to be aligned against in FASTA format" /> - <param argument="-k" name="kmer_size" type="integer" min="1" value="6" label="K-mer size for genome-wide indexing" /> - <param argument="-b" name="bits_per_block" type="integer" min="1" value="8" label="Number of bits per bin" help="Miniprot splits the genome into non-overlapping bins of 2^8 bp in size" /> - <param argument="-M" name="modimisers" type="integer" value="1" label="Sample k-mers at a rate 1/2**INT" help="Increasing this option reduces peak memory but decreases sensitivity" /> - <param argument="-L" name="min_ORF" type="integer" value="30" label="Minimum ORF length to index" /> + <param name="genomic_fasta" type="data" format="fasta,fasta.gz" label="Genomic sequence (FASTA)" help="Genomic contigs / scaffolds to be aligned against in FASTA format"/> + <param argument="-k" name="kmer_size" type="integer" min="1" value="6" label="K-mer size for genome-wide indexing"/> + <param argument="-b" name="bits_per_block" type="integer" min="1" value="8" label="Number of bits per bin" help="Miniprot splits the genome into non-overlapping bins of 2^8 bp in size"/> + <param argument="-M" name="modimisers" type="integer" value="1" label="Sample k-mers at a rate 1/2**INT" help="Increasing this option reduces peak memory but decreases sensitivity"/> + <param argument="-L" name="min_ORF" type="integer" value="30" label="Minimum ORF length to index"/> </when> <when value="preindexed"> <!-- refine the datatype here once Miniprot index data type is in Galaxy --> - <param name="genomic_db" type="data" format="binary" label="Pre-indexed genomic database" help="A pre-indexed database built by miniprot" /> + <param name="genomic_db" type="data" format="binary" label="Pre-indexed genomic database" help="A pre-indexed database built by miniprot"/> </when> </conditional> - <param name="protein_fasta" type="data" format="fasta,fasta.gz" label="Protein sequence (FASTA)" help="Protein sequences to be aligned in FASTA format" /> - <param name="output_format" type="select" label="Output format" > + <param name="protein_fasta" type="data" format="fasta,fasta.gz" label="Protein sequence (FASTA)" help="Protein sequences to be aligned in FASTA format"/> + <param name="output_format" type="select" label="Output format"> <option value="gff" selected="true">GFF3</option> <option value="paf">PAF</option> <option value="gtf">GTF</option> @@ -89,34 +92,34 @@ </param> <when value="yes"> <section name="mapping" title="Mapping"> - <param argument="-S" name="no_splicing" type="boolean" truevalue="-S" falsevalue="" checked="false" label="No splicing" help="No splicing (apply -G1000 -J1000 -e1000)" /> - <param argument="-c" name="max_kmer" type="integer" min="1" value="20000" label="Max k-mer occurences" /> - <param argument="-w" name="log_gap_penalty_weight" type="float" min="0" max="1" value="0.75" label="Log gap penalty weight" /> - <param argument="-n" name="min_syncmers" type="integer" min="1" value="3" label="Minimum number of syncmers in a chain" /> - <param argument="-m" name="min_chain_score" type="integer" min="0" value="0" label="Minimum chaining score" /> - <param argument="-l" name="second_round_kmer_size" type="integer" min="1" value="5" label="K-mer size for second round of chaining" /> - <param argument="-e" name="max_extension" type="integer" min="0" value="10000" label="Max extension for second round of chaining" /> - <param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.7" label="Minimum secondary-to-primary score ratio" /> - <param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="30" label="Max secondary alignments to consider" /> + <param argument="-S" name="no_splicing" type="boolean" truevalue="-S" falsevalue="" checked="false" label="No splicing" help="No splicing (apply -G1000 -J1000 -e1000)"/> + <param argument="-c" name="max_kmer" type="integer" min="1" value="20000" label="Max k-mer occurences"/> + <param argument="-w" name="log_gap_penalty_weight" type="float" min="0" max="1" value="0.75" label="Log gap penalty weight"/> + <param argument="-n" name="min_syncmers" type="integer" min="1" value="3" label="Minimum number of syncmers in a chain"/> + <param argument="-m" name="min_chain_score" type="integer" min="0" value="0" label="Minimum chaining score"/> + <param argument="-l" name="second_round_kmer_size" type="integer" min="1" value="5" label="K-mer size for second round of chaining"/> + <param argument="-e" name="max_extension" type="integer" min="0" value="10000" label="Max extension for second round of chaining"/> + <param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.7" label="Minimum secondary-to-primary score ratio"/> + <param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="30" label="Max secondary alignments to consider"/> <conditional name="intron_size"> <param name="mode" type="select" label="Maximum intron size"> <option value="manual" selected="true">Manual</option> <option value="auto">Auto (3.6*sqrt)</option> </param> <when value="manual"> - <param argument="-G" name="max_intron" type="integer" min="0" value="200000" label="Maximum intron size" /> + <param argument="-G" name="max_intron" type="integer" min="0" value="200000" label="Maximum intron size"/> </when> - <when value="auto" /> + <when value="auto"/> </conditional> </section> <section name="alignment" title="Alignment"> - <param argument="-O" name="gap_open" type="integer" min="0" value="11" label="Gap open penalty" /> - <param argument="-E" name="gap_extension" type="integer" min="0" value="1" label="Gap extension penalty" help="A k-long gap costs open_penalty+k*extension_penalty" /> - <param argument="-J" name="intron_open" type="integer" min="0" value="29" label="Intron open penalty" /> - <param argument="-C" name="non_canonical_splice" type="float" value="1" label="Weight of splice penalty; 0 to ignore splice signals" /> - <param argument="-F" name="frameshift" type="integer" min="0" value="23" label="Frameshift penalty" /> - <param argument="-B" name="end_bonus" type="integer" min="0" value="5" label="Bonus score for alignment reaching query ends" /> - <param argument="-j" name="splice_model" type="select" label="Splicing model for the target genome" help="2=mammal, 1=general, 0=none" > + <param argument="-O" name="gap_open" type="integer" min="0" value="11" label="Gap open penalty"/> + <param argument="-E" name="gap_extension" type="integer" min="0" value="1" label="Gap extension penalty" help="A k-long gap costs open_penalty+k*extension_penalty"/> + <param argument="-J" name="intron_open" type="integer" min="0" value="29" label="Intron open penalty"/> + <param argument="-C" name="non_canonical_splice" type="float" value="1" label="Weight of splice penalty; 0 to ignore splice signals"/> + <param argument="-F" name="frameshift" type="integer" min="0" value="23" label="Frameshift penalty"/> + <param argument="-B" name="end_bonus" type="integer" min="0" value="5" label="Bonus score for alignment reaching query ends"/> + <param argument="-j" name="splice_model" type="select" label="Splicing model for the target genome" help="2=mammal, 1=general, 0=none"> <option value="O" >None: No splicing model (0)</option> <option value="1" selected="true">General: Optimal splicing sequence: '|GTR...YAG|' (1)</option> <option value="2">Mammal: Optimal splicing sequence: 'G|GTR...YYYNYAG|' (2)</option> @@ -131,14 +134,15 @@ </valid> </sanitizer> </param> - <param argument="-u" name="print_unmapped_proteins" type="boolean" truevalue="-u" falsevalue="" label="Print unmapped proteins" checked="false" /> - <param argument="--outn" name="outputs_per_query" type="integer" min="0" value="100" label="Outputs per query" help="The number of outputs will be the minimum of this and the max secondary alignments option" /> + <param argument="-u" name="print_unmapped_proteins" type="boolean" truevalue="-u" falsevalue="" label="Print unmapped proteins" checked="false"/> + <param argument="--outn" name="outputs_per_query" type="integer" min="0" value="100" label="Outputs per query" help="The number of outputs will be the minimum of this and the max secondary alignments option"/> <param argument="--aln" name="output_residue_alignment" type="boolean" truevalue="--aln" falsevalue="" checked="false" label="Output residue alignment" help="Only for GFF output" /> - <param argument="--outs" name="output_score_least" type="float" min="0" max="1" value="0.99" - label="For each protein, only output alignments with a score higher than 'best_score' multiplied by this value"/> - <param argument="--outc" name="output_fraction_query" type="float" value="0.1" label="Output if at least this fraction of query is aligned" /> + <param argument="--outs" name="output_score_least" type="float" min="0" max="1" value="0.99" label="For each protein, only output alignments with a score higher than 'best_score' multiplied by this value"/> + <param argument="--outc" name="output_fraction_query" type="float" value="0.1" label="Output if at least this fraction of query is aligned"/> + <param argument="--trans" name="output_translated_protein" type="boolean" truevalue="--trans" falsevalue="" checked="false" label="Output translated protein sequences" help="Skipping frameshift"/> + <param argument="--no-cs" name="output_no_cs" type="boolean" truevalue="--no-cs" falsevalue="" checked="false" label="Disable the cs tag"/> </section> - <param argument="-K" name="query_batch_size" type="integer" min="1" value="2000000" label="Query batch size" /> + <param argument="-K" name="query_batch_size" type="integer" min="1" value="2000000" label="Query batch size"/> </when> <when value="no"> </when> @@ -147,7 +151,7 @@ <outputs> <data name="output_alignment" format="gff3" label="Miniprot on ${on_string}"> <change_format> - <when input="output_format" value="paf" format="paf" /> + <when input="output_format" value="paf" format="paf"/> <when input="output_format" value="gtf" format="gtf"/> </change_format> </data> @@ -156,40 +160,40 @@ <test expect_num_outputs="1"> <conditional name="db"> <param name="dbtype" value="fasta" /> - <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" /> + <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"/> </conditional> - <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" /> + <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"/> <output name="output_alignment" ftype="gff3"> <assert_contents> - <has_text text="ID=MP000001;Rank=1;Identity=0.3420;Positive=0.5104;Target=tr|I6YGH7|I6YGH7_MYCTU 1 375" /> - <has_text text="ID=MP000066;Rank=1;Identity=0.3613;Positive=0.5178;Target=sp|P9WQE5|PPSB_MYCTU 1 1214" /> + <has_text text="ID=MP000001;Rank=1;Identity=0.3420;Positive=0.5104;Target=tr|I6YGH7|I6YGH7_MYCTU 1 375"/> + <has_text text="ID=MP000066;Rank=1;Identity=0.3613;Positive=0.5178;Target=sp|P9WQE5|PPSB_MYCTU 1 1214"/> </assert_contents> </output> </test> <test expect_num_outputs="1"> <conditional name="db"> - <param name="dbtype" value="fasta" /> - <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" /> + <param name="dbtype" value="fasta"/> + <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"/> </conditional> - <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" /> - <param name="output_format" value="paf" /> + <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"/> + <param name="output_format" value="paf"/> <output name="output_alignment" ftype="paf"> <assert_contents> <has_text text="tr|O06302|O06302_MYCTU" /> - <has_text text="cs:Z::29*agcG:3*gtgA:5*ccgA:9*accS:1*gccV:4*cagL:1*gtcS:3*gtcA*gtcI*accA*gccG:8*gccS:2*ggtA:5*gccI*agcG:1*ctgA:4*gccV:5*gggL:1*gtgS:2" /> + <has_text text="cs:Z::29*agcG:3*gtgA:5*ccgA:9*accS:1*gccV:4*cagL:1*gtcS:3*gtcA*gtcI*accA*gccG:8*gccS:2*ggtA:5*gccI*agcG:1*ctgA:4*gccV:5*gggL:1*gtgS:2"/> </assert_contents> </output> </test> <test expect_num_outputs="1"> <conditional name="db"> <param name="dbtype" value="fasta" /> - <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" /> + <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"/> </conditional> - <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" /> + <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"/> <param name="output_format" value="gff" /> <conditional name="adv"> <param name="options" value="yes" /> - <param name="second_round_kmer_size" value="32" /> + <param name="second_round_kmer_size" value="32"/> </conditional> <output name="output_alignment" ftype="gff3"> <assert_contents> @@ -197,7 +201,6 @@ </assert_contents> </output> </test> - <test expect_num_outputs="1"> <conditional name="db"> <param name="dbtype" value="fasta"></param> @@ -212,13 +215,139 @@ <output name="output_alignment" ftype="gtf"> <assert_contents> <has_text text="NC_000962.3" /> - <has_text text='transcript_id "MPT000004"; gene_id "MPG000004"' /> + <has_text text='transcript_id "MPT000004"; gene_id "MPG000004"'/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> + <conditional name="db"> + <param name="dbtype" value="fasta"></param> + <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param> + </conditional> + <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param> + <param name="output_format" value="gff"></param> + <conditional name="adv"> + <param name="options" value="yes"/> + <section name="output"> + <param name="output_translated_protein" value="true"/> + </section> + </conditional> + <output name="output_alignment" ftype="gff3"> + <assert_contents> + <has_text text="tr|I6YGH7|I6YGH7_MYCTU" /> + <has_text text="VDIDLDPSTEKLRAQIRAEVAALKAMPREPRTVAIAEGGWVLPYLPKPWGRAASPVEQIIIAQEFTAGRVKRPQIAIATWIVPSIVAFGTDNQKQRLLPPTFRGDIFWCQLFSEPGAGSDLASLATKATRVDGGWRITGQKIWTTGAQYSQWGALLARTDPSAPKHNGITYFLLDMKSEGVQVKPLRELTGKEFFNTVYLDDVFVPDELVLGEVNRGWEVSRNTLTAERVSIGGSDSTFLPTLGEFVDFVRDYRFEGQFDQVARHRAGQLIAEGHATKLLNLRSTLLTLAGGDPMAPAAISKLLSMRTGQGYAEFAVSSFGTDAVIGDTERLPGKWGEYLLASRATTIYGGTSEVQLNIIAERLLGLPRDP"/> + <has_n_lines n="1633"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> + <conditional name="db"> + <param name="dbtype" value="fasta"></param> + <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param> + </conditional> + <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param> + <param name="output_format" value="gtf"></param> + <conditional name="adv"> + <param name="options" value="yes"></param> + <section name="output"> + <param name="output_translated_protein" value="true"/> + </section> + </conditional> + <output name="output_alignment" ftype="gtf"> + <assert_contents> + <has_text text="tr|I6YGH7|I6YGH7_MYCTU"/> + <has_text text="VDIDLDPSTEKLRAQIRAEVAALKAMPREPRTVAIAEGGWVLPYLPKPWGRAASPVEQIIIAQEFTAGRVKRPQIAIATWIVPSIVAFGTDNQKQRLLPPTFRGDIFWCQLFSEPGAGSDLASLATKATRVDGGWRITGQKIWTTGAQYSQWGALLARTDPSAPKHNGITYFLLDMKSEGVQVKPLRELTGKEFFNTVYLDDVFVPDELVLGEVNRGWEVSRNTLTAERVSIGGSDSTFLPTLGEFVDFVRDYRFEGQFDQVARHRAGQLIAEGHATKLLNLRSTLLTLAGGDPMAPAAISKLLSMRTGQGYAEFAVSSFGTDAVIGDTERLPGKWGEYLLASRATTIYGGTSEVQLNIIAERLLGLPRDP"/> + <has_text text="NC_000962.3"/> + <has_n_lines n="2172"/> </assert_contents> </output> </test> - + <test expect_num_outputs="1"> + <conditional name="db"> + <param name="dbtype" value="fasta"></param> + <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param> + </conditional> + <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param> + <param name="output_format" value="paf"></param> + <conditional name="adv"> + <param name="options" value="yes"></param> + <section name="output"> + <param name="output_translated_protein" value="true"/> + </section> + </conditional> + <output name="output_alignment" ftype="paf"> + <assert_contents> + <has_text text="tr|I6YGH7|I6YGH7_MYCTU"/> + <has_text text="VDIDLDPSTEKLRAQIRAEVAALKAMPREPRTVAIAEGGWVLPYLPKPWGRAASPVEQIIIAQEFTAGRVKRPQIAIATWIVPSIVAFGTDNQKQRLLPPTFRGDIFWCQLFSEPGAGSDLASLATKATRVDGGWRITGQKIWTTGAQYSQWGALLARTDPSAPKHNGITYFLLDMKSEGVQVKPLRELTGKEFFNTVYLDDVFVPDELVLGEVNRGWEVSRNTLTAERVSIGGSDSTFLPTLGEFVDFVRDYRFEGQFDQVARHRAGQLIAEGHATKLLNLRSTLLTLAGGDPMAPAAISKLLSMRTGQGYAEFAVSSFGTDAVIGDTERLPGKWGEYLLASRATTIYGGTSEVQLNIIAERLLGLPRDP"/> + <has_text text="sp|O05779|FTSE_MYCTU"/> + <has_n_lines n="650"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> + <conditional name="db"> + <param name="dbtype" value="fasta"></param> + <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param> + </conditional> + <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param> + <param name="output_format" value="paf"></param> + <conditional name="adv"> + <param name="options" value="yes"></param> + <section name="output"> + <param name="output_no_cs" value="true"/> + </section> + </conditional> + <output name="output_alignment" ftype="paf"> + <assert_contents> + <has_text text="tr|I6YGH7|I6YGH7_MYCTU"/> + <has_text text="cg:Z:17M1D20M14I2M1I24M1I195M4D61M6D40M"/> + <has_text text="cg:Z:188M3I4M5I11M6I3M1I31M2D1M282U7M1D6M3D28M3I20M1D52M3D4M1D24M2I8M5I5M2I32M1D21M"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> + <conditional name="db"> + <param name="dbtype" value="fasta"></param> + <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param> + </conditional> + <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param> + <param name="output_format" value="gff"></param> + <conditional name="adv"> + <param name="options" value="yes"></param> + <section name="output"> + <param name="output_no_cs" value="true"/> + </section> + </conditional> + <output name="output_alignment" ftype="gff3"> + <assert_contents> + <has_text text="##gff-version 3"/> + <has_text text="tr|I6YGH7|I6YGH7_MYCTU"/> + <has_text text="cg:Z:17M1D20M14I2M1I24M1I195M4D61M6D40M"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> + <conditional name="db"> + <param name="dbtype" value="fasta"></param> + <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param> + </conditional> + <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param> + <param name="output_format" value="gtf"></param> + <conditional name="adv"> + <param name="options" value="yes"></param> + <section name="output"> + <param name="output_no_cs" value="true"/> + </section> + </conditional> + <output name="output_alignment" ftype="gtf"> + <assert_contents> + <has_text text="NC_000962.3"/> + <has_text text='gene_id "MPG000001"'/> + </assert_contents> + </output> + </test> + </tests> - </tests> <help><![CDATA[ miniprot_ rapidly aligns a protein sequence against a genome with affine gap penalty, splicing and frameshift. It is primarily intended for annotating protein-coding genes in a new species using known genes from other species.