comparison miniprot.xml @ 2:d518cf04b55c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/miniprot commit bda011acaca773960c85bfa1288fa8b74cfabb15
author iuc
date Thu, 06 Apr 2023 09:20:48 +0000
parents ce04c239454b
children 52bdc302299b
comparison
equal deleted inserted replaced
1:ce04c239454b 2:d518cf04b55c
11 miniprot 11 miniprot
12 -t \${GALAXY_SLOTS:-1} 12 -t \${GALAXY_SLOTS:-1}
13 #if str($adv.options) == "yes" 13 #if str($adv.options) == "yes"
14 $adv.mapping.no_splicing 14 $adv.mapping.no_splicing
15 -c $adv.mapping.max_kmer 15 -c $adv.mapping.max_kmer
16 -G $adv.mapping.max_intron
17 -n $adv.mapping.min_syncmers 16 -n $adv.mapping.min_syncmers
18 -m $adv.mapping.min_chain_score 17 -m $adv.mapping.min_chain_score
19 -l $adv.mapping.second_round_kmer_size 18 -l $adv.mapping.second_round_kmer_size
20 -e $adv.mapping.max_extension 19 -e $adv.mapping.max_extension
21 -p $adv.mapping.score_ratio 20 -p $adv.mapping.score_ratio
22 -N $adv.mapping.max_secondary_alignments 21 -N $adv.mapping.max_secondary_alignments
22 -w $adv.mapping.log_gap_penalty_weight
23 -O $adv.alignment.gap_open 23 -O $adv.alignment.gap_open
24 -E $adv.alignment.gap_extension 24 -E $adv.alignment.gap_extension
25 -J $adv.alignment.intron_open 25 -J $adv.alignment.intron_open
26 -C $adv.alignment.non_canonical_splice 26 -C $adv.alignment.non_canonical_splice
27 -F $adv.alignment.frameshift 27 -F $adv.alignment.frameshift
28 -B $adv.alignment.end_bonus 28 -B $adv.alignment.end_bonus
29 -j $adv.alignment.splice_model
30 #if str($adv.mapping.intron_size.mode) == 'manual'
31 -G $adv.mapping.intron_size.max_intron
32 #elif str($adv.mapping.intron_size.mode) == 'auto'
33 -I
34 #end if
35
29 #if str($adv.output.prefix) != 'MP' 36 #if str($adv.output.prefix) != 'MP'
30 -P '$adv.output.prefix' 37 -P '$adv.output.prefix'
31 #end if 38 #end if
32 $adv.output.print_unmapped_proteins 39 $adv.output.print_unmapped_proteins
33 --outn=$adv.output.outputs_per_query 40 --outn=$adv.output.outputs_per_query
41 --outc=$adv.output.output_fraction_query
42 --outs=$adv.output.output_score_least
43 $adv.output.output_residue_alignment
34 #end if 44 #end if
35 #if str($db.dbtype) == 'fasta' 45 #if str($db.dbtype) == 'fasta'
36 '$db.genomic_fasta' 46 '$db.genomic_fasta'
37 -k $db.kmer_size 47 -k $db.kmer_size
38 -s $db.submer_size 48 -b $db.bits_per_block
39 -b $db.bits_per_block 49 -M $db.modimisers
50 -L $db.min_ORF
40 #else 51 #else
41 '$db.genomic_db' 52 '$db.genomic_db'
42 #end if 53 #end if
43 #if str($output_format) == "gff" 54 #if str($output_format) == "gff"
44 --gff 55 --gff
45 #end if 56 #else if str($output_format) == "gtf"
46 '$protein_fasta' 57 --gtf
47 >'$output_alignment' 58 #end if
59 '$protein_fasta' > '$output_alignment'
48 ]]></command> 60 ]]></command>
49 <inputs> 61 <inputs>
50 <conditional name="db"> 62 <conditional name="db">
51 <param name="dbtype" type="select" label="Database type" help="Build an index from FASTA or use a pre-indexed database"> 63 <param name="dbtype" type="select" label="Database type" help="Build an index from FASTA or use a pre-indexed database">
52 <option value="fasta" selected="true">FASTA</option> 64 <option value="fasta" selected="true">FASTA</option>
53 <option value="preindexed">Pre-indexed</option> 65 <option value="preindexed">Pre-indexed</option>
54 </param> 66 </param>
55 <when value="fasta"> 67 <when value="fasta">
56 <param name="genomic_fasta" type="data" format="fasta,fasta.gz" label="Genomic sequence (FASTA)" help="Genomic contigs / scaffolds to be aligned against in FASTA format" /> 68 <param name="genomic_fasta" type="data" format="fasta,fasta.gz" label="Genomic sequence (FASTA)" help="Genomic contigs / scaffolds to be aligned against in FASTA format" />
57 <param argument="-k" name="kmer_size" type="integer" min="1" value="6" label="K-mer size" /> 69 <param argument="-k" name="kmer_size" type="integer" min="1" value="6" label="K-mer size for genome-wide indexing" />
58 <param argument="-s" name="submer_size" type="integer" min="1" value="4" label="Submer size" help="Submer size (density: 1/(2*(kmer_size-submer_size)+1))" /> 70 <param argument="-b" name="bits_per_block" type="integer" min="1" value="8" label="Number of bits per bin" help="Miniprot splits the genome into non-overlapping bins of 2^8 bp in size" />
59 <param argument="-b" name="bits_per_block" type="integer" min="1" value="8" label="Bits per block" /> 71 <param argument="-M" name="modimisers" type="integer" value="1" label="Sample k-mers at a rate 1/2**INT" help="Increasing this option reduces peak memory but decreases sensitivity" />
72 <param argument="-L" name="min_ORF" type="integer" value="30" label="Minimum ORF length to index" />
60 </when> 73 </when>
61 <when value="preindexed"> 74 <when value="preindexed">
62 <!-- refine the datatype here once Miniprot index data type is in Galaxy --> 75 <!-- refine the datatype here once Miniprot index data type is in Galaxy -->
63 <param name="genomic_db" type="data" format="binary" label="Pre-indexed genomic database" help="A pre-indexed database built by miniprot" /> 76 <param name="genomic_db" type="data" format="binary" label="Pre-indexed genomic database" help="A pre-indexed database built by miniprot" />
64 </when> 77 </when>
65 </conditional> 78 </conditional>
66 <param name="protein_fasta" type="data" format="fasta,fasta.gz" label="Protein sequence (FASTA)" help="Protein sequences to be aligned in FASTA format" /> 79 <param name="protein_fasta" type="data" format="fasta,fasta.gz" label="Protein sequence (FASTA)" help="Protein sequences to be aligned in FASTA format" />
67 <param name="output_format" type="select" label="Output format" > 80 <param name="output_format" type="select" label="Output format" >
68 <option value="gff" selected="true">GFF3</option> 81 <option value="gff" selected="true">GFF3</option>
69 <option value="paf">PAF</option> 82 <option value="paf">PAF</option>
83 <option value="gtf">GTF</option>
70 </param> 84 </param>
71 <conditional name="adv"> 85 <conditional name="adv">
72 <param name="options" type="select" label="Advanced options"> 86 <param name="options" type="select" label="Advanced options">
73 <option value="yes">Show</option> 87 <option value="yes">Show</option>
74 <option value="no" selected="true">Hide</option> 88 <option value="no" selected="true">Hide</option>
75 </param> 89 </param>
76 <when value="yes"> 90 <when value="yes">
77 <section name="mapping" title="Mapping"> 91 <section name="mapping" title="Mapping">
78 <param argument="-S" name="no_splicing" type="boolean" truevalue="-S" falsevalue="" checked="false" label="No splicing" help="No splicing (apply -G1000 -J1000 -e1000)" /> 92 <param argument="-S" name="no_splicing" type="boolean" truevalue="-S" falsevalue="" checked="false" label="No splicing" help="No splicing (apply -G1000 -J1000 -e1000)" />
79 <param argument="-c" name="max_kmer" type="integer" min="1" value="50000" label="Max k-mer occurences" /> 93 <param argument="-c" name="max_kmer" type="integer" min="1" value="20000" label="Max k-mer occurences" />
80 <param argument="-G" name="max_intron" type="integer" min="0" value="200000" label="Max intron size" /> 94 <param argument="-w" name="log_gap_penalty_weight" type="float" min="0" max="1" value="0.75" label="Log gap penalty weight" />
81 <!-- the -w option is mentioned in the help text but apparently not implmented: https://github.com/lh3/miniprot/issues/12 --> 95 <param argument="-n" name="min_syncmers" type="integer" min="1" value="3" label="Minimum number of syncmers in a chain" />
82 <!-- <param argument="-w" name="log_gap_penalty_weight" type="float" value="0.75" label="Log gap penalty weight" /> -->
83 <param argument="-n" name="min_syncmers" type="integer" min="1" value="5" label="Minimum number of syncmers in a chain" />
84 <param argument="-m" name="min_chain_score" type="integer" min="0" value="0" label="Minimum chaining score" /> 96 <param argument="-m" name="min_chain_score" type="integer" min="0" value="0" label="Minimum chaining score" />
85 <param argument="-l" name="second_round_kmer_size" type="integer" min="1" value="5" label="K-mer size for second round of chaining" /> 97 <param argument="-l" name="second_round_kmer_size" type="integer" min="1" value="5" label="K-mer size for second round of chaining" />
86 <param argument="-e" name="max_extension" type="integer" min="0" value="10000" label="Max extension for second round of chaining" /> 98 <param argument="-e" name="max_extension" type="integer" min="0" value="10000" label="Max extension for second round of chaining" />
87 <param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.7" label="Minimum secondary-to-primary score ratio" /> 99 <param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.7" label="Minimum secondary-to-primary score ratio" />
88 <param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="50" label="Max secondary alignments to consider" /> 100 <param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="30" label="Max secondary alignments to consider" />
101 <conditional name="intron_size">
102 <param name="mode" type="select" label="Maximum intron size">
103 <option value="manual" selected="true">Manual</option>
104 <option value="auto">Auto (3.6*sqrt)</option>
105 </param>
106 <when value="manual">
107 <param argument="-G" name="max_intron" type="integer" min="0" value="200000" label="Maximum intron size" />
108 </when>
109 <when value="auto" />
110 </conditional>
89 </section> 111 </section>
90 <section name="alignment" title="Alignment"> 112 <section name="alignment" title="Alignment">
91 <param argument="-O" name="gap_open" type="integer" min="0" value="11" label="Gap open penalty" /> 113 <param argument="-O" name="gap_open" type="integer" min="0" value="11" label="Gap open penalty" />
92 <param argument="-E" name="gap_extension" type="integer" min="0" value="1" label="Gap extension penalty" help="A k-long gap costs open_penalty+k*extension_penalty" /> 114 <param argument="-E" name="gap_extension" type="integer" min="0" value="1" label="Gap extension penalty" help="A k-long gap costs open_penalty+k*extension_penalty" />
93 <param argument="-J" name="intron_open" type="integer" min="0" value="31" label="Intron open penalty" /> 115 <param argument="-J" name="intron_open" type="integer" min="0" value="29" label="Intron open penalty" />
94 <param argument="-C" name="non_canonical_splice" type="integer" min="0" value="11" label="Penalty for non-canonical splicing" /> 116 <param argument="-C" name="non_canonical_splice" type="float" value="1" label="Weight of splice penalty; 0 to ignore splice signals" />
95 <param argument="-F" name="frameshift" type="integer" min="0" value="17" label="Frameshift penalty" /> 117 <param argument="-F" name="frameshift" type="integer" min="0" value="23" label="Frameshift penalty" />
96 <param argument="-B" name="end_bonus" type="integer" min="0" value="5" label="End bonus" /> 118 <param argument="-B" name="end_bonus" type="integer" min="0" value="5" label="Bonus score for alignment reaching query ends" />
119 <param argument="-j" name="splice_model" type="select" label="Splicing model for the target genome" help="2=mammal, 1=general, 0=none" >
120 <option value="O" >None: No splicing model (0)</option>
121 <option value="1" selected="true">General: Optimal splicing sequence: '|GTR...YAG|' (1)</option>
122 <option value="2">Mammal: Optimal splicing sequence: 'G|GTR...YYYNYAG|' (2)</option>
123 </param>
97 </section> 124 </section>
98 <section name="output" title="Output"> 125 <section name="output" title="Output">
99 <param argument="-P" name="prefix" type="text" label="Prefix for IDs in GFF3 output" value="MP"> 126 <param argument="-P" name="prefix" type="text" label="Prefix for IDs in GFF3 output" value="MP">
100 <sanitizer invalid_char=""> 127 <sanitizer invalid_char="">
101 <valid initial="string.ascii_letters,string.digits"> 128 <valid initial="string.ascii_letters,string.digits">
104 </valid> 131 </valid>
105 </sanitizer> 132 </sanitizer>
106 </param> 133 </param>
107 <param argument="-u" name="print_unmapped_proteins" type="boolean" truevalue="-u" falsevalue="" label="Print unmapped proteins" checked="false" /> 134 <param argument="-u" name="print_unmapped_proteins" type="boolean" truevalue="-u" falsevalue="" label="Print unmapped proteins" checked="false" />
108 <param argument="--outn" name="outputs_per_query" type="integer" min="0" value="100" label="Outputs per query" help="The number of outputs will be the minimum of this and the max secondary alignments option" /> 135 <param argument="--outn" name="outputs_per_query" type="integer" min="0" value="100" label="Outputs per query" help="The number of outputs will be the minimum of this and the max secondary alignments option" />
136 <param argument="--aln" name="output_residue_alignment" type="boolean" truevalue="--aln" falsevalue="" checked="false" label="Output residue alignment" help="Only for GFF output" />
137 <param argument="--outs" name="output_score_least" type="float" min="0" max="1" value="0.99"
138 label="For each protein, only output alignments with a score higher than 'best_score' multiplied by this value"/>
139 <param argument="--outc" name="output_fraction_query" type="float" value="0.1" label="Output if at least this fraction of query is aligned" />
109 </section> 140 </section>
110 <param argument="-K" name="query_batch_size" type="integer" min="1" value="2000000" label="Query batch size" /> 141 <param argument="-K" name="query_batch_size" type="integer" min="1" value="2000000" label="Query batch size" />
111 </when> 142 </when>
112 <when value="no"> 143 <when value="no">
113 </when> 144 </when>
115 </inputs> 146 </inputs>
116 <outputs> 147 <outputs>
117 <data name="output_alignment" format="gff3" label="Miniprot on ${on_string}"> 148 <data name="output_alignment" format="gff3" label="Miniprot on ${on_string}">
118 <change_format> 149 <change_format>
119 <when input="output_format" value="paf" format="paf" /> 150 <when input="output_format" value="paf" format="paf" />
151 <when input="output_format" value="gtf" format="gtf"/>
120 </change_format> 152 </change_format>
121 </data> 153 </data>
122 </outputs> 154 </outputs>
123 <tests> 155 <tests>
124 <test expect_num_outputs="1"> 156 <test expect_num_outputs="1">
127 <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" /> 159 <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
128 </conditional> 160 </conditional>
129 <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" /> 161 <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
130 <output name="output_alignment" ftype="gff3"> 162 <output name="output_alignment" ftype="gff3">
131 <assert_contents> 163 <assert_contents>
132 <has_text text="ID=MP000001;Identity=1.0000;Positive=1.0000;Target=tr|O06302|O06302_MYCTU 1 126" /> 164 <has_text text="ID=MP000001;Rank=1;Identity=0.3420;Positive=0.5104;Target=tr|I6YGH7|I6YGH7_MYCTU 1 375" />
133 <has_text text="ID=MP000359;Identity=0.9811;Positive=1.0000;Target=tr|V5QPR5|V5QPR5_MYCTU 1 53" /> 165 <has_text text="ID=MP000066;Rank=1;Identity=0.3613;Positive=0.5178;Target=sp|P9WQE5|PPSB_MYCTU 1 1214" />
134 </assert_contents> 166 </assert_contents>
135 </output> 167 </output>
136 </test> 168 </test>
137 <test expect_num_outputs="1"> 169 <test expect_num_outputs="1">
138 <conditional name="db"> 170 <conditional name="db">
145 <assert_contents> 177 <assert_contents>
146 <has_text text="tr|O06302|O06302_MYCTU" /> 178 <has_text text="tr|O06302|O06302_MYCTU" />
147 <has_text text="cs:Z::29*agcG:3*gtgA:5*ccgA:9*accS:1*gccV:4*cagL:1*gtcS:3*gtcA*gtcI*accA*gccG:8*gccS:2*ggtA:5*gccI*agcG:1*ctgA:4*gccV:5*gggL:1*gtgS:2" /> 179 <has_text text="cs:Z::29*agcG:3*gtgA:5*ccgA:9*accS:1*gccV:4*cagL:1*gtcS:3*gtcA*gtcI*accA*gccG:8*gccS:2*ggtA:5*gccI*agcG:1*ctgA:4*gccV:5*gggL:1*gtgS:2" />
148 </assert_contents> 180 </assert_contents>
149 </output> 181 </output>
150 </test> 182 </test>
151 <test expect_num_outputs="1"> 183 <test expect_num_outputs="1">
152 <conditional name="db"> 184 <conditional name="db">
153 <param name="dbtype" value="fasta" /> 185 <param name="dbtype" value="fasta" />
154 <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" /> 186 <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
155 </conditional> 187 </conditional>
163 <assert_contents> 195 <assert_contents>
164 <has_text text="##gff-version 3" /> 196 <has_text text="##gff-version 3" />
165 </assert_contents> 197 </assert_contents>
166 </output> 198 </output>
167 </test> 199 </test>
200
201 <test expect_num_outputs="1">
202 <conditional name="db">
203 <param name="dbtype" value="fasta"></param>
204 <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param>
205 </conditional>
206 <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param>
207 <param name="output_format" value="gtf"></param>
208 <conditional name="adv">
209 <param name="options" value="yes"></param>
210 <param name="second_round_kmer_size" value="32"></param>
211 </conditional>
212 <output name="output_alignment" ftype="gtf">
213 <assert_contents>
214 <has_text text="NC_000962.3" />
215 <has_text text='transcript_id "MPT000004"; gene_id "MPG000004"' />
216 </assert_contents>
217 </output>
218 </test>
219
220
168 </tests> 221 </tests>
169 <help><![CDATA[ 222 <help><![CDATA[
170 miniprot_ rapidly aligns a protein sequence against a genome with affine gap penalty, splicing and frameshift. 223 miniprot_ rapidly aligns a protein sequence against a genome with affine gap penalty, splicing and frameshift.
171 It is primarily intended for annotating protein-coding genes in a new species using known genes from other species. 224 It is primarily intended for annotating protein-coding genes in a new species using known genes from other species.
172 225
175 228
176 For details of the algorithm and some insight into how parameters can be tuned see this overview_. 229 For details of the algorithm and some insight into how parameters can be tuned see this overview_.
177 230
178 .. _miniprot: https://github.com/lh3/miniprot 231 .. _miniprot: https://github.com/lh3/miniprot
179 .. _overview: https://github.com/lh3/miniprot#algorithm-overview 232 .. _overview: https://github.com/lh3/miniprot#algorithm-overview
180 ]]></help> 233 ]]></help>
181 <citations> 234 <expand macro="citation"></expand>
182 <citation type="bibtex"><![CDATA[
183 @misc{Li2022,
184 author = {Li, Heng},
185 title = {miniprot},
186 year = {2022},
187 publisher = {GitHub},
188 journal = {GitHub repository},
189 howpublished = {\url{https://github.com/lh3/miniprot}},
190 commit = {b442b7a6b60dbd15f460ea9af75fa0b7293d4a8c}
191 }
192 ]]></citation>
193 </citations>
194 </tool> 235 </tool>