annotate mashmap.xml @ 7:53f601fb8664 draft

planemo upload
author fubar
date Sat, 24 Feb 2024 04:32:36 +0000
parents 27df186d5446
children 9ba0184870ef
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
7
53f601fb8664 planemo upload
fubar
parents: 6
diff changeset
1 <tool name="mashmap" id="mashmap" version="1.19.2" profile="22.05">
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
2 <!--Source in git at: https://github.com/fubar2/galaxy_tf_overlay-->
7
53f601fb8664 planemo upload
fubar
parents: 6
diff changeset
3 <!--Created by toolfactory@galaxy.org at 24/02/2024 15:30:44 using the Galaxy Tool Factory.-->
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
4 <description>Fast local alignment boundaries</description>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
5 <requirements>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
6 <requirement version="3.1.3" type="package">mashmap</requirement>
7
53f601fb8664 planemo upload
fubar
parents: 6
diff changeset
7 <requirement version="1.19.2" type="package">samtools</requirement>
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
8 </requirements>
7
53f601fb8664 planemo upload
fubar
parents: 6
diff changeset
9 <version_command><![CDATA[echo "1.19.2"]]></version_command>
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
10 <command><![CDATA[bash '$runme']]></command>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
11 <configfiles>
5
10e4181a6443 planemo upload
fubar
parents: 4
diff changeset
12 <configfile name="runme"><![CDATA[#if len($reflist) > 1:
10e4181a6443 planemo upload
fubar
parents: 4
diff changeset
13 #for i, mash in enumerate($reflist):
10e4181a6443 planemo upload
fubar
parents: 4
diff changeset
14 #if i == 0:
6
27df186d5446 planemo upload
fubar
parents: 5
diff changeset
15 echo '$mash' > 'reflist' &&
5
10e4181a6443 planemo upload
fubar
parents: 4
diff changeset
16 #else:
6
27df186d5446 planemo upload
fubar
parents: 5
diff changeset
17 echo '$mash' >> 'reflist' &&
5
10e4181a6443 planemo upload
fubar
parents: 4
diff changeset
18 #end if
10e4181a6443 planemo upload
fubar
parents: 4
diff changeset
19 #end for
10e4181a6443 planemo upload
fubar
parents: 4
diff changeset
20 #end if
7
53f601fb8664 planemo upload
fubar
parents: 6
diff changeset
21 samtools faidx '$query' &&
5
10e4181a6443 planemo upload
fubar
parents: 4
diff changeset
22 mashmap --pi '$perc_identity' -s '$seqLength' -f '$filtermode' $dense \
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
23 #if int($sketchSize) > 0:
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
24 -J '$sketchSize' \
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
25 #end if
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
26 #if len($reflist) == 1:
6
27df186d5446 planemo upload
fubar
parents: 5
diff changeset
27 -r '$reflist' -q '$query' &&
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
28 #else
5
10e4181a6443 planemo upload
fubar
parents: 4
diff changeset
29 --rl 'reflist' -q '$query' &&
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
30 #end if
6
27df186d5446 planemo upload
fubar
parents: 5
diff changeset
31 cp 'mashmap.out' '$mashout']]></configfile>
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
32 </configfiles>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
33 <inputs>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
34 <param name="query" type="data" optional="false" label="Query sequences (as fasta) to mash against the references supplied below" help="" format="fasta" multiple="false"/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
35 <param name="reflist" type="data" optional="false" label="Reference or references to mash the query sequences on" help="Choose one or more reference sequences to mash the query sequences against." format="fasta" multiple="true"/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
36 <param name="perc_identity" type="float" value="85.0" label="Identity threshold" help="By default, it is set to 85, implying mappings with 85 or more identity should be reported. For example, it can be set to 80to account for more noisy long-read datasets or 95 for mapping human genome assembly to human reference."/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
37 <param name="seqLength" type="integer" value="5000" label="Minimum segment length" help="Default is 5,000 bp. Sequences below this length are ignored. Mashmap provides guarantees on reporting local alignments of length twice this value."/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
38 <param name="sketchSize" type="integer" value="0" label="Sketch size - leave 0 for automatic setting based" help="This parameter sets the seed density of the winnowing scheme, gauranteeing that the minhash will be calculated from a sample of sketchSize k-mers for each segment. It is set automatically based on --pi but can be manually set as well."/>
4
fba99cb9b0ef planemo upload
fubar
parents: 3
diff changeset
39 <param name="dense" type="select" label="Dense sketching" help="This flag will increase the seed density substantially, resulting in a density of roughly 0.02 * (1 + (1 - pi) / .05) where pi is the perc_identity threshold. This leads to longer runtimes and higher RAM usage, but significantly more accurate estimates of ANI.">
fba99cb9b0ef planemo upload
fubar
parents: 3
diff changeset
40 <option value="">No dense sketching</option>
fba99cb9b0ef planemo upload
fubar
parents: 3
diff changeset
41 <option value="--dense">Dense sketching</option>
3
aa2234f3b23a planemo upload
fubar
parents: 2
diff changeset
42 </param>
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
43 <param name="filtermode" type="select" label="Filter mode" help="Mashmap implements a plane-sweep based algorithm to perform the alignment filtering. Similar to delta-filter in nucmer, different filtering options are provided that are suitable for long read or assembly mapping. Option -f map is suitable for reporting the best mappings for long reads, whereas -f one-to-one is suitable for reporting orthologous mappings among all computed assembly to genome mappings.">
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
44 <option value="map">map - best mapping for long reads</option>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
45 <option value="one-to-one">one-to-one - best for mapping orthologous reads</option>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
46 <option value="none">None</option>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
47 </param>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
48 </inputs>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
49 <outputs>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
50 <data name="mashout" format="paf" label="mashmap on $query.element_identifier" hidden="false"/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
51 </outputs>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
52 <tests>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
53 <test>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
54 <output name="mashout" value="mashout_sample" compare="diff" lines_diff="0"/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
55 <param name="query" value="query_sample"/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
56 <param name="reflist" value="reflist_sample"/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
57 <param name="perc_identity" value="85.0"/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
58 <param name="seqLength" value="5000"/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
59 <param name="sketchSize" value="0"/>
4
fba99cb9b0ef planemo upload
fubar
parents: 3
diff changeset
60 <param name="dense" value=""/>
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
61 <param name="filtermode" value="map"/>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
62 </test>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
63 </tests>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
64 <help><![CDATA[
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
65 *MashMap* implements a fast and approximate algorithm for computing local alignment boundaries between long DNA sequences. It can be useful for mapping genome assembly or long reads (PacBio/ONT) to reference genome(s). Given a minimum alignment length and an identity threshold for the desired local alignments,
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
66
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
67 Mashmap computes alignment boundaries and identity estimates using k-mers. It does not compute the alignments explicitly, but rather estimates an unbiased k-mer based Jaccard similarity using a combination of minmers (a novel winnowing scheme) and MinHash. This is then converted to an estimate of sequence identity using the Mash distance. An appropriate k-mer sampling rate is automatically determined using the given minimum local alignment length and identity thresholds.
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
68
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
69 As an example, Mashmap can map a human genome assembly to the human reference genome in about one minute total execution time and < 4 GB memory using just 8 CPU threads, achieving more than an order of magnitude improvement in both runtime and memory over alternative methods. We describe the algorithms associated with Mashmap, and report on speed, scalability, and accuracy of the software in the publications listed below. Unlike traditional mappers, MashMap does not compute exact sequence alignments. In future, we plan to add an optional alignment support to generate base-to-base alignments.
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
70
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
71 Map set of query sequences against a reference genome:
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
72
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
73 mashmap -r reference.fna -q query.fa
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
74
7
53f601fb8664 planemo upload
fubar
parents: 6
diff changeset
75 The output is space-delimited with each line consisting of query name, length, 0-based start, end, strand, target name, length, start, end and mapping nucleotide identity.
2
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
76
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
77 Map set of query seqences against a list of reference genomes:
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
78
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
79 mashmap --rl referenceList.txt -q query.fa
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
80
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
81 File 'referenceList.txt' containing the list of reference genomes should contain path to the reference genomes, one per line.
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
82
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
83 Source code: https://github.com/marbl/MashMap
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
84 ]]></help>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
85 <citations>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
86 <citation type="doi">10.1093/bioinformatics/btad512</citation>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
87 <citation type="doi">10.1093/bioinformatics/bts573</citation>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
88 </citations>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
89 </tool>
6c6bf2bee1ca planemo upload
fubar
parents:
diff changeset
90