Mercurial > repos > iuc > quickmerge

diff quickmerge.xml @ 0:436c1d3c990a draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/quickmerge commit c171c7bde4063ec25fff4a284f5d0330d46ca290
author: iuc
date: Fri, 08 Jul 2022 10:35:37 +0000
children: 9acf91225108
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/quickmerge.xml	Fri Jul 08 10:35:37 2022 +0000
@@ -0,0 +1,351 @@
+<tool
+  id="quickmerge"
+  name="QuickMerge"
+  version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"
+  profile="21.05"
+>
+  <description>merge long-read and hybrid genome assemblies to increase contiguity</description>
+
+  <macros>
+    <token name="@TOOL_VERSION@">0.3</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+  </macros>
+
+  <xrefs>
+    <xref type="bio.tools">quickmerge</xref>
+  </xrefs>
+
+  <requirements>
+    <requirement type="package" version="@TOOL_VERSION@">quickmerge</requirement>
+    <requirement type="package" version="0.1.4">fastkit</requirement>
+  </requirements>
+
+  <command detect_errors="exit_code"><![CDATA[
+
+## Strip spaces from FASTA header
+fastkit format --strip-header-space '$input_ref' > input.ref.fasta
+&& fastkit format --strip-header-space '$input_query' > input.query.fasta
+
+## Nucmer step
+## ----------------------------------------------------------------------------
+
+&& nucmer
+  $advanced.nucmer_match_protocol
+  -l $advanced.nucmer_min_len
+  input.ref.fasta
+  input.query.fasta
+
+
+## Delta-filter step
+## ----------------------------------------------------------------------------
+
+&& delta-filter
+  $advanced.delta_map_ref
+  $advanced.delta_map_query
+  -l $advanced.delta_min_len
+
+out.delta > out.rq.delta
+
+
+## Quickmerge step
+## ----------------------------------------------------------------------------
+
+&& quickmerge
+  -d out.rq.delta
+  -q input.query.fasta
+  -r input.ref.fasta
+  -p out
+  -hco $advanced.hco
+  -c $advanced.c
+  -l $advanced.l
+  -ml $advanced.ml
+
+  ]]></command>
+
+  <inputs>
+    <param
+      name="input_query"
+      type="data"
+      format="fasta,fasta.gz"
+      label="Query (hybrid) assembly"
+      help="A hydrid or long-read assembly."
+    />
+    <param
+      name="input_ref"
+      type="data"
+      format="fasta,fasta.gz"
+      label="Reference assembly"
+      help="The self assembly - this can also be a long-read assembly."
+    />
+
+    <section name="advanced" title="Advanced parameters" expanded="false">
+      <param
+        name="nucmer_match_protocol"
+        type="select"
+        display="radio"
+        label="Anchor matching protocol"
+        help="By default, nucmer will include anchor matches that are unique
+          in the reference but not necessarily unique in the query. You can
+          change the protocol to include all matches, or only those unique to
+          both the reference and query sequence."
+      >
+        <option value="" selected="true">
+          Default (unique in reference only)
+        </option>
+        <option value="--mum">
+          Include matches that are unique to both reference and query
+        </option>
+        <option value="--maxmatch">
+          Include all matches
+        </option>
+      </param>
+
+      <param
+        name="nucmer_min_len"
+        type="integer"
+        label="Nucmer -l"
+        help="Minimum length for a single match (nt)"
+        value="20"
+        min="0"
+      />
+
+      <param
+        name="delta_min_len"
+        type="integer"
+        label="Delta-filter -l"
+        help="Minimum alignment length (nt)"
+        value="0"
+        min="0"
+      />
+
+      <param
+        name="delta_map_ref"
+        type="boolean"
+        truevalue="-r"
+        falsevalue=""
+        checked="true"
+        label="Delta-filter -r"
+        help="Maps each position of each reference to its best hit in the
+          query, allowing for query overlaps."
+      />
+
+      <param
+        name="delta_map_query"
+        type="boolean"
+        truevalue="-q"
+        falsevalue=""
+        checked="true"
+        label="Delta-filter -q"
+        help="Maps each position of each query to its best hit in the
+          reference, allowing for reference overlaps."
+      />
+
+      <param
+        type="integer"
+        argument="-l"
+        help="Minimum seed contig length to be merged (nt). A good rule of
+          thumb is to start with the N50 of the self assembly."
+        value="0"
+        min="0"
+      />
+
+      <param
+        type="integer"
+        argument="-ml"
+        help="Set the merging length cutoff (nt). This is especially helpful
+          for repeat-rich genomes, where a higher minimum length can reduce
+          errors arising from long repeats."
+        value="5000"
+        min="0"
+      />
+
+      <param
+        type="float"
+        argument="-hco"
+        help="Controls the overlap cutoff used in selection of anchor contigs.
+          Increasing this will raise stringency, resulting in fewer joins but
+          higher confidence."
+        value="5.0"
+        min="0.0"
+      />
+
+      <param
+        type="float"
+        argument="-c"
+        help="Controls the overlap cutoff for contigs used for extension of
+          the anchor contig. Increasing this will raise stringency,
+          resulting in fewer joins but higher confidence."
+        value="1.5"
+        min="0.0"
+      />
+    </section>
+
+    <section name="output" title="Optional outputs" expanded="true">
+      <param
+        name="check_align_summary"
+        type="boolean"
+        label="Alignment summary table"
+      />
+      <param
+        name="check_anchor_summary"
+        type="boolean"
+        label="Anchor summary table"
+      />
+      <param
+        name="check_param_summary"
+        type="boolean"
+        label="Parameter summary table"
+      />
+    </section>
+  </inputs>
+
+  <outputs>
+    <data
+      name="merged_fasta"
+      format="fasta"
+      from_work_dir="merged_out.fasta"
+      label="${tool.name} on ${on_string}: Merged assembly"
+    />
+
+    <data
+      name="align_summary"
+      format="tabular"
+      from_work_dir="aln_summary_out.tsv"
+      label="${tool.name} on ${on_string}: Alignment summary"
+    >
+      <filter>output['check_align_summary']</filter>
+    </data>
+
+    <data
+      name="anchor_summary"
+      format="tabular"
+      from_work_dir="anchor_summary_out.txt"
+      label="${tool.name} on ${on_string}: Anchor summary"
+    >
+      <filter>output['check_anchor_summary']</filter>
+    </data>
+
+    <data
+      name="param_summary"
+      format="tabular"
+      from_work_dir="param_summary_out.txt"
+      label="${tool.name} on ${on_string}: Parameter summary"
+    >
+      <filter>output['check_param_summary']</filter>
+    </data>
+  </outputs>
+
+  <tests>
+    <!-- Standard run -->
+    <test expect_num_outputs="1">
+      <param name="input_query" value="ecoli_nanopore.fasta" ftype="fasta" />
+      <param name="input_ref" value="ecoli_illumina.fasta" ftype="fasta" />
+      <output name="merged_fasta" ftype="fasta" file="merged.fasta" />
+    </test>
+
+    <!-- Standard run with additional outputs -->
+    <test expect_num_outputs="4">
+      <param name="input_query" value="ecoli_nanopore.fasta" ftype="fasta" />
+      <param name="input_ref" value="ecoli_illumina.fasta" ftype="fasta" />
+      <param name="check_align_summary" value="true" />
+      <param name="check_anchor_summary" value="true" />
+      <param name="check_param_summary" value="true" />
+      <output name="merged_fasta" ftype="fasta" file="merged.fasta" />
+      <output name="align_summary" ftype="tabular" file="2_aln_summary_out.tsv" />
+      <output name="anchor_summary" ftype="tabular" file="2_anchor_summary_out.txt" />
+      <output name="param_summary" ftype="tabular" file="2_param_summary_out.txt" />
+    </test>
+
+    <!-- Standard run with advanced params set -->
+    <test expect_num_outputs="1">
+      <param name="input_query" value="ecoli_nanopore.fasta" ftype="fasta" />
+      <param name="input_ref" value="ecoli_illumina.fasta" ftype="fasta" />
+      <param name="advanced.nucmer_min_len" value="20" />
+      <param name="advanced.delta_min_len" value="0" />
+      <param name="advanced.hco" value="0" />
+      <param name="advanced.c" value="5000" />
+      <param name="advanced.l" value="1.5" />
+      <param name="advanced.ml" value="5.0" />
+      <output name="merged_fasta" ftype="fasta" file="merged.fasta" />
+    </test>
+  </tests>
+
+  <help><![CDATA[
+
+.. class:: infomark
+
+**What it does**
+
+The program uses complementary information from genomes assembled with long reads in order to improve contiguity, and works with assemblies derived from both Pacific Biosciences or Oxford Nanopore. Quickmerge will even work with hybrid assemblies made by combining long reads and Illumina short reads. This can be useful when long read coverage is limiting. For more details, please see the paper: (`10.1093/nar/gkw654 <https://doi.org/10.1093/nar/gkw654>`_) that describes it.
+
+Although this program was written to merge a hybrid assembly (e.g. as generated by DBG2OLC) and a PacBio or ONP only assembly, it can also be used to merge two different long molecule only assemblies (e.g. one generated with PBcR or canu and another generated with FALCON).
+
+Please consult the `QuickMerge Wiki <https://github.com/mahulchak/quickmerge/wiki>`_ for extended usage information.
+
+**Why use quickmerge?**
+
+- More bang for your buck by using complementary information from different assemblers.
+- It is fast. Takes less than a minute to run on most genomes. You run nucmer once (nucmer is the most time consuming step) and then you can run quickmerge over a large number of parameters in a very short time.
+- Requires only FASTA files and does not depend on any special data or computational resources.
+- Saves money. When long read coverage is limiting, there are some hybrid approaches that lead to good results (e.g. DBG2OLC). So quickmerge allows you to cut your long molecule requirement by as much as half by replacing the same with Illumina short reads. E.g. if you think you would get a N50 of 8Mb from 75X long reads (ONP or PacBio), try sequencing 45X long and 70X Illumina reads instead of 75X long reads. You may not need that extra 35X long reads.
+- Allows dramatic improvements with reanalysis of legacy data collected when long reads were prohibitively expensive. See an excellent example from `this paper by Thomas Mather <https://academic.oup.com/g3journal/article/10/3/899/6026189>`_ on the Soybean Aphid genome.
+
+
+**The process**
+
+- **Nucmer** aligns the two assemblies so that the merger can find the correct splice sites
+- **Delta-filter** filters out alignments with repeats and duplicates
+- **Quickmerge** merges the aligned assemblies
+
+**Inputs**
+
+- **Query assembly**: a hydrid or long-read assembly (see `QuickMerge Wiki <https://github.com/mahulchak/quickmerge/wiki>`_ for more details)
+
+- **Reference assembly**: "Self" assembly. Can also be a hybrid assembly
+
+- **Nucmer -l**: the minimum length of a single match
+
+- **Delta-filter -l**: the minimum length of a single alignment
+
+- **hco**: controls the overlap cutoff used in selection of anchor contigs (default 5.0)
+
+- **c**: controls the overlap cutoff for contigs used for extension of the anchor contig (default 1.5)
+
+- **l**: controls the length cutoff for anchor contigs. A good rule of thumb is to start with the N50 of the self assembly. E.g. if the N50 of your self assembly is 2Mb then use 2000000 as your cutoff. Lowering this value may lead to more merging but may increase the probability of mis-joins.
+
+- **ml**: controls the minimum alignment length to be considered for merging. This is especially helpful for repeat-rich genomes (default 5000; higher recommended).
+
+For both ``hco`` and ``c``, bigger the number, more stringent is the criteria for contig selection (which will lead to fewer contigs being merged). If they are too small (<1), chances of spurious merging will increase. It is better to be conservative while merging contigs!
+
+
+**Outputs**
+
+- **Merged assembly**: your merged assembly in FASTA format
+
+- **Alignment summary**: tabular summary of alignments used for merging
+
+- **Anchor summary**: tabular summary of anchor matches used for merging
+
+- **Parameter summary**: summary of merge parameters
+
+
+**Helpful tips**
+
+- For optimal merging results, identify the major misassemblies (especially translocations and inversions) in the component assemblies and break the contigs at such misassembly boundaries. Alignment of the component assemblies to the merged assembly may help to identify such assembly errors because a specific error typically occurs in only one of the assemblies.
+
+- The FASTA sequence headers *should not have white spaces* in them. In case they do, as might happen for assemblies obtained from FALCON assembler, the white space needs to be removed before running.
+
+- You can run Ka-kit's `finisherSC <https://github.com/kakitone/finishingTool>`_ after running quickmerge to improve the contiguity even further.
+
+- Assembly polishing with Quiver and pilon before and after assembly merging is strongly recommended. However, if you are running finisherSC, you may perform the quiver polishing after the finisher step.
+
+- Check the merged assembly by aligning the hybrid and/or non-hybrid assembly to the merged assembly. You can use ``nucmer`` for alignment (with default anchoring) and ``mummerplot`` for dot plot visualization.
+
+
+Quickmerge was wrapped by the Galaxy Australia team.
+
+  ]]></help>
+  <citations>
+    <citation type="doi">https://doi.org/10.1093/nar/gkw654</citation>
+  </citations>
+</tool>
author	iuc
date	Fri, 08 Jul 2022 10:35:37 +0000
parents
children	9acf91225108