Mercurial > repos > iuc > unicycler
view unicycler.xml @ 11:8f9f06995f98 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/unicycler commit c1cd9b717bce328b276f24d910e2e6858585b2cc
author | iuc |
---|---|
date | Tue, 19 Dec 2023 15:58:30 +0000 |
parents | d10bdad2fd17 |
children | 3e44ef016b45 |
line wrap: on
line source
<tool id="unicycler" name="Create assemblies with Unicycler" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.09"> <description>pipeline for bacterial genomes</description> <macros> <token name="@TOOL_VERSION@">0.5.0</token> <token name="@VERSION_SUFFIX@">1</token> </macros> <edam_topics> <edam_topic>topic_0196</edam_topic> </edam_topics> <edam_operations> <edam_operation>operation_0525</edam_operation> </edam_operations> <xrefs> <xref type="bio.tools">unicycler</xref> </xrefs> <requirements> <requirement type="package" version="@TOOL_VERSION@">unicycler</requirement> <requirement type="package" version="1.15.1">samtools</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ #for r in $reuse ln -s $r.reuse_file ${r.reuse_step}.gfa && #end for ## Preparing files #set $uncompressed = ('fastqsanger','fastq') #set $compressed = ('fastqsanger.gz','fastq.gz') #if str( $paired_unpaired.fastq_input_selector ) == "paired" #if $paired_unpaired.fastq_input1.file_ext in $uncompressed #set fq1 = "fq1.fastq" #elif $paired_unpaired.fastq_input1.file_ext in $compressed #set fq1 = "fq1.fastq.gz" #end if #if $paired_unpaired.fastq_input2.file_ext in $uncompressed #set fq2 = "fq2.fastq" #elif $paired_unpaired.fastq_input2.file_ext in $compressed #set fq2 = "fq2.fastq.gz" #end if ln -s '${paired_unpaired.fastq_input1}' $fq1 && ln -s '${paired_unpaired.fastq_input2}' $fq2 && #elif str( $paired_unpaired.fastq_input_selector ) == "paired_collection" #if $paired_unpaired.fastq_input1.forward.file_ext in $uncompressed #set fq1 = "fq1.fastq" #elif $paired_unpaired.fastq_input1.forward.file_ext in $compressed #set fq1 = "fq1.fastq.gz" #end if #if $paired_unpaired.fastq_input1.reverse.file_ext in $uncompressed #set fq2 = "fq2.fastq" #elif $paired_unpaired.fastq_input1.reverse.file_ext in $compressed #set fq2 = "fq2.fastq.gz" #end if ln -s '${paired_unpaired.fastq_input1.forward}' $fq1 && ln -s '${paired_unpaired.fastq_input1.reverse}' $fq2 && #elif str( $paired_unpaired.fastq_input_selector ) == "single" #if $paired_unpaired.fastq_input1.file_ext in $uncompressed #set fq = "fq.fastq" #elif $paired_unpaired.fastq_input1.file_ext in $compressed #set fq = "fq.fastq.gz" #end if ln -s '${paired_unpaired.fastq_input1}' '$fq' && #end if #if $long #if $long.file_ext in $uncompressed #set lr = "lr.fastq" #elif $long.file_ext in $compressed #set lr = "lr.fastq.gz" #elif $long.is_of_type('fasta') #set lr = "lr.fasta" #end if ln -s '${long}' '$lr' && #end if ## Running Unicycler unicycler -t "\${GALAXY_SLOTS:-4}" -o ./ --verbosity 3 #if str( $paired_unpaired.fastq_input_selector ) == "paired" -1 '$fq1' -2 '$fq2' #elif str( $paired_unpaired.fastq_input_selector ) == "paired_collection" -1 '$fq1' -2 '$fq2' #elif str( $paired_unpaired.fastq_input_selector ) == "single" -s '$fq' #end if #if $long -l $lr #end if ## General Unicycler Options section ## ---------------------------------------------------------- --mode '$mode' --min_fasta_length '$min_fasta_length' --linear_seqs '$linear_seqs' #if str($min_anchor_seg_len) != '' --min_anchor_seg_len '$min_anchor_seg_len' #end if ## Spades Options section ## ---------------------------------------------------------- --min_kmer_frac '$spades.min_kmer_frac' --max_kmer_frac '$spades.max_kmer_frac' #if str($spades.kmers) != '' --kmers '$spades.kmers' #end if --kmer_count '$spades.kmer_count' --depth_filter '$spades.depth_filter' #if $spades.largest_component --largest_component #end if ## Rotation Options section ## ---------------------------------------------------------- $rotation.no_rotate #if $rotation.start_genes --start_genes '$rotation.start_genes' #end if --start_gene_id '$rotation.start_gene_id' --start_gene_cov '$rotation.start_gene_cov' ## Graph cleaning Options sdection ## ---------------------------------------------------------- --min_component_size '$graph_clean.min_component_size' --min_dead_end_size '$graph_clean.min_dead_end_size' ## Long Read Alignment Options ## ---------------------------------------------------------- #if $lr_align.contamination --contamination '$lr_align.contamination' #end if --scores '${lr_align.scores}' #if str($lr_align.low_score) != '' --low_score '$lr_align.low_score' #end if $lr_align.no_simple_bridges --keep $keep #if $keep != '0' && mkdir 'spades_graphs' && mv 00*gfa './spades_graphs/' #end if #if $keep == '2' and $long && samtools view -@ "\${GALAXY_SLOTS:-4}" -u 'read_alignment/long_read_alignments.sam' | samtools sort -@ "\${GALAXY_SLOTS:-4}" -o 'read_alignment/long_read_alignments.bam' #end if ]]></command> <inputs> <conditional name="paired_unpaired"> <param name="fastq_input_selector" type="select" label="Paired or Single end data?" help="Select between paired and single end data"> <option selected="True" value="paired">Paired</option> <option value="paired_collection">Paired Collection</option> <option value="single">Single</option> <option value="none">None</option> </param> <when value="paired"> <param name="fastq_input1" argument="-1" type="data" format="fastqsanger,fastqsanger.gz,fastq,fastq.gz" label="Select first set of reads" help="Specify dataset with forward reads"/> <param name="fastq_input2" argument="-2" type="data" format="fastqsanger,fastqsanger.gz,fastq,fastq.gz" label="Select second set of reads" help="Specify dataset with reverse reads"/> </when> <when value="paired_collection"> <param name="fastq_input1" format="fastqsanger,fastqsanger.gz,fastq,fastq.gz" type="data_collection" collection_type="paired" label="Select a paired collection" /> </when> <when value="single"> <param name="fastq_input1" argument="-s" type="data" format="fastqsanger,fastqsanger.gz,fastq,fastq.gz" label="Select unpaired reads" help="Specify dataset with unpaired reads"/> </when> <when value="none"> </when> </conditional> <param argument="--long" optional="true" type="data" format="fastqsanger,fastqsanger.gz,fasta,fastq,fastq.gz" label="Select long reads. If there are no long reads, leave this empty"/> <param argument="--mode" type="select" label="Select Bridging mode"> <option value="conservative">Conservative (smaller contigs, lower misassembly)</option> <option value="normal" selected="True">Normal (moderate contig size and misassembly rate)</option> <option value="bold">Bold (longest contigs, higher misassembly rate)</option> </param> <param argument="--min_fasta_length" type="integer" value="100" label="Exclude contigs from the FASTA file which are shorter than this length (bp)"/> <param argument="--linear_seqs" type="integer" value="0" label="The expected number of linear (i.e. non-circular) sequences in the assembly"/> <param argument="--min_anchor_seg_len" type="integer" min="0" optional="true" label="Unicycler will not use segments shorter than this as scaffolding anchors"/> <section name="spades" expanded="False" title="SPAdes options" help="Unicycler uses SPAdes to construct assembly graphs. You can modify some of the SPAdes settings here. Use this ONLY if you know what you are doing!"> <param argument="--min_kmer_frac" type="float" min="0" max="1" value="0.2" label="Lowest k-mer size for SPAdes assembly, expressed as a fraction of the read length"/> <param argument="--max_kmer_frac" type="float" min="0" max="1" value="0.95" label="Highest k-mer size for SPAdes assembly, expressed as a fraction of the read length"/> <param argument="--kmers" type="text" value="" optional="true" label="Exact k-mers to use for SPAdes assembly, comma-separated"> <validator type="regex" message="Kmers must be comma-separated odd integers (no repitition) without space in the range of 11 to 127 (inclusive)">^(\d*[13579],)*(\d*[13579])$</validator> </param> <param argument="--kmer_count" type="integer" min="0" value="10" label="Number of k-mer steps to use in SPAdes assembly"/> <param argument="--depth_filter" type="float" min="0" max="1" value="0.25" label="Filter out contigs lower than this fraction of the chromosomal depth" help="It is done if does not result in graph dead ends"/> <param argument="--largest_component" type="boolean" checked="false" label="Only keep the largest connected component of the assembly graph"/> </section> <section name="rotation" expanded="false" title="Rotation options" help="These options control the rotation of completed circular sequence near the end of the Unicycler pipeline. Use this ONLY if you know what you are doing!"> <param argument="--no_rotate" type="boolean" checked="false" truevalue="--no_rotate" falsevalue="" label="Do not rotate completed replicons to start at a standard gene." help="Unicycler uses TBLASTN to search for dnaA or repA alleles in each completed replicon. If one is found, the sequence is rotated and/or flipped so that it begins with that gene encoded on the forward strand. This provides consistently oriented assemblies and reduces the risk that a gene will be split across the start and end of the sequence."/> <param argument="--start_genes" optional="true" type="data" format="fasta" label="FASTA file of genes for start point of rotated replicons" /> <param argument="--start_gene_id" type="float" min="0" max="100" value="90" label="The minimum required BLAST percent identity for a start gene search"/> <param argument="--start_gene_cov" type="float" min="0" max="100" value="95" label="The minimum required BLAST percent coverage for a start gene search"/> </section> <section name="graph_clean" expanded="false" title="Graph cleaning options" help="These options control the removal of small leftover sequences after bridging is complete."> <param argument="--min_component_size" type="integer" min="0" value="1000" label="Unbridged graph components smaller than this size will be removed from the final graph" /> <param argument="--min_dead_end_size" type="integer" min="0" value="1000" label="Graph dead ends smaller than this size will be removed from the final graph"/> </section> <section name="lr_align" expanded="false" title="Long read alignment parameters" help="These options control the alignment of long reads to the assembly graph."> <param argument="--contamination" optional="true" type="data" format="fasta" label="FASTA file of known contamination in long reads, e.g. lambda, phiXm or puc18 spike-ins." /> <param argument="--scores" type="text" value="3,-6,-5,-2" label="Comma-delimited string of alignment scores: match, mismatch, gap open, gap extend"/> <param argument="--low_score" optional="true" type="integer" value="" label="Score threshold - alignments below this are considered poor" help="default = set automatically"/> <param argument="--no_simple_bridges" type="boolean" truevalue="--no_simple_bridges" falsevalue="" checked="false" label="Simple long-read bridging" help="Default: No" /> </section> <param argument="--keep" type="select" label="Outputs to keep" help="Level of file retention. Default: 1"> <option value="0">0: only keep final files</option> <option value="1" selected="true">1: save graphs at main checkpoints</option> <option value="2">2: also keep SAM</option> </param> <repeat name="reuse" title="Reuse checkpoint files from earlier runs" max="1" help=""> <param name="reuse_file" type="data" optional="false" format="gfa1" label="Checkpoint file"/> <param name="reuse_step" type="select" label="Checkpoint"> <option value="002_depth_filter">002_depth_filter</option> <option value="003_overlaps_removed">003_overlaps_removed</option> <option value="004_bridges_applied">004_bridges_applied</option> </param> </repeat> </inputs> <outputs> <data name="assembly_graph" format="gfa1" from_work_dir="assembly.gfa" label="${tool.name} on ${on_string}: Final Assembly Graph" /> <data name="assembly" format="fasta" from_work_dir="assembly.fasta" label="${tool.name} on ${on_string}: Final Assembly"/> <collection name="spades_collection" type="list" label="${tool.name} on ${on_string}: SPAdes graphs"> <discover_datasets pattern="(?P<designation>.*)\.gfa" format="gfa1" directory="spades_graphs"/> <filter>keep != "0"</filter> </collection> <data name="bam_file" format="bam" from_work_dir="read_alignment/long_read_alignments.bam" label="${tool.name} on ${on_string}: Long read alignments BAM"> <filter>keep == "2" and long</filter> </data> </outputs> <tests> <test expect_num_outputs="2"> <conditional name="paired_unpaired"> <param name="fastq_input_selector" value="paired" /> <param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" /> <param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" /> </conditional> <param name="mode" value="normal" /> <param name="min_fasta_length" value="100"/> <param name="linear_seqs" value="0"/> <section name="spades"> <param name="min_kmer_frac" value="0.2"/> <param name="max_kmer_frac" value="0.95"/> <param name="kmer_count" value="10"/> <param name="depth_filter" value="0.25"/> </section> <section name="rotation"> <param name="no_rotate" value=""/> <param name="start_gene_id" value="90"/> <param name="start_gene_cov" value="95"/> </section> <section name="graph_clean"> <param name="min_component_size" value="1000"/> <param name="min_dead_end_size" value="1000"/> </section> <section name="lr_align"> <param name="scores" value="3,-6,-5,-2"/> </section> <param name="keep" value="0"/> <output name="assembly_graph" ftype="gfa1"> <assert_contents> <has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/> </assert_contents> </output> <output name="assembly" ftype="fasta"> <assert_contents> <has_text text="length=5386" /> </assert_contents> </output> </test> <!-- Following test corresponds to the command: unicycler -t "${GALAXY_SLOTS:-8}" -o ./ - -verbose 3 - -pilon_path `pilon - -jar_dir` \ -1 test-data/phix_f.fq.gz -2 test-data/phix_r.fq.gz -l test-data/onp.fa \ - -mode 'normal' - -no_correct This command causes a segfault with the current version of unicycler on bioconda for Linux during the minimap step (which seems to be compiled C code). A gist of the log can be found at: https://gist.github.com/jmchilton/b411b695170c1daea6589f5d76e326cb. --> <test expect_num_outputs="2"> <conditional name="paired_unpaired"> <param name="fastq_input_selector" value="paired" /> <param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger.gz" /> <param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger.gz" /> </conditional> <param name="long" value="onp.fa" ftype="fasta" /> <param name="mode" value="normal" /> <param name="min_fasta_length" value="100"/> <param name="linear_seqs" value="0"/> <section name="spades"> <param name="min_kmer_frac" value="0.2"/> <param name="max_kmer_frac" value="0.95"/> <param name="kmer_count" value="10"/> <param name="depth_filter" value="0.25"/> </section> <section name="rotation"> <param name="no_rotate" value=""/> <param name="start_gene_id" value="90"/> <param name="start_gene_cov" value="95"/> </section> <section name="graph_clean"> <param name="min_component_size" value="1000"/> <param name="min_dead_end_size" value="1000"/> </section> <section name="lr_align"> <param name="scores" value="3,-6,-5,-2"/> </section> <param name="keep" value="0"/> <output name="assembly_graph" ftype="gfa1"> <assert_contents> <has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/> </assert_contents> </output> <output name="assembly" ftype="fasta"> <assert_contents> <has_text text="length=5386" /> </assert_contents> </output> </test> <test expect_num_outputs="2"> <conditional name="paired_unpaired"> <param name="fastq_input_selector" value="paired_collection"/> <param name="fastq_input1"> <collection type="paired"> <element name="forward" value="phix_f.fq.gz" ftype="fastqsanger" /> <element name="reverse" value="phix_r.fq.gz" ftype="fastqsanger" /> </collection> </param> </conditional> <param name="mode" value="normal" /> <param name="min_fasta_length" value="100"/> <param name="linear_seqs" value="0"/> <section name="spades"> <param name="min_kmer_frac" value="0.2"/> <param name="max_kmer_frac" value="0.95"/> <param name="kmer_count" value="10"/> <param name="depth_filter" value="0.25"/> </section> <section name="rotation"> <param name="no_rotate" value=""/> <param name="start_gene_id" value="90"/> <param name="start_gene_cov" value="95"/> </section> <section name="graph_clean"> <param name="min_component_size" value="1000"/> <param name="min_dead_end_size" value="1000"/> </section> <section name="lr_align"> <param name="scores" value="3,-6,-5,-2"/> </section> <param name="keep" value="0"/> <output name="assembly_graph" ftype="gfa1"> <assert_contents> <has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/> </assert_contents> </output> <output name="assembly" ftype="fasta"> <assert_contents> <has_text text="length=5386" /> </assert_contents> </output> </test> <test expect_num_outputs="2"> <conditional name="paired_unpaired"> <param name="fastq_input_selector" value="none"/> </conditional> <param name="min_anchor_seg_len" value="10"/> <section name="spades"> <param name="kmers" value="21,23"/> </section> <param name="long" value="only_long.fasta" ftype="fasta" /> <param name="keep" value="0"/> <output name="assembly_graph" ftype="gfa1"> <assert_contents> <has_text text="S" /> </assert_contents> </output> <output name="assembly" ftype="fasta"> <assert_contents> <has_text text=">1" /> </assert_contents> </output> </test> <!-- test checkpoint graph reuse TODO more precise test and check difference to call wo reuse --> <test expect_num_outputs="2"> <conditional name="paired_unpaired"> <param name="fastq_input_selector" value="paired_collection"/> <param name="fastq_input1"> <collection type="paired"> <element name="forward" value="phix_f.fq.gz" ftype="fastqsanger" /> <element name="reverse" value="phix_r.fq.gz" ftype="fastqsanger" /> </collection> </param> </conditional> <param name="long" value="only_long.fasta" ftype="fasta" /> <repeat name="reuse"> <param name="reuse_file" value="phix__spades_graph.gfa1"/> <param name="reuse_step" value="002_depth_filter"/> </repeat> <param name="keep" value="0"/> <output name="assembly_graph" ftype="gfa1"> <assert_contents> <has_text text="S" /> </assert_contents> </output> <output name="assembly" ftype="fasta"> <assert_contents> <has_text text=">1" /> </assert_contents> </output> </test> <!-- Test keep value = 1 --> <test expect_num_outputs="3"> <conditional name="paired_unpaired"> <param name="fastq_input_selector" value="paired" /> <param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" /> <param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" /> </conditional> <param name="mode" value="normal" /> <param name="keep" value="1"/> <output name="assembly_graph" ftype="gfa1"> <assert_contents> <has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/> </assert_contents> </output> <output name="assembly" ftype="fasta"> <assert_contents> <has_text text="length=5386" /> </assert_contents> </output> <output_collection name="spades_collection" type="list" count="14"> <element name="001_spades_graph_k027"> <assert_contents> <has_text text="TTGAATGCCACCGGAGGCGGCTTTTTGACCGCCTCCAAAC"/> </assert_contents> </element> </output_collection> </test> <!-- Test keep value = 2 --> <test expect_num_outputs="4"> <conditional name="paired_unpaired"> <param name="fastq_input_selector" value="paired" /> <param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" /> <param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" /> </conditional> <param name="long" value="onp.fa" ftype="fasta" /> <param name="mode" value="normal" /> <param name="keep" value="2"/> <output name="assembly_graph" ftype="gfa1"> <assert_contents> <has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/> </assert_contents> </output> <output name="assembly" ftype="fasta"> <assert_contents> <has_text text="length=5386" /> </assert_contents> </output> <output_collection name="spades_collection" type="list" count="14"> <element name="001_spades_graph_k027"> <assert_contents> <has_text text="TTGAATGCCACCGGAGGCGGCTTTTTGACCGCCTCCAAAC"/> </assert_contents> </element> <!-- there are gfa files for more k that are not tested explicily Aim of testing these is to be sure about the names of the graphs, since they are used for reuse. Hence if there is a change here update reuse accordingly--> <element name="001_spades_graph_k127"> <assert_contents> <has_line_matching expression="^S.*"/> </assert_contents> </element> <element name="002_depth_filter"> <assert_contents> <has_line_matching expression="^S.*"/> </assert_contents> </element> <element name="003_overlaps_removed"> <assert_contents> <has_line_matching expression="^S.*"/> </assert_contents> </element> <element name="004_bridges_applied"> <assert_contents> <has_line_matching expression="^S.*"/> </assert_contents> </element> <element name="005_final_clean"> <assert_contents> <has_line_matching expression="^S.*"/> </assert_contents> </element> </output_collection> <output name="bam_file" ftype="bam"> <assert_contents> <has_size value="2084" delta="100"/> </assert_contents> </output> </test> <!-- Test no simple bridges option --> <test expect_num_outputs="2"> <conditional name="paired_unpaired"> <param name="fastq_input_selector" value="paired" /> <param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" /> <param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" /> </conditional> <param name="long" value="onp.fa" ftype="fasta" /> <param name="mode" value="normal" /> <param name="keep" value="0"/> <section name="lr_align"> <param name="no_simple_bridges" value="true"/> </section> <output name="assembly_graph" ftype="gfa1"> <assert_contents> <has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/> </assert_contents> </output> <output name="assembly" ftype="fasta"> <assert_contents> <has_text text="length=5386" /> </assert_contents> </output> <assert_command> <has_text text="--no_simple_bridges" /> </assert_command> </test> </tests> <help><![CDATA[ **Unicycler** Unicycler is a hybrid assembly pipeline for bacterial genomes. It uses both Illumina reads and long reads (PacBio or Nanopore) to produce complete and accurate assemblies. It is written by `Ryan Wick`_ at the University of Melbourne's Centre for Systems Genomics. Much of the description below is lifted from Unicycler's `github page`_. .. _`Ryan Wick`: https://github.com/rrwick .. _`github page`: https://github.com/rrwick/Unicycler ----- **Input data** Unicycler accepts inputs short (Illumina) reads in FASTQ format. Galaxy places additional requirement of having these in FASTQ format with `Sanger encoding`_ of quality scores. Long reads (from Oxford Nanopore or PacBio) can be either in FASTQ of FASTA form. .. _`Sanger encoding`: https://en.wikipedia.org/wiki/FASTQ_format#Quality The input options are:: -1 SHORT1, --short1 SHORT1 FASTQ file of short reads (first reads in each pair) -2 SHORT2, --short2 SHORT2 FASTQ file of short reads (second reads in each pair) -s SHORT_UNPAIRED, --short_unpaired SHORT_UNPAIRED FASTQ file of unpaired short reads -l LONG, --long LONG FASTQ or FASTA file of long reads, if all reads are available at start. ----- **Bridging mode** Unicycler can be run in three modes: conservative, normal (the default) and bold, set with the --mode option. Conservative mode is least likely to produce a complete assembly but has a very low risk of misassembly. Bold mode is most likely to produce a complete assembly but carries greater risk of misassembly. Normal mode is intermediate regarding both completeness and misassembly risk. See `description of modes`_ for more information. .. _`description of modes`: https://github.com/rrwick/Unicycler#conservative-normal-and-bold The available modes are:: --mode {conservative,normal,bold} Bridging mode (default: normal) conservative = smaller contigs, lowest misassembly rate normal = moderate contig size and misassembly rate bold = longest contigs, higher misassembly rate ---- **Skip SPAdes error correction step** Sequencing data contains a substantial number of sequencing errors that manifest themselves as deviations (bulges and non-connected components) within the assembly graph. One of the ways to improve the graph even constructing it is to minimize the amount sequencing errors by performing error correction. SPAdes, which is used by Unicycler for error correction and assembly, uses `BayesHammer`_ to correct the reads. Here is a brief summary of what it does: 1. SPAdes (or rather BayesHammer) counts *k*-mers in reads and computed *k*-mer statistics that takes into account base quality values. 2. `Hamming graph`_ is constructed for *k*-mers is which *k*-mers are nodes. In this graph edges connect nodes (*k*-mers) is they differ from each other by a number of nucleotides up to a certain threshold (the `Hamming distance`_). The graph is central to the error correction algorithm. 3. At this step Bayesian subclustering of the graph produced in the previous step. For each *k*-mer we now know the center of its subcluster. 4. Solid *k*-mers are derived from cluster centers and are assumed to be *error free*. 5. Solid *k*-mers are mapped back to the reads and used to correct them. This step takes considerable time, so if one need to quickly evaluate assemblies this step can be skipped. However, this is not recommended if one if trying to produce a final high quality assembly. .. _`BayesHammer`: https://goo.gl/1iGkMe .. _`Hamming graph`: https://en.wikipedia.org/wiki/Hamming_graph .. _`Hamming distance`: https://en.wikipedia.org/wiki/Hamming_distance ----- **Do not rotate completed replicons to start at a standard gene** Unicycler uses TBLASTN to search for dnaA or repA alleles in each completed replicon. If one is found, the sequence is rotated and/or flipped so that it begins with that gene encoded on the forward strand. This provides consistently oriented assemblies and reduces the risk that a gene will be split across the start and end of the sequence. The following option turns rotation on and off:: --no_rotate Do not rotate completed replicons to start at a standard gene (default: completed replicons are rotated) **Do not use Pilon to polish the final assembly** `Pilon`_ is a tool for improving overall quality of draft assemblies and finding variation among strains. Unicycler uses it for assembly *polishing*. The following option turns pilon part of Unicycler pipeline on and off:: --no_pilon Do not use Pilon to polish the final assembly (default: Pilon is used) .. _`Pilon`: https://github.com/broadinstitute/pilon/wiki ------ **Expected number of linear sequences** If you expect your sample to contain linear (non circular) sequences, set this option:: --linear_seqs EXPECTED_LINEAR_SEQS The expected number of linear (i.e. non-circular) sequences in the underlying sequence ---- **SPAdes options** This section provides control of SPAdes options:: --min_kmer_frac MIN_KMER_FRAC Lowest k-mer size for SPAdes assembly, expressed as a fraction of the read length (default: 0.2) --max_kmer_frac MAX_KMER_FRAC Highest k-mer size for SPAdes assembly, expressed as a fraction of the read length (default: 0.95) --kmer_count KMER_COUNT Number of k-mer steps to use in SPAdes assembly (default: 10) --depth_filter DEPTH_FILTER Filter out contigs lower than this fraction of the chromosomal depth, if doing so does not result in graph dead ends (default: 0.25) ---- **Rotation options** Unicycler attempts to rotate circular assemblies to make sure that they begin at a consistent starting gene. The following parameters control assembly rotation:: --start_genes START_GENES FASTA file of genes for start point of rotated replicons (default: start_genes.fasta) --start_gene_id START_GENE_ID The minimum required BLAST percent identity for a start gene search (default: 90.0) --start_gene_cov START_GENE_COV The minimum required BLAST percent coverage for a start gene search (default: 95.0) ----- **Graph cleaning options** These options control the removal of small leftover sequences after bridging is complete:: --min_component_size MIN_COMPONENT_SIZE Unbridged graph components smaller than this size (bp) will be removed from the final graph (default: 1000) --min_dead_end_size MIN_DEAD_END_SIZE Graph dead ends smaller than this size (bp) will be removed from the final graph (default: 1000) ----- **Long read alignment options** These options control the alignment of long reads to the assembly graph:: --contamination CONTAMINATION FASTA file of known contamination in long reads --scores SCORES Comma-delimited string of alignment scores: match, mismatch, gap open, gap extend (default: 3,-6,-5,-2) --low_score LOW_SCORE Score threshold - alignments below this are considered poor (default: set threshold automatically) ----- **Outputs** Galaxy's wrapped for Unicycler produces two outputs: * final assembly in FASTA format * final assembly grapth in graph format While most will likely be interested in the FASTA dataset, the graph dataset is also quite useful and can be visualized using tools such as `Bandage`_. .. _`Bandage`: https://github.com/rrwick/Bandage ]]></help> <citations> <citation type="doi">10.1101/096412</citation> </citations> </tool>