Mercurial > repos > iuc > necat

<tool id="necat" name="necat" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
    <description>Error correction and de-novo assembly for ONT Nanopore reads</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <xrefs>
        <xref type="bio.tools">necat</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">necat</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        ## helper function
        #def make_filename($i, $input_param)
            #set ext = $input_param.extension
            #if $ext == "fastqsanger"
                #set $ext = "fastq"
            #end if
            #set filename = "reads_" + str($i) + "." + $ext
            #return $filename
        #end def

        ## push each input file and everything in input collections into read_list.txt
        #set i = 1
        #for input in $input_fastqs
            #set filename = $make_filename($i, $input)
            cp '$input' $filename
            && echo $filename >> read_list.txt &&
            #set i = $i + 1
        #end for

        ## #for $i, $input in enumerate($input_fastqs):
        ##     #set filename = 'reads_${i}.$input.ext'
        ##     ln -s '$input' $filename &&
        ##     echo $filename >> read_list.txt &&
        ## #end for

        ## necat commands
        necat correct '${job_configfile}'
        #if $assembly.should_assemble == "yes":
            && necat assemble '${job_configfile}'
            && necat bridge '${job_configfile}'
        #end if
    ]]></command>
    <configfiles>
        <expand macro="job_conf" />
    </configfiles>
    <inputs>
        <param name="input_fastqs" type="data" format="fastq,fastq.gz,fasta,fasta.gz" multiple="true" label="Input reads" help="Input read files (FASTQ or FASTA). To select more than one file or collection from your history, use the 'ctrl' key" />

        <param name="genome_size" type="integer" value="" min="1" max="100000000000" label="Genome size" help="Estimated size of genome (bp)" />
        <param name="min_read_length" type="integer" value="1000" min="1" max="10000000" label="Min read length" help="Minimum length for input reads" />
        <param name="correction_coverage" type="integer" value="40" min="1" max="10000" label="Correction coverage" help="Number of reads to correct in terms of genome coverage. For a 4Gb genome, setting correction coverage = 10 will correct the longest 40Gb worth of reads from the input fastq. " />
        <conditional name="assembly">
            <param name="should_assemble" type="select" label="Assembly">
                <option value="no" selected="true">Don't perform assembly</option>
                <option value="yes">Perform assembly on corrected reads</option>
            </param>
            <when value="no" />
            <when value="yes">
                <param name="assembly_coverage" type="integer" value="30" min="1" max="10000" label="Assembly coverage" help="Number of reads to use in genome assembly in terms of genome coverage" />
                <param name="polish_contigs" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Polish contigs" help="Polish contigs as final step after briding" />
            </when>
        </conditional>

        <section name="adv" title="Advanced options" expanded="false" help="Warning: only change these if you really know what you are doing">
            <expand macro="overlap_sensitive_options" />
            <expand macro="consensus_sensitive_options" />
            <expand macro="overlap_fast_options" />
            <expand macro="consensus_fast_options" />
            <expand macro="trimming_overlap_options" />
            <expand macro="assembly_overlap_options" />
            <expand macro="assembly_overlap_filtering" />
            <expand macro="contig_assembly" />
            <expand macro="contig_bridging" />
        </section>
    </inputs>
    <outputs>
        <data name="out_reads" format="fasta.gz" from_work_dir="project/1-consensus/cns_final.fasta.gz" label="${tool.name} on ${on_string}: corrected reads" />
        <data name="out_assembly" format="fasta" from_work_dir="project/6-bridge_contigs/polished_contigs.fasta" label="${tool.name} on ${on_string}: bridged assembly">
            <filter>assembly['should_assemble'] == 'yes' and not assembly['polish_contigs']</filter>
        </data>
        <data name="out_polished_assembly" format="fasta" from_work_dir="project/6-bridge_contigs/bridged_contigs.fasta" label="${tool.name} on ${on_string}: polished assembly">
            <filter>assembly['should_assemble'] == 'yes' and assembly['polish_contigs']</filter>
        </data>
    </outputs>
    <tests>
        <!-- single input fastq -->
        <test expect_num_outputs="2">
            <param name="input_fastqs" value="test1.fa" />
            <param name="genome_size" value="13000" />
            <param name="min_read_length" value="1000" />
            <param name="correction_coverage" value="40" />
            <conditional name="assembly">
                <param name="should_assemble" value="yes" />
                <param name="assembly_coverage" value="30"/>
                <param name="polish_contigs" value="true"/>
            </conditional>
            <output name="out_reads" ftype="fasta.gz">
                <assert_contents>
                    <has_size value="75000" delta="2000" />
                </assert_contents>
            </output>
            <output name="out_polished_assembly" ftype="fasta">
                <assert_contents>
                    <has_line line="&#62;bctg00000000 000000F" />
                    <has_size value="13000" delta="1000" />
                </assert_contents>
            </output>
        </test>
        <!-- multiple input files of different format -->
        <test expect_num_outputs="2">
            <param name="input_fastqs" value="test1_head.fastq,test1_tail.fasta" />
            <param name="genome_size" value="13000" />
            <param name="min_read_length" value="1000" />
            <param name="correction_coverage" value="40" />
            <conditional name="assembly">
                <param name="should_assemble" value="yes" />
                <param name="assembly_coverage" value="30"/>
                <param name="polish_contigs" value="true"/>
            </conditional>
            <output name="out_reads" ftype="fasta.gz">
                <assert_contents>
                    <has_size value="29000" delta="2000" />
                </assert_contents>
            </output>
            <output name="out_polished_assembly" ftype="fasta">
                <assert_contents>
                    <has_line line="&#62;bctg00000000 000000F" />
                    <has_size value="13000" delta="1000" />
                </assert_contents>
            </output>
        </test>
        <!-- advanced params 1 -->
        <test expect_num_outputs="2">
            <param name="input_fastqs" value="test1.fa" />
            <param name="genome_size" value="13000" />
            <param name="min_read_length" value="1000" />
            <param name="correction_coverage" value="40" />
            <conditional name="assembly">
                <param name="should_assemble" value="yes" />
                <param name="assembly_coverage" value="30"/>
                <param name="polish_contigs" value="true"/>
            </conditional>
            <section name="adv">
                <section name="ovs">
                    <param name="n" value="600" />
                    <param name="k" value="14" />
                    <param name="q" value="600" />
                    <param name="z" value="15" />
                    <param name="b" value="2500" />
                    <param name="a" value="800" />
                    <param name="d" value="0.25" />
                    <param name="e" value="0.4" />
                    <param name="m" value="600" />
                </section>
            </section>
            <output name="out_reads" ftype="fasta.gz">
                <assert_contents>
                    <has_size value="75000" delta="2000" />
                </assert_contents>
            </output>
            <output name="out_polished_assembly" ftype="fasta">
                <assert_contents>
                    <has_line line="&#62;bctg00000000 000000F" />
                    <has_size value="13000" delta="1000" />
                </assert_contents>
            </output>
        </test>
        <!-- advanced params 2 -->
        <test expect_num_outputs="2">
            <param name="input_fastqs" value="test1.fa" />
            <param name="genome_size" value="13000" />
            <param name="min_read_length" value="1000" />
            <param name="correction_coverage" value="40" />
            <conditional name="assembly">
                <param name="should_assemble" value="yes" />
                <param name="assembly_coverage" value="30"/>
                <param name="polish_contigs" value="true"/>
            </conditional>
            <section name="adv">
                <section name="fol">
                    <param name="min_length" value="2000" />
                    <param name="max_length" value="200000" />
                    <param name="min_aligned_length" value="2000" />
                    <param name="max_overhang" value="20000" />
                    <param name="min_coverage" value="5" />
                    <param name="bestn" value="5" />
                    <param name="overhang_local_deviation1" value="5" />
                </section>
            </section>
            <output name="out_reads" ftype="fasta.gz">
                <assert_contents>
                    <has_size value="75000" delta="2000" />
                </assert_contents>
            </output>
            <output name="out_polished_assembly" ftype="fasta">
                <assert_contents>
                    <has_line line="&#62;bctg00000000 000000F" />
                    <has_size value="13000" delta="1000" />
                </assert_contents>
            </output>
        </test>
        <!-- advanced params 3 -->
        <test expect_num_outputs="2">
            <param name="input_fastqs" value="test1.fa" />
            <param name="genome_size" value="13000" />
            <param name="min_read_length" value="1000" />
            <param name="correction_coverage" value="40" />
            <conditional name="assembly">
                <param name="should_assemble" value="yes" />
                <param name="assembly_coverage" value="30"/>
                <param name="polish_contigs" value="true"/>
            </conditional>
            <section name="adv">
                <section name="fa">
                    <param name="min_length" value="1000" />
                    <param name="min_identity" value="40" />
                    <param name="min_contig_length" value="600" />
                    <param name="select_branch" value="true" />
                </section>
            </section>
            <output name="out_reads" ftype="fasta.gz">
                <assert_contents>
                    <has_size value="75000" delta="2000" />
                </assert_contents>
            </output>
            <output name="out_polished_assembly" ftype="fasta">
                <assert_contents>
                    <has_line line="&#62;bctg00000000 000000F" />
                    <has_size value="13000" delta="1000" />
                </assert_contents>
            </output>
        </test>
        <!-- advanced params 4 -->
        <test expect_num_outputs="2">
            <param name="input_fastqs" value="test1.fa" />
            <param name="genome_size" value="13000" />
            <param name="min_read_length" value="1000" />
            <param name="correction_coverage" value="40" />
            <conditional name="assembly">
                <param name="should_assemble" value="yes" />
                <param name="assembly_coverage" value="30"/>
                <param name="polish_contigs" value="true"/>
            </conditional>
            <section name="adv">
                <section name="fcb">
                    <param name="read_min_length" value="4000" />
                    <param name="ctg_min_length" value="1000" />
                    <param name="ctg2ctg_min_identity" value="90" />
                    <param name="read2ctg_min_identity" value="60" />
                    <param name="min_contig_length" value="1000" />
                </section>
            </section>
            <output name="out_reads" ftype="fasta.gz">
                <assert_contents>
                    <has_size value="75000" delta="2000" />
                </assert_contents>
            </output>
            <output name="out_polished_assembly" ftype="fasta">
                <assert_contents>
                    <has_line line="&#62;bctg00000000 000000F" />
                    <has_size value="13000" delta="1000" />
                </assert_contents>
            </output>
        </test>
    </tests>

    <help><![CDATA[

NECAT
.....

**What it does**

| NECAT performs error correction to remove complex errors in nanopore reads. It can also optionally de novo assembly.
| After assembly it is recommended to use MEDAKA for long-read polishing, then NextPolish for short-read polishing.
|
| Github: https://github.com/xiaochuanle/NECAT
|

**Input**

- One or more files or collections containing sequence reads (fastq / fasta)

**Output**

- Corrected reads (fasta)
- Genome assembly (fasta) (Optional)

|

**Advanced Settings**

| Necat runs multiple subprograms in an assembly pipeline to create its final output.
| Each subprogram does a specific task, then hands its output to the next.
| The subprograms are listed in order below, alongside the settings which can be configured:
|

*oc2pmov*

| Finds overlaps between raw-reads
| *Overlap Sensitive Options & Overlap Fast Options*
|

-k <Integer>    kmer size
-z <Integer>    scan window size
-q <Integer>    kmer occurs > q times will be ignored
-b <Integer>    block size
-n <Integer>    number of candidates
-a <Integer>    min align length
-d <Real>       ddf score cutoff
-e <Real>       sequencing error
-m <Integer>    number of output

|

| DEFAULT OPTIONS:
| -k 15 -z 10 -q 500 -b 2000 -s 3 -n 500 -a 500 -d 0.250000 -e 0.500000 -m 500 -t 1

|
|

*oc2cns*

| Creates consensus reads from raw-read overlaps
| *Consensus Sensitive Options & Consensus Fast Options*
|

-a <Integer>    align length cutoff
-x <Integer>    minimal coverage
-y <Integer>    maximal coverage
-l <Integer>    minimal length of corrected reads.
-f <0 or 1>     full consensus or not: 1 = yes, 0 = no
-e <Real>       sequencing error
-p <Real>       minimal mapping ratio
-r <0 or 1>     rescue long indels or not: 1 = yes, 0 = no
-u <0 or 1>     use dynamic or fixed ident cutoff: 1 = fixed, 0 = dynamic

|

| DEFAULT OPTIONS:
| -a 400 -x 4 -y 12 -l 500 -f 0 -e 0.500000 -p 0.800000 -t 1 -r 0 -u 0 -s 0

|
|

*oc2asmpm*

| Identifies corrected-read overlaps for assembly
| *Trimming Overlap Options & Assembly Overlap Options*
|


-k <Integer>    kmer size
-z <Integer>    scan window size
-q <Integer>    kmer occurs > q times will be ignored
-b <Integer>    block size
-n <Integer>    number of candidates
-a <Integer>    min align length
-d <Real>       ddf score cutoff
-e <Real>       sequencing error
-m <Integer>    number of output

|
|

*fsa_ol_filter*

| Filters out low-quality corrected-read overlaps for assembly
| *Assembly Overlap Filtering Options*
|

--min_length=INT                      minimum length of reads. default: 2500
--max_length=INT                      maximum length of reads. default: 2147483647
--min_identity=DOUBLE                 minimum identity of overlaps default: -1
--min_aligned_length=INT              minimum aligned length of overlaps default: 2500
--max_overhang=INT                    maximum overhang of overlaps, negative number = determined by the program. default: -1
--min_coverage=INT                    minimum base coverage, negative number = determined by the program. default: -1
--max_coverage=INT                    maximum base coverage, negative number = determined by the program default: -1
--max_diff_coverage=INT               maximum difference of base coverage, negative number = determined by the program default: -1
--coverage_discard=DOUBLE             discard ratio of base coverage. If max_coverage or max_diff_coverage is negative, it will be reset to (100-coverage_discard)th percentile. default: 0.01
--bestn=INT                           output best n overlaps on 5' or 3' end for each read.  default: 10
--genome_size=INT                     genome size. It determines the maximum length of reads with coverage together default: 0
--coverage=INT                        coverage. It determines the maximum length of reads with genome_size together default: 40
--identity_global_deviation1=DOUBLE   If min_identity < 0, min_identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 98
--identity_global_deviation2=DOUBLE   If min_identity < 0, min_identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 6
--overhang_global_deviation1=DOUBLE   If max_overhang < 0, max_overhang is set to max(m, deviation1) + 1.4826*mad*deviation2 default: 30
--overhang_global_deviation2=DOUBLE   If max_overhang < 0, max_overhang is set to max(m, deviation1) + 1.4826*mad*deviation2 default: 6
--identity_local_deviation1=DOUBLE    The local threshold of identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 99
--identity_local_deviation2=DOUBLE    The local threshold of identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 6
--overhang_local_deviation1=DOUBLE    The local threshold of overhang is set to max(m, deviation1) + 1.253*mad*deviation2 default: 10
--overhang_local_deviation2=DOUBLE    The local threshold of overhang is set to max(m, deviation1) + 1.253*mad*deviation2 default: 6
--identity_local_condition=INT        Local filtering conditions. 0 = overlap idenitity < threshold, 1 = overlap idenitity < threshold and query identity >= target identity default: 0
--local_low_coverage=INT              If the coverage of reads is less than local_low_coverage, min_identity and max_overhang are used to filter out low-quality overlaps. Otherwise, the local threshold is used. default: 25

|
|

*fsa_assemble*

| Constructs contigs from filtered overlaps
| *Contig Assembly Options*
|

--min_length=INT            minimum length of reads default: 0
--min_identity=DOUBLE       minimum identity of overlaps default: 0
--min_aligned_length=INT    minimum aligned length of overlaps default: 0
--min_contig_length=INT     minimum length of contigs default: 500
--select_branch=BOOL        select the most probable branch default: "no"
--max_spur_length=INT       branches less the threshod are treated as spurs default: 50000

|
|

*fsa_ctg_bridge*

| Bridges contigs using input long raw-reads
| *Contig Bridging Options*
|

--read_min_length=INT               minimum rawread length default: 5000
--ctg_min_length=INT                minimum contig length default: 500
--ctg2ctg_min_identity=DOUBLE       minimum identity of overlaps between contigs default: 95
--ctg2ctg_max_overhang=INT          maximum overhang of overlaps between contigs default: 100
--ctg2ctg_min_aligned_length=INT    minimum aligned length of overlaps between contigs default: 2000
--read2ctg_min_identity=DOUBLE      minimum identity of overlaps between rawreads and contigs default: 80
--read2ctg_max_overhang=INT         maximum overhang of overlaps between rawreads and contigs default: 500
--read2ctg_min_aligned_length=INT   minimum aligned length of overlaps between rawreads and contigs default: 5000
--read2ctg_min_coverage=INT         minimum coverage of links between rawreads and contigs default: 3
--min_contig_length=INT             minimum length of bridged contig default: 500
--select_branch=BOOL                select the most probable branch default: "no"
--window_size=INT                   threshold is used to group rawreads that bridge contigs default: 1000

|


    ]]></help>
    <expand macro="citations" />
</tool>