Mercurial > repos > iuc > necat

diff necat.xml @ 0:6ee7eb5821f0 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/necat commit 6946d81de9419c90e9bc4ea2f7bd5e4168dd6dd6
author: iuc
date: Fri, 25 Nov 2022 14:24:27 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/necat.xml	Fri Nov 25 14:24:27 2022 +0000
@@ -0,0 +1,440 @@
+<tool id="necat" name="necat" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
+    <description>Error correction and de-novo assembly for ONT Nanopore reads</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <xrefs>       
+        <xref type="bio.tools">necat</xref>
+    </xrefs>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">necat</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        ## helper function
+        #def make_filename($i, $input_param)
+            #set ext = $input_param.extension
+            #if $ext == "fastqsanger"
+                #set $ext = "fastq"
+            #end if
+            #set filename = "reads_" + str($i) + "." + $ext
+            #return $filename
+        #end def
+
+        ## push each input file and everything in input collections into read_list.txt
+        #set i = 1
+        #for input in $input_fastqs
+            #set filename = $make_filename($i, $input)
+            cp '$input' $filename  
+            && echo $filename >> read_list.txt &&
+            #set i = $i + 1
+        #end for
+
+        ## #for $i, $input in enumerate($input_fastqs):
+        ##     #set filename = 'reads_${i}.$input.ext'
+        ##     ln -s '$input' $filename &&
+        ##     echo $filename >> read_list.txt &&
+        ## #end for
+        
+        ## necat commands
+        necat correct '${job_configfile}' 
+        #if $assembly.should_assemble == "yes":
+            && necat assemble '${job_configfile}'
+            && necat bridge '${job_configfile}' 
+        #end if
+    ]]></command>
+    <configfiles>   
+        <expand macro="job_conf" />
+    </configfiles>
+    <inputs>
+        <param name="input_fastqs" type="data" format="fastq,fastq.gz,fasta,fasta.gz" multiple="true" label="Input reads" help="Input read files (FASTQ or FASTA). To select more than one file or collection from your history, use the 'ctrl' key" />
+
+        <param name="genome_size" type="integer" value="" min="1" max="100000000000" label="Genome size" help="Estimated size of genome (bp)" />
+        <param name="min_read_length" type="integer" value="1000" min="1" max="10000000" label="Min read length" help="Minimum length for input reads" />
+        <param name="correction_coverage" type="integer" value="40" min="1" max="10000" label="Correction coverage" help="Number of reads to correct in terms of genome coverage. For a 4Gb genome, setting correction coverage = 10 will correct the longest 40Gb worth of reads from the input fastq. " />
+        <conditional name="assembly">
+            <param name="should_assemble" type="select" label="Assembly">
+                <option value="no" selected="true">Don't perform assembly</option>
+                <option value="yes">Perform assembly on corrected reads</option>
+            </param>
+            <when value="no" />
+            <when value="yes">
+                <param name="assembly_coverage" type="integer" value="30" min="1" max="10000" label="Assembly coverage" help="Number of reads to use in genome assembly in terms of genome coverage" />
+                <param name="polish_contigs" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Polish contigs" help="Polish contigs as final step after briding" />
+            </when>
+        </conditional>
+
+        <section name="adv" title="Advanced options" expanded="false" help="Warning: only change these if you really know what you are doing">
+            <expand macro="overlap_sensitive_options" />
+            <expand macro="consensus_sensitive_options" />
+            <expand macro="overlap_fast_options" />
+            <expand macro="consensus_fast_options" />
+            <expand macro="trimming_overlap_options" />
+            <expand macro="assembly_overlap_options" />
+            <expand macro="assembly_overlap_filtering" />
+            <expand macro="contig_assembly" />
+            <expand macro="contig_bridging" />
+        </section>
+    </inputs>
+    <outputs>
+        <data name="out_reads" format="fasta.gz" from_work_dir="project/1-consensus/cns_final.fasta.gz" label="${tool.name} on ${on_string}: corrected reads" />
+        <data name="out_assembly" format="fasta" from_work_dir="project/6-bridge_contigs/polished_contigs.fasta" label="${tool.name} on ${on_string}: bridged assembly">
+            <filter>assembly['should_assemble'] == 'yes' and not assembly['polish_contigs']</filter>
+        </data>
+        <data name="out_polished_assembly" format="fasta" from_work_dir="project/6-bridge_contigs/bridged_contigs.fasta" label="${tool.name} on ${on_string}: polished assembly">
+            <filter>assembly['should_assemble'] == 'yes' and assembly['polish_contigs']</filter>
+        </data>
+    </outputs>
+    <tests>
+        <!-- single input fastq -->
+        <test expect_num_outputs="2">
+            <param name="input_fastqs" value="test1.fa" />
+            <param name="genome_size" value="13000" />
+            <param name="min_read_length" value="1000" />
+            <param name="correction_coverage" value="40" />
+            <conditional name="assembly">
+                <param name="should_assemble" value="yes" />
+                <param name="assembly_coverage" value="30"/>
+                <param name="polish_contigs" value="true"/>
+            </conditional>
+            <output name="out_reads" ftype="fasta.gz">
+                <assert_contents>
+                    <has_size value="75000" delta="2000" />
+                </assert_contents>
+            </output>
+            <output name="out_polished_assembly" ftype="fasta">
+                <assert_contents>
+                    <has_line line="&#62;bctg00000000 000000F" />
+                    <has_size value="13000" delta="1000" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- multiple input files of different format -->
+        <test expect_num_outputs="2">
+            <param name="input_fastqs" value="test1_head.fastq,test1_tail.fasta" />
+            <param name="genome_size" value="13000" />
+            <param name="min_read_length" value="1000" />
+            <param name="correction_coverage" value="40" />
+            <conditional name="assembly">
+                <param name="should_assemble" value="yes" />
+                <param name="assembly_coverage" value="30"/>
+                <param name="polish_contigs" value="true"/>
+            </conditional>
+            <output name="out_reads" ftype="fasta.gz">
+                <assert_contents>
+                    <has_size value="29000" delta="2000" />
+                </assert_contents>
+            </output>
+            <output name="out_polished_assembly" ftype="fasta">
+                <assert_contents>
+                    <has_line line="&#62;bctg00000000 000000F" />
+                    <has_size value="13000" delta="1000" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- advanced params 1 -->
+        <test expect_num_outputs="2">
+            <param name="input_fastqs" value="test1.fa" />
+            <param name="genome_size" value="13000" />
+            <param name="min_read_length" value="1000" />
+            <param name="correction_coverage" value="40" />
+            <conditional name="assembly">
+                <param name="should_assemble" value="yes" />
+                <param name="assembly_coverage" value="30"/>
+                <param name="polish_contigs" value="true"/>
+            </conditional>
+            <section name="adv">
+                <section name="ovs">
+                    <param name="n" value="600" />
+                    <param name="k" value="14" />
+                    <param name="q" value="600" />
+                    <param name="z" value="15" />
+                    <param name="b" value="2500" />
+                    <param name="a" value="800" />
+                    <param name="d" value="0.25" />
+                    <param name="e" value="0.4" />
+                    <param name="m" value="600" />
+                </section>
+            </section>
+            <output name="out_reads" ftype="fasta.gz">
+                <assert_contents>
+                    <has_size value="75000" delta="2000" />
+                </assert_contents>
+            </output>
+            <output name="out_polished_assembly" ftype="fasta">
+                <assert_contents>
+                    <has_line line="&#62;bctg00000000 000000F" />
+                    <has_size value="13000" delta="1000" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- advanced params 2 -->
+        <test expect_num_outputs="2">
+            <param name="input_fastqs" value="test1.fa" />
+            <param name="genome_size" value="13000" />
+            <param name="min_read_length" value="1000" />
+            <param name="correction_coverage" value="40" />
+            <conditional name="assembly">
+                <param name="should_assemble" value="yes" />
+                <param name="assembly_coverage" value="30"/>
+                <param name="polish_contigs" value="true"/>
+            </conditional>
+            <section name="adv">
+                <section name="fol">
+                    <param name="min_length" value="2000" />
+                    <param name="max_length" value="200000" />
+                    <param name="min_aligned_length" value="2000" />
+                    <param name="max_overhang" value="20000" />
+                    <param name="min_coverage" value="5" />
+                    <param name="bestn" value="5" />
+                    <param name="overhang_local_deviation1" value="5" />
+                </section>
+            </section>
+            <output name="out_reads" ftype="fasta.gz">
+                <assert_contents>
+                    <has_size value="75000" delta="2000" />
+                </assert_contents>
+            </output>
+            <output name="out_polished_assembly" ftype="fasta">
+                <assert_contents>
+                    <has_line line="&#62;bctg00000000 000000F" />
+                    <has_size value="13000" delta="1000" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- advanced params 3 -->
+        <test expect_num_outputs="2">
+            <param name="input_fastqs" value="test1.fa" />
+            <param name="genome_size" value="13000" />
+            <param name="min_read_length" value="1000" />
+            <param name="correction_coverage" value="40" />
+            <conditional name="assembly">
+                <param name="should_assemble" value="yes" />
+                <param name="assembly_coverage" value="30"/>
+                <param name="polish_contigs" value="true"/>
+            </conditional>
+            <section name="adv">
+                <section name="fa">
+                    <param name="min_length" value="1000" />
+                    <param name="min_identity" value="40" />
+                    <param name="min_contig_length" value="600" />
+                    <param name="select_branch" value="true" />
+                </section>
+            </section>
+            <output name="out_reads" ftype="fasta.gz">
+                <assert_contents>
+                    <has_size value="75000" delta="2000" />
+                </assert_contents>
+            </output>
+            <output name="out_polished_assembly" ftype="fasta">
+                <assert_contents>
+                    <has_line line="&#62;bctg00000000 000000F" />
+                    <has_size value="13000" delta="1000" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- advanced params 4 -->
+        <test expect_num_outputs="2">
+            <param name="input_fastqs" value="test1.fa" />
+            <param name="genome_size" value="13000" />
+            <param name="min_read_length" value="1000" />
+            <param name="correction_coverage" value="40" />
+            <conditional name="assembly">
+                <param name="should_assemble" value="yes" />
+                <param name="assembly_coverage" value="30"/>
+                <param name="polish_contigs" value="true"/>
+            </conditional>
+            <section name="adv">
+                <section name="fcb">
+                    <param name="read_min_length" value="4000" />
+                    <param name="ctg_min_length" value="1000" />
+                    <param name="ctg2ctg_min_identity" value="90" />
+                    <param name="read2ctg_min_identity" value="60" />
+                    <param name="min_contig_length" value="1000" />
+                </section>
+            </section>
+            <output name="out_reads" ftype="fasta.gz">
+                <assert_contents>
+                    <has_size value="75000" delta="2000" />
+                </assert_contents>
+            </output>
+            <output name="out_polished_assembly" ftype="fasta">
+                <assert_contents>
+                    <has_line line="&#62;bctg00000000 000000F" />
+                    <has_size value="13000" delta="1000" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+
+    <help><![CDATA[ 
+
+NECAT
+.....
+
+**What it does**
+
+| NECAT performs error correction to remove complex errors in nanopore reads. It can also optionally de novo assembly. 
+| After assembly it is recommended to use MEDAKA for long-read polishing, then NextPolish for short-read polishing.
+| 
+| Github: https://github.com/xiaochuanle/NECAT
+| 
+
+**Input**
+
+- One or more files or collections containing sequence reads (fastq / fasta)
+
+**Output**
+
+- Corrected reads (fasta)
+- Genome assembly (fasta) (Optional)
+
+|
+
+**Advanced Settings**
+
+| Necat runs multiple subprograms in an assembly pipeline to create its final output.  
+| Each subprogram does a specific task, then hands its output to the next. 
+| The subprograms are listed in order below, alongside the settings which can be configured:
+| 
+
+*oc2pmov*
+
+| Finds overlaps between raw-reads
+| *Overlap Sensitive Options & Overlap Fast Options*
+| 
+
+-k <Integer>    kmer size
+-z <Integer>    scan window size
+-q <Integer>    kmer occurs > q times will be ignored
+-b <Integer>    block size
+-n <Integer>    number of candidates
+-a <Integer>    min align length
+-d <Real>       ddf score cutoff
+-e <Real>       sequencing error
+-m <Integer>    number of output
+
+|
+
+| DEFAULT OPTIONS:
+| -k 15 -z 10 -q 500 -b 2000 -s 3 -n 500 -a 500 -d 0.250000 -e 0.500000 -m 500 -t 1 
+
+|
+|
+
+*oc2cns*
+
+| Creates consensus reads from raw-read overlaps
+| *Consensus Sensitive Options & Consensus Fast Options*
+| 
+
+-a <Integer>    align length cutoff
+-x <Integer>    minimal coverage
+-y <Integer>    maximal coverage
+-l <Integer>    minimal length of corrected reads.
+-f <0 or 1>     full consensus or not: 1 = yes, 0 = no
+-e <Real>       sequencing error
+-p <Real>       minimal mapping ratio
+-r <0 or 1>     rescue long indels or not: 1 = yes, 0 = no
+-u <0 or 1>     use dynamic or fixed ident cutoff: 1 = fixed, 0 = dynamic
+
+|
+
+| DEFAULT OPTIONS:
+| -a 400 -x 4 -y 12 -l 500 -f 0 -e 0.500000 -p 0.800000 -t 1 -r 0 -u 0 -s 0
+
+|
+|
+
+*oc2asmpm*
+
+| Identifies corrected-read overlaps for assembly
+| *Trimming Overlap Options & Assembly Overlap Options*
+| 
+
+
+-k <Integer>    kmer size
+-z <Integer>    scan window size
+-q <Integer>    kmer occurs > q times will be ignored
+-b <Integer>    block size
+-n <Integer>    number of candidates
+-a <Integer>    min align length
+-d <Real>       ddf score cutoff
+-e <Real>       sequencing error
+-m <Integer>    number of output
+
+|
+|
+
+*fsa_ol_filter*
+
+| Filters out low-quality corrected-read overlaps for assembly
+| *Assembly Overlap Filtering Options*
+| 
+
+--min_length=INT                      minimum length of reads. default: 2500 
+--max_length=INT                      maximum length of reads. default: 2147483647 
+--min_identity=DOUBLE                 minimum identity of overlaps default: -1 
+--min_aligned_length=INT              minimum aligned length of overlaps default: 2500
+--max_overhang=INT                    maximum overhang of overlaps, negative number = determined by the program. default: -1
+--min_coverage=INT                    minimum base coverage, negative number = determined by the program. default: -1
+--max_coverage=INT                    maximum base coverage, negative number = determined by the program default: -1
+--max_diff_coverage=INT               maximum difference of base coverage, negative number = determined by the program default: -1
+--coverage_discard=DOUBLE             discard ratio of base coverage. If max_coverage or max_diff_coverage is negative, it will be reset to (100-coverage_discard)th percentile. default: 0.01
+--bestn=INT                           output best n overlaps on 5' or 3' end for each read.  default: 10
+--genome_size=INT                     genome size. It determines the maximum length of reads with coverage together default: 0
+--coverage=INT                        coverage. It determines the maximum length of reads with genome_size together default: 40
+--identity_global_deviation1=DOUBLE   If min_identity < 0, min_identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 98
+--identity_global_deviation2=DOUBLE   If min_identity < 0, min_identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 6
+--overhang_global_deviation1=DOUBLE   If max_overhang < 0, max_overhang is set to max(m, deviation1) + 1.4826*mad*deviation2 default: 30
+--overhang_global_deviation2=DOUBLE   If max_overhang < 0, max_overhang is set to max(m, deviation1) + 1.4826*mad*deviation2 default: 6
+--identity_local_deviation1=DOUBLE    The local threshold of identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 99
+--identity_local_deviation2=DOUBLE    The local threshold of identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 6
+--overhang_local_deviation1=DOUBLE    The local threshold of overhang is set to max(m, deviation1) + 1.253*mad*deviation2 default: 10
+--overhang_local_deviation2=DOUBLE    The local threshold of overhang is set to max(m, deviation1) + 1.253*mad*deviation2 default: 6
+--identity_local_condition=INT        Local filtering conditions. 0 = overlap idenitity < threshold, 1 = overlap idenitity < threshold and query identity >= target identity default: 0
+--local_low_coverage=INT              If the coverage of reads is less than local_low_coverage, min_identity and max_overhang are used to filter out low-quality overlaps. Otherwise, the local threshold is used. default: 25
+
+|
+|
+
+*fsa_assemble*
+
+| Constructs contigs from filtered overlaps
+| *Contig Assembly Options*
+| 
+
+--min_length=INT            minimum length of reads default: 0
+--min_identity=DOUBLE       minimum identity of overlaps default: 0
+--min_aligned_length=INT    minimum aligned length of overlaps default: 0
+--min_contig_length=INT     minimum length of contigs default: 500
+--select_branch=BOOL        select the most probable branch default: "no"
+--max_spur_length=INT       branches less the threshod are treated as spurs default: 50000
+
+|
+|
+
+*fsa_ctg_bridge*
+
+| Bridges contigs using input long raw-reads
+| *Contig Bridging Options*
+| 
+
+--read_min_length=INT               minimum rawread length default: 5000
+--ctg_min_length=INT                minimum contig length default: 500
+--ctg2ctg_min_identity=DOUBLE       minimum identity of overlaps between contigs default: 95
+--ctg2ctg_max_overhang=INT          maximum overhang of overlaps between contigs default: 100
+--ctg2ctg_min_aligned_length=INT    minimum aligned length of overlaps between contigs default: 2000
+--read2ctg_min_identity=DOUBLE      minimum identity of overlaps between rawreads and contigs default: 80
+--read2ctg_max_overhang=INT         maximum overhang of overlaps between rawreads and contigs default: 500
+--read2ctg_min_aligned_length=INT   minimum aligned length of overlaps between rawreads and contigs default: 5000
+--read2ctg_min_coverage=INT         minimum coverage of links between rawreads and contigs default: 3
+--min_contig_length=INT             minimum length of bridged contig default: 500
+--select_branch=BOOL                select the most probable branch default: "no"
+--window_size=INT                   threshold is used to group rawreads that bridge contigs default: 1000
+
+|
+
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>
\ No newline at end of file