view kmernator.xml @ 0:d9da256384e1 default tip

Uploaded
author edward-kirton
date Thu, 14 Jul 2011 22:19:53 -0400
parents
children
line wrap: on
line source

<tool id='kmernator' name='kmernator' version='1.0.1' description='Filter reads by kmer frequency' force_history_refresh='True'>
<command interpreter="perl">kmernator_wrapper.pl
$logfile.id
$__new_file_path__
$logfile.extra_files_path
$mmap_infile
$kmer_size
$mmap_outfile
$gc_map_outfile
$num_outfiles.partition_by_depth
$input.format
#for $i in $input.files
${i.file} ${i.file.hid}
#end for
END
--separate-outputs 0
--mmap-input 1
--save-kmer-mmap 1
--build-partitions 1
--gc-heat-map 1
--log-file $logfile
--temp-dir $logfile.extra_files_path
#if $num_outfiles.select == 'one':
--max-kmer-output-depth $num_outfiles.max_kmer_output_depth
#else:
--partition-by-depth $num_outfiles.partition_by_depth
#end if
--min-passing-in-pair $min_passing_in_pair
--kmer-size $kmer_size
--format-output $format_output
--ignore-quality $ignore_quality
--filter-output $filter_output
--phix-output $phix_output
--min-read-length $min_read_length
--min-kmer-quality $min_kmer_quality
--min-quality-score $min_quality_score
--min-depth $min_depth
--bimodal-sigmas $bimodal_sigmas
--variant-sigmas $variant_sigmas
--periodic-singleton-purge $periodic_singleton_purge
--skip-artifact-filter $skip_artifact_filter
--artifact-match-length $artifact_match_length
--artifact-edit-distance $artifact_edit_distance
--mask-simple-repeats $mask_simple_repeats
--dedup-mode $dedup_mode
--dedup-single $dedup_single
--dedup-edit-distance $dedup_edit_distance
--dedup-start-offset $dedup_start_offset
--dedup-length $dedup_length
</command>
<inputs>
    <!-- INPUT FILES -->
    <conditional name="input">
        <param name="format" type="select" label="Input format">
            <option value="fastqsanger">Fastq-Sanger</option>
            <option value="fastqillumina">Fastq-Illumina</option>
        </param>
        <when value="fastqsanger">
            <repeat name="files" title="Read Files">
                <param name="file" type="data" format="fastqsanger" label="FastqSanger File"/>
            </repeat>
        </when>
        <when value="fastqillumina">
            <repeat name="files" title="Read Files">
                <param name="file" type="data" format="fastqillumina" label="FastqIllumina File"/>
            </repeat>
        </when>
    </conditional>
    <param name="ignore_quality" type="boolean" truevalue="1" falsevalue="0" checked="false" label="ignore the quality scores" help="to save memory or if they are untrusted" />
    <param name="mmap_infile" type="data" format="mmap" optional="true" label="mmap file" help="Instead of generating kmer spectrum, load an existing one" />

    <!-- OUTPUT OPTIONS -->
    <param name="format_output" type="select" display="radio" label="Output format">
        <option value="0" selected="true">Fastq masked</option>
        <!-- <option value="1">Fasta</option> -->
        <option value="2">Fastq unmasked</option>
        <!-- <option value="3">Fasta unmasked</option> -->
    </param>
    <conditional name="num_outfiles">
        <param name="select" type="select" label="Number of output files">
            <option value="one">One</option>
            <option value="many">Many; partition by kmer depth</option>
        </param>
        <when value="one">
            <param name="max_kmer_output_depth" type="integer" value="-1" label="maximum number of times a kmer will be output among the selected reads (to reduce the redundancy of the output)" help="-1 = no maximum" />
            <param name="partition_by_depth" type="hidden" value="-1" />
        </when>
        <when value="many">
            <param name="partition_by_depth" type="integer" value="512" label="Trimmed reads will be output into powers-of-two buckets whereby average k-mer depth is &gt;= this size" help="Outputs are nonredundant; useful for meta-genome/-transcriptome sequencing" />
        </when>
    </conditional>
    <param name="filter_output" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Save artifact filtered reads in a separate file?" help="If not set, then affected reads will be trimmed and then output normally." />
    <param name="phix_output" type="boolean" truevalue="1" falsevalue="0" selected="false" label="if set, artifact filter also screens for PhiX174, and any matching reads will be output to Artifacts file" help="Requires filter-output option (above)" />
    <!-- NOT IMPLEMENTED: -separate-outputs 1
    For multiple infiles:
    If set, each input (plus consensus) will generate a new outputfile.
    If set to 0, all input files will be merged into one output file.
    -->

    <!-- FILTER OPTIONS -->
    <param name="skip_artifact_filter" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Skip artifact filter" />
    <param name="kmer_size" type="integer" value="21" label="kmer size" help="A size of 0 will skip k-mer calculations" />
    <param name="min_depth" type="float" value="2" label="Minimum depth" help="Requires nonzero kmer size" />

    <param name="min_passing_in_pair" type="select" display="radio">
        <option value="1">only 1 read in a pair must pass filters</option>
        <option value="2">both reads in a pair must pass filters</option>
    </param>
    <param name="min_read_length" type="integer" value="25" label="minimum (trimmed) read length of selected reads." help="Special values: 0=no minimum; 1=full read length" />
    <param name="min_kmer_quality" type="float" value="0.10000000000000001" label="minimum quality-adjusted kmer probability" help="0.0-1.0" />
    <param name="min_quality_score" type="integer" value="5" label="minimum quality score over entire kmer" />
    <param name="min_depth" type="integer" value="2" label="minimum depth for a solid kmer" />
    <param name="skip_artifact_filter" type="boolean" truevalue="1" falsevalue="0" checked="false" label= "Skip homo-polymer, primer-dimer and duplicated fragment pair filtering" />

    <!-- ADVANCED OPTIONS -->
    <param name="bimodal_sigmas" type="float" value="-1" label="Detect bimodal kmer-signatures across reads and trim at transition point if the two means are separated by bimodal-sigmas * stdDev (2.0 to 3.0 suggested)." help="disabled if &lt; 0.0" />
    <param name="variant_sigmas" type="float" value="-1" label="Detect and purge kmer-variants if &gt;= variant-sigmas * Poisson-stdDev (2.0-3.0 suggested)." help="Disabled if &lt; 0.0" />
    <param name="periodic_singleton_purge" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Purge singleton memory structure every # of reads" />
    <param name="artifact_match_length" type="integer" value="24" label="Kmer match length to known artifact sequences" />
    <param name="artifact_edit_distance" type="integer" value="2" label="edit-distance to apply to artifact-match-length matches to know artifacts" />
    <param name="mask_simple_repeats" type="boolean" truevalue="1" falsevalue="0" checked="true" label="if filtering artifacts, also mask simple repeats" />
    <param name="dedup_mode" type="select" label="De-duplication mode" display="radio" help="If de-duplication is performed, additional consensus outfiles will be generated">
        <option value="0">No fragment de-duplication</option>
        <option value="1" selected="true">single orientation will collapse to consensus (AB and BA are separated)</option>
        <option value="2">both orientations will collapse (AB and BA are the same)</option>
    </param>
    <param name="dedup_single" type="select" display="radio" label="De-duplication for single reads">
        <option value="0" selected="true">no single read de-duplication</option>
        <option value="1">perform single read de-duplication</option>
    </param>
    <param name="dedup_edit_distance" type="integer" value="0" label="De-duplication edit distance" help="-1 = no fragment de-duplication" />
    <param name="dedup_start_offset" type="integer" value="0" label="de-duplication start offset to find unique fragments" help="must be multiple of 4" />
    <param name="dedup_length" type="integer" value="16" label="de-duplication length to find unique fragments" help="must be multiple of 4 (doubled when in single-end mode)" />
</inputs>
<outputs>
    <data name="logfile" format="txt" />
    <data name="gc_map_outfile" format="tabular" label="GC heat map">
        <filter>kmer_size > 0</filter>
    </data>
    <data name="mmap_outfile" format="mmap" label="mmap file">
        <filter>kmer_size > 0</filter>
    </data>
</outputs>
<help>
**What It Does**

Firstly, this tool may be used to remove sequencing library artifacts (adapter/linker sequences and PCR amplified reads).
If you specify the filter-output option, then reads that have at least a partial hit to the screens will be separated.
Otherwise only those reads which match the screens for the reads' entire length will be filtered.
You may wish to set the kmer size to 0 to avoid doing kmer filtering in this first-pass.

Secondly, this tool may be used to filter reads with low-abundance kmers.  Such reads contain sequencing errors or the
reference sequence appears in such low abundance you probably cannot assemble it anyway.  Reducing the dataset in this
way will make assemblies faster (or possible).  Set the kmer size depending on read length (e.g. ~2/3 read length);
increasing the kmer size will require less RAM, run faster, and eliminate more error-containing reads.  Also set the
min-depth option depending upon your expected depth of coverage.  You may also set the max-kmer-output-depth option;
this is useful for eliminating ribosomal contaminants, for example.

For metagenome or metatranscriptome sequencing projects, instead of specifying the max-kmer-output-depth, it may be
adventitious to partition the reads into several files based upon kmer abundance using the partition-by-depth option.

**Author**

Rob Egan (RSEgan@LBL.gov)
</help>
</tool>