Mercurial > repos > edward-kirton > kmernator
changeset 0:d9da256384e1 default tip
Uploaded
author | edward-kirton |
---|---|
date | Thu, 14 Jul 2011 22:19:53 -0400 |
parents | |
children | |
files | kmernator.xml kmernator_wrapper.pl |
diffstat | 2 files changed, 253 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmernator.xml Thu Jul 14 22:19:53 2011 -0400 @@ -0,0 +1,166 @@ +<tool id='kmernator' name='kmernator' version='1.0.1' description='Filter reads by kmer frequency' force_history_refresh='True'> +<command interpreter="perl">kmernator_wrapper.pl +$logfile.id +$__new_file_path__ +$logfile.extra_files_path +$mmap_infile +$kmer_size +$mmap_outfile +$gc_map_outfile +$num_outfiles.partition_by_depth +$input.format +#for $i in $input.files +${i.file} ${i.file.hid} +#end for +END +--separate-outputs 0 +--mmap-input 1 +--save-kmer-mmap 1 +--build-partitions 1 +--gc-heat-map 1 +--log-file $logfile +--temp-dir $logfile.extra_files_path +#if $num_outfiles.select == 'one': +--max-kmer-output-depth $num_outfiles.max_kmer_output_depth +#else: +--partition-by-depth $num_outfiles.partition_by_depth +#end if +--min-passing-in-pair $min_passing_in_pair +--kmer-size $kmer_size +--format-output $format_output +--ignore-quality $ignore_quality +--filter-output $filter_output +--phix-output $phix_output +--min-read-length $min_read_length +--min-kmer-quality $min_kmer_quality +--min-quality-score $min_quality_score +--min-depth $min_depth +--bimodal-sigmas $bimodal_sigmas +--variant-sigmas $variant_sigmas +--periodic-singleton-purge $periodic_singleton_purge +--skip-artifact-filter $skip_artifact_filter +--artifact-match-length $artifact_match_length +--artifact-edit-distance $artifact_edit_distance +--mask-simple-repeats $mask_simple_repeats +--dedup-mode $dedup_mode +--dedup-single $dedup_single +--dedup-edit-distance $dedup_edit_distance +--dedup-start-offset $dedup_start_offset +--dedup-length $dedup_length +</command> +<inputs> + <!-- INPUT FILES --> + <conditional name="input"> + <param name="format" type="select" label="Input format"> + <option value="fastqsanger">Fastq-Sanger</option> + <option value="fastqillumina">Fastq-Illumina</option> + </param> + <when value="fastqsanger"> + <repeat name="files" title="Read Files"> + <param name="file" type="data" format="fastqsanger" label="FastqSanger File"/> + </repeat> + </when> + <when value="fastqillumina"> + <repeat name="files" title="Read Files"> + <param name="file" type="data" format="fastqillumina" label="FastqIllumina File"/> + </repeat> + </when> + </conditional> + <param name="ignore_quality" type="boolean" truevalue="1" falsevalue="0" checked="false" label="ignore the quality scores" help="to save memory or if they are untrusted" /> + <param name="mmap_infile" type="data" format="mmap" optional="true" label="mmap file" help="Instead of generating kmer spectrum, load an existing one" /> + + <!-- OUTPUT OPTIONS --> + <param name="format_output" type="select" display="radio" label="Output format"> + <option value="0" selected="true">Fastq masked</option> + <!-- <option value="1">Fasta</option> --> + <option value="2">Fastq unmasked</option> + <!-- <option value="3">Fasta unmasked</option> --> + </param> + <conditional name="num_outfiles"> + <param name="select" type="select" label="Number of output files"> + <option value="one">One</option> + <option value="many">Many; partition by kmer depth</option> + </param> + <when value="one"> + <param name="max_kmer_output_depth" type="integer" value="-1" label="maximum number of times a kmer will be output among the selected reads (to reduce the redundancy of the output)" help="-1 = no maximum" /> + <param name="partition_by_depth" type="hidden" value="-1" /> + </when> + <when value="many"> + <param name="partition_by_depth" type="integer" value="512" label="Trimmed reads will be output into powers-of-two buckets whereby average k-mer depth is >= this size" help="Outputs are nonredundant; useful for meta-genome/-transcriptome sequencing" /> + </when> + </conditional> + <param name="filter_output" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Save artifact filtered reads in a separate file?" help="If not set, then affected reads will be trimmed and then output normally." /> + <param name="phix_output" type="boolean" truevalue="1" falsevalue="0" selected="false" label="if set, artifact filter also screens for PhiX174, and any matching reads will be output to Artifacts file" help="Requires filter-output option (above)" /> + <!-- NOT IMPLEMENTED: -separate-outputs 1 + For multiple infiles: + If set, each input (plus consensus) will generate a new outputfile. + If set to 0, all input files will be merged into one output file. + --> + + <!-- FILTER OPTIONS --> + <param name="skip_artifact_filter" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Skip artifact filter" /> + <param name="kmer_size" type="integer" value="21" label="kmer size" help="A size of 0 will skip k-mer calculations" /> + <param name="min_depth" type="float" value="2" label="Minimum depth" help="Requires nonzero kmer size" /> + + <param name="min_passing_in_pair" type="select" display="radio"> + <option value="1">only 1 read in a pair must pass filters</option> + <option value="2">both reads in a pair must pass filters</option> + </param> + <param name="min_read_length" type="integer" value="25" label="minimum (trimmed) read length of selected reads." help="Special values: 0=no minimum; 1=full read length" /> + <param name="min_kmer_quality" type="float" value="0.10000000000000001" label="minimum quality-adjusted kmer probability" help="0.0-1.0" /> + <param name="min_quality_score" type="integer" value="5" label="minimum quality score over entire kmer" /> + <param name="min_depth" type="integer" value="2" label="minimum depth for a solid kmer" /> + <param name="skip_artifact_filter" type="boolean" truevalue="1" falsevalue="0" checked="false" label= "Skip homo-polymer, primer-dimer and duplicated fragment pair filtering" /> + + <!-- ADVANCED OPTIONS --> + <param name="bimodal_sigmas" type="float" value="-1" label="Detect bimodal kmer-signatures across reads and trim at transition point if the two means are separated by bimodal-sigmas * stdDev (2.0 to 3.0 suggested)." help="disabled if < 0.0" /> + <param name="variant_sigmas" type="float" value="-1" label="Detect and purge kmer-variants if >= variant-sigmas * Poisson-stdDev (2.0-3.0 suggested)." help="Disabled if < 0.0" /> + <param name="periodic_singleton_purge" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Purge singleton memory structure every # of reads" /> + <param name="artifact_match_length" type="integer" value="24" label="Kmer match length to known artifact sequences" /> + <param name="artifact_edit_distance" type="integer" value="2" label="edit-distance to apply to artifact-match-length matches to know artifacts" /> + <param name="mask_simple_repeats" type="boolean" truevalue="1" falsevalue="0" checked="true" label="if filtering artifacts, also mask simple repeats" /> + <param name="dedup_mode" type="select" label="De-duplication mode" display="radio" help="If de-duplication is performed, additional consensus outfiles will be generated"> + <option value="0">No fragment de-duplication</option> + <option value="1" selected="true">single orientation will collapse to consensus (AB and BA are separated)</option> + <option value="2">both orientations will collapse (AB and BA are the same)</option> + </param> + <param name="dedup_single" type="select" display="radio" label="De-duplication for single reads"> + <option value="0" selected="true">no single read de-duplication</option> + <option value="1">perform single read de-duplication</option> + </param> + <param name="dedup_edit_distance" type="integer" value="0" label="De-duplication edit distance" help="-1 = no fragment de-duplication" /> + <param name="dedup_start_offset" type="integer" value="0" label="de-duplication start offset to find unique fragments" help="must be multiple of 4" /> + <param name="dedup_length" type="integer" value="16" label="de-duplication length to find unique fragments" help="must be multiple of 4 (doubled when in single-end mode)" /> +</inputs> +<outputs> + <data name="logfile" format="txt" /> + <data name="gc_map_outfile" format="tabular" label="GC heat map"> + <filter>kmer_size > 0</filter> + </data> + <data name="mmap_outfile" format="mmap" label="mmap file"> + <filter>kmer_size > 0</filter> + </data> +</outputs> +<help> +**What It Does** + +Firstly, this tool may be used to remove sequencing library artifacts (adapter/linker sequences and PCR amplified reads). +If you specify the filter-output option, then reads that have at least a partial hit to the screens will be separated. +Otherwise only those reads which match the screens for the reads' entire length will be filtered. +You may wish to set the kmer size to 0 to avoid doing kmer filtering in this first-pass. + +Secondly, this tool may be used to filter reads with low-abundance kmers. Such reads contain sequencing errors or the +reference sequence appears in such low abundance you probably cannot assemble it anyway. Reducing the dataset in this +way will make assemblies faster (or possible). Set the kmer size depending on read length (e.g. ~2/3 read length); +increasing the kmer size will require less RAM, run faster, and eliminate more error-containing reads. Also set the +min-depth option depending upon your expected depth of coverage. You may also set the max-kmer-output-depth option; +this is useful for eliminating ribosomal contaminants, for example. + +For metagenome or metatranscriptome sequencing projects, instead of specifying the max-kmer-output-depth, it may be +adventitious to partition the reads into several files based upon kmer abundance using the partition-by-depth option. + +**Author** + +Rob Egan (RSEgan@LBL.gov) +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmernator_wrapper.pl Thu Jul 14 22:19:53 2011 -0400 @@ -0,0 +1,87 @@ +#!/usr/bin/env perl + +# FilterReads (AKA Kmernator) wrapper for Galaxy. +# Kmernator by Rob Egan (RSEgan@LBL.gov) +# Wrapper for Galaxy supported by Edward Kirton (ESKirton@LBL.gov) +# Produced under funding from the United States Department of Energy Office of Science +# Freely distributable under the same license as Galaxy itself. + +use strict; +use warnings; +use File::Copy; + +# SHIFT OUTFILE PATHS OFF ARGV +die unless @ARGV; +my $id=shift @ARGV; +my $results_dir=shift @ARGV; +my $working_dir=shift @ARGV; +my $mmap_infile=shift @ARGV; +my $kmer=shift @ARGV; +my $mmap_outfile=shift @ARGV; +my $gc_map_outfile=shift @ARGV; +my $partition_by_depth=shift @ARGV; +my $format=shift @ARGV; +my @infiles=(); +my @history_ids=(); +while (my $infile=shift @ARGV) { + last if $infile eq 'END'; + push @infiles, $infile; + my $hid=shift @ARGV; + push @history_ids, $hid; + die("Input file $hid does not exist ($infile)\n") unless -e $infile; +} + +# PREPARE +mkdir($working_dir) unless -d $working_dir; +chdir($working_dir) or die($!); +for (my $i=0; $i<=$#infiles; $i++) { + my $infile=$infiles[$i]; + symlink($infile,"$working_dir/$i.fastq"); + unshift @ARGV, "--input-file $working_dir/$i.fastq"; +} +if ($mmap_infile ne 'None') { + symlink("$mmap_infile", "$working_dir/out-mmap"); + unshift @ARGV, "--load-kmer-mmap 1"; +} + +# RUN +my $output=`FilterReads @ARGV --output-file $working_dir/out`; +die("Kmernator died while running command: @ARGV\nOUTPUT:\n$output\n") if $?; + +# MOVE OUTFILES +if ($kmer < 1) { + my $dest=join("_",'primary',$id,'FilteredReads','visible',$format); + move("out-in.fastq","$results_dir/$dest") or die($!); +} elsif ($partition_by_depth > 0) { + opendir(DIR, $working_dir) or die($!); + my @files = readdir(DIR); + closedir(DIR); + foreach my $file ( @files ) { + next unless $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+)\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+\-consensus\-\d+)\.fastq$/; + #next unless $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+)\-dataset_\d+\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+\-consensus\-\d+)\.fastq$/; + my $dest=join("_",'primary',$id,$1,'visible',$format); + move($file,"$results_dir/$dest") or die($!); + } + move("out-GC.txt",$gc_map_outfile) or die($!); + move("out-mmap",$mmap_outfile) or die($!); +} else { + opendir(DIR, $working_dir) or die($!); + my @files = readdir(DIR); + closedir(DIR); + foreach my $file ( @files ) { + next unless $file =~ /^out\-(MinDepth\d+)\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-consensus\-\d+)\.fastq$/; + #next unless $file =~ /^out\-(MinDepth\d+)\-dataset_\d+\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-consensus\-\d+)\.fastq$/; + my $dest=join("_",'primary',$id,$1,'visible',$format); + move($file,"$results_dir/$dest") or die($!); + } + move("out-GC.txt",$gc_map_outfile) or die($!); + move("out-mmap",$mmap_outfile) or die($!); +} +if ( -f "out-0-Artifact.fastq" ) { + for (my $i=0; $i<=$#infiles; $i++) { + my $hid=$history_ids[$i]; + my $dest=join("_",'primary',$id,"Dataset-$hid-Artifact",'visible',$format); + move("out-$i-Artifact.fastq", "$results_dir/$dest") or die($!); + } +} +exit;