# HG changeset patch # User edward-kirton # Date 1310696393 14400 # Node ID d9da256384e18f780ec156a497d6e2518c2662f1 Uploaded diff -r 000000000000 -r d9da256384e1 kmernator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmernator.xml Thu Jul 14 22:19:53 2011 -0400 @@ -0,0 +1,166 @@ + +kmernator_wrapper.pl +$logfile.id +$__new_file_path__ +$logfile.extra_files_path +$mmap_infile +$kmer_size +$mmap_outfile +$gc_map_outfile +$num_outfiles.partition_by_depth +$input.format +#for $i in $input.files +${i.file} ${i.file.hid} +#end for +END +--separate-outputs 0 +--mmap-input 1 +--save-kmer-mmap 1 +--build-partitions 1 +--gc-heat-map 1 +--log-file $logfile +--temp-dir $logfile.extra_files_path +#if $num_outfiles.select == 'one': +--max-kmer-output-depth $num_outfiles.max_kmer_output_depth +#else: +--partition-by-depth $num_outfiles.partition_by_depth +#end if +--min-passing-in-pair $min_passing_in_pair +--kmer-size $kmer_size +--format-output $format_output +--ignore-quality $ignore_quality +--filter-output $filter_output +--phix-output $phix_output +--min-read-length $min_read_length +--min-kmer-quality $min_kmer_quality +--min-quality-score $min_quality_score +--min-depth $min_depth +--bimodal-sigmas $bimodal_sigmas +--variant-sigmas $variant_sigmas +--periodic-singleton-purge $periodic_singleton_purge +--skip-artifact-filter $skip_artifact_filter +--artifact-match-length $artifact_match_length +--artifact-edit-distance $artifact_edit_distance +--mask-simple-repeats $mask_simple_repeats +--dedup-mode $dedup_mode +--dedup-single $dedup_single +--dedup-edit-distance $dedup_edit_distance +--dedup-start-offset $dedup_start_offset +--dedup-length $dedup_length + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + kmer_size > 0 + + + kmer_size > 0 + + + +**What It Does** + +Firstly, this tool may be used to remove sequencing library artifacts (adapter/linker sequences and PCR amplified reads). +If you specify the filter-output option, then reads that have at least a partial hit to the screens will be separated. +Otherwise only those reads which match the screens for the reads' entire length will be filtered. +You may wish to set the kmer size to 0 to avoid doing kmer filtering in this first-pass. + +Secondly, this tool may be used to filter reads with low-abundance kmers. Such reads contain sequencing errors or the +reference sequence appears in such low abundance you probably cannot assemble it anyway. Reducing the dataset in this +way will make assemblies faster (or possible). Set the kmer size depending on read length (e.g. ~2/3 read length); +increasing the kmer size will require less RAM, run faster, and eliminate more error-containing reads. Also set the +min-depth option depending upon your expected depth of coverage. You may also set the max-kmer-output-depth option; +this is useful for eliminating ribosomal contaminants, for example. + +For metagenome or metatranscriptome sequencing projects, instead of specifying the max-kmer-output-depth, it may be +adventitious to partition the reads into several files based upon kmer abundance using the partition-by-depth option. + +**Author** + +Rob Egan (RSEgan@LBL.gov) + + diff -r 000000000000 -r d9da256384e1 kmernator_wrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmernator_wrapper.pl Thu Jul 14 22:19:53 2011 -0400 @@ -0,0 +1,87 @@ +#!/usr/bin/env perl + +# FilterReads (AKA Kmernator) wrapper for Galaxy. +# Kmernator by Rob Egan (RSEgan@LBL.gov) +# Wrapper for Galaxy supported by Edward Kirton (ESKirton@LBL.gov) +# Produced under funding from the United States Department of Energy Office of Science +# Freely distributable under the same license as Galaxy itself. + +use strict; +use warnings; +use File::Copy; + +# SHIFT OUTFILE PATHS OFF ARGV +die unless @ARGV; +my $id=shift @ARGV; +my $results_dir=shift @ARGV; +my $working_dir=shift @ARGV; +my $mmap_infile=shift @ARGV; +my $kmer=shift @ARGV; +my $mmap_outfile=shift @ARGV; +my $gc_map_outfile=shift @ARGV; +my $partition_by_depth=shift @ARGV; +my $format=shift @ARGV; +my @infiles=(); +my @history_ids=(); +while (my $infile=shift @ARGV) { + last if $infile eq 'END'; + push @infiles, $infile; + my $hid=shift @ARGV; + push @history_ids, $hid; + die("Input file $hid does not exist ($infile)\n") unless -e $infile; +} + +# PREPARE +mkdir($working_dir) unless -d $working_dir; +chdir($working_dir) or die($!); +for (my $i=0; $i<=$#infiles; $i++) { + my $infile=$infiles[$i]; + symlink($infile,"$working_dir/$i.fastq"); + unshift @ARGV, "--input-file $working_dir/$i.fastq"; +} +if ($mmap_infile ne 'None') { + symlink("$mmap_infile", "$working_dir/out-mmap"); + unshift @ARGV, "--load-kmer-mmap 1"; +} + +# RUN +my $output=`FilterReads @ARGV --output-file $working_dir/out`; +die("Kmernator died while running command: @ARGV\nOUTPUT:\n$output\n") if $?; + +# MOVE OUTFILES +if ($kmer < 1) { + my $dest=join("_",'primary',$id,'FilteredReads','visible',$format); + move("out-in.fastq","$results_dir/$dest") or die($!); +} elsif ($partition_by_depth > 0) { + opendir(DIR, $working_dir) or die($!); + my @files = readdir(DIR); + closedir(DIR); + foreach my $file ( @files ) { + next unless $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+)\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+\-consensus\-\d+)\.fastq$/; + #next unless $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+)\-dataset_\d+\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+\-consensus\-\d+)\.fastq$/; + my $dest=join("_",'primary',$id,$1,'visible',$format); + move($file,"$results_dir/$dest") or die($!); + } + move("out-GC.txt",$gc_map_outfile) or die($!); + move("out-mmap",$mmap_outfile) or die($!); +} else { + opendir(DIR, $working_dir) or die($!); + my @files = readdir(DIR); + closedir(DIR); + foreach my $file ( @files ) { + next unless $file =~ /^out\-(MinDepth\d+)\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-consensus\-\d+)\.fastq$/; + #next unless $file =~ /^out\-(MinDepth\d+)\-dataset_\d+\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-consensus\-\d+)\.fastq$/; + my $dest=join("_",'primary',$id,$1,'visible',$format); + move($file,"$results_dir/$dest") or die($!); + } + move("out-GC.txt",$gc_map_outfile) or die($!); + move("out-mmap",$mmap_outfile) or die($!); +} +if ( -f "out-0-Artifact.fastq" ) { + for (my $i=0; $i<=$#infiles; $i++) { + my $hid=$history_ids[$i]; + my $dest=join("_",'primary',$id,"Dataset-$hid-Artifact",'visible',$format); + move("out-$i-Artifact.fastq", "$results_dir/$dest") or die($!); + } +} +exit;