changeset 0:d9da256384e1 default tip

Uploaded
author edward-kirton
date Thu, 14 Jul 2011 22:19:53 -0400
parents
children
files kmernator.xml kmernator_wrapper.pl
diffstat 2 files changed, 253 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kmernator.xml	Thu Jul 14 22:19:53 2011 -0400
@@ -0,0 +1,166 @@
+<tool id='kmernator' name='kmernator' version='1.0.1' description='Filter reads by kmer frequency' force_history_refresh='True'>
+<command interpreter="perl">kmernator_wrapper.pl
+$logfile.id
+$__new_file_path__
+$logfile.extra_files_path
+$mmap_infile
+$kmer_size
+$mmap_outfile
+$gc_map_outfile
+$num_outfiles.partition_by_depth
+$input.format
+#for $i in $input.files
+${i.file} ${i.file.hid}
+#end for
+END
+--separate-outputs 0
+--mmap-input 1
+--save-kmer-mmap 1
+--build-partitions 1
+--gc-heat-map 1
+--log-file $logfile
+--temp-dir $logfile.extra_files_path
+#if $num_outfiles.select == 'one':
+--max-kmer-output-depth $num_outfiles.max_kmer_output_depth
+#else:
+--partition-by-depth $num_outfiles.partition_by_depth
+#end if
+--min-passing-in-pair $min_passing_in_pair
+--kmer-size $kmer_size
+--format-output $format_output
+--ignore-quality $ignore_quality
+--filter-output $filter_output
+--phix-output $phix_output
+--min-read-length $min_read_length
+--min-kmer-quality $min_kmer_quality
+--min-quality-score $min_quality_score
+--min-depth $min_depth
+--bimodal-sigmas $bimodal_sigmas
+--variant-sigmas $variant_sigmas
+--periodic-singleton-purge $periodic_singleton_purge
+--skip-artifact-filter $skip_artifact_filter
+--artifact-match-length $artifact_match_length
+--artifact-edit-distance $artifact_edit_distance
+--mask-simple-repeats $mask_simple_repeats
+--dedup-mode $dedup_mode
+--dedup-single $dedup_single
+--dedup-edit-distance $dedup_edit_distance
+--dedup-start-offset $dedup_start_offset
+--dedup-length $dedup_length
+</command>
+<inputs>
+    <!-- INPUT FILES -->
+    <conditional name="input">
+        <param name="format" type="select" label="Input format">
+            <option value="fastqsanger">Fastq-Sanger</option>
+            <option value="fastqillumina">Fastq-Illumina</option>
+        </param>
+        <when value="fastqsanger">
+            <repeat name="files" title="Read Files">
+                <param name="file" type="data" format="fastqsanger" label="FastqSanger File"/>
+            </repeat>
+        </when>
+        <when value="fastqillumina">
+            <repeat name="files" title="Read Files">
+                <param name="file" type="data" format="fastqillumina" label="FastqIllumina File"/>
+            </repeat>
+        </when>
+    </conditional>
+    <param name="ignore_quality" type="boolean" truevalue="1" falsevalue="0" checked="false" label="ignore the quality scores" help="to save memory or if they are untrusted" />
+    <param name="mmap_infile" type="data" format="mmap" optional="true" label="mmap file" help="Instead of generating kmer spectrum, load an existing one" />
+
+    <!-- OUTPUT OPTIONS -->
+    <param name="format_output" type="select" display="radio" label="Output format">
+        <option value="0" selected="true">Fastq masked</option>
+        <!-- <option value="1">Fasta</option> -->
+        <option value="2">Fastq unmasked</option>
+        <!-- <option value="3">Fasta unmasked</option> -->
+    </param>
+    <conditional name="num_outfiles">
+        <param name="select" type="select" label="Number of output files">
+            <option value="one">One</option>
+            <option value="many">Many; partition by kmer depth</option>
+        </param>
+        <when value="one">
+            <param name="max_kmer_output_depth" type="integer" value="-1" label="maximum number of times a kmer will be output among the selected reads (to reduce the redundancy of the output)" help="-1 = no maximum" />
+            <param name="partition_by_depth" type="hidden" value="-1" />
+        </when>
+        <when value="many">
+            <param name="partition_by_depth" type="integer" value="512" label="Trimmed reads will be output into powers-of-two buckets whereby average k-mer depth is &gt;= this size" help="Outputs are nonredundant; useful for meta-genome/-transcriptome sequencing" />
+        </when>
+    </conditional>
+    <param name="filter_output" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Save artifact filtered reads in a separate file?" help="If not set, then affected reads will be trimmed and then output normally." />
+    <param name="phix_output" type="boolean" truevalue="1" falsevalue="0" selected="false" label="if set, artifact filter also screens for PhiX174, and any matching reads will be output to Artifacts file" help="Requires filter-output option (above)" />
+    <!-- NOT IMPLEMENTED: -separate-outputs 1
+    For multiple infiles:
+    If set, each input (plus consensus) will generate a new outputfile.
+    If set to 0, all input files will be merged into one output file.
+    -->
+
+    <!-- FILTER OPTIONS -->
+    <param name="skip_artifact_filter" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Skip artifact filter" />
+    <param name="kmer_size" type="integer" value="21" label="kmer size" help="A size of 0 will skip k-mer calculations" />
+    <param name="min_depth" type="float" value="2" label="Minimum depth" help="Requires nonzero kmer size" />
+
+    <param name="min_passing_in_pair" type="select" display="radio">
+        <option value="1">only 1 read in a pair must pass filters</option>
+        <option value="2">both reads in a pair must pass filters</option>
+    </param>
+    <param name="min_read_length" type="integer" value="25" label="minimum (trimmed) read length of selected reads." help="Special values: 0=no minimum; 1=full read length" />
+    <param name="min_kmer_quality" type="float" value="0.10000000000000001" label="minimum quality-adjusted kmer probability" help="0.0-1.0" />
+    <param name="min_quality_score" type="integer" value="5" label="minimum quality score over entire kmer" />
+    <param name="min_depth" type="integer" value="2" label="minimum depth for a solid kmer" />
+    <param name="skip_artifact_filter" type="boolean" truevalue="1" falsevalue="0" checked="false" label= "Skip homo-polymer, primer-dimer and duplicated fragment pair filtering" />
+
+    <!-- ADVANCED OPTIONS -->
+    <param name="bimodal_sigmas" type="float" value="-1" label="Detect bimodal kmer-signatures across reads and trim at transition point if the two means are separated by bimodal-sigmas * stdDev (2.0 to 3.0 suggested)." help="disabled if &lt; 0.0" />
+    <param name="variant_sigmas" type="float" value="-1" label="Detect and purge kmer-variants if &gt;= variant-sigmas * Poisson-stdDev (2.0-3.0 suggested)." help="Disabled if &lt; 0.0" />
+    <param name="periodic_singleton_purge" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Purge singleton memory structure every # of reads" />
+    <param name="artifact_match_length" type="integer" value="24" label="Kmer match length to known artifact sequences" />
+    <param name="artifact_edit_distance" type="integer" value="2" label="edit-distance to apply to artifact-match-length matches to know artifacts" />
+    <param name="mask_simple_repeats" type="boolean" truevalue="1" falsevalue="0" checked="true" label="if filtering artifacts, also mask simple repeats" />
+    <param name="dedup_mode" type="select" label="De-duplication mode" display="radio" help="If de-duplication is performed, additional consensus outfiles will be generated">
+        <option value="0">No fragment de-duplication</option>
+        <option value="1" selected="true">single orientation will collapse to consensus (AB and BA are separated)</option>
+        <option value="2">both orientations will collapse (AB and BA are the same)</option>
+    </param>
+    <param name="dedup_single" type="select" display="radio" label="De-duplication for single reads">
+        <option value="0" selected="true">no single read de-duplication</option>
+        <option value="1">perform single read de-duplication</option>
+    </param>
+    <param name="dedup_edit_distance" type="integer" value="0" label="De-duplication edit distance" help="-1 = no fragment de-duplication" />
+    <param name="dedup_start_offset" type="integer" value="0" label="de-duplication start offset to find unique fragments" help="must be multiple of 4" />
+    <param name="dedup_length" type="integer" value="16" label="de-duplication length to find unique fragments" help="must be multiple of 4 (doubled when in single-end mode)" />
+</inputs>
+<outputs>
+    <data name="logfile" format="txt" />
+    <data name="gc_map_outfile" format="tabular" label="GC heat map">
+        <filter>kmer_size > 0</filter>
+    </data>
+    <data name="mmap_outfile" format="mmap" label="mmap file">
+        <filter>kmer_size > 0</filter>
+    </data>
+</outputs>
+<help>
+**What It Does**
+
+Firstly, this tool may be used to remove sequencing library artifacts (adapter/linker sequences and PCR amplified reads).
+If you specify the filter-output option, then reads that have at least a partial hit to the screens will be separated.
+Otherwise only those reads which match the screens for the reads' entire length will be filtered.
+You may wish to set the kmer size to 0 to avoid doing kmer filtering in this first-pass.
+
+Secondly, this tool may be used to filter reads with low-abundance kmers.  Such reads contain sequencing errors or the
+reference sequence appears in such low abundance you probably cannot assemble it anyway.  Reducing the dataset in this
+way will make assemblies faster (or possible).  Set the kmer size depending on read length (e.g. ~2/3 read length);
+increasing the kmer size will require less RAM, run faster, and eliminate more error-containing reads.  Also set the
+min-depth option depending upon your expected depth of coverage.  You may also set the max-kmer-output-depth option;
+this is useful for eliminating ribosomal contaminants, for example.
+
+For metagenome or metatranscriptome sequencing projects, instead of specifying the max-kmer-output-depth, it may be
+adventitious to partition the reads into several files based upon kmer abundance using the partition-by-depth option.
+
+**Author**
+
+Rob Egan (RSEgan@LBL.gov)
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kmernator_wrapper.pl	Thu Jul 14 22:19:53 2011 -0400
@@ -0,0 +1,87 @@
+#!/usr/bin/env perl
+
+# FilterReads (AKA Kmernator) wrapper for Galaxy.
+# Kmernator by Rob Egan (RSEgan@LBL.gov)
+# Wrapper for Galaxy supported by Edward Kirton (ESKirton@LBL.gov)
+# Produced under funding from the United States Department of Energy Office of Science
+# Freely distributable under the same license as Galaxy itself.
+
+use strict;
+use warnings;
+use File::Copy;
+
+# SHIFT OUTFILE PATHS OFF ARGV
+die unless @ARGV;
+my $id=shift @ARGV;
+my $results_dir=shift @ARGV;
+my $working_dir=shift @ARGV;
+my $mmap_infile=shift @ARGV;
+my $kmer=shift @ARGV;
+my $mmap_outfile=shift @ARGV;
+my $gc_map_outfile=shift @ARGV;
+my $partition_by_depth=shift @ARGV;
+my $format=shift @ARGV;
+my @infiles=();
+my @history_ids=();
+while (my $infile=shift @ARGV) {
+    last if $infile eq 'END';
+    push @infiles, $infile;
+    my $hid=shift @ARGV;
+    push @history_ids, $hid;
+    die("Input file $hid does not exist ($infile)\n") unless -e $infile;
+}
+
+# PREPARE
+mkdir($working_dir) unless -d $working_dir;
+chdir($working_dir) or die($!);
+for (my $i=0; $i<=$#infiles; $i++) {
+    my $infile=$infiles[$i];
+    symlink($infile,"$working_dir/$i.fastq");
+    unshift @ARGV, "--input-file $working_dir/$i.fastq";
+}
+if ($mmap_infile ne 'None') {
+    symlink("$mmap_infile", "$working_dir/out-mmap");
+    unshift @ARGV, "--load-kmer-mmap 1";
+}
+
+# RUN
+my $output=`FilterReads @ARGV --output-file $working_dir/out`;
+die("Kmernator died while running command: @ARGV\nOUTPUT:\n$output\n") if $?;
+
+# MOVE OUTFILES
+if ($kmer < 1) {
+    my $dest=join("_",'primary',$id,'FilteredReads','visible',$format); 
+    move("out-in.fastq","$results_dir/$dest") or die($!);
+} elsif ($partition_by_depth > 0) {
+    opendir(DIR, $working_dir) or die($!);
+    my @files = readdir(DIR);
+    closedir(DIR);
+    foreach my $file ( @files ) {
+        next unless $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+)\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+\-consensus\-\d+)\.fastq$/;
+        #next unless $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+)\-dataset_\d+\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-PartitionDepth\d+\-consensus\-\d+)\.fastq$/;
+        my $dest=join("_",'primary',$id,$1,'visible',$format); 
+        move($file,"$results_dir/$dest") or die($!);
+    }
+    move("out-GC.txt",$gc_map_outfile) or die($!);
+    move("out-mmap",$mmap_outfile) or die($!);
+} else {
+    opendir(DIR, $working_dir) or die($!);
+    my @files = readdir(DIR);
+    closedir(DIR);
+    foreach my $file ( @files ) {
+        next unless $file =~ /^out\-(MinDepth\d+)\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-consensus\-\d+)\.fastq$/;
+        #next unless $file =~ /^out\-(MinDepth\d+)\-dataset_\d+\.fastq$/ or $file =~ /^out\-(MinDepth\d+\-consensus\-\d+)\.fastq$/;
+        my $dest=join("_",'primary',$id,$1,'visible',$format); 
+        move($file,"$results_dir/$dest") or die($!);
+    }
+    move("out-GC.txt",$gc_map_outfile) or die($!);
+    move("out-mmap",$mmap_outfile) or die($!);
+}
+if ( -f "out-0-Artifact.fastq" ) {
+    for (my $i=0; $i<=$#infiles; $i++) {
+        my $hid=$history_ids[$i];
+        my $dest=join("_",'primary',$id,"Dataset-$hid-Artifact",'visible',$format); 
+        move("out-$i-Artifact.fastq", "$results_dir/$dest") or die($!);
+    }
+}
+exit;