Mercurial > repos > devteam > fastq_trimmer_by_quality
changeset 3:c64d534a763c draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/fastq_trimmer_by_quality commit f2582539542b33240234e8ea6093e25d0aee9b6a
author | devteam |
---|---|
date | Sat, 30 Sep 2017 13:56:36 -0400 |
parents | 25c24379693a |
children | 8050e091e99b |
files | fastq_trimmer_by_quality.py fastq_trimmer_by_quality.xml |
diffstat | 2 files changed, 132 insertions(+), 257 deletions(-) [+] |
line wrap: on
line diff
--- a/fastq_trimmer_by_quality.py Thu Feb 02 12:12:55 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -#Dan Blankenberg -from optparse import OptionParser -from galaxy_utils.sequence.fastq import fastqReader, fastqWriter - -def mean( score_list ): - return float( sum( score_list ) ) / float( len( score_list ) ) - -ACTION_METHODS = { 'min':min, 'max':max, 'sum':sum, 'mean':mean } - -def compare( aggregated_value, operator, threshold_value ): - if operator == '>': - return aggregated_value > threshold_value - elif operator == '>=': - return aggregated_value >= threshold_value - elif operator == '==': - return aggregated_value == threshold_value - elif operator == '<': - return aggregated_value < threshold_value - elif operator == '<=': - return aggregated_value <= threshold_value - elif operator == '!=': - return aggregated_value != threshold_value - -def exclude( value_list, exclude_indexes ): - rval = [] - for i, val in enumerate( value_list ): - if i not in exclude_indexes: - rval.append( val ) - return rval - -def exclude_and_compare( aggregate_action, aggregate_list, operator, threshold_value, exclude_indexes = None ): - if not aggregate_list or compare( aggregate_action( aggregate_list ), operator, threshold_value ): - return True - if exclude_indexes: - for exclude_index in exclude_indexes: - excluded_list = exclude( aggregate_list, exclude_index ) - if not excluded_list or compare( aggregate_action( excluded_list ), operator, threshold_value ): - return True - return False - -def main(): - usage = "usage: %prog [options] input_file output_file" - parser = OptionParser( usage=usage ) - parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' ) - parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' ) - parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' ) - parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' ) - parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' ) - parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' ) - parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' ) - parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' ) - parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length") - ( options, args ) = parser.parse_args() - - if len ( args ) != 2: - parser.error( "Need to specify an input file and an output file" ) - - if options.window_size < 1: - parser.error( 'You must specify a strictly positive window size' ) - - if options.window_step < 1: - parser.error( 'You must specify a strictly positive step size' ) - - #determine an exhaustive list of window indexes that can be excluded from aggregation - exclude_window_indexes = [] - last_exclude_indexes = [] - for exclude_count in range( min( options.exclude_count, options.window_size ) ): - if last_exclude_indexes: - new_exclude_indexes = [] - for exclude_list in last_exclude_indexes: - for window_index in range( options.window_size ): - if window_index not in exclude_list: - new_exclude = sorted( exclude_list + [ window_index ] ) - if new_exclude not in exclude_window_indexes + new_exclude_indexes: - new_exclude_indexes.append( new_exclude ) - exclude_window_indexes += new_exclude_indexes - last_exclude_indexes = new_exclude_indexes - else: - for window_index in range( options.window_size ): - last_exclude_indexes.append( [ window_index ] ) - exclude_window_indexes = list( last_exclude_indexes ) - - out = fastqWriter( open( args[1], 'wb' ), format = options.format ) - action = ACTION_METHODS[ options.aggregation_action ] - - num_reads = None - num_reads_excluded = 0 - for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ): - for trim_end in options.trim_ends: - quality_list = fastq_read.get_decimal_quality_scores() - if trim_end == '5': - lwindow_position = 0 #left position of window - while True: - if lwindow_position >= len( quality_list ): - fastq_read.sequence = '' - fastq_read.quality = '' - break - if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ): - fastq_read = fastq_read.slice( lwindow_position, None ) - break - lwindow_position += options.window_step - else: - rwindow_position = len( quality_list ) #right position of window - while True: - lwindow_position = rwindow_position - options.window_size #left position of window - if rwindow_position <= 0 or lwindow_position < 0: - fastq_read.sequence = '' - fastq_read.quality = '' - break - if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ): - fastq_read = fastq_read.slice( None, rwindow_position ) - break - rwindow_position -= options.window_step - if options.keep_zero_length or len( fastq_read ): - out.write( fastq_read ) - else: - num_reads_excluded += 1 - out.close() - if num_reads is None: - print "No valid FASTQ reads could be processed." - else: - print "%i FASTQ reads were processed." % ( num_reads + 1 ) - if num_reads_excluded: - print "%i reads of zero length were excluded from the output." % num_reads_excluded - -if __name__ == "__main__": main()
--- a/fastq_trimmer_by_quality.xml Thu Feb 02 12:12:55 2017 -0500 +++ b/fastq_trimmer_by_quality.xml Sat Sep 30 13:56:36 2017 -0400 @@ -1,130 +1,131 @@ -<tool id="fastq_quality_trimmer" name="FASTQ Quality Trimmer" version="1.0.1"> - <description>by sliding window</description> - <requirements> - <requirement type="package" version="1.0.1">galaxy_sequence_utils</requirement> - </requirements> - <command>python '$__tool_directory__/fastq_trimmer_by_quality.py' '$input_file' '$output_file' -f '${input_file.extension[len( 'fastq' ):]}' -s $window_size - -t $step_size -e $trim_ends -a $aggregation_action -x $exclude_count -c '$score_comparison' -q $quality_score - #if $keep_zero_length: - -k - #end if - </command> - <inputs> - <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ File"/> - <param name="keep_zero_length" label="Keep reads with zero length" type="boolean" checked="false"/> - <param name="trim_ends" type="select" label="Trim ends"> - <option value="53" selected="True">5' and 3'</option> - <option value="5">5' only</option> - <option value="3">3' only</option> - </param> - <param name="window_size" type="integer" value="1" label="Window size"/> - <param name="step_size" type="integer" value="1" label="Step Size" /> - <param name="exclude_count" label="Maximum number of bases to exclude from the window during aggregation" value="0" type="integer" /> - <param name="aggregation_action" type="select" label="Aggregate action for window"> - <option value="min" selected="True">min score</option> - <option value="max">max score</option> - <option value="sum">sum of scores</option> - <option value="mean">mean of scores</option> - </param> - <param name="score_comparison" type="select" label="Trim until aggregate score is"> - <sanitizer> - <valid initial="none"> - <add value="<>=!"/> <!-- only allow lt, gt, e, le, ge, ne for this parameter; will be single-quote escaped on commandline --> - </valid> - </sanitizer> - <option value=">">></option> - <option value=">=" selected="true">>=</option> - <option value="==">==</option> - <option value="!=">!=</option> - <option value="<"><</option> - <option value="<="><=</option> - </param> - <param name="quality_score" label="Quality Score" value="0" type="float" /> - </inputs> - <outputs> - <data name="output_file" format_source="input_file" /> - </outputs> - <tests> - <test> - <!-- Trim until window size 1 >= 20;both ends --> - <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> - <param name="keep_zero_length" value="false" /> - <param name="trim_ends" value="53"/> - <param name="window_size" value="1"/> - <param name="step_size" value="1"/> - <param name="exclude_count" value="0"/> - <param name="aggregation_action" value="min"/> - <param name="score_comparison" value=">="/> - <param name="quality_score" value="20"/> - <output name="output_file" file="sanger_full_range_quality_trimmed_out_1.fastqsanger" /> - </test> - <test> - <!-- Trim until window size 1 >= 20; 5' end only --> - <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> - <param name="keep_zero_length" value="false" /> - <param name="trim_ends" value="5"/> - <param name="window_size" value="1"/> - <param name="step_size" value="1"/> - <param name="exclude_count" value="0"/> - <param name="aggregation_action" value="min"/> - <param name="score_comparison" value=">="/> - <param name="quality_score" value="20"/> - <output name="output_file" file="sanger_full_range_quality_trimmed_out_2.fastqsanger" /> - </test> - <test> - <!-- Trim until window size 1 >= 20; 3' end only --> - <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> - <param name="keep_zero_length" value="false" /> - <param name="trim_ends" value="3"/> - <param name="window_size" value="1"/> - <param name="step_size" value="1"/> - <param name="exclude_count" value="0"/> - <param name="aggregation_action" value="min"/> - <param name="score_comparison" value=">="/> - <param name="quality_score" value="20"/> - <output name="output_file" file="sanger_full_range_quality_trimmed_out_3.fastqsanger" /> - </test> - <test> - <!-- Trim until window size 2 >= 1;both ends, 1 deviant score --> - <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> - <param name="keep_zero_length" value="false" /> - <param name="trim_ends" value="53"/> - <param name="window_size" value="2"/> - <param name="step_size" value="1"/> - <param name="exclude_count" value="1"/> - <param name="aggregation_action" value="min"/> - <param name="score_comparison" value=">="/> - <param name="quality_score" value="1"/> - <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" /> - </test> - <test> - <!-- Trim entire sequences; keep empty reads --> - <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> - <param name="keep_zero_length" value="true" /> - <param name="trim_ends" value="53"/> - <param name="window_size" value="1"/> - <param name="step_size" value="1"/> - <param name="exclude_count" value="0"/> - <param name="aggregation_action" value="min"/> - <param name="score_comparison" value=">="/> - <param name="quality_score" value="999"/> - <output name="output_file" file="sanger_full_range_empty_reads.fastqsanger" /> - </test> - <test> - <!-- Trim entire sequences; discard empty reads --> - <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> - <param name="keep_zero_length" value="false" /> - <param name="trim_ends" value="53"/> - <param name="window_size" value="1"/> - <param name="step_size" value="1"/> - <param name="exclude_count" value="0"/> - <param name="aggregation_action" value="min"/> - <param name="score_comparison" value=">="/> - <param name="quality_score" value="999"/> - <output name="output_file" file="empty_file.dat" /> - </test> - </tests> - <help> +<tool id="fastq_quality_trimmer" name="FASTQ Quality Trimmer" version="1.1.1"> + <description>by sliding window</description> + <requirements> + <requirement type="package" version="1.1.1">galaxy_sequence_utils</requirement> + </requirements> + <command><![CDATA[ +gx-fastq-trimmer-by-quality '$input_file' '$output_file' -f '${input_file.extension[len('fastq'):]}' -s $window_size +-t $step_size -e $trim_ends -a $aggregation_action -x $exclude_count -c '$score_comparison' -q $quality_score +#if $keep_zero_length: + -k +#end if + ]]></command> + <inputs> + <param name="input_file" type="data" format="fastqsanger,fastqcssanger,fastqsanger.gz,fastqcssanger.gz,fastqsanger.bz2,fastqcssanger.bz2" label="FASTQ File"/> + <param name="keep_zero_length" type="boolean" checked="false" label="Keep reads with zero length"/> + <param name="trim_ends" type="select" label="Trim ends"> + <option value="53" selected="true">5' and 3'</option> + <option value="5">5' only</option> + <option value="3">3' only</option> + </param> + <param name="window_size" type="integer" value="1" label="Window size"/> + <param name="step_size" type="integer" value="1" label="Step size" /> + <param name="exclude_count" type="integer" value="0" label="Maximum number of bases to exclude from the window during aggregation" /> + <param name="aggregation_action" type="select" label="Aggregate action for window"> + <option value="min" selected="true">min score</option> + <option value="max">max score</option> + <option value="sum">sum of scores</option> + <option value="mean">mean of scores</option> + </param> + <param name="score_comparison" type="select" label="Trim until aggregate score is"> + <sanitizer> + <valid initial="none"> + <add value="<>=!"/> <!-- only allow lt, gt, e, le, ge, ne for this parameter; will be single-quote escaped on commandline --> + </valid> + </sanitizer> + <option value=">">></option> + <option value=">=" selected="true">>=</option> + <option value="==">==</option> + <option value="!=">!=</option> + <option value="<"><</option> + <option value="<="><=</option> + </param> + <param name="quality_score" type="float" value="0" label="Quality score" /> + </inputs> + <outputs> + <data name="output_file" format_source="input_file" /> + </outputs> + <tests> + <!-- Trim until window size 1 >= 20;both ends --> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="false" /> + <param name="trim_ends" value="53"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="20"/> + <output name="output_file" file="sanger_full_range_quality_trimmed_out_1.fastqsanger" ftype="fastqsanger" /> + </test> + <!-- Trim until window size 1 >= 20; 5' end only --> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="false" /> + <param name="trim_ends" value="5"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="20"/> + <output name="output_file" file="sanger_full_range_quality_trimmed_out_2.fastqsanger" ftype="fastqsanger" /> + </test> + <!-- Trim until window size 1 >= 20; 3' end only --> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="false" /> + <param name="trim_ends" value="3"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="20"/> + <output name="output_file" file="sanger_full_range_quality_trimmed_out_3.fastqsanger" ftype="fastqsanger" /> + </test> + <!-- Trim until window size 2 >= 1;both ends, 1 deviant score --> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="false" /> + <param name="trim_ends" value="53"/> + <param name="window_size" value="2"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="1"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="1"/> + <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + </test> + <!-- Trim entire sequences; keep empty reads --> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="true" /> + <param name="trim_ends" value="53"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="999"/> + <output name="output_file" file="sanger_full_range_empty_reads.fastqsanger" ftype="fastqsanger" /> + </test> + <!-- Trim entire sequences; discard empty reads --> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="false" /> + <param name="trim_ends" value="53"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="999"/> + <output name="output_file" file="empty_file.dat" ftype="fastqsanger" /> + </test> + </tests> + <help><![CDATA[ **What it does** This tool allows you to trim the ends of reads based upon the aggregate value of quality scores found within a sliding window; a sliding window of size 1 is equivalent to 'simple' trimming of the ends. @@ -138,8 +139,8 @@ .. class:: warningmark Trimming a color space read will cause any adapter base to be lost. - </help> - <citations> - <citation type="doi">10.1093/bioinformatics/btq281</citation> - </citations> + ]]></help> + <citations> + <citation type="doi">10.1093/bioinformatics/btq281</citation> + </citations> </tool>