Mercurial > repos > richard-burhans > rdeval
diff rdeval.xml @ 5:7cfeba6facd1 draft
planemo upload for repository https://github.com/vgl-hub/rdeval commit d55c4b3d6b91d0418950ed6b7806ef779a916099
author | richard-burhans |
---|---|
date | Fri, 02 May 2025 22:49:30 +0000 |
parents | 7bf95986aaa4 |
children | 24b05d3958d7 |
line wrap: on
line diff
--- a/rdeval.xml Wed Apr 23 19:31:12 2025 +0000 +++ b/rdeval.xml Fri May 02 22:49:30 2025 +0000 @@ -15,46 +15,62 @@ #end for #if $output_options.output_type.type_selector == "combined_reads" ln -s '$reads_outfile' 'output.${output_type.format_selector}' && - #end if - rdeval --input-reads #echo " ".join([f"'{input}'" for $input in $mangled_inputs]) + #end if + rdeval --input-reads + #for $input in $mangled_inputs + '$input' + #end for #if $expected_gsize '$expected_gsize' #end if - #if $input_filter.filter_selector == "exclude_file" - --exclude-list '$exclude_file' - #else if $input_filter.filter_selector == "include_file" - --include-list '$include_file' - #end if - #if $filter - --filter '$filter' + #if $input_filter.include_list + --include-list '$input_filter.include_list' + #end if + #if $input_filter.exclude_list + --exclude-list '$input_filter.exclude_list' #end if - --sample '$sample' - #if $input_subsample.seed_selector == "yes" - --random-seed '$random_seed' + #set $filter_exp_type = $input_filter.filter_expression.filter_selector + #if $filter_exp_type != "no_exp" + #set $l_exp = "l" + str($input_filter.filter_expression.length_comparison) + str($input_filter.filter_expression.length_value) + #set $q_exp = "q" + str($input_filter.filter_expression.quality_comparison) + str($input_filter.filter_expression.quality_value) + #if $filter_exp_type == "l_exp" + #set $filter_exp = $l_exp + #else if $filter_exp_type == "q_exp" + #set $filter_exp = $q_exp + #else if $filter_exp_type == "lq_exp" + #set $filter_exp = $l_exp + str($input_filter.filter_expression.exp_operator) + $q_exp + #end if + --filter '$filter_exp' + #end if + #if int($input_subsample.sample) != 1 + --sample '$input_subsample.sample' + #end if + #if $input_subsample.random_seed.seed_selector == "yes" + --random-seed '$input_subsample.random_seed.random_seed' #end if #if $input_compress.compress_selector == "yes" - --homopolymer-compress '$homopolymer_compress' + --homopolymer-compress '$input_compress.homopolymer_compress' #end if - #if $stats_flavor.flavor_selector == "stats" - #if $sequence_report + #set $stats_type = $output_options.stats_flavor.flavor_selector + #if $stats_type == "stats" + #if $output_options.stats_flavor.sequence_report --sequence-report #end if - #else if $stats_flavor.flavor_selector == "quality" - --quality '$quality' - #else if $stats_flavor.flavor_selector == "size" - --out-size '$out_size' + #else if $stats_type == "quality" + --quality '$output_options.stats_flavor.quality' + #else if $stats_type == "size" + --out-size '$output_options.stats_flavor.out_size' #end if - #if $output_options.output_type.type_selector == "rd_file" - #if $md5 + #set $output_type = $output_options.output_type.type_selector + #if $output_type == "rd_file" + #if $output_options.output_type.md5 --md5 #end if -o output.rd - #else if $output_options.output_type.type_selector == "combined_reads" - -o 'output.${output_type.format_selector}' + #else if $output_type == "combined_reads" + -o 'output.${output_options.output_type.format_selector}' #end if - #if $verbose --verbose - #end if --tabular --threads \${GALAXY_SLOTS:-2} > '$stats_outfile' @@ -63,21 +79,56 @@ <param argument="--input-reads" type="data" format="bam,cram,fasta,fasta.gz,fastq,fastq.gz" multiple="true" label="Input dataset" help="BAM, CRAM, FASTA, FASTQ, or RD files"/> <param name="expected_gsize" type="integer" label="Expected Genome Size" optional="true" help="Integer (e.g., 3000000000 for human)."/> <section name="input_filter" title="Filter input reads" expanded="false"> - <conditional name="file_filter"> - <param name="filter_selector" type="select" label="Use an exclude or include file"> - <option value="no_file" selected="true">no</option> - <option value="exclude_file">Use an exclude file</option> - <option value="include_file">Use an include file</option> + <param argument="--include-list" type="data" format="txt" optional="true" label="File containing headers to include"/> + <param argument="--exclude-list" type="data" format="txt" optional="true" label="File containing headers to exclude"/> + <conditional name="filter_expression"> + <param name="filter_selector" type="select" label="Filter using length and/or quality" help="filter help"> + <option value="no_exp" selected="true">No</option> + <option value="l_exp">Read length</option> + <option value="q_exp">Average read quality</option> + <option value="lq_exp">Both read length and average read quality</option> </param> - <when value="no_file"/> - <when value="exclude_file"> - <param argument="--exclude-list" type="data" format="txt" optional="true" label="File containing headers to exclude"/> + <when value="no_exp"/> + <when value="l_exp"> + <param name="length_comparison" type="select" label="Retain reads with length"> + <option value="<" selected="true">less than</option> + <option value="=">equal to</option> + <option value=">">greater than</option> + <sanitizer sanitize="false"/> + </param> + <param name="length_value" type="integer" min="0" value="0" label="Length in bp" /> + </when> + <when value="q_exp"> + <param name="quality_comparison" type="select" label="Retain reads with average read quality"> + <option value="<" selected="true">less than</option> + <option value="=">equal to</option> + <option value=">">greater than</option> + <sanitizer sanitize="false"/> + </param> + <param name="quality_value" type="integer" min="0" value="0" label="Average read quality" /> </when> - <when value="include_file"> - <param argument="--include-list" type="data" format="txt" optional="true" label="File containing headers to include"/> + <when value="lq_exp"> + <param name="length_comparison" type="select" label="Retain reads with length"> + <option value="<" selected="true">less than</option> + <option value="=">equal to</option> + <option value=">">greater than</option> + <sanitizer sanitize="false"/> + </param> + <param name="length_value" type="integer" min="0" value="0" label="Length in bp" /> + <param name="exp_operator" type="select" label="Combination operator"> + <option value="|" selected="true">or</option> + <option value="&">and</option> + <sanitizer sanitize="false"/> + </param> + <param name="quality_comparison" type="select" label="Average read quality"> + <option value="<" selected="true">less than</option> + <option value="=">equal to</option> + <option value=">">greater than</option> + <sanitizer sanitize="false"/> + </param> + <param name="quality_value" type="integer" min="0" value="0" label="average read quality" /> </when> </conditional> - <param argument="--filter" type="text" optional="true" label="filter" help="e.g. l>1000 & q>20"/> </section> <section name="input_subsample" title="Subsample input reads" expanded="false"> <param argument="--sample" type="float" min="0" max="1" value="1" label="fraction of reads to subsample"/> @@ -144,7 +195,6 @@ <param argument="--md5" type="boolean" checked="false" label="Print md5 of .rd files"/> </when> </conditional> - <param argument="--verbose" type="boolean" checked="false" label="Verbose output"/> </section> </inputs> <outputs> @@ -164,91 +214,103 @@ </outputs> <tests> <test expect_num_outputs="2"> - <param name="input_reads" value="test1.fasta.gz" ftype="fasta.gz"/> + <param name="input_reads" value="input1.fastq.gz" ftype="fastq.gz"/> <output name="stats_outfile" file="output1.tabular" ftype="tabular"/> <output name="rd_outfile" ftype="binary"> <assert_contents> - <has_size size="109" delta="1"/> + <has_size size="119" delta="1"/> </assert_contents> </output> </test> <test expect_num_outputs="2"> - <param name="input_reads" value="test1.fastq.gz" ftype="fastq.gz"/> - <output name="stats_outfile" file="output1.tabular" ftype="tabular" lines_diff="2"/> + <param name="input_reads" value="input1.fastq.gz" ftype="fastq.gz"/> + <section name="input_filter"> + <conditional name="filter_expression"> + <param name="filter_selector" value="lq_exp"/> + <param name="length_comparison" value=">"/> + <param name="length_value" value="10"/> + <param name="exp_operator" value="&"/> + <param name="quality_comparison" value=">"/> + <param name="quality_value" value="10"/> + </conditional> + </section> + <output name="stats_outfile" file="output2.tabular" ftype="tabular"/> <output name="rd_outfile" ftype="binary"> <assert_contents> - <has_size size="128" delta="1"/> + <has_size size="100" delta="1"/> </assert_contents> </output> </test> <test expect_num_outputs="2"> - <param name="input_reads" value="test2.bam" ftype="bam"/> + <param name="input_reads" value="input1.fastq.gz" ftype="fastq.gz"/> + <section name="input_compress"> + <param name="compress_selector" value="yes"/> + <param name="homopolymer_compress" value="1"/> + </section> <section name="output_options"> <conditional name="output_type"> <param name="type_selector" value="combined_reads"/> <param name="format_selector" value="fastq.gz"/> </conditional> </section> - <output name="stats_outfile" file="output2.tabular" ftype="tabular"/> - <output name="reads_outfile" file="output2.fastq.gz" ftype="fastq.gz"/> + <output name="stats_outfile" file="output3.tabular" ftype="tabular"/> + <output name="reads_outfile" ftype="fastq.gz" md5="23a14631cb075817967752021deb6ec4"> + <assert_contents> + <has_size size="159"/> + </assert_contents> + </output> </test> </tests> <help><![CDATA[ - -**rdeval** is a general-purpose, multithreaded tool for analyzing and manipulating reads (FASTA/FASTQ/BAM/CRAM/RD). +What it does +============ - rdeval input.fa*[.gz]|bam|cram|rd [expected genome size] +**rdeval** accepts an arbitrary number of sequencing files and optionally **filters**, **subsamples**, and/or **compresses homopolymers** within the reads. The retained reads can be saved in multiple formats, and metrics on these reads can be stored in a '*sketch*' file. Statistics can then be efficiently retrieved from these sketch files for further processing. -:: - - Dataset report example: +.. image:: pipeline.svg - +++Read summary+++: - # reads: 10000 - Total read length: 134014104 - Average read length: 13401.41 - Read N50: 14270 - Smallest read length: 1142 - Largest read length: 40910 - Coverage: inf - GC content %: 43.78 - Base composition (A:C:T:G): 37693226:29331833:37655925:29333120 - Average per base quality: 26.47 +Filtering +========= + +Input reads can be filtered using one of the three methods listed below, applied sequentially in the specified order. -:: - - Per sequence/read report (--sequence-report) example: +1. Retain reads whose header lines are listed in the include dataset. +2. Discard reads whose header lines are listed in the exclude dataset. +3. Retain reads that match the provided filter expression. + +The filter expression can be used to select reads based on read length (l), average read quality (q), or a combination of both. The grammar for constructing filter expressions is outlined below: - Header Comment Length A C G T N GC Average Quality - m54306U_210528_154706/69206614/ccs 22812 6170 5146 4802 6694 0 0.44 89.9705 - m54306U_210528_154706/25888573/ccs 32200 9162 7270 7112 8656 0 0.45 56.8306 - m54306U_210528_154706/40634168/ccs 8487 2443 1858 1876 2310 0 0.44 90.3828 - m54306U_210528_154706/103745617/ccs 16496 4546 3752 3760 4438 0 0.46 88.3554 + * filter-expression ::= <length-expression> | <quality-expression> | <length-expression> <combination-operator> <quality-expression> | <quality-expression> <combination-operator> <length-expression> + * length-expression ::= "l" <comparison-operator> <integer> + * quality-expression ::= "q" <comparison-operator> <integer> + * combination-operator := "&" | "|" + * comparison-operator ::= "<" | "=" | ">" + * integer ::= <digit> | <digit><integer> + * digit ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" -:: +Retain reads longer than 10 base pairs + + l>10 + +Retain reads with average quality greather than 20 + + q>20 - Options: - --sequence-report generates a per-read report - -e --exclude-list <file> generates output on a excluding list of headers. - -f --filter <exp> filter reads using <exp> in quotes, e.g. 'l>10' for longer than 10bp or 'l>10 & q>10' to further exclude reads by quality (default: none). - -i --include-list <file> generates output on a subset list of headers. - -o --out-format <file> output file (fa*[.gz], bam, cram, rd). Optionally write reads to file or generate rd summary file. - -q --quality q|a generates list of average quality for each read (q) or both length and quality (a). - -r --input-reads <file1> <file2> <file n> input file (fa*[.gz], bam, cram, rd). - -s --out-size u|s|h|c generates size list (unsorted|sorted|histogram|inverse cumulative table). - --homopolymer-compress <int> compress all the homopolymers longer than n in the input. - --sample <float> fraction of reads to subsample. - --random-seed <int> an optional random seed to make subsampling reproducible. - --md5 print md5 of .rd files. - --tabular tabular output. - --verbose verbose output. - -j --threads <int> numbers of threads (default:5). - -v --version software version. - --cmd print $0 to stdout. +Retain reads longer than 10 base pairs with average quality greather than 20 + + l>10 & q>20 + +.. _sampling-label: -**Attribution** +Sub-sampling +============ + +4. Retain a subsample of the reads by specifying the fraction to be kept. Use the *random seed* option to keep subsampling reproducible. -This tool relies on the gfastar suite and the gfalibs toolkit `vgl-hub/gfalibs <https://github.com/vgl-hub/gfalibs>`_, developed by Giulio Formenti at the Rockefeller University +Homopolymer Compression +======================= + +5. Runs of repeated nucleotides in each read are collapsed, with any associated quality data discarded. For example, CAGGCTTT would become CAGCT. ]]></help> <expand macro="citations"/> </tool>