Mercurial > repos > richard-burhans > rdeval

diff rdeval.xml @ 5:7cfeba6facd1 draft
planemo upload for repository https://github.com/vgl-hub/rdeval commit d55c4b3d6b91d0418950ed6b7806ef779a916099
author: richard-burhans
date: Fri, 02 May 2025 22:49:30 +0000
parents: 7bf95986aaa4
children: 24b05d3958d7
--- a/rdeval.xml	Wed Apr 23 19:31:12 2025 +0000
+++ b/rdeval.xml	Fri May 02 22:49:30 2025 +0000
@@ -15,46 +15,62 @@
     #end for
     #if $output_options.output_type.type_selector == "combined_reads"
         ln -s '$reads_outfile' 'output.${output_type.format_selector}' &&
-    #end if  
-	rdeval --input-reads #echo " ".join([f"'{input}'" for $input in $mangled_inputs])
+    #end if
+	rdeval --input-reads
+    #for $input in $mangled_inputs
+        '$input'
+    #end for
 	#if $expected_gsize
 		'$expected_gsize'
 	#end if
-	#if $input_filter.filter_selector == "exclude_file"
-		--exclude-list '$exclude_file'
-	#else if $input_filter.filter_selector == "include_file"
-		--include-list '$include_file'
-	#end if
-    #if $filter
-		--filter '$filter'
+	#if $input_filter.include_list
+		--include-list '$input_filter.include_list'
+    #end if
+	#if $input_filter.exclude_list
+		--exclude-list '$input_filter.exclude_list'
     #end if
-		--sample '$sample'
-	#if $input_subsample.seed_selector == "yes"
-		--random-seed '$random_seed'
+    #set $filter_exp_type = $input_filter.filter_expression.filter_selector
+    #if $filter_exp_type != "no_exp"
+        #set $l_exp = "l" + str($input_filter.filter_expression.length_comparison) + str($input_filter.filter_expression.length_value)
+        #set $q_exp = "q" + str($input_filter.filter_expression.quality_comparison) + str($input_filter.filter_expression.quality_value)
+        #if $filter_exp_type == "l_exp"
+            #set $filter_exp = $l_exp
+        #else if $filter_exp_type == "q_exp"
+            #set $filter_exp = $q_exp
+        #else if $filter_exp_type == "lq_exp"
+            #set $filter_exp = $l_exp + str($input_filter.filter_expression.exp_operator) + $q_exp
+        #end if
+        --filter '$filter_exp'
+    #end if
+    #if int($input_subsample.sample) != 1
+		--sample '$input_subsample.sample'
+    #end if
+	#if $input_subsample.random_seed.seed_selector == "yes"
+		--random-seed '$input_subsample.random_seed.random_seed'
 	#end if
 	#if $input_compress.compress_selector == "yes"
-		--homopolymer-compress '$homopolymer_compress'
+		--homopolymer-compress '$input_compress.homopolymer_compress'
 	#end if
-    #if $stats_flavor.flavor_selector == "stats"
-        #if $sequence_report
+    #set $stats_type = $output_options.stats_flavor.flavor_selector
+    #if $stats_type == "stats"
+        #if $output_options.stats_flavor.sequence_report
             --sequence-report
         #end if
-    #else if $stats_flavor.flavor_selector == "quality"
-		--quality '$quality'
-    #else if $stats_flavor.flavor_selector == "size"
-		--out-size '$out_size'
+    #else if $stats_type == "quality"
+		--quality '$output_options.stats_flavor.quality'
+    #else if $stats_type == "size"
+		--out-size '$output_options.stats_flavor.out_size'
     #end if
-    #if $output_options.output_type.type_selector == "rd_file"
-        #if $md5
+    #set $output_type = $output_options.output_type.type_selector
+    #if $output_type == "rd_file"
+        #if $output_options.output_type.md5
             --md5
         #end if
         -o output.rd
-    #else if $output_options.output_type.type_selector == "combined_reads"
-        -o 'output.${output_type.format_selector}'
+    #else if $output_type == "combined_reads"
+        -o 'output.${output_options.output_type.format_selector}'
     #end if
-	#if $verbose
 		--verbose
-	#end if
 		--tabular
 		--threads \${GALAXY_SLOTS:-2}
 		> '$stats_outfile'
@@ -63,21 +79,56 @@
         <param argument="--input-reads" type="data" format="bam,cram,fasta,fasta.gz,fastq,fastq.gz" multiple="true" label="Input dataset" help="BAM, CRAM, FASTA, FASTQ, or RD files"/>
         <param name="expected_gsize" type="integer" label="Expected Genome Size" optional="true" help="Integer (e.g., 3000000000 for human)."/>
         <section name="input_filter" title="Filter input reads" expanded="false">
-            <conditional name="file_filter">
-                <param name="filter_selector" type="select" label="Use an exclude or include file">
-                    <option value="no_file" selected="true">no</option>
-                    <option value="exclude_file">Use an exclude file</option>
-                    <option value="include_file">Use an include file</option>
+            <param argument="--include-list" type="data" format="txt" optional="true" label="File containing headers to include"/>
+            <param argument="--exclude-list" type="data" format="txt" optional="true" label="File containing headers to exclude"/>
+            <conditional name="filter_expression">
+                <param name="filter_selector" type="select" label="Filter using length and/or quality" help="filter help">
+                    <option value="no_exp" selected="true">No</option>
+                    <option value="l_exp">Read length</option>
+                    <option value="q_exp">Average read quality</option>
+                    <option value="lq_exp">Both read length and average read quality</option>
                 </param>
-                <when value="no_file"/>
-                <when value="exclude_file">
-                    <param argument="--exclude-list" type="data" format="txt" optional="true" label="File containing headers to exclude"/>
+                <when value="no_exp"/>
+                <when value="l_exp">
+                    <param name="length_comparison" type="select" label="Retain reads with length">
+                        <option value="&lt;" selected="true">less than</option>
+                        <option value="=">equal to</option>
+                        <option value=">">greater than</option>
+                        <sanitizer sanitize="false"/>
+                    </param>
+                    <param name="length_value" type="integer" min="0" value="0" label="Length in bp" />
+                </when>
+                <when value="q_exp">
+                    <param name="quality_comparison" type="select" label="Retain reads with average read quality">
+                        <option value="&lt;" selected="true">less than</option>
+                        <option value="=">equal to</option>
+                        <option value=">">greater than</option>
+                        <sanitizer sanitize="false"/>
+                    </param>
+                    <param name="quality_value" type="integer" min="0" value="0" label="Average read quality" />
                 </when>
-                <when value="include_file">
-                    <param argument="--include-list" type="data" format="txt" optional="true" label="File containing headers to include"/>
+                <when value="lq_exp">
+                    <param name="length_comparison" type="select" label="Retain reads with length">
+                        <option value="&lt;" selected="true">less than</option>
+                        <option value="=">equal to</option>
+                        <option value=">">greater than</option>
+                        <sanitizer sanitize="false"/>
+                    </param>
+                    <param name="length_value" type="integer" min="0" value="0" label="Length in bp" />
+                    <param name="exp_operator" type="select" label="Combination operator">
+                        <option value="|" selected="true">or</option>
+                        <option value="&amp;">and</option>
+                        <sanitizer sanitize="false"/>
+                    </param>
+                    <param name="quality_comparison" type="select" label="Average read quality">
+                        <option value="&lt;" selected="true">less than</option>
+                        <option value="=">equal to</option>
+                        <option value=">">greater than</option>
+                        <sanitizer sanitize="false"/>
+                    </param>
+                    <param name="quality_value" type="integer" min="0" value="0" label="average read quality" />
                 </when>
             </conditional>
-            <param argument="--filter" type="text" optional="true" label="filter" help="e.g. l&gt;1000 &amp; q&gt;20"/>
         </section>
         <section name="input_subsample" title="Subsample input reads" expanded="false">
             <param argument="--sample" type="float" min="0" max="1" value="1" label="fraction of reads to subsample"/>
@@ -144,7 +195,6 @@
                     <param argument="--md5" type="boolean" checked="false" label="Print md5 of .rd files"/>
                 </when>
             </conditional>
-            <param argument="--verbose" type="boolean" checked="false" label="Verbose output"/>
         </section>
     </inputs>
     <outputs>
@@ -164,91 +214,103 @@
     </outputs>
     <tests>
         <test expect_num_outputs="2">
-            <param name="input_reads" value="test1.fasta.gz" ftype="fasta.gz"/>
+            <param name="input_reads" value="input1.fastq.gz" ftype="fastq.gz"/>
             <output name="stats_outfile" file="output1.tabular" ftype="tabular"/>
             <output name="rd_outfile" ftype="binary">
                 <assert_contents>
-                    <has_size size="109" delta="1"/>
+                    <has_size size="119" delta="1"/>
                 </assert_contents>
             </output>
         </test>
         <test expect_num_outputs="2">
-            <param name="input_reads" value="test1.fastq.gz" ftype="fastq.gz"/>
-            <output name="stats_outfile" file="output1.tabular" ftype="tabular" lines_diff="2"/>
+            <param name="input_reads" value="input1.fastq.gz" ftype="fastq.gz"/>
+            <section name="input_filter">
+                <conditional name="filter_expression">
+                    <param name="filter_selector" value="lq_exp"/>
+                    <param name="length_comparison" value=">"/>
+                    <param name="length_value" value="10"/>
+                    <param name="exp_operator" value="&amp;"/>
+                    <param name="quality_comparison" value=">"/>
+                    <param name="quality_value" value="10"/>
+                </conditional>
+            </section>
+            <output name="stats_outfile" file="output2.tabular" ftype="tabular"/>
             <output name="rd_outfile" ftype="binary">
                 <assert_contents>
-                    <has_size size="128" delta="1"/>
+                    <has_size size="100" delta="1"/>
                 </assert_contents>
             </output>
         </test>
         <test expect_num_outputs="2">
-            <param name="input_reads" value="test2.bam" ftype="bam"/>
+            <param name="input_reads" value="input1.fastq.gz" ftype="fastq.gz"/>
+            <section name="input_compress">
+                <param name="compress_selector" value="yes"/>
+                <param name="homopolymer_compress" value="1"/>
+            </section>
             <section name="output_options">
                 <conditional name="output_type">
                     <param name="type_selector" value="combined_reads"/>
                     <param name="format_selector" value="fastq.gz"/>
                 </conditional>
             </section>
-            <output name="stats_outfile" file="output2.tabular" ftype="tabular"/>
-            <output name="reads_outfile" file="output2.fastq.gz" ftype="fastq.gz"/>
+            <output name="stats_outfile" file="output3.tabular" ftype="tabular"/>
+            <output name="reads_outfile" ftype="fastq.gz" md5="23a14631cb075817967752021deb6ec4">
+                <assert_contents>
+                    <has_size size="159"/>
+                </assert_contents>
+            </output>
         </test>
     </tests>
     <help><![CDATA[
-
-**rdeval** is a general-purpose, multithreaded tool for analyzing and manipulating reads (FASTA/FASTQ/BAM/CRAM/RD).
+What it does
+============
 
-        rdeval input.fa*[.gz]|bam|cram|rd [expected genome size]
+**rdeval** accepts an arbitrary number of sequencing files and optionally **filters**, **subsamples**, and/or **compresses homopolymers** within the reads. The retained reads can be saved in multiple formats, and metrics on these reads can be stored in a '*sketch*' file. Statistics can then be efficiently retrieved from these sketch files for further processing.
 
-::
-
-	Dataset report example:
+.. image:: pipeline.svg
 
-	+++Read summary+++:
-	# reads: 10000
-	Total read length: 134014104
-	Average read length: 13401.41
-	Read N50: 14270
-	Smallest read length: 1142
-	Largest read length: 40910
-	Coverage: inf
-	GC content %: 43.78
-	Base composition (A:C:T:G): 37693226:29331833:37655925:29333120
-	Average per base quality: 26.47
+Filtering
+=========
+
+Input reads can be filtered using one of the three methods listed below, applied sequentially in the specified order.
 
-::
-	
-	Per sequence/read report (--sequence-report) example:
+1. Retain reads whose header lines are listed in the include dataset.
+2. Discard reads whose header lines are listed in the exclude dataset.
+3. Retain reads that match the provided filter expression.
+
+The filter expression can be used to select reads based on read length (l), average read quality (q), or a combination of both. The grammar for constructing filter expressions is outlined below:
 
-	Header  Comment Length  A       C       G       T       N       GC      Average Quality
-	m54306U_210528_154706/69206614/ccs              22812   6170    5146    4802    6694    0       0.44    89.9705
-	m54306U_210528_154706/25888573/ccs              32200   9162    7270    7112    8656    0       0.45    56.8306
-	m54306U_210528_154706/40634168/ccs              8487    2443    1858    1876    2310    0       0.44    90.3828
-	m54306U_210528_154706/103745617/ccs             16496   4546    3752    3760    4438    0       0.46    88.3554
+     * filter-expression ::= <length-expression> | <quality-expression> | <length-expression> <combination-operator> <quality-expression> | <quality-expression> <combination-operator> <length-expression>
+     * length-expression ::= "l" <comparison-operator> <integer>
+     * quality-expression ::= "q" <comparison-operator> <integer>
+     * combination-operator := "&" | "|"
+     * comparison-operator ::= "<" | "=" | ">"
+     * integer ::= <digit> | <digit><integer>
+     * digit ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
 
-::
+Retain reads longer than 10 base pairs
+
+    l>10
+
+Retain reads with average quality greather than 20
+
+     q>20
 
-	Options:
-	--sequence-report generates a per-read report
-	-e --exclude-list <file> generates output on a excluding list of headers.
-	-f --filter <exp> filter reads using <exp> in quotes, e.g. 'l>10' for longer than 10bp or 'l>10 & q>10' to further exclude reads by quality (default: none).
-	-i --include-list <file> generates output on a subset list of headers.
-	-o --out-format <file> output file (fa*[.gz], bam, cram, rd). Optionally write reads to file or generate rd summary file.
-	-q --quality q|a generates list of average quality for each read (q) or both length and quality (a).
-	-r --input-reads <file1> <file2> <file n> input file (fa*[.gz], bam, cram, rd).
-	-s --out-size u|s|h|c  generates size list (unsorted|sorted|histogram|inverse cumulative table).
-	--homopolymer-compress <int> compress all the homopolymers longer than n in the input.
-	--sample <float> fraction of reads to subsample.
-	--random-seed <int> an optional random seed to make subsampling reproducible.
-	--md5 print md5 of .rd files.
-	--tabular tabular output.
-	--verbose verbose output.
-	-j --threads <int> numbers of threads (default:5).
-	-v --version software version.
-	--cmd print $0 to stdout.
+Retain reads longer than 10 base pairs with average quality greather than 20
+
+     l>10 & q>20
+
+.. _sampling-label:
 
-**Attribution**
+Sub-sampling
+============
+
+4. Retain a subsample of the reads by specifying the fraction to be kept. Use the *random seed* option to keep subsampling reproducible.
 
-This tool relies on the gfastar suite and the gfalibs toolkit `vgl-hub/gfalibs <https://github.com/vgl-hub/gfalibs>`_, developed by Giulio Formenti at the Rockefeller University
+Homopolymer Compression
+=======================
+
+5. Runs of repeated nucleotides in each read are collapsed, with any associated quality data discarded. For example, CAGGCTTT would become CAGCT.
     ]]></help>
     <expand macro="citations"/>
 </tool>
author	richard-burhans
date	Fri, 02 May 2025 22:49:30 +0000
parents	7bf95986aaa4
children	24b05d3958d7