Mercurial > repos > richard-burhans > rdeval
changeset 0:425a2aa541df draft
planemo upload for repository https://github.com/vgl-hub/rdeval commit bdcd7b1991f872540fb5f87ee945077b5ca68780
author | richard-burhans |
---|---|
date | Mon, 17 Feb 2025 19:17:26 +0000 |
parents | |
children | 853c32c31a6c |
files | macros.xml rdeval.xml test-data/output1.tabular test-data/output2.fastq.gz test-data/output2.tabular test-data/test1.fasta.gz test-data/test1.fastq.gz test-data/test2.bam |
diffstat | 8 files changed, 301 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Feb 17 19:17:26 2025 +0000 @@ -0,0 +1,33 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">rdeval</requirement> + </requirements> + </xml> + <token name="@TOOL_VERSION@">0.0.5</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">23.02</token> + <token name="@ATTRIBUTION@"><![CDATA[ + **Attribution** + + This Galaxy tool is available at `vgl-hub/rdeval <https://github.com/vgl-hub/rdeval/>`_, and + relies on the gfastar suite and the gfalibs toolkit `vgl-hub/gfalibs + <https://github.com/vgl-hub/gfalibs>`_, developed by Giulio Formenti at the Rockefeller University + ]]></token> + <xml name="citations"> + <citations> + <citation type="bibtex"> + @article{formenti2022gfastats, + title={Gfastats: conversion, evaluation and manipulation of genome sequences using assembly graphs}, + author={Formenti, Giulio and Abueg, Linelle and Brajuka, Angelo and Brajuka, Nadolina and Gallardo-Alba, Crist{\'o}bal and Giani, Alice and Fedrigo, Olivier and Jarvis, Erich D}, + journal={Bioinformatics}, + volume={38}, + number={17}, + pages={4214--4216}, + year={2022}, + publisher={Oxford University Press} + } + </citation> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rdeval.xml Mon Feb 17 19:17:26 2025 +0000 @@ -0,0 +1,248 @@ +<tool id="rdeval" name="rdeval" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>Multithreaded read analysis and manipulation tool.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ + #import re + #set $mangled_inputs = [] + #for $input in $input_reads + #set $mangled_base = re.sub(r"[^\w\-\s]", "_", str($input.element_identifier)) + #set $mangled_input = $mangled_base + "." + str($input.ext) + #silent $mangled_inputs.append($mangled_input) + ln -s '$input' '$mangled_input' && + #end for + #if $output_options.output_type.type_selector == "combined_reads" + ln -s '$reads_outfile' 'output.${output_type.format_selector}' && + #end if + rdeval --input-reads #echo " ".join([f"'{input}'" for $input in $mangled_inputs]) + #if $expected_gsize + $expected_gsize + #end if + #if $input_filter.filter_selector == "exclude_file" + --exclude-list '$exclude_file' + #else if $input_filter.filter_selector == "include_file" + --include-list '$include_file' + #end if + #if $filter + --filter '$filter' + #end if + --sample '$sample' + #if $input_subsample.seed_selector == "yes" + --random-seed '$random_seed' + #end if + #if $homopolymer_compress.compress_selector == "yes" + --homopolymer-compress '$homopolymer_compress' + #end if + #if $stats_flavor.flavor_selector == "stats" + #if $sequence_report + --sequence-report + #end if + #else if $stats_flavor.flavor_selector == "quality" + --quality '$quality' + #else if $stats_flavor.flavor_selector == "size" + --out-size '$out_size' + #end if + #if $output_options.output_type.type_selector == "rd_file" + #if $md5 + --md5 + #end if + -o output.rd + #else if $output_options.output_type.type_selector == "combined_reads" + -o 'output.${output_type.format_selector}' + #end if + #if $verbose + --verbose + #end if + --tabular + --threads \${GALAXY_SLOTS:-2} + > '$stats_outfile' + ]]></command> + <inputs> + <param argument="--input-reads" type="data" format="bam,cram,fasta,fasta.gz,fastq,fastq.gz" multiple="true" label="Input dataset" help="FASTA/FASTQ, BAM, or CRAM files."/> + <param name="expected_gsize" type="integer" label="Expected Genome Size" optional="true" help="Integer (e.g., 3000000000 for human)."/> + <section name="input_filter" title="Filter input reads" expanded="false"> + <conditional name="file_filter"> + <param name="filter_selector" type="select" label="Use an exclude or include file"> + <option value="no_file" selected="true">no</option> + <option value="exclude_file">Use an exclude file</option> + <option value="include_file">Use an include file</option> + </param> + <when value="no_file"/> + <when value="exclude_file"> + <param argument="--exclude-list" type="data" format="txt" optional="true" label="File containing headers to exclude"/> + </when> + <when value="include_file"> + <param argument="--include-list" type="data" format="txt" optional="true" label="File containing headers to include"/> + </when> + </conditional> + <param argument="--filter" type="text" optional="true" label="filter" help="e.g. l>1000 & q>20"/> + </section> + <section name="input_subsample" title="Subsample input reads" expanded="false"> + <param argument="--sample" type="float" min="0" max="1" value="1" label="fraction of reads to subsample"/> + <conditional name="random_seed"> + <param name="seed_selector" type="select" label="supply random seed to make subsampling reproducible"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="no"/> + <when value="yes"> + <param argument="--random-seed" type="integer" min="0" label="random seed to make subsampling reproducible"/> + </when> + </conditional> + </section> + <conditional name="homopolymer_compress"> + <param name="compress_selector" type="select" label="Compress homopolymers"> + <option value="no" selected="true">no</option> + <option value="yes">yes</option> + </param> + <when value="no"/> + <when value="yes"> + <param argument="--homopolymer-compress" type="integer" min="0" label="Compress homopolymers longer than n in the input"/> + </when> + </conditional> + <section name="output_options" title="Output options"> + <conditional name="stats_flavor"> + <param name="flavor_selector" type="select" label="Stats output"> + <option value="stats" selected="true">Stats</option> + <option value="quality">Quality</option> + <option value="size">Size</option> + </param> + <when value="stats"> + <param argument="--sequence-report" type="boolean" checked="false" label="Per read sequence report"/> + </when> + <when value="quality"> + <param argument="--quality" type="select" optional="true" label="quality type"> + <option value="q" selected="true">Average quality for each read</option> + <option value="a">Both length and quality for each read</option> + </param> + </when> + <when value="size"> + <param argument="--out-size" type="select" optional="true" label="size list type"> + <option value="u" selected="true">unsorted</option> + <option value="s">sorted</option> + <option value="h">histogram</option> + <option value="c">inverse cumulative table</option> + </param> + </when> + </conditional> + <conditional name="output_type"> + <param name="type_selector" type="select" label="output type"> + <option value="rd_file" selected="true">RD file</option> + <option value="combined_reads">Combined reads</option> + </param> + <when value="combined_reads"> + <param name="format_selector" type="select" optional="true" label="Output format"> + <option value="fasta.gz" selected="true">fasta</option> + <option value="fastq.gz">fastq</option> + <option value="bam">bam</option> + <option value="cram">cram</option> + </param> + </when> + <when value="rd_file"> + <param argument="--md5" type="boolean" checked="false" label="Print md5 of .rd files"/> + </when> + </conditional> + <param argument="--verbose" type="boolean" checked="false" label="Verbose output"/> + </section> + </inputs> + <outputs> + <data name="stats_outfile" format="tabular" label="Rdeval summary"/> + <data name="rd_outfile" from_work_dir="output.rd" format="binary" label="RD File"> + <filter>output_options["output_type"]["type_selector"] == "rd_file"</filter> + </data> + <data name="reads_outfile" format="binary" label="Output reads"> + <filter>output_options["output_type"]["type_selector"] == "combined_reads"</filter> + <change_format> + <when input="format_selector" value="fasta.gz" format="fasta.gz"/> + <when input="format_selector" value="fastq.gz" format="fastq.gz"/> + <when input="format_selector" value="bam" format="bam"/> + <when input="format_selector" value="cram" format="cram"/> + </change_format> + </data> + </outputs> + <tests> + <test expect_num_outputs="2"> + <param name="input_reads" value="test1.fasta.gz" ftype="fasta.gz"/> + <output name="stats_outfile" file="output1.tabular" ftype="tabular"/> + <output name="rd_outfile" ftype="binary"> + <assert_contents> + <has_size size="109" delta="1"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="input_reads" value="test1.fastq.gz" ftype="fastq.gz"/> + <output name="stats_outfile" file="output1.tabular" ftype="tabular" lines_diff="2"/> + <output name="rd_outfile" ftype="binary"> + <assert_contents> + <has_size size="128" delta="1"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="input_reads" value="test2.bam" ftype="bam"/> + <param name="type_selector" value="combined_reads"/> + <output name="stats_outfile" file="output2.tabular" ftype="tabular"/> + <param name="format_selector" value="fastq.gz"/> + <output name="reads_outfile" file="output2.fastq.gz" ftype="fastq.gz"/> + </test> + </tests> + <help><![CDATA[ + +**rdeval** is a general-purpose, multithreaded tool for analyzing and manipulating reads (FASTA/FASTQ/BAM/CRAM/RD). + + rdeval input.fa*[.gz]|bam|cram|rd [expected genome size] + +:: + + Dataset report example: + + +++Read summary+++: + # reads: 10000 + Total read length: 134014104 + Average read length: 13401.41 + Read N50: 14270 + Smallest read length: 1142 + Largest read length: 40910 + Coverage: inf + GC content %: 43.78 + Base composition (A:C:T:G): 37693226:29331833:37655925:29333120 + Average per base quality: 26.47 + +:: + + Per sequence/read report (--sequence-report) example: + + Header Comment Length A C G T N GC Average Quality + m54306U_210528_154706/69206614/ccs 22812 6170 5146 4802 6694 0 0.44 89.9705 + m54306U_210528_154706/25888573/ccs 32200 9162 7270 7112 8656 0 0.45 56.8306 + m54306U_210528_154706/40634168/ccs 8487 2443 1858 1876 2310 0 0.44 90.3828 + m54306U_210528_154706/103745617/ccs 16496 4546 3752 3760 4438 0 0.46 88.3554 + +:: + + Options: + --sequence-report generates a per-read report + -e --exclude-list <file> generates output on a excluding list of headers. + -f --filter <exp> filter reads using <exp> in quotes, e.g. 'l>10' for longer than 10bp or 'l>10 & q>10' to further exclude reads by quality (default: none). + -i --include-list <file> generates output on a subset list of headers. + -o --out-format <file> output file (fa*[.gz], bam, cram, rd). Optionally write reads to file or generate rd summary file. + -q --quality q|a generates list of average quality for each read (q) or both length and quality (a). + -r --input-reads <file1> <file2> <file n> input file (fa*[.gz], bam, cram, rd). + -s --out-size u|s|h|c generates size list (unsorted|sorted|histogram|inverse cumulative table). + --homopolymer-compress <int> compress all the homopolymers longer than n in the input. + --sample <float> fraction of reads to subsample. + --random-seed <int> an optional random seed to make subsampling reproducible. + --md5 print md5 of .rd files. + --tabular tabular output. + --verbose verbose output. + -j --threads <int> numbers of threads (default:5). + -v --version software version. + --cmd print $0 to stdout. + +@ATTRIBUTION@ + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output1.tabular Mon Feb 17 19:17:26 2025 +0000 @@ -0,0 +1,10 @@ +# reads 5 +Total read length 50 +Average read length 10.00 +Read N50 15 +Smallest read length 5 +Largest read length 15 +Coverage inf +GC content % 50.00 +Base composition (A:C:T:G) 9:14:11:6 +Average per base quality 0.00
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output2.tabular Mon Feb 17 19:17:26 2025 +0000 @@ -0,0 +1,10 @@ +# reads 11 +Total read length 264855 +Average read length 24077.73 +Read N50 24322 +Smallest read length 17465 +Largest read length 36274 +Coverage inf +GC content % 40.81 +Base composition (A:C:T:G) 79479:54455:77277:53644 +Average per base quality 23.58