Mercurial > repos > iuc > seqkit_grep

diff seqkit_grep.xml @ 0:731f3256c2b3 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/seqkit commit 4b34a4e0d3a8482acd3c0773a79658ec6add3fb3
author: iuc
date: Wed, 20 Aug 2025 15:04:47 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seqkit_grep.xml	Wed Aug 20 15:04:47 2025 +0000
@@ -0,0 +1,185 @@
+<tool id="seqkit_grep" name="SeqKit grep" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>grep-like tools for FASTA/Q files</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="bio_tools"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+#import re
+
+#set input_identifier = re.sub('[^\s\w\-]', '_', str($input.element_identifier))
+ln -s '${input}' '${input_identifier}' &&
+
+seqkit grep 
+--threads "\${GALAXY_SLOTS:-4}"
+#if $conditional_pattern.mode == 'expression'
+    --pattern '"$conditional_pattern.pattern"'
+    $conditional_pattern.use_regexp
+#else
+    --pattern-file '$conditional_pattern.pattern_file'
+#end if
+$search_options.allow_duplicated_patterns
+$search_options.by_name
+$search_options.by_seq
+$search_options.circular
+$search_options.count
+$search_options.degenerate
+$search_options.delete_matched
+$search_options.ignore_case
+$search_options.invert_match
+#if $search_options.by_seq and not $search_options.degenerate
+    --max-mismatch $search_options.max_mismatch
+#end if
+$search_options.only_positive_strand
+$search_options.region
+'${input_identifier}'
+> '$output'
+]]></command>
+    <inputs>
+        <param name="input" type="data" format="fasta,fastq,fasta.gz,fastq.gz" label="Input FASTA/FASTQ file"/>
+        <conditional name="conditional_pattern">
+            <param name="mode" type="select" label="Pattern mode" help="Specify pattern directly or upload a file with multiple patterns">
+                <option value="expression">Pattern/motif sequence</option>
+                <option value="file">FASTA file with the pattern/motif of interest</option>
+            </param>
+            <when value="expression">
+                <param argument="--pattern" type="text" label="Search pattern" help="Pattern to search for. Use quotes for special characters when using regex">
+                    <sanitizer invalid_char="">
+                        <valid initial="string.letters,string.digits">
+                            <add value="^"/>
+                            <add value="$"/>
+                            <add value="("/>
+                            <add value=")"/>
+                            <add value="|"/>
+                            <add value="?"/>
+                            <add value="*"/>
+                            <add value="+"/>
+                            <add value="{"/>
+                            <add value="}"/>
+                            <add value="\"/>
+                            <add value="["/>
+                            <add value="]"/>
+                            <add value="."/>
+                            <add value=","/>
+                            <add value=":"/>
+                        </valid>
+                    </sanitizer>
+                <validator type="regex" message="Pattern must not end with backslash.">.*[^\\]$</validator>
+                </param>
+                <param argument="--use-regexp" type="boolean" truevalue="--use-regexp" falsevalue="" checked="false" label="Interpret pattern as regular expression" help="Enable regular expression matching"/>
+            </when>
+            <when value="file">
+                <param argument="--pattern-file" type="data" format="fasta" label="Pattern/motif file"/>
+            </when>
+        </conditional>
+        <section name="search_options" title="Search options">
+            <param argument="--by-name" type="boolean" truevalue="--by-name" falsevalue="" checked="false" label="Search by sequence name" help="match by full name instead of just ID"/>
+            <param argument="--by-seq" type="boolean" truevalue="--by-seq" falsevalue="" checked="false" label="Search by sequence content" help="search subseq on seq. Both positive and negative strand are searched by default, you might use only-positive-strand. Mismatch allowed using max-mismatch"/>
+            <param argument="--ignore-case" type="boolean" truevalue="--ignore-case" falsevalue="" checked="false" label="Ignore case" help="ignore case"/>
+            <param argument="--only-positive-strand" type="boolean" truevalue="--only-positive-strand" falsevalue="" checked="false" label="Only search positive strand" help="Only search on positive strand (only applies when searching by sequence)"/>
+            <param argument="--max-mismatch" type="integer" min="0" value="0" label="Maximum mismatches" help="Maximum number of mismatches allowed (only for sequence search, 0 = exact match)"/>
+            <param argument="--invert-match" type="boolean" truevalue="--invert-match" falsevalue="" checked="false" label="Invert match" help="invert the sense of matching, to select non-matching records"/>
+            <param argument="--degenerate" type="boolean" truevalue="--degenerate" falsevalue="" checked="false" label="Pattern contains degenerate bases" help="Pattern contains degenerate basee"/>
+            <param argument="--circular" type="boolean" truevalue="--circular" falsevalue="" checked="false" label="Circular genome" help="Treat sequences as circular for matching"/>
+            <param argument="--count" type="boolean" truevalue="--count" falsevalue="" checked="false" label="Count" help="just print a count of matching records. with the -v/--invert-match flag, count non-matching records"/>
+            <param argument="--delete-matched" type="boolean" truevalue="--delete-matched" falsevalue="" checked="false" label="Delete matched patterns" help="delete a pattern right after being matched, this keeps the firstly matched data and speedups when using regular expressions"/>
+            <param argument="--allow-duplicated-patterns" type="boolean" truevalue="--allow-duplicated-patterns" falsevalue="" checked="false" label="Allow duplicated patterns" help="output records multiple times when duplicated patterns are given"/>
+            <param argument="--region" type="text" value="" label="Sequence region" help="Specify region for searching (e.g., 1:30 for first 30 bases, -12:-1 for last 12).">
+                <validator type="regex" message="Region must be in format 'start:end' or 'start:' or ':end'">^$|^-?[0-9]*:-?[0-9]*$</validator>
+            </param>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="output" format_source="input" label="${tool.name} on ${on_string}"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/>
+            <conditional name="conditional_pattern">
+                <param name="mode" value="expression"/>
+                <param name="pattern" value="ATGC"/>
+            </conditional>
+            <section name="search_options">
+               <param name="by_seq" value="true"/>
+                <param name="max_mismatch" value="0"/>
+            </section>
+            <output decompress="true" name="output" file="grep_output1.fasta.gz" ftype="fasta.gz"/>
+        </test>
+    <test expect_num_outputs="1">
+        <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/>
+        <conditional name="conditional_pattern">
+            <param name="mode" value="file"/>
+            <param name="pattern_file" value="grep_pattern.fasta"/>
+        </conditional>
+        <section name="search_options">
+            <param name="invert_match" value="true"/>
+        </section>
+        <output decompress="true" name="output" file="grep_output2.fasta.gz" ftype="fasta.gz"/>
+    </test>
+    <test expect_num_outputs="1">
+            <param name="input" value="input1.fastq.gz" ftype="fastq.gz"/>
+            <conditional name="conditional_pattern">
+                <param name="mode" value="expression"/>
+                <param name="pattern" value="^5"/>
+                <param name="use_regexp" value="true"/>
+            </conditional>
+            <section name="search_options">
+                <param name="by_name" value="true"/>
+            </section>
+            <output decompress="true" name="output" file="grep_output3.fastq.gz" ftype="fastq.gz"/>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/>
+            <conditional name="conditional_pattern">
+                <param name="mode" value="expression"/>
+                <param name="pattern" value="NNNNATGC"/>
+            </conditional>
+            <section name="search_options">
+                <param name="by_seq" value="true"/>
+                <param name="degenerate" value="true"/>
+            </section>
+            <output decompress="true" name="output" file="grep_output4.fasta.gz" ftype="fasta.gz"/>
+        </test>
+    </tests>
+    <help>
+.. class:: infomark
+
+**What it does**
+
+search sequences by ID/name/sequence/sequence motifs, mismatch allowed
+
+------
+
+.. class:: infomark
+
+**Attention**
+
+  0. By default, we match sequence ID with patterns, use "-n/--by-name"
+     for matching full name instead of just ID.
+  1. Unlike POSIX/GNU grep, we compare the pattern to the whole target
+     (ID/full header) by default. Please switch "-r/--use-regexp" on
+     for partly matching.
+  2. When searching by sequences, it's partly matching, and both positive
+     and negative strands are searched.
+     Please switch on "-P/--only-positive-strand" if you would like to
+     search only on the positive strand.
+     Mismatch is allowed using flag "-m/--max-mismatch", you can increase
+     the value of "-j/--threads" to accelerate processing.
+  3. Degenerate bases/residues like "RYMM.." are also supported by flag -d.
+     But do not use degenerate bases/residues in regular expression, you need
+     convert them to regular expression, e.g., change "N" or "X"  to ".".
+  4. When providing search patterns (motifs) via flag '-p',
+     please use double quotation marks for patterns containing comma,
+     e.g., -p '"A{2,}"' or -p "\"A{2,}\"". Because the command line argument
+     parser accepts comma-separated-values (CSV) for multiple values (motifs).
+     Patterns in file do not follow this rule.
+  5. The order of sequences in result is consistent with that in original
+     file, not the order of the query patterns.
+     But for FASTA file, you can use:
+     seqkit faidx seqs.fasta --infile-list IDs.txt
+  6. For multiple patterns, you can either set "-p" multiple times, i.e.,
+     -p pattern1 -p pattern2, or give a file of patterns via "-f/--pattern-file".
+    </help>
+    <expand macro="citations"/>
+</tool>