Mercurial > repos > iuc > seqkit_grep

<tool id="seqkit_grep" name="SeqKit grep" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>grep-like tools for FASTA/Q files</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
#import re

#set input_identifier = re.sub('[^\s\w\-]', '_', str($input.element_identifier))
ln -s '${input}' '${input_identifier}' &&

seqkit grep
--threads "\${GALAXY_SLOTS:-4}"
#if $conditional_pattern.mode == 'expression'
    --pattern '"$conditional_pattern.pattern"'
    $conditional_pattern.use_regexp
#else
    --pattern-file '$conditional_pattern.pattern_file'
#end if
$search_options.allow_duplicated_patterns
$search_options.by_name
$search_options.by_seq
$search_options.circular
$search_options.count
$search_options.degenerate
$search_options.delete_matched
$search_options.ignore_case
$search_options.invert_match
#if $search_options.by_seq and not $search_options.degenerate
    --max-mismatch $search_options.max_mismatch
#end if
$search_options.only_positive_strand
$search_options.region
'${input_identifier}'
> '$output'
]]></command>
    <inputs>
        <param name="input" type="data" format="fasta,fastq,fasta.gz,fastq.gz" label="Input FASTA/FASTQ file"/>
        <conditional name="conditional_pattern">
            <param name="mode" type="select" label="Pattern mode" help="Specify pattern directly or upload a file with multiple patterns">
                <option value="expression">Pattern/motif sequence</option>
                <option value="file">FASTA file with the pattern/motif of interest</option>
            </param>
            <when value="expression">
                <param argument="--pattern" type="text" label="Search pattern" help="Pattern to search for. Use quotes for special characters when using regex">
                    <sanitizer invalid_char="">
                        <valid initial="string.letters,string.digits">
                            <add value="^"/>
                            <add value="$"/>
                            <add value="("/>
                            <add value=")"/>
                            <add value="|"/>
                            <add value="?"/>
                            <add value="*"/>
                            <add value="+"/>
                            <add value="{"/>
                            <add value="}"/>
                            <add value="\"/>
                            <add value="["/>
                            <add value="]"/>
                            <add value="."/>
                            <add value=","/>
                            <add value=":"/>
                        </valid>
                    </sanitizer>
                <validator type="regex" message="Pattern must not end with backslash.">.*[^\\]$</validator>
                </param>
                <param argument="--use-regexp" type="boolean" truevalue="--use-regexp" falsevalue="" checked="false" label="Interpret pattern as regular expression" help="Enable regular expression matching"/>
            </when>
            <when value="file">
                <param argument="--pattern-file" type="data" format="fasta" label="Pattern/motif file"/>
            </when>
        </conditional>
        <section name="search_options" title="Search options">
            <param argument="--by-name" type="boolean" truevalue="--by-name" falsevalue="" checked="false" label="Search by sequence name" help="match by full name instead of just ID"/>
            <param argument="--by-seq" type="boolean" truevalue="--by-seq" falsevalue="" checked="false" label="Search by sequence content" help="search subseq on seq. Both positive and negative strand are searched by default, you might use only-positive-strand. Mismatch allowed using max-mismatch"/>
            <param argument="--ignore-case" type="boolean" truevalue="--ignore-case" falsevalue="" checked="false" label="Ignore case" help="ignore case"/>
            <param argument="--only-positive-strand" type="boolean" truevalue="--only-positive-strand" falsevalue="" checked="false" label="Only search positive strand" help="Only search on positive strand (only applies when searching by sequence)"/>
            <param argument="--max-mismatch" type="integer" min="0" value="0" label="Maximum mismatches" help="Maximum number of mismatches allowed (only for sequence search, 0 = exact match)"/>
            <param argument="--invert-match" type="boolean" truevalue="--invert-match" falsevalue="" checked="false" label="Invert match" help="invert the sense of matching, to select non-matching records"/>
            <param argument="--degenerate" type="boolean" truevalue="--degenerate" falsevalue="" checked="false" label="Pattern contains degenerate bases" help="Pattern contains degenerate basee"/>
            <param argument="--circular" type="boolean" truevalue="--circular" falsevalue="" checked="false" label="Circular genome" help="Treat sequences as circular for matching"/>
            <param argument="--count" type="boolean" truevalue="--count" falsevalue="" checked="false" label="Count" help="just print a count of matching records. with the -v/--invert-match flag, count non-matching records"/>
            <param argument="--delete-matched" type="boolean" truevalue="--delete-matched" falsevalue="" checked="false" label="Delete matched patterns" help="delete a pattern right after being matched, this keeps the firstly matched data and speedups when using regular expressions"/>
            <param argument="--allow-duplicated-patterns" type="boolean" truevalue="--allow-duplicated-patterns" falsevalue="" checked="false" label="Allow duplicated patterns" help="output records multiple times when duplicated patterns are given"/>
            <param argument="--region" type="text" value="" label="Sequence region" help="Specify region for searching (e.g., 1:30 for first 30 bases, -12:-1 for last 12).">
                <validator type="regex" message="Region must be in format 'start:end' or 'start:' or ':end'">^$|^-?[0-9]*:-?[0-9]*$</validator>
            </param>
        </section>
    </inputs>
    <outputs>
        <data name="output" format_source="input" label="${tool.name} on ${on_string}"/>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/>
            <conditional name="conditional_pattern">
                <param name="mode" value="expression"/>
                <param name="pattern" value="ATGC"/>
            </conditional>
            <section name="search_options">
               <param name="by_seq" value="true"/>
                <param name="max_mismatch" value="0"/>
            </section>
            <output decompress="true" name="output" file="grep_output1.fasta.gz" ftype="fasta.gz"/>
        </test>
    <test expect_num_outputs="1">
        <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/>
        <conditional name="conditional_pattern">
            <param name="mode" value="file"/>
            <param name="pattern_file" value="grep_pattern.fasta"/>
        </conditional>
        <section name="search_options">
            <param name="invert_match" value="true"/>
        </section>
        <output decompress="true" name="output" file="grep_output2.fasta.gz" ftype="fasta.gz"/>
    </test>
    <test expect_num_outputs="1">
            <param name="input" value="input1.fastq.gz" ftype="fastq.gz"/>
            <conditional name="conditional_pattern">
                <param name="mode" value="expression"/>
                <param name="pattern" value="^5"/>
                <param name="use_regexp" value="true"/>
            </conditional>
            <section name="search_options">
                <param name="by_name" value="true"/>
            </section>
            <output decompress="true" name="output" file="grep_output3.fastq.gz" ftype="fastq.gz"/>
        </test>
        <test expect_num_outputs="1">
            <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/>
            <conditional name="conditional_pattern">
                <param name="mode" value="expression"/>
                <param name="pattern" value="NNNNATGC"/>
            </conditional>
            <section name="search_options">
                <param name="by_seq" value="true"/>
                <param name="degenerate" value="true"/>
            </section>
            <output decompress="true" name="output" file="grep_output4.fasta.gz" ftype="fasta.gz"/>
        </test>
    </tests>
    <help>
.. class:: infomark

**What it does**

search sequences by ID/name/sequence/sequence motifs, mismatch allowed

------

.. class:: infomark

**Attention**

  0. By default, we match sequence ID with patterns, use "-n/--by-name"
     for matching full name instead of just ID.
  1. Unlike POSIX/GNU grep, we compare the pattern to the whole target
     (ID/full header) by default. Please switch "-r/--use-regexp" on
     for partly matching.
  2. When searching by sequences, it's partly matching, and both positive
     and negative strands are searched.
     Please switch on "-P/--only-positive-strand" if you would like to
     search only on the positive strand.
     Mismatch is allowed using flag "-m/--max-mismatch", you can increase
     the value of "-j/--threads" to accelerate processing.
  3. Degenerate bases/residues like "RYMM.." are also supported by flag -d.
     But do not use degenerate bases/residues in regular expression, you need
     convert them to regular expression, e.g., change "N" or "X"  to ".".
  4. When providing search patterns (motifs) via flag '-p',
     please use double quotation marks for patterns containing comma,
     e.g., -p '"A{2,}"' or -p "\"A{2,}\"". Because the command line argument
     parser accepts comma-separated-values (CSV) for multiple values (motifs).
     Patterns in file do not follow this rule.
  5. The order of sequences in result is consistent with that in original
     file, not the order of the query patterns.
     But for FASTA file, you can use:
     seqkit faidx seqs.fasta --infile-list IDs.txt
  6. For multiple patterns, you can either set "-p" multiple times, i.e.,
     -p pattern1 -p pattern2, or give a file of patterns via "-f/--pattern-file".
    </help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Fri, 26 Sep 2025 16:49:16 +0000
parents	731f3256c2b3
children