Mercurial > repos > iuc > seqkit_locate
diff seqkit_locate.xml @ 0:642d73815dd1 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/seqkit commit 202bb1229cb0b8e8040a87d140edb6fdf7654628
author | iuc |
---|---|
date | Thu, 03 Nov 2022 19:35:37 +0000 |
parents | |
children | 6510652376b1 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seqkit_locate.xml Thu Nov 03 19:35:37 2022 +0000 @@ -0,0 +1,177 @@ +<tool id="seqkit_locate" name="SeqKit locate" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>subsequences/motifs, mismatch allowed</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="bio_tools"/> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ +#import re + +#set input_identifier = re.sub('[^\s\w\-]', '_', str($input.element_identifier)) +ln -s '${input}' '${input_identifier}' && + +seqkit locate +--threads \${GALAXY_SLOTS:-4} +#if $conditional_pattern.mode == 'expression' + --pattern '"$conditional_pattern.pattern"' + $conditional_pattern.use_regexp +#else + --pattern-file '$conditional_pattern.pattern_file' +#end if +$output_mode +--validate-seq-length $advanced_options.validate_seq_length +$advanced_options.circular +$advanced_options.degenerate +$advanced_options.hide_matched +$advanced_options.ignore_case +#if not $advanced_options.degenerate + --max-mismatch $advanced_options.max_mismatch + $advanced_options.use_fmi +#end if +$advanced_options.non_greedy +$advanced_options.only_positive_strand +$advanced_options.id_ncbi +'${input_identifier}' +> '$output' +]]></command> + <inputs> + <param name="input" type="data" format="fasta,fasta.gz" label="Input file"/> + <conditional name="conditional_pattern"> + <param name="mode" type="select" label="Pattern mode" + help="Specify a pattern/motif sequence or a FASTA file with the motif of interest. Motifs could be EITHER plain sequence containing 'ACTGN' OR regular + expression like 'A[TU]G(?:.{3})+?[TU](?:AG|AA|GA)' for ORFs"> + <option value="expression">Pattern/motif sequence</option> + <option value="file">FASTA file with the pattern/motif of interest</option> + </param> + <when value="expression"> + <param argument="--pattern" type="text" value="" label="Pattern/motif sequence" help="Perl regular expressions are allowed"> + <sanitizer invalid_char=""> + <valid initial="string.letters,string.digits"> + <add value="^"/> + <add value="$"/> + <add value="("/> + <add value=")"/> + <add value="|"/> + <add value="?"/> + <add value="*"/> + <add value="+"/> + <add value="{"/> + <add value="}"/> + <add value="\"/> + <add value="["/> + <add value="]"/> + <add value="."/> + <add value=","/> + </valid> + </sanitizer> + <validator type="regex" message="Pattern must not end with backslash.">.*[^\\]$</validator> + </param> + <param argument="--use-regexp" type="boolean" truevalue="--use-regexp" falsevalue="" checked="false" label="Pattern/motifs are regular expressions"/> + </when> + <when value="file"> + <param argument="--pattern-file" type="data" format="fasta" label="Pattern/motif file"/> + </when> + </conditional> + <param name="output_mode" type="select" label="Output mode"> + <option value="">Tabular (default format)</option> + <option value="--gtf">GTF</option> + <option value="--bed">BED6</option> + </param> + <section name="advanced_options" title="Advanced options"> + <param argument="--validate-seq-length" type="integer" min="0" value="10000" label="Lenth of the sequence to validate" help="Default: 10000" /> + <param argument="--circular" type="boolean" truevalue="--circular" falsevalue="" checked="false" label="Circular genome" + help="When using flag --circular, end position of matched subsequence that crossing genome sequence end would be greater than sequence length" /> + <param argument="--degenerate" type="boolean" truevalue="--degenerate" falsevalue="" checked="false" label="Pattern/motif contains degenerate bases" + help="Do not use degenerate bases/residues in regular expression, you need convert them to regular expression, e.g., change 'N' or 'X' to '.'"/> + <param argument="--hide-matched" type="boolean" truevalue="--hide-matched" falsevalue="" checked="false" label="Do not show matched sequences"/> + <param argument="--ignore-case" type="boolean" truevalue="--ignore-case" falsevalue="" checked="false" label="Ignore case"/> + <param argument="--max-mismatch" type="integer" min="0" value="0" label="Maximum mismatch" help="For large genomes like human genome, using mapping/alignment tools would be faster" /> + <param argument="--non-greedy" type="boolean" truevalue="--non-greedy" falsevalue="" checked="false" label="Non-greedy mode" help="Faster, but muy miss motifs overlapping with others" /> + <param argument="--only-positive-strand" type="boolean" truevalue="--only-positive-strand" falsevalue="" checked="false" label="Only search on positive strand"/> + <param argument="--use-fmi" type="boolean" truevalue="--use-fmi" falsevalue="" checked="false" label="FM-index" + help="Use FM-index for much faster search of lots of sequence patterns. This option is not compatible with the --degenerate option"/> + <param argument="--id-ncbi" type="boolean" truevalue="--id-ncbi" falsevalue="" checked="false" label="FASTA head is NCBI stype" help="Example: >gi|110645304|ref|NC_002516.2| Pseud..." /> + </section> + </inputs> + <outputs> + <data name="output" format="tabular" label="${tool.name} on ${on_string}"> + <change_format> + <when input="output_mode" value="--gtf" format="gtf"/> + <when input="output_mode" value="--bed" format="bed"/> + </change_format> + </data> + </outputs> + <tests> + <test expect_num_outputs="1"> + <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/> + <conditional name="conditional_pattern"> + <param name="mode" value="expression"/> + <param name="pattern" value="ATAGAT"/> + </conditional> + <section name="advanced_options"> + <param name="max_mismatch" value="1"/> + </section> + <output name="output" file="locate_output1.tabular" ftype="tabular"/> + </test> + <test expect_num_outputs="1"> + <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/> + <conditional name="conditional_pattern"> + <param name="mode" value="expression"/> + <param name="pattern" value="A[TU]G"/> + <param name="use_regexp" value="true"/> + </conditional> + <param name="output_mode" value="--bed"/> + <section name="advanced_options"> + <param name="circular" value="true"/> + <param name="hide_matched" value="true"/> + <param name="ignore_case" value="true"/> + <param name="only_positive_strand" value="true"/> + <param name="id_ncbi" value="true"/> + </section> + <output name="output" file="locate_output2.bed" ftype="bed"/> + </test> + <test expect_num_outputs="1"> + <param name="input" value="input1.fasta.gz" ftype="fasta.gz"/> + <conditional name="conditional_pattern"> + <param name="mode" value="file"/> + <param name="pattern_file" value="motif_sequence.fasta"/> + </conditional> + <param name="output_mode" value="--gtf"/> + <section name="advanced_options"> + <param name="use_fmi" value="true"/> + </section> + <output name="output" file="locate_output3.gtf" ftype="gtf"/> + </test> + </tests> + <help> +.. class:: infomark + +**Purpose** + +Locate subsequences/motifs, mismatch allowed. + +------ + +.. class:: infomark + +**Attention** + + 1. Motifs could be EITHER plain sequence containing "ACTGN" OR regular + expression like "A[TU]G(?:.{3})+?[TU](?:AG|AA|GA)" for ORFs. + 2. Degenerate bases/residues like "RYMM.." are also supported by flag -d. + But do not use degenerate bases/residues in regular expression, you need + convert them to regular expression, e.g., change "N" or "X" to ".". + 3. When providing search patterns (motifs) via flag '-p', + please use double quotation marks for patterns containing comma, + e.g., -p '"A{2,}"' or -p "\"A{2,}\"". Because the command line argument + parser accepts comma-separated-values (CSV) for multiple values (motifs). + Patterns in file do not follow this rule. + 4. Mismatch is allowed using flag "-m/--max-mismatch", + you can increase the value of "-j/--threads" to accelerate processing. + 5. When using flag --circular, end position of matched subsequence that + crossing genome sequence end would be greater than sequence length. + </help> + <expand macro="citations"/> +</tool> +