view fastaregexfinder.xml @ 0:269c627ae9f4 draft default tip

planemo upload for repository commit 8e118a4d24047e2c62912b962e854f789d6ff559
author mbernt
date Wed, 20 Jun 2018 11:06:57 -0400
line wrap: on
line source

<tool id="fasta_regex_finder" name="fasta_regex_finder" version="0.1.0">
        Search in fasta for regexp match
    <version_command>python $__tool_directory__/ --version</version_command>
    <command detect_errors="exit_code"><![CDATA[
python $__tool_directory__/ 
--fasta "$input"
--regex "$regex"
#if $settings.advanced == "advanced"
    --maxstr $settings.maxstr
    #if $settings.seqnames != ""
        --seqnames $settings.seqnames
    #end if
#end if
> $output
        <param type="data" name="input" format="fasta" />
        <param name="regex" size="30" type="text" value="([gG]{3,}\w{1,7}){3,}[gG]{3,}" label="Regular expression" help="(--regex)">
                <valid initial="string.printable">
                    <remove value="&apos;"/>
                <mapping initial="none">
                    <add source="&apos;" target="__sq__"/>
        <conditional name="settings">
            <param name="advanced" type="select" label="Specify advanced parameters">
                <option value="simple" selected="true">No, use program defaults.</option>
                <option value="advanced">Yes, see full parameter list.</option>
            <when value="simple">
            <when value="advanced">
                <param name="matchcase" type="boolean" label="Match case" truevalue="--matchcase" falsevalue="" help="(--matchcase)" />
                <param name="noreverse" type="boolean" label="Do not search the reverse complement" truevalue="--noreverse" falsevalue="" help="(--noreverse)" />
                <param name="maxstr" type="integer" label="Maximum length of the match to report" value="10000" min="1" help="(--maxstr)" />
                <param name="seqnames" size="30" type="text" value="" label="Space separated list of fasta sequences to search" help="--seqnames"/>
        <data name="output" format="bed" from_work_dir="TestSeqGroup-G4.bed" />
            <param name="input" value="TestSeqGroup-G4.fasta"/>
            <output name="output" file="TestSeqGroup-G4.bed"/>
            <param name="input" value="test.fas"/>
            <param name="regex" value="ACTG"/>
            <output name="output" file="test-1.bed"/>
            <param name="input" value="test.fas"/>
            <param name="regex" value="ACTG"/>
            <param name="advanced" value="advanced"/>
            <param name="matchcase" value="--matchcase"/>
            <output name="output" file="test-2.bed"/>
            <param name="input" value="test.fas"/>
            <param name="regex" value="ACTG"/>
            <param name="advanced" value="advanced"/>
            <param name="noreverse" value="--noreverse"/>
            <output name="output" file="test-3.bed"/>
            <param name="input" value="test.fas"/>
            <param name="regex" value="ACTG"/>
            <param name="advanced" value="advanced"/>
            <param name="maxstr" value="3"/>
            <output name="output" file="test-4.bed"/>
            <param name="input" value="TestSeqGroup-G4.fasta"/>
            <param name="advanced" value="advanced"/>
            <param name="seqnames" value="HJ24-Shp2_oncogenicProtein2 HJ24-Shp2_oncogenicProtein"/>
            <output name="output" file="TestSeqGroup-G4-sub.bed"/>
Search a fasta file for matches to a regular expression and return a bed file with the
coordinates of the match and the matched sequence itself. 
Output bed file has columns:

1. Name of fasta sequence (e.g. chromosome)
2. Start of the match
3. End of the match
4. ID of the match
5. Length of the match
6. Strand 
7. Matched sequence as it appears on the forward strand
For matches on the reverse strand it is reported the start and end position on the
forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT'
present on the reverse strand is reported as ACCCTCCC).

Note: Fasta sequences (chroms) are read in memory one at a time along with the
matches for that chromosome.
The order of the output is: chroms as they are found in the inut fasta, matches
sorted within chroms by positions.


- regex Regex to be searched in the fasta input. Matches to the reverse complement will have - strand. The default regex is '([gG]{3,}\w{1,7}){3,}[gG]{3,}' which searches for G-quadruplexes.
- matchcase Match case while searching for matches. Default is to ignore case (I.e. 'ACTG' will match 'actg').
- noreverse           Do not search the reverse complement of the input fasta. Use this flag to search protein sequences.                                   
- maxstr Maximum length of the match to report in the 7th column of the output. Default is to report up to 10000nt. Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>]
- seqnames List of fasta sequences in the input to search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes. Default is to search all the sequences in input.


Test data::

Example1 regex=ACTG::

        mychr	0	4	mychr_0_4_for	4	+	ACTG
        mychr	5	9	mychr_5_9_for	4	+	ACTG
        mychr	10	14	mychr_10_14_for	4	+	ACTG

Example2 regex=ACTG maxstr=3::

        mychr	0	4	mychr_0_4_for	4	+	ACT[3,4]
        mychr	5	9	mychr_5_9_for	4	+	ACT[3,4]
        mychr	10	14	mychr_10_14_for	4	+	ACT[3,4]
Example3 regex=A\w\wG::

        mychr	0	5	mychr_0_5_for	5	+	ACTGn
        mychr	5	10	mychr_5_10_for	5	+	ACTGn
        mychr	10	15	mychr_10_15_for	5	+	ACTGn

        <citation type="bibtex">
  author = {Dario Beraldi},
  year = {2017},
  title = {fastaRegexFinder},
  publisher = {GitHub},
  journal = {GitHub repository},
  url = {},