view fasta_merge_files_and_filter_unique_sequences.xml @ 3:9ad0d336e5ed draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
author galaxyp
date Fri, 03 Feb 2017 14:27:56 -0500
parents 379c41d859aa
children 8462a4e9f86e
line wrap: on
line source

<tool id="fasta_merge_files_and_filter_unique_sequences" name="FASTA Merge Files and Filter Unique Sequences" version="1.1">
    <description>Concatenate FASTA database files together</description>
    <requirements>
        <requirement type="package" version="2.7.12">python</requirement>
    </requirements>
    <command>
        python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py'
        '$output' $uniqueness_criterion '$accession_parser'
        #for $input in $inputs:
            '$input'
        #end for
    </command>
    <inputs>
        <param name="inputs" format="fasta" multiple="True" type="data" label="Input FASTA files"/>
        <param name="uniqueness_criterion" type="select" label="How are sequences judged to be unique?">
            <option value="sequence" selected="true">Accession and Sequence</option>
            <option value="accession">Accession Only</option>
        </param>
        <param name="accession_parser" type="text" label="Accession Parsing Regex" value="^&gt;([^ ]+).*$" help="Regular expression with 1 capture group; the capture group is the accession (which must be unique)">
          <sanitizer>
            <valid>
              <add preset="string.printable"/>
              <remove value="&#92;" />
              <remove value="&apos;" />
            </valid>
            <mapping initial="none">
              <add source="&#92;" target="__backslash__" />
              <add source="&apos;" target="__sq__"/>
            </mapping>
          </sanitizer>
        </param>
    </inputs>
    <outputs>
        <data format="fasta" name="output" label="Merged and Filtered FASTA from ${on_string}"/>
    </outputs>
    <tests>
        <test>
          <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
          <param name="uniqueness_criterion" value="sequence" />
          <param name="accession_parser" value="^&gt;([^ |]+).*$" />
          <output name="output" file="res-sequence.fa" ftype="fasta" />
          <assert_stdout>
            <has_line line="Skipping protein '&gt;one_2' with duplicate sequence (first seen as '&gt;one')" />
            <has_line line="Skipping protein '&gt;two_2' with duplicate sequence (first seen as '&gt;two')" />
            <has_line line="Skipping protein '&gt;three_2|456' with duplicate accession" />
            <has_line line="Skipping protein '&gt;three_2 789' with duplicate accession" />
          </assert_stdout>
        </test>
        <test>
          <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
          <param name="uniqueness_criterion" value="accession" />
          <param name="accession_parser" value="^&gt;([^ |]+).*$" />
          <output name="output" file="res-accession.fa" ftype="fasta" />
          <assert_stdout>
            <has_line line="Skipping protein '&gt;three_2|456' with duplicate accession" />
            <has_line line="Skipping protein '&gt;three_2 789' with duplicate accession" />
          </assert_stdout>
        </test>
    </tests>
    <help>
<![CDATA[
**What it does**

Concatenate FASTA database files together.

If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output.
Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output.

The default accession parser will treat everything in the header before the first space as the accession.

------

**Citation**

If you use this tool in Galaxy, please the GalaxyP developers at: https://github.com/galaxyproteomics/

]]>
    </help>
</tool>