Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences

<tool id="fasta_merge_files_and_filter_unique_sequences" name="FASTA Merge Files and Filter Unique Sequences" version="1.1">
    <description>Concatenate FASTA database files together</description>
    <requirements>
        <requirement type="package" version="2.7.12">python</requirement>
    </requirements>
    <command>
        python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py'
        '$output' $uniqueness_criterion '$accession_parser'
        #for $input in $inputs:
            '$input'
        #end for
    </command>
    <inputs>
        <param name="inputs" format="fasta" multiple="True" type="data" label="Input FASTA files"/>
        <param name="uniqueness_criterion" type="select" label="How are sequences judged to be unique?">
            <option value="sequence" selected="true">Accession and Sequence</option>
            <option value="accession">Accession Only</option>
        </param>
        <param name="accession_parser" type="text" label="Accession Parsing Regex" value="^&gt;([^ ]+).*$" help="Regular expression with 1 capture group; the capture group is the accession (which must be unique)">
          <sanitizer>
            <valid>
              <add preset="string.printable"/>
              <remove value="&#92;" />
              <remove value="&apos;" />
            </valid>
            <mapping initial="none">
              <add source="&#92;" target="__backslash__" />
              <add source="&apos;" target="__sq__"/>
            </mapping>
          </sanitizer>
        </param>
    </inputs>
    <outputs>
        <data format="fasta" name="output" label="Merged and Filtered FASTA from ${on_string}"/>
    </outputs>
    <tests>
        <test>
          <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
          <param name="uniqueness_criterion" value="sequence" />
          <param name="accession_parser" value="^&gt;([^ |]+).*$" />
          <output name="output" file="res-sequence.fa" ftype="fasta" />
          <assert_stdout>
            <has_line line="Skipping protein '&gt;one_2' with duplicate sequence (first seen as '&gt;one')" />
            <has_line line="Skipping protein '&gt;two_2' with duplicate sequence (first seen as '&gt;two')" />
            <has_line line="Skipping protein '&gt;three_2|456' with duplicate accession" />
            <has_line line="Skipping protein '&gt;three_2 789' with duplicate accession" />
          </assert_stdout>
        </test>
        <test>
          <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
          <param name="uniqueness_criterion" value="accession" />
          <param name="accession_parser" value="^&gt;([^ |]+).*$" />
          <output name="output" file="res-accession.fa" ftype="fasta" />
          <assert_stdout>
            <has_line line="Skipping protein '&gt;three_2|456' with duplicate accession" />
            <has_line line="Skipping protein '&gt;three_2 789' with duplicate accession" />
          </assert_stdout>
        </test>
    </tests>
    <help>
<![CDATA[
**What it does**

Concatenate FASTA database files together.

If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output.
Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output.

The default accession parser will treat everything in the header before the first space as the accession.

------

**Citation**

If you use this tool in Galaxy, please the GalaxyP developers at: https://github.com/galaxyproteomics/

]]>
    </help>
</tool>
author	galaxyp
date	Fri, 03 Feb 2017 14:27:56 -0500
parents	379c41d859aa
children	8462a4e9f86e