view dereplication/dereplicate.xml @ 0:88fc52f1c5db draft default tip

Uploaded
author qfab
date Wed, 28 May 2014 20:34:11 -0400
parents
children
line wrap: on
line source

<tool id="usearch_derep_full" name="Dereplicate" version="1.0.0">
  <description>Remove duplicate sequences</description>
  <command>
    #if [ $mode == "fulllength" ]
      usearch -derep_fulllength $input -output $output -sizeout 2&gt;1;
    #elif [ $mode == "prefix" ]
      usearch -derep_prefix $input -output $output -sizeout 2&gt;1;
    #else
      echo 'Unrecognised mode:' $mode;
      echo '  [fulllength|prefix] only';
    #end if
  </command>
  <inputs>
    <param name='input' type='data' format='fasta' label='Input sequence file' />
    <param name='mode' type='select' label='Criteria used for duplicate detection'>
      <option value='fulllength'>Full length</option>
      <option value='prefix'>Prefix</option>
    </param>
  </inputs>

  <outputs>
    <data name='output' format='fasta' />
  </outputs>

  <help>
===========
Description
===========

Removes duplicate sequences using one of two modes (below), from the Usearch-Tool-Suite_.

.. _Usearch-Tool-Suite: http://www.drive5.com/usearch/

-----

-----
Input
-----

File of reads in FASTA format.

----------
Parameters
----------

Full length
  Matching is performed over the full length of the sequences, all identical sequences except one are removed.
Prefix
  A sequence (A) is discarded, if it is a prefix of another sequence (B). The first part of the sequence is identical.

------
Output
------

A FASTA file containing only unique sequences according to the criteria chosen for the duplicate detection. The identifier line for each sequence states the representative sequence followed by the number of identical sequences found.

e.g. >sequenceXXXX;size=1443;

sequenceXXXX is the representative of 1443 identical sequences.

-----

=========
Resources
=========

Dereplication_

.. _Dereplication: http://drive5.com/usearch/manual/dereplication.html

**Author**

Robert C. Edgar (bob@drive5.com)

**Wrapper Author**

QFAB Bioinformatics (support@qfab.org)
  </help>
  <tests>
    <test>
    	<param name="input" value="seqs.fasta" />
    	<param name="mode" value="fulllength" />
    	<output name="output" file="seqs_derep.fasta" ftype="fasta" lines_diff="10" />
    </test>
  </tests>
</tool>