view filter_assembly.xml @ 3:faadbce1bbd2 draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ee0948efcef5f8732f7f32ce37a8127d891d0c64
author abims-sbr
date Thu, 11 Sep 2025 08:27:22 +0000
parents 000dbfafe31d
children
line wrap: on
line source

<tool name="Filter assemblies" id="filter_assemblies" version="2.0.6">

    <description>
        Filter the outputs of Spades, Velvet or Trinity assemblies
    </description>

    <macros>
        <import>macros.xml</import>
    </macros>

    <requirements>
        <expand macro="python3_required" />
        <requirement type="package" version="10.2011">cap3</requirement>
    </requirements>

    <command>
    <![CDATA[
        #set $infiles = ""
        #for $input in $inputs
            ln -s '$input' '$input.element_identifier';
            #set $infiles = $infiles + $input.element_identifier + ","
        #end for
        #set $infiles = $infiles[:-1]

        python '$__tool_directory__/scripts/S01_script_to_choose.py'

        '$infiles'
        $length_seq_max
        $percent_identity
        $overlap_length
        > '${log}'
    ]]>
    </command>

    <inputs>
        <param name="inputs" type="data" format="fasta" multiple="true" label="Input files" />
        <param name="percent_identity" type="integer" value="90" label="Overlap percent identity cutoff" help="Cap3 parameter (-p N); minimum percent identity of an overlap. The specified value should be more than 65%." />
        <param name="overlap_length" type="integer" value="60" label="Overlap length cutoff" help="Cap3 parameter (-o N); minimum length of an overlap (in base pairs). The specified value should be more than 15 base pairs." />
        <param name="length_seq_max" type="integer" value="100" label="Minimum sequence length" help="Keep sequences which length is higher than the minimum sequence length  " />
    </inputs>

    <outputs>
        <collection name="output_fasta" type="list" label="Filter Assemblies outputs">
            <discover_datasets pattern="__name_and_ext__" directory="outputs" />
        </collection>
        <data format="txt" name="log" label="Filter Assemblies Summary"/>
    </outputs>

	<tests>
        <test>
            <param name="inputs" ftype="fasta" value="trinity/Pfiji_trinity.fasta,trinity/Apomp_trinity.fasta,trinity/Amphi_trinity.fasta,trinity/Acaud_trinity.fasta,velvet/Pg_transcriptome_90109.fasta,velvet/Ap_transcriptome_35099.fasta,velvet/Ac_transcriptome_25591.fasta" />
            <param name="percent_identity" value="100" />
            <param name="overlap_length" value="60" />
            <param name="length_seq_max" value="100" />
            <output name="log" value="trinity_and_velvet_up.output" />
            <output_collection name="output_fasta" type="list">
                <element name="AcAc_transcriptome_25591" value="velvet_out/AcAc_transcriptome_25591.fasta" />
                <element name="ApAp_transcriptome_35099" value="velvet_out/ApAp_transcriptome_35099.fasta" />
                <element name="PgPg_transcriptome_90109" value="velvet_out/PgPg_transcriptome_90109.fasta" />
                <element name="AcAcaud_trinity" value="trinity_out/AcAcaud_trinity.fasta" />
                <element name="AmAmphi_trinity" value="trinity_out/AmAmphi_trinity.fasta" />
                <element name="ApApomp_trinity" value="trinity_out/ApApomp_trinity.fasta" />
                <element name="PfPfiji_trinity" value="trinity_out/PfPfiji_trinity.fasta" />
            </output_collection>
        </test>        
        <test>
            <param name="inputs" ftype="fasta" value="trinity/Pfiji_trinity.fasta,trinity/Apomp_trinity.fasta,trinity/Amphi_trinity.fasta,trinity/Acaud_trinity.fasta" />
            <param name="percent_identity" value="100" />
            <param name="overlap_length" value="60" />
            <param name="length_seq_max" value="100" />
            <output name="log" value="trinity_up.output" />
            <output_collection name="output_fasta" type="list">
                <element name="AcAcaud_trinity" value="trinity_out/AcAcaud_trinity.fasta" />
                <element name="AmAmphi_trinity" value="trinity_out/AmAmphi_trinity.fasta" />
                <element name="ApApomp_trinity" value="trinity_out/ApApomp_trinity.fasta" />
                <element name="PfPfiji_trinity" value="trinity_out/PfPfiji_trinity.fasta" />
            </output_collection>
        </test>
        <test>
            <param name="inputs" ftype="fasta" value="velvet/Pg_transcriptome_90109.fasta,velvet/Ap_transcriptome_35099.fasta,velvet/Ac_transcriptome_25591.fasta" />
            <param name="percent_identity" value="100" />
            <param name="overlap_length" value="60" />
            <param name="length_seq_max" value="100" />
            <output name="log" value="velvet_up.output" />
            <output_collection name="output_fasta" type="list">
                <element name="AcAc_transcriptome_25591" value="velvet_out/AcAc_transcriptome_25591.fasta" />
                <element name="ApAp_transcriptome_35099" value="velvet_out/ApAp_transcriptome_35099.fasta" />
                <element name="PgPg_transcriptome_90109" value="velvet_out/PgPg_transcriptome_90109.fasta" />
            </output_collection>
        </test>
        
    </tests>

	<help>

@HELP_AUTHORS@

<![CDATA[

**Description**

This tool runs the CAP3 software on assembly FASTA data, merge singlets and contigs and then reformat headers to allow any assembly tools.

---------

**Input format**

Sequences are in the FASTA format:

| >seqname1
| AAAGAGAGACCACATGTCAGTAGC -on one or several lines -
| >seqname2
| AAGGCCTGACCACATGAGTTAAGC -on one or several lines -
| etc ...
|

---------

**Parameters**

    - 'Input files' : a collection of fasta files (one file per species).
    - 'Overlap percent identity cutoff' : cap3 -p parameter : minimum percent identity of an overlap.
        must be > 65 ; default : 100.
    - 'Overlap length cutoff' (integer) : cap3 -o parameter : minimum length of an overlap (in base pairs).
        must be > 15 ; default : 60.
    - 'Minimum sequence length' (integer) : only keep sequences which are longer than the specified value.
        default : 100.

---------

**Steps**:
    
The tool:
    1) Performs a CAP3 from the full set of ORFs to minimize redundancy
    2) Merges singlets and contigs identified by CAP3
    3) Reformats headers of the FASTA records by adding a specified prefix (defined from the original filename) and ensures that sequences are on a single line

**Outputs**

    - 'Filter Assemblies Summary' : the log file.
    - 'Filter Assemblies outputs' : the main results.

---------

**The AdaptSearch Pipeline**

.. image:: adaptsearch_picture_helps.png

---------

Changelog
---------

		
**Version 2.2 - 07/10/2024**

    - Input files can be from any assembly tools
		
**Version 2.1 - 15/01/2018**

    - Input files can be a mix from files coming either from Trinity or Velvet Oases assemblers

**Version 2.0 - 14/04/2017**

    - NEW: Replace the zip between tools by Dataset Collection

**Version 1.0 - 13/04/2017**

    - TEST: Add funtional test with planemo
    - IMPROVEMENT: Use conda dependencies for cap3, fastaformatter and python

    ]]>
	</help>

</tool>