view tools/ncbi_blast_plus/blastxml_to_tabular.xml @ 13:623f727cdff1 draft

Uploaded v0.1.00, uses BLAST+ 2.2.29, allows custom column selection for tabular output - including taxonomy fields.
author peterjc
date Fri, 14 Mar 2014 07:40:46 -0400
parents 4c4a0da938ff
children 2fe07f50a41e
line wrap: on
line source

<tool id="blastxml_to_tabular" name="BLAST XML to tabular" version="0.1.00">
    <description>Convert BLAST XML output to tabular</description>
    <version_command interpreter="python">blastxml_to_tabular.py --version</version_command>
    <command interpreter="python">
blastxml_to_tabular.py -o "$tabular_file"
#if $output.out_format == "cols":
#set cols = (str($output.std_cols)+","+str($output.ext_cols)).replace("None", " ").replace(",,", ",").replace(",", " ")
-c "$cols"
#else
-c "$output.out_format"
#end if
#for i in $blastxml_file#${i} #end for#
    </command>
    <stdio>
        <!-- Anything other than zero is an error -->
        <exit_code range="1:" />
        <exit_code range=":-1" />
    </stdio>
    <inputs>
        <param name="blastxml_file" type="data" format="blastxml" multiple="true" label="BLAST results as XML"/>
        <conditional name="output">
          <param name="out_format" type="select" label="Output format">
            <option value="std" selected="True">Tabular (standard 12 columns)</option>
            <option value="ext">Tabular (extended 25 columns)</option>
            <option value="cols">Tabular (select columns to output)</option>
          </param>
          <when value="std"/>
          <when value="ext"/>
          <when value="cols">
            <param name="std_cols" type="select" multiple="true" display="checkboxes" label="Standard columns">
              <option selected="true" value="qseqid">qseqid = Query Seq-id (ID of your sequence)</option>
              <option selected="true" value="sseqid">sseqid = Subject Seq-id (ID of the database hit)</option>
              <option selected="true" value="pident">pident = Percentage of identical matches</option>
              <option selected="true" value="length">length = Alignment length</option>
              <option selected="true" value="mismatch">mismatch = Number of mismatches</option>
              <option selected="true" value="gapopen">gapopen = Number of gap openings</option>
              <option selected="true" value="qstart">qstart = Start of alignment in query</option>
              <option selected="true" value="qend">qend = End of alignment in query</option>
              <option selected="true" value="sstart">sstart = Start of alignment in subject (database hit)</option>
              <option selected="true" value="send">send = End of alignment in subject (database hit)</option>
              <option selected="true" value="evalue">evalue = Expectation value (E-value)</option>
              <option selected="true" value="bitscore">bitscore = Bit score</option>
            </param>
            <param name="ext_cols" type="select" multiple="true" display="checkboxes" label="Extended columns">
              <option value="sallseqid">sallseqid = All subject Seq-id(s), separated by a ';'</option>
              <option value="score">score = Raw score</option>
              <option value="nident">nident = Number of identical matches</option>
              <option value="positive">positive = Number of positive-scoring matches</option>
              <option value="gaps">gaps = Total number of gaps</option>
              <option value="ppos">ppos = Percentage of positive-scoring matches</option>
              <option value="qframe">qframe = Query frame</option>
              <option value="sframe">sframe = Subject frame</option>
              <option value="qseq">qseq = Aligned part of query sequence</option>
              <option value="sseq">sseq = Aligned part of subject sequence</option>
              <option value="qlen">qlen = Query sequence length</option>
              <option value="slen">slen = Subject sequence length</option>
              <option value="salltitles">salltitles = All subject title(s), separated by a '&lt;&gt;'</option>
            </param>
          </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="tabular_file" format="tabular" label="$on_string (as tabular)" />
    </outputs>
    <requirements>
    </requirements>
    <tests>
        <test>
            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
            <param name="out_format" value="std" />
            <!-- Note this has some white space differences from the actual blastp output blast_four_human_vs_rhodopsin.tabluar -->
            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_converted.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
            <param name="out_format" value="ext" />
            <!-- Note this has some white space differences from the actual blastp output blast_four_human_vs_rhodopsin_22c.tabluar -->
            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_converted_ext.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="blastxml_file" value="blastp_sample.xml" ftype="blastxml" />
            <param name="out_format" value="std" />
            <!-- Note this has some white space differences from the actual blastp output -->
            <output name="tabular_file" file="blastp_sample_converted.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="blastxml_file" value="blastx_rhodopsin_vs_four_human.xml" ftype="blastxml" />
            <param name="out_format" value="std" />
            <!-- Note this has some white space differences from the actual blastx output -->
            <output name="tabular_file" file="blastx_rhodopsin_vs_four_human_converted.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="blastxml_file" value="blastx_rhodopsin_vs_four_human.xml" ftype="blastxml" />
            <param name="out_format" value="ext" />
            <!-- Note this has some white space and XXXX masking differences from the actual blastx output -->
            <output name="tabular_file" file="blastx_rhodopsin_vs_four_human_converted_ext.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="blastxml_file" value="blastx_sample.xml" ftype="blastxml" />
            <param name="out_format" value="std" />
            <!-- Note this has some white space differences from the actual blastx output -->
            <output name="tabular_file" file="blastx_sample_converted.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="blastxml_file" value="blastp_human_vs_pdb_seg_no.xml" ftype="blastxml" />
            <param name="out_format" value="std" />
            <!-- Note this has some white space differences from the actual blastp output -->
            <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_std.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="blastxml_file" value="blastp_human_vs_pdb_seg_no.xml" ftype="blastxml" />
            <param name="out_format" value="ext" />
            <!-- Note this has some white space differences from the actual blastp output -->
            <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_ext.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="blastxml_file" value="blastn_arabidopsis.xml" ftype="blastxml" />
            <param name="out_format" value="std" />
            <output name="tabular_file" file="blastn_arabidopsis.standard.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="blastxml_file" value="blastn_arabidopsis.xml" ftype="blastxml" />
            <param name="out_format" value="ext" />
            <output name="tabular_file" file="blastn_arabidopsis.extended.tabular" ftype="tabular" />
        </test>
        <!-- there are some harmless white space differences in our conversion to the BLAST+ output here: -->
        <test>
            <param name="blastxml_file" value="blastn_rhodopsin_vs_three_human.xml" ftype="blastxml" />
            <param name="out_format" value="std" />
            <output name="tabular_file" file="blastn_rhodopsin_vs_three_human_converted.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="blastxml_file" value="blastn_rhodopsin_vs_three_human.xml" ftype="blastxml" />
            <param name="out_format" value="cols" />
            <param name="std_cols" value="qseqid,sseqid,pident" />
            <param name="ext_cols" value="qlen,slen" />
            <output name="tabular_file" file="blastn_rhodopsin_vs_three_human.columns.tabular" ftype="tabular" />
        </test>
    </tests>
    <help>
    
**What it does**

NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of
formats including tabular and a more detailed XML format. A complex workflow
may need both the XML and the tabular output - but running BLAST twice is
slow and wasteful.

This tool takes the BLAST XML output and can convert it into the
standard 12 column tabular equivalent:

====== ========= ============================================
Column NCBI name Description
------ --------- --------------------------------------------
     1 qseqid    Query Seq-id (ID of your sequence)
     2 sseqid    Subject Seq-id (ID of the database hit)
     3 pident    Percentage of identical matches
     4 length    Alignment length
     5 mismatch  Number of mismatches
     6 gapopen   Number of gap openings
     7 qstart    Start of alignment in query
     8 qend      End of alignment in query
     9 sstart    Start of alignment in subject (database hit)
    10 send      End of alignment in subject (database hit)
    11 evalue    Expectation value (E-value)
    12 bitscore  Bit score
====== ========= ============================================

The BLAST+ tools can optionally output additional columns of information,
but this takes longer to calculate. Most (but not all) of these columns are
included by selecting the extended tabular output. The extra columns are
included *after* the standard 12 columns. This is so that you can write
workflow filtering steps that accept either the 12 or 25 column tabular
BLAST output. This tool now uses this extended 25 column output by default.

====== ============= ===========================================
Column NCBI name     Description
------ ------------- -------------------------------------------
    13 sallseqid     All subject Seq-id(s), separated by a ';'
    14 score         Raw score
    15 nident        Number of identical matches
    16 positive      Number of positive-scoring matches
    17 gaps          Total number of gaps
    18 ppos          Percentage of positive-scoring matches
    19 qframe        Query frame
    20 sframe        Subject frame
    21 qseq          Aligned part of query sequence
    22 sseq          Aligned part of subject sequence
    23 qlen          Query sequence length
    24 slen          Subject sequence length
    25 salltitles    All subject title(s), separated by a '&lt;&gt;'
====== ============= ===========================================

Beware that the XML file (and thus the conversion) and the tabular output
direct from BLAST+ may differ in the presence of XXXX masking on regions
low complexity (columns 21 and 22), and thus also calculated figures like
the percentage identity (column 3).

**References**

If you use this Galaxy tool in work leading to a scientific publication please
cite:

Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
Galaxy tools and workflows for sequence analysis with applications
in molecular plant pathology. PeerJ 1:e167
http://dx.doi.org/10.7717/peerj.167

This wrapper is available to install into other Galaxy Instances via the Galaxy
Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus
    </help>
</tool>