changeset 0:075fe5424c32 draft

Uploaded v0.0.1
author peterjc
date Thu, 07 Feb 2013 14:56:18 -0500
parents
children dff89c7e4308
files tools/ncbi_blast_plus/blastxml_to_top_descr.py tools/ncbi_blast_plus/blastxml_to_top_descr.txt tools/ncbi_blast_plus/blastxml_to_top_descr.xml
diffstat 3 files changed, 241 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/blastxml_to_top_descr.py	Thu Feb 07 14:56:18 2013 -0500
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+"""Convert a BLAST XML file to a top hits description table.
+
+Takes three command line options, input BLAST XML filename, output tabular
+BLAST filename, number of hits to collect the descriptions of.
+"""
+import sys
+import re
+
+if sys.version_info[:2] >= ( 2, 5 ):
+    import xml.etree.cElementTree as ElementTree
+else:
+    from galaxy import eggs
+    import pkg_resources; pkg_resources.require( "elementtree" )
+    from elementtree import ElementTree
+
+def stop_err( msg ):
+    sys.stderr.write("%s\n" % msg)
+    sys.exit(1)
+
+#Parse Command Line
+try:
+    in_file, out_file, topN = sys.argv[1:]
+except:
+    stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, number of hits")
+
+
+try:
+    topN = int(topN)
+except ValueError:
+    stop_err("Number of hits  argument should be an integer (at least 1)")
+if topN < 1:
+    stop_err("Number of hits  argument should be an integer (at least 1)")
+
+# get an iterable
+try: 
+    context = ElementTree.iterparse(in_file, events=("start", "end"))
+except:
+    stop_err("Invalid data format.")
+# turn it into an iterator
+context = iter(context)
+# get the root element
+try:
+    event, root = context.next()
+except:
+    stop_err( "Invalid data format." )
+
+
+re_default_query_id = re.compile("^Query_\d+$")
+assert re_default_query_id.match("Query_101")
+assert not re_default_query_id.match("Query_101a")
+assert not re_default_query_id.match("MyQuery_101")
+re_default_subject_id = re.compile("^Subject_\d+$")
+assert re_default_subject_id.match("Subject_1")
+assert not re_default_subject_id.match("Subject_")
+assert not re_default_subject_id.match("Subject_12a")
+assert not re_default_subject_id.match("TheSubject_1")
+
+
+count = 0
+outfile = open(out_file, 'w')
+outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN)))
+for event, elem in context:
+    # for every <Iteration> tag
+    if event == "end" and elem.tag == "Iteration":
+        #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
+        # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
+        # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
+        # <Iteration_query-len>406</Iteration_query-len>
+        # <Iteration_hits></Iteration_hits>
+        #
+        #Or, from BLAST 2.2.24+ run online
+        # <Iteration_query-ID>Query_1</Iteration_query-ID>
+        # <Iteration_query-def>Sample</Iteration_query-def>
+        # <Iteration_query-len>516</Iteration_query-len>
+        # <Iteration_hits>...
+        qseqid = elem.findtext("Iteration_query-ID")
+        if qseqid is None:
+            stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)")
+        if re_default_query_id.match(qseqid):
+            #Place holder ID, take the first word of the query definition
+            qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
+        # for every <Hit> within <Iteration>
+        hit_descrs = []
+        for hit in elem.findall("Iteration_hits/Hit"):
+            #Expecting either this,
+            # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
+            # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
+            # <Hit_accession>P56514</Hit_accession>
+            #or,
+            # <Hit_id>Subject_1</Hit_id>
+            # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
+            # <Hit_accession>Subject_1</Hit_accession>
+            #
+            #apparently depending on the parse_deflines switch
+            sseqid = hit.findtext("Hit_id").split(None,1)[0]
+            hit_def = sseqid + " " + hit.findtext("Hit_def")
+            if re_default_subject_id.match(sseqid) \
+            and sseqid == hit.findtext("Hit_accession"):
+                #Place holder ID, take the first word of the subject definition
+                hit_def = hit.findtext("Hit_def")
+                sseqid = hit_def.split(None,1)[0]
+            assert hit_def not in hit_descrs
+            hit_descrs.append(hit_def)
+        #print "%r has %i hits" % (qseqid, len(hit_descrs))
+        hit_descrs = hit_descrs[:topN]
+        while len(hit_descrs) < topN:
+            hit_descrs.append("")
+        outfile.write("%s\t%s\n" % (qseqid, "\t".join(hit_descrs)))
+        count += 1
+        # prevents ElementTree from growing large datastructure
+        root.clear()
+        elem.clear()
+outfile.close()
+print "%i BLAST results" % count
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/blastxml_to_top_descr.txt	Thu Feb 07 14:56:18 2013 -0500
@@ -0,0 +1,79 @@
+Galaxy tool to extract top BLAST hit descriptions from BLAST XML
+================================================================
+
+This tool is copyright 2012 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below.
+
+This tool is a short Python script to parse a BLAST XML file, and extract the
+identifiers with description for the top matches (by default the top 3), and
+output these as a simple tabular file along with the query identifiers.
+
+There are no additional dependancies.
+
+
+Manual Installation
+===================
+
+There are just two files to install (if doing this manually):
+
+* blastxml_to_top_descr.py (the Python script)
+* blastxml_to_top_descr.xml (the Galaxy tool definition)
+
+The suggested location is in the Galaxy folder tools/ncbi_blast_plus next to
+the NCBI BLAST+ tool wrappers.
+
+You will also need to modify the tools_conf.xml file to tell Galaxy to offer
+the tool. e.g. next to the NCBI BLAST+ tools. Simply add the line:
+
+<tool file="filters/seq_select_by_id.xml" />
+
+To run the tool's tests, also add this line to tools_conf.xml.sample then:
+
+$ sh run_functional_tests.sh -id blastxml_to_top_descr
+
+
+History
+=======
+
+v0.0.1 - Initial version.
+
+
+Developers
+==========
+
+This script and related tools are being developed on the following hg branch:
+http://bitbucket.org/peterjc/galaxy-central/src/tools
+
+For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use
+the following command from the Galaxy root folder:
+
+$ tar -czf blastxml_to_top_descr.tar.gz tools/ncbi_blast_plus/blastxml_to_top_descr.*
+
+Check this worked:
+
+$ tar -tzf blastxml_to_top_descr.tar.gz
+tools/ncbi_blast_plus/blastxml_to_top_descr.py
+tools/ncbi_blast_plus/blastxml_to_top_descr.txt
+tools/ncbi_blast_plus/blastxml_to_top_descr.xml
+
+Licence (MIT/BSD style)
+=======================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/blastxml_to_top_descr.xml	Thu Feb 07 14:56:18 2013 -0500
@@ -0,0 +1,47 @@
+<tool id="blastxml_to_top_descr" name="BLAST top hit descriptions" version="0.0.1">
+    <description>Make a table from BLAST XML</description>
+    <command interpreter="python">
+      blastxml_to_top_descr.py $blastxml_file $tabular_file $topN
+    </command>
+    <inputs>
+        <param name="blastxml_file" type="data" format="blastxml" label="BLAST results as XML"/> 
+	<param name="topN" type="integer" min="1" max="100" optional="false" label="Number of descriptions" value="3"/>
+    </inputs>
+    <outputs>
+        <data name="tabular_file" format="tabular" label="Top $topN descriptions from $blastxml_file.name" />
+    </outputs>
+    <requirements>
+    </requirements>
+    <tests>
+        <test>
+            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
+            <param name="topN" value="3" />
+            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_top3.tabular" ftype="tabular" />
+        </test>
+    </tests>
+    <help>
+    
+**What it does**
+
+NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of
+formats including text, tabular and a more detailed XML format. You can
+do a lot of things with tabular files in Galaxy (sorting, filtering, joins,
+etc) however currently the BLAST tabular output omits the hit descriptions
+found in the other output formats.
+
+This tool turns a BLAST XML file into a simple tabular file containing
+one row per query sequence, containing the query identifier and then
+the three (by default) top hit descriptions. If a query doesn't have
+that many hits, then these entries are left blank.
+
+**Example Usage**
+
+One simple usage would be to take a transcriptome assembly or set of
+gene predictions, run a BLAST search against the NCBI NR database, and
+then use this tool to make a table of the top three BLAST hits. This
+can give you a 'quick and dirty' crude annotation, potentially enough
+to spot some problems (e.g. bacterial contaimination could be very
+obvious).
+
+    </help>
+</tool>