ncbi_blast_plus: tools/ncbi_blast_plus/blastxml_to

comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 21:7538e2bfcd41 draft

v0.2.00, for NCBI BLAST+ 2.5.0 via bioconda or tool_dependencies.xml

author	peterjc
date	Wed, 19 Apr 2017 05:27:19 -0400
parents	3034ce97dd33
children	6f386c5dc4fb

comparison

equal deleted inserted replaced

-:3034ce97dd33
+:7538e2bfcd41
 This script attempts to produce identical output to what BLAST+ would have done.
 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
 space character (probably a bug).
 """
+import os
+import re
 import sys
-import re
-import os
 from optparse import OptionParser
 if "-v" in sys.argv or "--version" in sys.argv:
-print "v0.1.08"
+print "v0.2.00"
 sys.exit(0)
 if sys.version_info[:2] >= (2, 5):
 try:
 from xml.etree import cElementTree as ElementTree
 except ImportError:
 from xml.etree import ElementTree as ElementTree
 else:
-from galaxy import eggs
+from galaxy import eggs  # noqa - ignore flake8 F401
 import pkg_resources
 pkg_resources.require("elementtree")
 from elementtree import ElementTree
 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]:
 The columns option can be 'std' (standard 12 columns), 'ext'
 (extended 25 columns), or a list of BLAST+ column names like
 'qseqid,sseqid,pident' (space or comma separated).
 """
 parser = OptionParser(usage=usage)
-parser.add_option('-o', '--output', dest='output', default=None, help='output filename (defaults to stdout)', metavar="FILE")
+parser.add_option('-o', '--output', dest='output', default=None,
-parser.add_option("-c", "--columns", dest="columns", default='std', help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names")
+help='output filename (defaults to stdout)',
+metavar="FILE")
+parser.add_option("-c", "--columns", dest="columns", default='std',
+help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names")
 (options, args) = parser.parse_args()
-colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',')
+colnames = ('qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,'
+'sstart,send,evalue,bitscore,sallseqid,score,nident,positive,'
+'gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles').split(',')
 if len(args) < 1:
 sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.")
 out_fmt = options.columns
 blast_program = elem.text
 # for every <Iteration> tag
 if event == "end" and elem.tag == "Iteration":
 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
-# <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
+# <Iteration_query-def>Endoplasmic reticulum resident protein 44
+# OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
 # <Iteration_query-len>406</Iteration_query-len>
 # <Iteration_hits></Iteration_hits>
 #
 # Or, from BLAST 2.2.24+ run online
 # <Iteration_query-ID>Query_1</Iteration_query-ID>
 #
 # apparently depending on the parse_deflines switch
 #
 # Or, with a local database not using -parse_seqids can get this,
 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
-# <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
+# <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis
+# thaliana chromosome 3, complete sequence</Hit_def>
 # <Hit_accession>2</Hit_accession>
 sseqid = hit.findtext("Hit_id").split(None, 1)[0]
 hit_def = sseqid + " " + hit.findtext("Hit_def")
-if re_default_subject_id.match(sseqid) \
+if re_default_subject_id.match(sseqid) and sseqid == hit.findtext("Hit_accession"):
-and sseqid == hit.findtext("Hit_accession"):
 # Place holder ID, take the first word of the subject definition
 hit_def = hit.findtext("Hit_def")
 sseqid = hit_def.split(None, 1)[0]
-if sseqid.startswith("gnl|BL_ORD_ID|") \
+if sseqid.startswith("gnl|BL_ORD_ID|") and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
-and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
 # Alternative place holder ID, again take the first word of hit_def
 hit_def = hit.findtext("Hit_def")
 sseqid = hit_def.split(None, 1)[0]
 # for every <Hsp> within <Hit>
 for hsp in hit.findall("Hit_hsps/Hsp"):
 nident = hsp.findtext("Hsp_identity")
 length = hsp.findtext("Hsp_align-len")
-pident = "%0.2f" % (100 * float(nident) / float(length))
+# As of NCBI BLAST+ 2.4.0 this is given to 3dp (not 2dp)
+pident = "%0.3f" % (100 * float(nident) / float(length))
 q_seq = hsp.findtext("Hsp_qseq")
 h_seq = hsp.findtext("Hsp_hseq")
 m_seq = hsp.findtext("Hsp_midline")
 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
 gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 +
 len(h_seq.replace('-', ' ').split()) - 1)
-mismatch = m_seq.count(' ') + m_seq.count('+') \
+mismatch = m_seq.count(' ') + m_seq.count('+') - q_seq.count('-') - h_seq.count('-')
-- q_seq.count('-') - h_seq.count('-')
 # TODO - Remove this alternative mismatch calculation and test
 # once satisifed there are no problems
-expected_mismatch = len(q_seq) \
+expected_mismatch = len(q_seq) - sum(1 for q, h in zip(q_seq, h_seq)
-- sum(1 for q, h in zip(q_seq, h_seq)
+if q == h or q == "-" or h == "-")
-if q == h or q == "-" or h == "-")
 xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X")
 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
 sys.exit("%s vs %s mismatches, expected %i <= %i <= %i"
 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
 int(mismatch), expected_mismatch))

Mercurial > repos > devteam > ncbi_blast_plus

comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 21:7538e2bfcd41 draft