comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 21:7538e2bfcd41 draft

v0.2.00, for NCBI BLAST+ 2.5.0 via bioconda or tool_dependencies.xml
author peterjc
date Wed, 19 Apr 2017 05:27:19 -0400
parents 3034ce97dd33
children 6f386c5dc4fb
comparison
equal deleted inserted replaced
20:3034ce97dd33 21:7538e2bfcd41
58 58
59 This script attempts to produce identical output to what BLAST+ would have done. 59 This script attempts to produce identical output to what BLAST+ would have done.
60 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra 60 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
61 space character (probably a bug). 61 space character (probably a bug).
62 """ 62 """
63
64
65 import os
66 import re
63 import sys 67 import sys
64 import re 68
65 import os
66 from optparse import OptionParser 69 from optparse import OptionParser
67 70
68 if "-v" in sys.argv or "--version" in sys.argv: 71 if "-v" in sys.argv or "--version" in sys.argv:
69 print "v0.1.08" 72 print "v0.2.00"
70 sys.exit(0) 73 sys.exit(0)
71 74
72 if sys.version_info[:2] >= (2, 5): 75 if sys.version_info[:2] >= (2, 5):
73 try: 76 try:
74 from xml.etree import cElementTree as ElementTree 77 from xml.etree import cElementTree as ElementTree
75 except ImportError: 78 except ImportError:
76 from xml.etree import ElementTree as ElementTree 79 from xml.etree import ElementTree as ElementTree
77 else: 80 else:
78 from galaxy import eggs 81 from galaxy import eggs # noqa - ignore flake8 F401
79 import pkg_resources 82 import pkg_resources
80 pkg_resources.require("elementtree") 83 pkg_resources.require("elementtree")
81 from elementtree import ElementTree 84 from elementtree import ElementTree
82 85
83 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]: 86 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]:
104 The columns option can be 'std' (standard 12 columns), 'ext' 107 The columns option can be 'std' (standard 12 columns), 'ext'
105 (extended 25 columns), or a list of BLAST+ column names like 108 (extended 25 columns), or a list of BLAST+ column names like
106 'qseqid,sseqid,pident' (space or comma separated). 109 'qseqid,sseqid,pident' (space or comma separated).
107 """ 110 """
108 parser = OptionParser(usage=usage) 111 parser = OptionParser(usage=usage)
109 parser.add_option('-o', '--output', dest='output', default=None, help='output filename (defaults to stdout)', metavar="FILE") 112 parser.add_option('-o', '--output', dest='output', default=None,
110 parser.add_option("-c", "--columns", dest="columns", default='std', help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names") 113 help='output filename (defaults to stdout)',
114 metavar="FILE")
115 parser.add_option("-c", "--columns", dest="columns", default='std',
116 help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names")
111 (options, args) = parser.parse_args() 117 (options, args) = parser.parse_args()
112 118
113 colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',') 119 colnames = ('qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,'
120 'sstart,send,evalue,bitscore,sallseqid,score,nident,positive,'
121 'gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles').split(',')
114 122
115 if len(args) < 1: 123 if len(args) < 1:
116 sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.") 124 sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.")
117 125
118 out_fmt = options.columns 126 out_fmt = options.columns
174 blast_program = elem.text 182 blast_program = elem.text
175 # for every <Iteration> tag 183 # for every <Iteration> tag
176 if event == "end" and elem.tag == "Iteration": 184 if event == "end" and elem.tag == "Iteration":
177 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA 185 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
178 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> 186 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
179 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> 187 # <Iteration_query-def>Endoplasmic reticulum resident protein 44
188 # OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
180 # <Iteration_query-len>406</Iteration_query-len> 189 # <Iteration_query-len>406</Iteration_query-len>
181 # <Iteration_hits></Iteration_hits> 190 # <Iteration_hits></Iteration_hits>
182 # 191 #
183 # Or, from BLAST 2.2.24+ run online 192 # Or, from BLAST 2.2.24+ run online
184 # <Iteration_query-ID>Query_1</Iteration_query-ID> 193 # <Iteration_query-ID>Query_1</Iteration_query-ID>
204 # 213 #
205 # apparently depending on the parse_deflines switch 214 # apparently depending on the parse_deflines switch
206 # 215 #
207 # Or, with a local database not using -parse_seqids can get this, 216 # Or, with a local database not using -parse_seqids can get this,
208 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> 217 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
209 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> 218 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis
219 # thaliana chromosome 3, complete sequence</Hit_def>
210 # <Hit_accession>2</Hit_accession> 220 # <Hit_accession>2</Hit_accession>
211 sseqid = hit.findtext("Hit_id").split(None, 1)[0] 221 sseqid = hit.findtext("Hit_id").split(None, 1)[0]
212 hit_def = sseqid + " " + hit.findtext("Hit_def") 222 hit_def = sseqid + " " + hit.findtext("Hit_def")
213 if re_default_subject_id.match(sseqid) \ 223 if re_default_subject_id.match(sseqid) and sseqid == hit.findtext("Hit_accession"):
214 and sseqid == hit.findtext("Hit_accession"):
215 # Place holder ID, take the first word of the subject definition 224 # Place holder ID, take the first word of the subject definition
216 hit_def = hit.findtext("Hit_def") 225 hit_def = hit.findtext("Hit_def")
217 sseqid = hit_def.split(None, 1)[0] 226 sseqid = hit_def.split(None, 1)[0]
218 if sseqid.startswith("gnl|BL_ORD_ID|") \ 227 if sseqid.startswith("gnl|BL_ORD_ID|") and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
219 and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
220 # Alternative place holder ID, again take the first word of hit_def 228 # Alternative place holder ID, again take the first word of hit_def
221 hit_def = hit.findtext("Hit_def") 229 hit_def = hit.findtext("Hit_def")
222 sseqid = hit_def.split(None, 1)[0] 230 sseqid = hit_def.split(None, 1)[0]
223 # for every <Hsp> within <Hit> 231 # for every <Hsp> within <Hit>
224 for hsp in hit.findall("Hit_hsps/Hsp"): 232 for hsp in hit.findall("Hit_hsps/Hsp"):
225 nident = hsp.findtext("Hsp_identity") 233 nident = hsp.findtext("Hsp_identity")
226 length = hsp.findtext("Hsp_align-len") 234 length = hsp.findtext("Hsp_align-len")
227 pident = "%0.2f" % (100 * float(nident) / float(length)) 235 # As of NCBI BLAST+ 2.4.0 this is given to 3dp (not 2dp)
236 pident = "%0.3f" % (100 * float(nident) / float(length))
228 237
229 q_seq = hsp.findtext("Hsp_qseq") 238 q_seq = hsp.findtext("Hsp_qseq")
230 h_seq = hsp.findtext("Hsp_hseq") 239 h_seq = hsp.findtext("Hsp_hseq")
231 m_seq = hsp.findtext("Hsp_midline") 240 m_seq = hsp.findtext("Hsp_midline")
232 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) 241 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
233 gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 + 242 gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 +
234 len(h_seq.replace('-', ' ').split()) - 1) 243 len(h_seq.replace('-', ' ').split()) - 1)
235 244
236 mismatch = m_seq.count(' ') + m_seq.count('+') \ 245 mismatch = m_seq.count(' ') + m_seq.count('+') - q_seq.count('-') - h_seq.count('-')
237 - q_seq.count('-') - h_seq.count('-')
238 # TODO - Remove this alternative mismatch calculation and test 246 # TODO - Remove this alternative mismatch calculation and test
239 # once satisifed there are no problems 247 # once satisifed there are no problems
240 expected_mismatch = len(q_seq) \ 248 expected_mismatch = len(q_seq) - sum(1 for q, h in zip(q_seq, h_seq)
241 - sum(1 for q, h in zip(q_seq, h_seq) 249 if q == h or q == "-" or h == "-")
242 if q == h or q == "-" or h == "-")
243 xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X") 250 xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X")
244 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): 251 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
245 sys.exit("%s vs %s mismatches, expected %i <= %i <= %i" 252 sys.exit("%s vs %s mismatches, expected %i <= %i <= %i"
246 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), 253 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
247 int(mismatch), expected_mismatch)) 254 int(mismatch), expected_mismatch))