Mercurial > repos > devteam > ncbi_blast_plus
comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 21:7538e2bfcd41 draft
v0.2.00, for NCBI BLAST+ 2.5.0 via bioconda or tool_dependencies.xml
author | peterjc |
---|---|
date | Wed, 19 Apr 2017 05:27:19 -0400 |
parents | 3034ce97dd33 |
children | 6f386c5dc4fb |
comparison
equal
deleted
inserted
replaced
20:3034ce97dd33 | 21:7538e2bfcd41 |
---|---|
58 | 58 |
59 This script attempts to produce identical output to what BLAST+ would have done. | 59 This script attempts to produce identical output to what BLAST+ would have done. |
60 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra | 60 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra |
61 space character (probably a bug). | 61 space character (probably a bug). |
62 """ | 62 """ |
63 | |
64 | |
65 import os | |
66 import re | |
63 import sys | 67 import sys |
64 import re | 68 |
65 import os | |
66 from optparse import OptionParser | 69 from optparse import OptionParser |
67 | 70 |
68 if "-v" in sys.argv or "--version" in sys.argv: | 71 if "-v" in sys.argv or "--version" in sys.argv: |
69 print "v0.1.08" | 72 print "v0.2.00" |
70 sys.exit(0) | 73 sys.exit(0) |
71 | 74 |
72 if sys.version_info[:2] >= (2, 5): | 75 if sys.version_info[:2] >= (2, 5): |
73 try: | 76 try: |
74 from xml.etree import cElementTree as ElementTree | 77 from xml.etree import cElementTree as ElementTree |
75 except ImportError: | 78 except ImportError: |
76 from xml.etree import ElementTree as ElementTree | 79 from xml.etree import ElementTree as ElementTree |
77 else: | 80 else: |
78 from galaxy import eggs | 81 from galaxy import eggs # noqa - ignore flake8 F401 |
79 import pkg_resources | 82 import pkg_resources |
80 pkg_resources.require("elementtree") | 83 pkg_resources.require("elementtree") |
81 from elementtree import ElementTree | 84 from elementtree import ElementTree |
82 | 85 |
83 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]: | 86 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]: |
104 The columns option can be 'std' (standard 12 columns), 'ext' | 107 The columns option can be 'std' (standard 12 columns), 'ext' |
105 (extended 25 columns), or a list of BLAST+ column names like | 108 (extended 25 columns), or a list of BLAST+ column names like |
106 'qseqid,sseqid,pident' (space or comma separated). | 109 'qseqid,sseqid,pident' (space or comma separated). |
107 """ | 110 """ |
108 parser = OptionParser(usage=usage) | 111 parser = OptionParser(usage=usage) |
109 parser.add_option('-o', '--output', dest='output', default=None, help='output filename (defaults to stdout)', metavar="FILE") | 112 parser.add_option('-o', '--output', dest='output', default=None, |
110 parser.add_option("-c", "--columns", dest="columns", default='std', help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names") | 113 help='output filename (defaults to stdout)', |
114 metavar="FILE") | |
115 parser.add_option("-c", "--columns", dest="columns", default='std', | |
116 help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names") | |
111 (options, args) = parser.parse_args() | 117 (options, args) = parser.parse_args() |
112 | 118 |
113 colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',') | 119 colnames = ('qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,' |
120 'sstart,send,evalue,bitscore,sallseqid,score,nident,positive,' | |
121 'gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles').split(',') | |
114 | 122 |
115 if len(args) < 1: | 123 if len(args) < 1: |
116 sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.") | 124 sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.") |
117 | 125 |
118 out_fmt = options.columns | 126 out_fmt = options.columns |
174 blast_program = elem.text | 182 blast_program = elem.text |
175 # for every <Iteration> tag | 183 # for every <Iteration> tag |
176 if event == "end" and elem.tag == "Iteration": | 184 if event == "end" and elem.tag == "Iteration": |
177 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA | 185 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA |
178 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> | 186 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> |
179 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | 187 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 |
188 # OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | |
180 # <Iteration_query-len>406</Iteration_query-len> | 189 # <Iteration_query-len>406</Iteration_query-len> |
181 # <Iteration_hits></Iteration_hits> | 190 # <Iteration_hits></Iteration_hits> |
182 # | 191 # |
183 # Or, from BLAST 2.2.24+ run online | 192 # Or, from BLAST 2.2.24+ run online |
184 # <Iteration_query-ID>Query_1</Iteration_query-ID> | 193 # <Iteration_query-ID>Query_1</Iteration_query-ID> |
204 # | 213 # |
205 # apparently depending on the parse_deflines switch | 214 # apparently depending on the parse_deflines switch |
206 # | 215 # |
207 # Or, with a local database not using -parse_seqids can get this, | 216 # Or, with a local database not using -parse_seqids can get this, |
208 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> | 217 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> |
209 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> | 218 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis |
219 # thaliana chromosome 3, complete sequence</Hit_def> | |
210 # <Hit_accession>2</Hit_accession> | 220 # <Hit_accession>2</Hit_accession> |
211 sseqid = hit.findtext("Hit_id").split(None, 1)[0] | 221 sseqid = hit.findtext("Hit_id").split(None, 1)[0] |
212 hit_def = sseqid + " " + hit.findtext("Hit_def") | 222 hit_def = sseqid + " " + hit.findtext("Hit_def") |
213 if re_default_subject_id.match(sseqid) \ | 223 if re_default_subject_id.match(sseqid) and sseqid == hit.findtext("Hit_accession"): |
214 and sseqid == hit.findtext("Hit_accession"): | |
215 # Place holder ID, take the first word of the subject definition | 224 # Place holder ID, take the first word of the subject definition |
216 hit_def = hit.findtext("Hit_def") | 225 hit_def = hit.findtext("Hit_def") |
217 sseqid = hit_def.split(None, 1)[0] | 226 sseqid = hit_def.split(None, 1)[0] |
218 if sseqid.startswith("gnl|BL_ORD_ID|") \ | 227 if sseqid.startswith("gnl|BL_ORD_ID|") and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): |
219 and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): | |
220 # Alternative place holder ID, again take the first word of hit_def | 228 # Alternative place holder ID, again take the first word of hit_def |
221 hit_def = hit.findtext("Hit_def") | 229 hit_def = hit.findtext("Hit_def") |
222 sseqid = hit_def.split(None, 1)[0] | 230 sseqid = hit_def.split(None, 1)[0] |
223 # for every <Hsp> within <Hit> | 231 # for every <Hsp> within <Hit> |
224 for hsp in hit.findall("Hit_hsps/Hsp"): | 232 for hsp in hit.findall("Hit_hsps/Hsp"): |
225 nident = hsp.findtext("Hsp_identity") | 233 nident = hsp.findtext("Hsp_identity") |
226 length = hsp.findtext("Hsp_align-len") | 234 length = hsp.findtext("Hsp_align-len") |
227 pident = "%0.2f" % (100 * float(nident) / float(length)) | 235 # As of NCBI BLAST+ 2.4.0 this is given to 3dp (not 2dp) |
236 pident = "%0.3f" % (100 * float(nident) / float(length)) | |
228 | 237 |
229 q_seq = hsp.findtext("Hsp_qseq") | 238 q_seq = hsp.findtext("Hsp_qseq") |
230 h_seq = hsp.findtext("Hsp_hseq") | 239 h_seq = hsp.findtext("Hsp_hseq") |
231 m_seq = hsp.findtext("Hsp_midline") | 240 m_seq = hsp.findtext("Hsp_midline") |
232 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) | 241 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) |
233 gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 + | 242 gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 + |
234 len(h_seq.replace('-', ' ').split()) - 1) | 243 len(h_seq.replace('-', ' ').split()) - 1) |
235 | 244 |
236 mismatch = m_seq.count(' ') + m_seq.count('+') \ | 245 mismatch = m_seq.count(' ') + m_seq.count('+') - q_seq.count('-') - h_seq.count('-') |
237 - q_seq.count('-') - h_seq.count('-') | |
238 # TODO - Remove this alternative mismatch calculation and test | 246 # TODO - Remove this alternative mismatch calculation and test |
239 # once satisifed there are no problems | 247 # once satisifed there are no problems |
240 expected_mismatch = len(q_seq) \ | 248 expected_mismatch = len(q_seq) - sum(1 for q, h in zip(q_seq, h_seq) |
241 - sum(1 for q, h in zip(q_seq, h_seq) | 249 if q == h or q == "-" or h == "-") |
242 if q == h or q == "-" or h == "-") | |
243 xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X") | 250 xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X") |
244 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): | 251 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): |
245 sys.exit("%s vs %s mismatches, expected %i <= %i <= %i" | 252 sys.exit("%s vs %s mismatches, expected %i <= %i <= %i" |
246 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), | 253 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), |
247 int(mismatch), expected_mismatch)) | 254 int(mismatch), expected_mismatch)) |