Mercurial > repos > galaxyp > blastxml_to_tabular_selectable
diff blastxml_to_tabular_selectable.py @ 0:2bd0cbccb3c6
Uploaded
author | galaxyp |
---|---|
date | Wed, 08 Oct 2014 19:38:28 -0400 |
parents | |
children | 5da5dcc5e13a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blastxml_to_tabular_selectable.py Wed Oct 08 19:38:28 2014 -0400 @@ -0,0 +1,329 @@ +#!/usr/bin/env python +"""Convert a BLAST XML file to 12 column tabular output + +Takes three command line options, input BLAST XML filename, output tabular +BLAST filename, output format (std for standard 12 columns, or ext for the +extended 25 columns offered in the BLAST+ wrappers). + +The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart +qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which +mean: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The additional columns offered in the Galaxy BLAST+ wrappers are: + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence + 23 qlen Query sequence length + 24 slen Subject sequence length + 25 salltitles All subject titles, separated by '<>' +====== ============= =========================================== + +Most of these fields are given explicitly in the XML file, others some like +the percentage identity and the number of gap openings must be calculated. + +Be aware that the sequence in the extended tabular output or XML direct from +BLAST+ may or may not use XXXX masking on regions of low complexity. This +can throw the off the calculation of percentage identity and gap openings. +[In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard, +with these numbers changing depending on whether or not the low complexity +filter is used.] + +This script attempts to produce identical output to what BLAST+ would have done. +However, check this with "diff -b ..." since BLAST+ sometimes includes an extra +space character (probably a bug). +""" +import sys +import re +import os +from optparse import OptionParser + +if "-v" in sys.argv or "--version" in sys.argv: + print "v0.0.12" + sys.exit(0) + +if sys.version_info[:2] >= ( 2, 5 ): + try: + from xml.etree import cElementTree as ElementTree + except ImportError: + from xml.etree import ElementTree as ElementTree +else: + from galaxy import eggs + import pkg_resources; pkg_resources.require( "elementtree" ) + from elementtree import ElementTree + +def stop_err( msg ): + sys.stderr.write("%s\n" % msg) + sys.exit(1) + +usage = "usage: %prog [options] blastxml[,...]" +parser = OptionParser(usage=usage) +parser.add_option('-o','--output', dest='output', default = None, help='output file path', metavar="FILE") +parser.add_option("-c", "--columns", dest="columns", default='std', help="[std|ext|colname[,colname,...]] std: 12 column, ext: 25 column, or user specified columns") +parser.add_option("-a", "--allqueries", action="store_true", dest="allqueries", default=False, help="ouptut row for each query, including those with no hits") +parser.add_option("-d", "--qdef", action="store_true", dest="qdef", default=False, help="use Iteration_query-def for qseqid") +parser.add_option('-u', '--unmatched', dest='unmatched', default = None, help='unmatched queries file path', metavar="FILE") +parser.add_option('-m', '--maxhits', dest='maxhits', type="int", default = 1, help='maximum number of HITs for output for a Query') +parser.add_option('-p', '--maxhsps', dest='maxhsps', type="int", default = 1, help='maximum number of HSPs for output for a Hit') +(options, args) = parser.parse_args() + +colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',') + +if len(args) < 1: + parser.error("no BLASTXML input files") + +if options.columns == "std": + out_fmt = 'std' + extended = False + columns = colnames[0:12] +elif options.columns == "x22": + stop_err("Format argument x22 has been replaced with ext (extended 25 columns)") +elif options.columns == "ext": + out_fmt = 'ext' + extended = True + columns = colnames[0:25] +else: + out_fmt = 'cols' + extended = True + sel_cols = options.columns.split(',') + columns = [] + for col in sel_cols: + if col in colnames: + columns.append(col) + +for in_file in args: + if not os.path.isfile(in_file): + stop_err("Input BLAST XML file not found: %s" % in_file) + +unhitfile = open(options.unmatched,'w') if options.unmatched else None + +re_default_query_id = re.compile("^Query_\d+$") +assert re_default_query_id.match("Query_101") +assert not re_default_query_id.match("Query_101a") +assert not re_default_query_id.match("MyQuery_101") +re_default_subject_id = re.compile("^Subject_\d+$") +assert re_default_subject_id.match("Subject_1") +assert not re_default_subject_id.match("Subject_") +assert not re_default_subject_id.match("Subject_12a") +assert not re_default_subject_id.match("TheSubject_1") + +header = "#%s\n" % "\t".join(columns) +outfile = open(options.output, 'w') if options.output else sys.stdout +outfile.write(header) + +def handle_event(event, elem): + blast_program = None + if event == "end" and elem.tag == "BlastOutput_program": + blast_program = elem.text + # for every <Iteration> tag + if event == "end" and elem.tag == "Iteration": + #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA + # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> + # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> + # <Iteration_query-len>406</Iteration_query-len> + # <Iteration_hits></Iteration_hits> + # + #Or, from BLAST 2.2.24+ run online + # <Iteration_query-ID>Query_1</Iteration_query-ID> + # <Iteration_query-def>Sample</Iteration_query-def> + # <Iteration_query-len>516</Iteration_query-len> + # <Iteration_hits>... + qseqid = elem.findtext("Iteration_query-ID") + if options.qdef or re_default_query_id.match(qseqid): + #Place holder ID, take the first word of the query definition + qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] + qlen = int(elem.findtext("Iteration_query-len")) + # If no hits in this iteration + if not elem.find("Iteration_hits/Hit"): + if unhitfile: + unhitfile.write("%s\n" % qseqid) + if options.allqueries: + if columns and len(columns) > 0: + v = [] + for name in columns: + if name == 'qseqid': + v.append(qseqid) + elif name == 'qlen': + v.append(str(qlen)) + else: + v.append('') + outfile.write("\t".join(v) + "\n") + # for every <Hit> within <Iteration> + for hitn, hit in enumerate(elem.findall("Iteration_hits/Hit")): + if hitn >= options.maxhits: + break + #Expecting either this, + # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> + # <Hit_def>RecName: Full=Rhodopsin</Hit_def> + # <Hit_accession>P56514</Hit_accession> + #or, + # <Hit_id>Subject_1</Hit_id> + # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> + # <Hit_accession>Subject_1</Hit_accession> + # + #apparently depending on the parse_deflines switch + # + #Or, with a local database not using -parse_seqids can get this, + # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> + # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> + # <Hit_accession>2</Hit_accession> + sseqid = hit.findtext("Hit_id").split(None,1)[0] + hit_def = sseqid + " " + hit.findtext("Hit_def") + if re_default_subject_id.match(sseqid) \ + and sseqid == hit.findtext("Hit_accession"): + #Place holder ID, take the first word of the subject definition + hit_def = hit.findtext("Hit_def") + sseqid = hit_def.split(None,1)[0] + # for every <Hsp> within <Hit> + for hspn,hsp in enumerate(hit.findall("Hit_hsps/Hsp")): + if hspn >= options.maxhsps: + break + nident = hsp.findtext("Hsp_identity") + length = hsp.findtext("Hsp_align-len") + pident = "%0.2f" % (100*float(nident)/float(length)) + + q_seq = hsp.findtext("Hsp_qseq") + h_seq = hsp.findtext("Hsp_hseq") + m_seq = hsp.findtext("Hsp_midline") + assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) + gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \ + len(h_seq.replace('-', ' ').split())-1) + + mismatch = m_seq.count(' ') + m_seq.count('+') \ + - q_seq.count('-') - h_seq.count('-') + #TODO - Remove this alternative mismatch calculation and test + #once satisifed there are no problems + expected_mismatch = len(q_seq) \ + - sum(1 for q,h in zip(q_seq, h_seq) \ + if q == h or q == "-" or h == "-") + xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X") + if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): + stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \ + % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), + int(mismatch), expected_mismatch)) + + #TODO - Remove this alternative identity calculation and test + #once satisifed there are no problems + expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h) + if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")): + stop_err("%s vs %s identities, expected %i <= %i <= %i" \ + % (qseqid, sseqid, expected_identity, int(nident), + expected_identity + q_seq.count("X"))) + + + evalue = hsp.findtext("Hsp_evalue") + if evalue == "0": + evalue = "0.0" + else: + evalue = "%0.0e" % float(evalue) + + bitscore = float(hsp.findtext("Hsp_bit-score")) + if bitscore < 100: + #Seems to show one decimal place for lower scores + bitscore = "%0.1f" % bitscore + else: + #Note BLAST does not round to nearest int, it truncates + bitscore = "%i" % bitscore + + values = [qseqid, + sseqid, + pident, + length, #hsp.findtext("Hsp_align-len") + str(mismatch), + gapopen, + hsp.findtext("Hsp_query-from"), #qstart, + hsp.findtext("Hsp_query-to"), #qend, + hsp.findtext("Hsp_hit-from"), #sstart, + hsp.findtext("Hsp_hit-to"), #send, + evalue, #hsp.findtext("Hsp_evalue") in scientific notation + bitscore, #hsp.findtext("Hsp_bit-score") rounded + ] + + if extended: + sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">")) + salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >")) + #print hit_def, "-->", sallseqid + positive = hsp.findtext("Hsp_positive") + ppos = "%0.2f" % (100*float(positive)/float(length)) + qframe = hsp.findtext("Hsp_query-frame") + sframe = hsp.findtext("Hsp_hit-frame") + if blast_program == "blastp": + #Probably a bug in BLASTP that they use 0 or 1 depending on format + if qframe == "0": qframe = "1" + if sframe == "0": sframe = "1" + slen = int(hit.findtext("Hit_len")) + values.extend([sallseqid, + hsp.findtext("Hsp_score"), #score, + nident, + positive, + hsp.findtext("Hsp_gaps"), #gaps, + ppos, + qframe, + sframe, + #NOTE - for blastp, XML shows original seq, tabular uses XXX masking + q_seq, + h_seq, + str(qlen), + str(slen), + salltitles, + ]) + if out_fmt == 'cols': + if columns and len(columns) > 0: + v = [] + for name in columns: + v.append(values[colnames.index(name)]) + values = v + #print "\t".join(values) + outfile.write("\t".join(values) + "\n") + # prevents ElementTree from growing large datastructure + root.clear() + elem.clear() + + +for in_file in args: + # get an iterable + try: + context = ElementTree.iterparse(in_file, events=("start", "end")) + except: + stop_err("Invalid data format.") + # turn it into an iterator + context = iter(context) + # get the root element + try: + event, root = context.next() + except: + stop_err( "Invalid data format." ) + for event, elem in context: + handle_event(event, elem) + +if unhitfile: + unhitfile.close() +if options.output: + outfile.close()