ncbi_blast_plus: tools/ncbi_blast_plus/blastxml_to

comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 20:3034ce97dd33 draft

Uploaded v0.1.08, can search multiple local databases, fixes a pipe problem in blastdbcmd, and minor internal changes.

author	peterjc
date	Mon, 07 Nov 2016 11:31:37 -0500
parents	c16c30e9ad5b
children	7538e2bfcd41

comparison

equal deleted inserted replaced

-:7f3c448e119b
+:3034ce97dd33
 in the Galaxy BLAST+ wrappers).
 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
 mean:
 ====== ========= ============================================
 Column NCBI name Description
 ------ --------- --------------------------------------------
 1 qseqid    Query Seq-id (ID of your sequence)
 2 sseqid    Subject Seq-id (ID of the database hit)
 import re
 import os
 from optparse import OptionParser
 if "-v" in sys.argv or "--version" in sys.argv:
-print "v0.1.04"
+print "v0.1.08"
 sys.exit(0)
-if sys.version_info[:2] >= ( 2, 5 ):
+if sys.version_info[:2] >= (2, 5):
 try:
 from xml.etree import cElementTree as ElementTree
 except ImportError:
 from xml.etree import ElementTree as ElementTree
 else:
 from galaxy import eggs
-import pkg_resources; pkg_resources.require( "elementtree" )
+import pkg_resources
+pkg_resources.require("elementtree")
 from elementtree import ElementTree
-def stop_err( msg ):
-sys.stderr.write("%s\n" % msg)
-sys.exit(1)
 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]:
-#False positive if user really has a BLAST XML file called 'std' or 'ext'...
+# False positive if user really has a BLAST XML file called 'std' or 'ext'...
-stop_err("""ERROR: The script API has changed, sorry.
+sys.exit("""ERROR: The script API has changed, sorry.
 Instead of the old style:
 $ python blastxml_to_tabular.py input.xml output.tabular std
 (options, args) = parser.parse_args()
 colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',')
 if len(args) < 1:
-stop_err("ERROR: No BLASTXML input files given; run with --help to see options.")
+sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.")
 out_fmt = options.columns
 if out_fmt == "std":
 extended = False
 cols = None
 elif out_fmt == "x22":
-stop_err("Format argument x22 has been replaced with ext (extended 25 columns)")
+sys.exit("Format argument x22 has been replaced with ext (extended 25 columns)")
 elif out_fmt == "ext":
 extended = True
 cols = None
 else:
-cols = out_fmt.replace(" ", ",").split(",") #Allow space or comma separated
+cols = out_fmt.replace(" ", ",").split(",")  # Allow space or comma separated
-#Remove any blank entries due to trailing comma,
+# Remove any blank entries due to trailing comma,
-#or annoying "None" dummy value from Galaxy if no columns
+# or annoying "None" dummy value from Galaxy if no columns
 cols = [c for c in cols if c and c != "None"]
 extra = set(cols).difference(colnames)
 if extra:
-stop_err("These are not recognised column names: %s" % ",".join(sorted(extra)))
+sys.exit("These are not recognised column names: %s" % ",".join(sorted(extra)))
 del extra
 assert set(colnames).issuperset(cols), cols
 if not cols:
-stop_err("No columns selected!")
+sys.exit("No columns selected!")
-extended = max(colnames.index(c) for c in cols) >= 12 #Do we need any higher columns?
+extended = max(colnames.index(c) for c in cols) >= 12  # Do we need any higher columns?
 del out_fmt
 for in_file in args:
 if not os.path.isfile(in_file):
-stop_err("Input BLAST XML file not found: %s" % in_file)
+sys.exit("Input BLAST XML file not found: %s" % in_file)
 re_default_query_id = re.compile("^Query_\d+$")
 assert re_default_query_id.match("Query_101")
 assert not re_default_query_id.match("Query_101a")
 def convert(blastxml_filename, output_handle):
 blast_program = None
 # get an iterable
 try:
 context = ElementTree.iterparse(blastxml_filename, events=("start", "end"))
-except:
+except Exception:
-stop_err("Invalid data format.")
+sys.exit("Invalid data format.")
 # turn it into an iterator
 context = iter(context)
 # get the root element
 try:
 event, root = context.next()
-except:
+except Exception:
-stop_err( "Invalid data format." )
+sys.exit("Invalid data format.")
 for event, elem in context:
 if event == "end" and elem.tag == "BlastOutput_program":
 blast_program = elem.text
 # for every <Iteration> tag
 if event == "end" and elem.tag == "Iteration":
-#Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
+# Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
 # <Iteration_query-len>406</Iteration_query-len>
 # <Iteration_hits></Iteration_hits>
 #
-#Or, from BLAST 2.2.24+ run online
+# Or, from BLAST 2.2.24+ run online
 # <Iteration_query-ID>Query_1</Iteration_query-ID>
 # <Iteration_query-def>Sample</Iteration_query-def>
 # <Iteration_query-len>516</Iteration_query-len>
 # <Iteration_hits>...
 qseqid = elem.findtext("Iteration_query-ID")
 if re_default_query_id.match(qseqid):
-#Place holder ID, take the first word of the query definition
+# Place holder ID, take the first word of the query definition
-qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
+qseqid = elem.findtext("Iteration_query-def").split(None, 1)[0]
 qlen = int(elem.findtext("Iteration_query-len"))
 # for every <Hit> within <Iteration>
 for hit in elem.findall("Iteration_hits/Hit"):
-#Expecting either this,
+# Expecting either this,
 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
 # <Hit_accession>P56514</Hit_accession>
-#or,
+# or,
 # <Hit_id>Subject_1</Hit_id>
 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
 # <Hit_accession>Subject_1</Hit_accession>
 #
-#apparently depending on the parse_deflines switch
+# apparently depending on the parse_deflines switch
 #
-#Or, with a local database not using -parse_seqids can get this,
+# Or, with a local database not using -parse_seqids can get this,
 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
 # <Hit_accession>2</Hit_accession>
-sseqid = hit.findtext("Hit_id").split(None,1)[0]
+sseqid = hit.findtext("Hit_id").split(None, 1)[0]
 hit_def = sseqid + " " + hit.findtext("Hit_def")
 if re_default_subject_id.match(sseqid) \
 and sseqid == hit.findtext("Hit_accession"):
-#Place holder ID, take the first word of the subject definition
+# Place holder ID, take the first word of the subject definition
 hit_def = hit.findtext("Hit_def")
-sseqid = hit_def.split(None,1)[0]
+sseqid = hit_def.split(None, 1)[0]
 if sseqid.startswith("gnl|BL_ORD_ID|") \
 and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
-#Alternative place holder ID, again take the first word of hit_def
+# Alternative place holder ID, again take the first word of hit_def
 hit_def = hit.findtext("Hit_def")
-sseqid = hit_def.split(None,1)[0]
+sseqid = hit_def.split(None, 1)[0]
 # for every <Hsp> within <Hit>
 for hsp in hit.findall("Hit_hsps/Hsp"):
 nident = hsp.findtext("Hsp_identity")
 length = hsp.findtext("Hsp_align-len")
-pident = "%0.2f" % (100*float(nident)/float(length))
+pident = "%0.2f" % (100 * float(nident) / float(length))
 q_seq = hsp.findtext("Hsp_qseq")
 h_seq = hsp.findtext("Hsp_hseq")
 m_seq = hsp.findtext("Hsp_midline")
 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
-gapopen = str(len(q_seq.replace('-', ' ').split())-1  + \
+gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 +
-len(h_seq.replace('-', ' ').split())-1)
+len(h_seq.replace('-', ' ').split()) - 1)
 mismatch = m_seq.count(' ') + m_seq.count('+') \
 - q_seq.count('-') - h_seq.count('-')
-#TODO - Remove this alternative mismatch calculation and test
+# TODO - Remove this alternative mismatch calculation and test
-#once satisifed there are no problems
+# once satisifed there are no problems
 expected_mismatch = len(q_seq) \
-- sum(1 for q,h in zip(q_seq, h_seq) \
+- sum(1 for q, h in zip(q_seq, h_seq)
 if q == h or q == "-" or h == "-")
-xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")
+xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X")
 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
-stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \
+sys.exit("%s vs %s mismatches, expected %i <= %i <= %i"
 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
 int(mismatch), expected_mismatch))
-#TODO - Remove this alternative identity calculation and test
+# TODO - Remove this alternative identity calculation and test
-#once satisifed there are no problems
+# once satisifed there are no problems
-expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)
+expected_identity = sum(1 for q, h in zip(q_seq, h_seq) if q == h)
 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
-stop_err("%s vs %s identities, expected %i <= %i <= %i" \
+sys.exit("%s vs %s identities, expected %i <= %i <= %i"
 % (qseqid, sseqid, expected_identity, int(nident),
 expected_identity + q_seq.count("X")))
 evalue = hsp.findtext("Hsp_evalue")
 if evalue == "0":
 evalue = "0.0"
 else:
 evalue = "%0.0e" % float(evalue)
 bitscore = float(hsp.findtext("Hsp_bit-score"))
 if bitscore < 100:
-#Seems to show one decimal place for lower scores
+# Seems to show one decimal place for lower scores
 bitscore = "%0.1f" % bitscore
 else:
-#Note BLAST does not round to nearest int, it truncates
+# Note BLAST does not round to nearest int, it truncates
 bitscore = "%i" % bitscore
 values = [qseqid,
 sseqid,
 pident,
-length, #hsp.findtext("Hsp_align-len")
+length,  # hsp.findtext("Hsp_align-len")
 str(mismatch),
 gapopen,
-hsp.findtext("Hsp_query-from"), #qstart,
+hsp.findtext("Hsp_query-from"),  # qstart,
-hsp.findtext("Hsp_query-to"), #qend,
+hsp.findtext("Hsp_query-to"),  # qend,
-hsp.findtext("Hsp_hit-from"), #sstart,
+hsp.findtext("Hsp_hit-from"),  # sstart,
-hsp.findtext("Hsp_hit-to"), #send,
+hsp.findtext("Hsp_hit-to"),  # send,
-evalue, #hsp.findtext("Hsp_evalue") in scientific notation
+evalue,  # hsp.findtext("Hsp_evalue") in scientific notation
-bitscore, #hsp.findtext("Hsp_bit-score") rounded
+bitscore,  # hsp.findtext("Hsp_bit-score") rounded
 ]
 if extended:
 try:
-sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(" >"))
+sallseqid = ";".join(name.split(None, 1)[0] for name in hit_def.split(" >"))
-salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >"))
+salltitles = "<>".join(name.split(None, 1)[1] for name in hit_def.split(" >"))
 except IndexError as e:
-stop_err("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e))
+sys.exit("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e))
-#print hit_def, "-->", sallseqid
+# print hit_def, "-->", sallseqid
 positive = hsp.findtext("Hsp_positive")
-ppos = "%0.2f" % (100*float(positive)/float(length))
+ppos = "%0.2f" % (100 * float(positive) / float(length))
 qframe = hsp.findtext("Hsp_query-frame")
 sframe = hsp.findtext("Hsp_hit-frame")
 if blast_program == "blastp":
-#Probably a bug in BLASTP that they use 0 or 1 depending on format
+# Probably a bug in BLASTP that they use 0 or 1 depending on format
-if qframe == "0": qframe = "1"
+if qframe == "0":
-if sframe == "0": sframe = "1"
+qframe = "1"
+if sframe == "0":
+sframe = "1"
 slen = int(hit.findtext("Hit_len"))
 values.extend([sallseqid,
-hsp.findtext("Hsp_score"), #score,
+hsp.findtext("Hsp_score"),  # score,
 nident,
 positive,
-hsp.findtext("Hsp_gaps"), #gaps,
+hsp.findtext("Hsp_gaps"),  # gaps,
 ppos,
 qframe,
 sframe,
-#NOTE - for blastp, XML shows original seq, tabular uses XXX masking
+# NOTE - for blastp, XML shows original seq, tabular uses XXX masking
 q_seq,
 h_seq,
 str(qlen),
 str(slen),
 salltitles,
 ])
 if cols:
-#Only a subset of the columns are needed
+# Only a subset of the columns are needed
 values = [values[colnames.index(c)] for c in cols]
-#print "\t".join(values)
+# print "\t".join(values)
 output_handle.write("\t".join(values) + "\n")
 # prevents ElementTree from growing large datastructure
 root.clear()
 elem.clear()
 convert(in_file, outfile)
 if options.output:
 outfile.close()
 else:
-#Using stdout
+# Using stdout
 pass

Mercurial > repos > devteam > ncbi_blast_plus

comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 20:3034ce97dd33 draft