Mercurial > repos > devteam > ncbi_blast_plus
diff tools/ncbi_blast_plus/blastxml_to_tabular.py @ 21:7538e2bfcd41 draft
v0.2.00, for NCBI BLAST+ 2.5.0 via bioconda or tool_dependencies.xml
author | peterjc |
---|---|
date | Wed, 19 Apr 2017 05:27:19 -0400 |
parents | 3034ce97dd33 |
children | 6f386c5dc4fb |
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Mon Nov 07 11:31:37 2016 -0500 +++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Wed Apr 19 05:27:19 2017 -0400 @@ -60,13 +60,16 @@ However, check this with "diff -b ..." since BLAST+ sometimes includes an extra space character (probably a bug). """ -import sys + + +import os import re -import os +import sys + from optparse import OptionParser if "-v" in sys.argv or "--version" in sys.argv: - print "v0.1.08" + print "v0.2.00" sys.exit(0) if sys.version_info[:2] >= (2, 5): @@ -75,7 +78,7 @@ except ImportError: from xml.etree import ElementTree as ElementTree else: - from galaxy import eggs + from galaxy import eggs # noqa - ignore flake8 F401 import pkg_resources pkg_resources.require("elementtree") from elementtree import ElementTree @@ -106,11 +109,16 @@ 'qseqid,sseqid,pident' (space or comma separated). """ parser = OptionParser(usage=usage) -parser.add_option('-o', '--output', dest='output', default=None, help='output filename (defaults to stdout)', metavar="FILE") -parser.add_option("-c", "--columns", dest="columns", default='std', help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names") +parser.add_option('-o', '--output', dest='output', default=None, + help='output filename (defaults to stdout)', + metavar="FILE") +parser.add_option("-c", "--columns", dest="columns", default='std', + help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names") (options, args) = parser.parse_args() -colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',') +colnames = ('qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,' + 'sstart,send,evalue,bitscore,sallseqid,score,nident,positive,' + 'gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles').split(',') if len(args) < 1: sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.") @@ -176,7 +184,8 @@ if event == "end" and elem.tag == "Iteration": # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> - # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> + # <Iteration_query-def>Endoplasmic reticulum resident protein 44 + # OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> # <Iteration_query-len>406</Iteration_query-len> # <Iteration_hits></Iteration_hits> # @@ -206,17 +215,16 @@ # # Or, with a local database not using -parse_seqids can get this, # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> - # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> + # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis + # thaliana chromosome 3, complete sequence</Hit_def> # <Hit_accession>2</Hit_accession> sseqid = hit.findtext("Hit_id").split(None, 1)[0] hit_def = sseqid + " " + hit.findtext("Hit_def") - if re_default_subject_id.match(sseqid) \ - and sseqid == hit.findtext("Hit_accession"): + if re_default_subject_id.match(sseqid) and sseqid == hit.findtext("Hit_accession"): # Place holder ID, take the first word of the subject definition hit_def = hit.findtext("Hit_def") sseqid = hit_def.split(None, 1)[0] - if sseqid.startswith("gnl|BL_ORD_ID|") \ - and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): + if sseqid.startswith("gnl|BL_ORD_ID|") and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): # Alternative place holder ID, again take the first word of hit_def hit_def = hit.findtext("Hit_def") sseqid = hit_def.split(None, 1)[0] @@ -224,7 +232,8 @@ for hsp in hit.findall("Hit_hsps/Hsp"): nident = hsp.findtext("Hsp_identity") length = hsp.findtext("Hsp_align-len") - pident = "%0.2f" % (100 * float(nident) / float(length)) + # As of NCBI BLAST+ 2.4.0 this is given to 3dp (not 2dp) + pident = "%0.3f" % (100 * float(nident) / float(length)) q_seq = hsp.findtext("Hsp_qseq") h_seq = hsp.findtext("Hsp_hseq") @@ -233,13 +242,11 @@ gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 + len(h_seq.replace('-', ' ').split()) - 1) - mismatch = m_seq.count(' ') + m_seq.count('+') \ - - q_seq.count('-') - h_seq.count('-') + mismatch = m_seq.count(' ') + m_seq.count('+') - q_seq.count('-') - h_seq.count('-') # TODO - Remove this alternative mismatch calculation and test # once satisifed there are no problems - expected_mismatch = len(q_seq) \ - - sum(1 for q, h in zip(q_seq, h_seq) - if q == h or q == "-" or h == "-") + expected_mismatch = len(q_seq) - sum(1 for q, h in zip(q_seq, h_seq) + if q == h or q == "-" or h == "-") xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X") if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): sys.exit("%s vs %s mismatches, expected %i <= %i <= %i"