Mercurial > repos > devteam > ncbi_blast_plus
diff tools/ncbi_blast_plus/blastxml_to_tabular.py @ 11:4c4a0da938ff draft
Uploaded v0.0.22, now wraps BLAST+ 2.2.28 allowing extended tabular output to include the hit descriptions as column 25.
Supports $GALAXY_SLOTS.
Includes more tests and heavy use of macros.
author | peterjc |
---|---|
date | Thu, 05 Dec 2013 06:55:59 -0500 |
parents | 70e7dcbf6573 |
children | 623f727cdff1 |
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Mon Sep 23 06:14:13 2013 -0400 +++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Thu Dec 05 06:55:59 2013 -0500 @@ -31,7 +31,7 @@ ====== ============= =========================================== Column NCBI name Description ------ ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' + 13 sallseqid All subject Seq-id(s), separated by ';' 14 score Raw score 15 nident Number of identical matches 16 positive Number of positive-scoring matches @@ -43,6 +43,7 @@ 22 sseq Aligned part of subject sequence 23 qlen Query sequence length 24 slen Subject sequence length + 25 salltitles All subject titles, separated by '<>' ====== ============= =========================================== Most of these fields are given explicitly in the XML file, others some like @@ -63,7 +64,7 @@ import re if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.12" + print "v0.0.22" sys.exit(0) if sys.version_info[:2] >= ( 2, 5 ): @@ -89,11 +90,11 @@ if out_fmt == "std": extended = False elif out_fmt == "x22": - stop_err("Format argument x22 has been replaced with ext (extended 24 columns)") + stop_err("Format argument x22 has been replaced with ext (extended 25 columns)") elif out_fmt == "ext": extended = True else: - stop_err("Format argument should be std (12 column) or ext (extended 24 columns)") + stop_err("Format argument should be std (12 column) or ext (extended 25 columns), not: %r" % out_fmt) # get an iterable @@ -157,6 +158,11 @@ # <Hit_accession>Subject_1</Hit_accession> # #apparently depending on the parse_deflines switch + # + #Or, with BLAST 2.2.28+ can get this, + # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> + # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> + # <Hit_accession>2</Hit_accession> sseqid = hit.findtext("Hit_id").split(None,1)[0] hit_def = sseqid + " " + hit.findtext("Hit_def") if re_default_subject_id.match(sseqid) \ @@ -164,6 +170,11 @@ #Place holder ID, take the first word of the subject definition hit_def = hit.findtext("Hit_def") sseqid = hit_def.split(None,1)[0] + if sseqid.startswith("gnl|BL_ORD_ID|") \ + and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): + #Alternative place holder ID, again take the first word of hit_def + hit_def = hit.findtext("Hit_def") + sseqid = hit_def.split(None,1)[0] # for every <Hsp> within <Hit> for hsp in hit.findall("Hit_hsps/Hsp"): nident = hsp.findtext("Hsp_identity") @@ -228,7 +239,11 @@ ] if extended: - sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">")) + try: + sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(" >")) + salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >")) + except IndexError as e: + stop_err("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e)) #print hit_def, "-->", sallseqid positive = hsp.findtext("Hsp_positive") ppos = "%0.2f" % (100*float(positive)/float(length)) @@ -252,6 +267,7 @@ h_seq, str(qlen), str(slen), + salltitles, ]) #print "\t".join(values) outfile.write("\t".join(values) + "\n")