orf_tools: blast_tool.py comparison

comparison blast_tool.py @ 0:ec8fe63afdb8 draft

Uploaded

author	nedias
date	Wed, 12 Oct 2016 00:03:16 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:ec8fe63afdb8
+"""
+Util class of BLAST API
+Use Biopython's BLAST interface to access BLAST API
+BLAST call may take more than 5 min to complete.
+Author Nedias  Sept, 2016
+"""
+# BLAST Web api, use to calling BLAST using url
+from Bio.Blast import NCBIWWW
+# BLAST XML util, use to read data from BLAST's response
+from Bio.Blast import NCBIXML
+def exec_tool(options):
+if options.format and options.format == "fasta":
+blast_call(options.input, options.output)
+# Call BLAST protein API
+# input: an fasta format file with polypeptide data
+# output: the searching result file from BLAST in XML format
+# return: execute status code(developing)
+def blast_call(in_file, out_file):
+# Open input file(read only)
+input_file = open(in_file, "rU")
+# Create an output file(create or override)
+output_file = open(out_file, "w+")
+# Read all data from the input fasta file
+fasta_string = input_file.read()
+# Call BLAST to search similar protein data
+# blastp's "p" means using the protein search API
+# "nr" means not specific database, which will perform a search base on all possible database
+# TODO:May consider parametrize these options
+result_handle = NCBIWWW.qblast("blastp", "nr", fasta_string)
+# Read result form responded data and save them in output file
+result = result_handle.read()
+output_file.write(result)
+return 0
+# Call BLAST protein API (text version)
+# input: polypeptide data text
+# return: the searching result from BLAST in StringIO format
+def blast_call_text(in_text):
+# Call BLAST to search similar protein data
+result_handle = NCBIWWW.qblast("blastp", "nr", in_text)
+return result_handle
+# Abstract data from BLAST responded XML
+# input: the searching result file from BLAST in XML format
+# return: an dictionary include all data from BLAST result.
+#      the structure of the dictionary is:
+#      dict -- hits: Integer, number of total hits during searching
+#           -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file)
+#                |          (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry)
+#                |
+#                -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain
+#                    |
+#                    hps: Dictionary, contains all data of the high-scoring pair
+def read_blast_result(blast_xml):
+# Open the BLAST result
+result_file = open(blast_xml)
+# Read all result using biopython
+blast_record = NCBIXML.read(result_file)
+# E-value limit, every data with a less-than 96% e-value will be filtered
+E_VALUE_THRESH = 0.04
+# Initialize the result dictionary
+result_dict = dict()
+# Store the hits number
+result_dict['hits': len(blast_record.alignments)]
+# Initialize the alignment list
+aligns = list()
+# Scan through all alignments
+for alignment in blast_record.alignments:
+# Initialize the high-score pairs list
+hsps = list()
+# Scan through all hsps
+for hsp in alignment.hsps:
+# filter all hsp has less e-value than E_VALUE_THRESH
+if hsp.expect < E_VALUE_THRESH:
+# Initialize high-score pair dictionary
+hsp_dict = dict()
+# Store all data obtained from BLAST into the dictionary
+hsp_dict['seq'] = alignment.title
+hsp_dict['len'] = alignment.length
+hsp_dict['eval'] = hsp.expect
+# Identity is calculated as the percentage of identities in the alignment
+hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
+hsp_dict['query'] = hsp.query
+hsp_dict['match'] = hsp.match
+hsp_dict['sbjct'] = hsp.sbjct
+hsps.append(hsp_dict)
+# Store the hsps into the alignment data
+aligns.append(hsps)
+# Store alignment data into the dictionary
+result_dict['alignments', aligns]
+return result_dict
+# Abstract data from BLAST responded XML (text version)
+# input: the searching result file from BLAST in StringIO
+# return: an dictionary include all data from BLAST result.
+def read_blast_result_text(blast_handle):
+blast_record = NCBIXML.read(blast_handle)
+E_VALUE_THRESH = 0.04
+result_dict = dict()
+result_dict['hits'] =  len(blast_record.alignments)
+aligns = list()
+for alignment in blast_record.alignments:
+hsps = list()
+for hsp in alignment.hsps:
+if hsp.expect < E_VALUE_THRESH:
+hsp_dict = dict()
+hsp_dict['seq'] = alignment.title
+hsp_dict['len'] = alignment.length
+hsp_dict['eval'] = hsp.expect
+hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
+hsp_dict['query'] = hsp.query
+hsp_dict['match'] = hsp.match
+hsp_dict['sbjct'] = hsp.sbjct
+hsps.append(hsp_dict)
+aligns.append(hsps)
+result_dict['alignments'] = aligns
+return result_dict
+# Sample of usage (text version)
+def text():
+result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\
+DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\
+AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\
+EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\
+APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\
+FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\
+TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\
+AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\
+PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI'
+)
+re_dict = read_blast_result_text(result)
+print(re_dict['hits'])
+for alignments in re_dict['alignments']:
+for hsp in alignments:
+print(hsp['seq'])
+print(hsp['len'])
+print(hsp['eval'])
+print(hsp['ident'])
+print(hsp['query'][0:75] + "...")
+print(hsp['match'][0:75] + "...")
+print(hsp['sbjct'][0:75] + "...")

Mercurial > repos > nedias > orf_tools

comparison blast_tool.py @ 0:ec8fe63afdb8 draft