Mercurial > repos > nedias > orf_tools
comparison blast_tool.py @ 0:ec8fe63afdb8 draft
Uploaded
author | nedias |
---|---|
date | Wed, 12 Oct 2016 00:03:16 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ec8fe63afdb8 |
---|---|
1 """ | |
2 Util class of BLAST API | |
3 Use Biopython's BLAST interface to access BLAST API | |
4 BLAST call may take more than 5 min to complete. | |
5 | |
6 Author Nedias Sept, 2016 | |
7 | |
8 """ | |
9 | |
10 | |
11 # BLAST Web api, use to calling BLAST using url | |
12 from Bio.Blast import NCBIWWW | |
13 # BLAST XML util, use to read data from BLAST's response | |
14 from Bio.Blast import NCBIXML | |
15 | |
16 | |
17 def exec_tool(options): | |
18 if options.format and options.format == "fasta": | |
19 blast_call(options.input, options.output) | |
20 | |
21 | |
22 | |
23 | |
24 # Call BLAST protein API | |
25 # input: an fasta format file with polypeptide data | |
26 # output: the searching result file from BLAST in XML format | |
27 # return: execute status code(developing) | |
28 def blast_call(in_file, out_file): | |
29 | |
30 # Open input file(read only) | |
31 input_file = open(in_file, "rU") | |
32 # Create an output file(create or override) | |
33 output_file = open(out_file, "w+") | |
34 | |
35 # Read all data from the input fasta file | |
36 fasta_string = input_file.read() | |
37 | |
38 # Call BLAST to search similar protein data | |
39 # blastp's "p" means using the protein search API | |
40 # "nr" means not specific database, which will perform a search base on all possible database | |
41 # TODO:May consider parametrize these options | |
42 result_handle = NCBIWWW.qblast("blastp", "nr", fasta_string) | |
43 | |
44 # Read result form responded data and save them in output file | |
45 result = result_handle.read() | |
46 output_file.write(result) | |
47 | |
48 return 0 | |
49 | |
50 | |
51 # Call BLAST protein API (text version) | |
52 # input: polypeptide data text | |
53 # return: the searching result from BLAST in StringIO format | |
54 def blast_call_text(in_text): | |
55 | |
56 # Call BLAST to search similar protein data | |
57 result_handle = NCBIWWW.qblast("blastp", "nr", in_text) | |
58 | |
59 return result_handle | |
60 | |
61 | |
62 # Abstract data from BLAST responded XML | |
63 # input: the searching result file from BLAST in XML format | |
64 # return: an dictionary include all data from BLAST result. | |
65 # the structure of the dictionary is: | |
66 # dict -- hits: Integer, number of total hits during searching | |
67 # -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file) | |
68 # | (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry) | |
69 # | | |
70 # -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain | |
71 # | | |
72 # hps: Dictionary, contains all data of the high-scoring pair | |
73 def read_blast_result(blast_xml): | |
74 | |
75 # Open the BLAST result | |
76 result_file = open(blast_xml) | |
77 # Read all result using biopython | |
78 blast_record = NCBIXML.read(result_file) | |
79 # E-value limit, every data with a less-than 96% e-value will be filtered | |
80 E_VALUE_THRESH = 0.04 | |
81 | |
82 # Initialize the result dictionary | |
83 result_dict = dict() | |
84 | |
85 # Store the hits number | |
86 result_dict['hits': len(blast_record.alignments)] | |
87 # Initialize the alignment list | |
88 aligns = list() | |
89 # Scan through all alignments | |
90 for alignment in blast_record.alignments: | |
91 # Initialize the high-score pairs list | |
92 hsps = list() | |
93 # Scan through all hsps | |
94 for hsp in alignment.hsps: | |
95 # filter all hsp has less e-value than E_VALUE_THRESH | |
96 if hsp.expect < E_VALUE_THRESH: | |
97 # Initialize high-score pair dictionary | |
98 hsp_dict = dict() | |
99 # Store all data obtained from BLAST into the dictionary | |
100 hsp_dict['seq'] = alignment.title | |
101 hsp_dict['len'] = alignment.length | |
102 hsp_dict['eval'] = hsp.expect | |
103 # Identity is calculated as the percentage of identities in the alignment | |
104 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length)) | |
105 hsp_dict['query'] = hsp.query | |
106 hsp_dict['match'] = hsp.match | |
107 hsp_dict['sbjct'] = hsp.sbjct | |
108 hsps.append(hsp_dict) | |
109 # Store the hsps into the alignment data | |
110 aligns.append(hsps) | |
111 | |
112 # Store alignment data into the dictionary | |
113 result_dict['alignments', aligns] | |
114 return result_dict | |
115 | |
116 | |
117 # Abstract data from BLAST responded XML (text version) | |
118 # input: the searching result file from BLAST in StringIO | |
119 # return: an dictionary include all data from BLAST result. | |
120 def read_blast_result_text(blast_handle): | |
121 | |
122 blast_record = NCBIXML.read(blast_handle) | |
123 | |
124 E_VALUE_THRESH = 0.04 | |
125 | |
126 result_dict = dict() | |
127 | |
128 result_dict['hits'] = len(blast_record.alignments) | |
129 aligns = list() | |
130 for alignment in blast_record.alignments: | |
131 hsps = list() | |
132 for hsp in alignment.hsps: | |
133 | |
134 if hsp.expect < E_VALUE_THRESH: | |
135 hsp_dict = dict() | |
136 hsp_dict['seq'] = alignment.title | |
137 hsp_dict['len'] = alignment.length | |
138 hsp_dict['eval'] = hsp.expect | |
139 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length)) | |
140 hsp_dict['query'] = hsp.query | |
141 hsp_dict['match'] = hsp.match | |
142 hsp_dict['sbjct'] = hsp.sbjct | |
143 hsps.append(hsp_dict) | |
144 aligns.append(hsps) | |
145 | |
146 result_dict['alignments'] = aligns | |
147 return result_dict | |
148 | |
149 | |
150 # Sample of usage (text version) | |
151 def text(): | |
152 result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\ | |
153 DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\ | |
154 AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\ | |
155 EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\ | |
156 APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\ | |
157 FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\ | |
158 TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\ | |
159 AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\ | |
160 PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI' | |
161 ) | |
162 re_dict = read_blast_result_text(result) | |
163 print(re_dict['hits']) | |
164 for alignments in re_dict['alignments']: | |
165 for hsp in alignments: | |
166 print(hsp['seq']) | |
167 print(hsp['len']) | |
168 print(hsp['eval']) | |
169 print(hsp['ident']) | |
170 print(hsp['query'][0:75] + "...") | |
171 print(hsp['match'][0:75] + "...") | |
172 print(hsp['sbjct'][0:75] + "...") | |
173 | |
174 | |
175 |