0
|
1 #!/usr/bin/env python
|
|
2 import sys
|
|
3 import argparse
|
|
4 import json
|
|
5 import logging
|
|
6 from Bio.Blast import NCBIXML
|
|
7
|
|
8 logging.basicConfig(level=logging.DEBUG)
|
|
9 log = logging.getLogger()
|
|
10
|
|
11 def parse_blast(blast, isXML = False):
|
|
12 res = []
|
|
13 finalRes = []
|
|
14 if isXML:
|
|
15 for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1):
|
|
16 for alignment in blast_record.alignments:
|
|
17 tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:]
|
|
18 tempID = tempID[:tempID.find("|")]
|
|
19 tempDesc = alignment.title
|
|
20 while tempDesc.find("|") >= 0:
|
|
21 tempDesc = tempDesc[tempDesc.find("|") + 1:]
|
|
22 tempDesc = tempDesc.strip()
|
|
23 tempID = tempID.strip()
|
|
24 #for hsp in alignment.hsps:
|
|
25 line = [str(blast_record.query)[:str(blast_record.query).find("[")].strip()]
|
|
26 line.append(alignment.hit_id)
|
|
27 line.append(tempDesc)
|
|
28 line.append(alignment.accession)
|
|
29 res.append(line)
|
|
30 blast.seek(0)
|
|
31 resInd = -1
|
|
32 taxLine = blast.readline()
|
|
33 while taxLine:
|
|
34 if "<Hit>" in taxLine:
|
|
35 resInd += 1
|
|
36 taxSlice = ""
|
|
37 elif "<taxid>" in taxLine:
|
|
38 taxSlice = taxLine[taxLine.find("<taxid>") + 7:taxLine.find("</taxid>")]
|
|
39 finalRes.append(res[resInd])
|
|
40 finalRes[-1].append(taxSlice)
|
|
41 taxLine = blast.readline()
|
|
42 return finalRes
|
|
43 else:
|
|
44 for line in blast:
|
|
45 finalRes.append(line.strip("\n").split("\t"))
|
|
46 return finalRes
|
|
47
|
|
48 def with_dice(blast):
|
|
49 for data in blast:
|
|
50 dice = 2 * int(data[14]) / (float(data[22]) + float(data[23]))
|
|
51 yield data + [dice]
|
|
52
|
|
53
|
|
54 def filter_dice(blast, threshold=0.5):
|
|
55 for data in blast:
|
|
56 if data[-1] > threshold:
|
|
57 yield data
|
|
58
|
|
59
|
|
60 def split_identifiers_nucl(_, ident):
|
|
61 if "<>" in ident:
|
|
62 idents = ident.split("<>")
|
|
63 else:
|
|
64 idents = [ident]
|
|
65 return idents
|
|
66
|
|
67
|
|
68 def split_identifiers_prot(_, ident):
|
|
69 if "<>" in ident:
|
|
70 idents = ident.split("<>")
|
|
71 else:
|
|
72 idents = [ident]
|
|
73 return [
|
|
74 x[x.index("[") + 1 : x.rindex("]")]
|
|
75 for x in idents
|
|
76 # MULTISPECIES: recombination-associated protein RdgC [Enterobacteriaceae]<>RecName: Full=Recombination-associated protein RdgC<>putative exonuclease, RdgC [Enterobacter sp. 638]
|
|
77 if "[" in x and "]" in x
|
|
78 ]
|
|
79
|
|
80
|
|
81 def split_identifiers_phage(par, ident):
|
|
82 par = par.replace("lcl|", "")
|
|
83 par = par[0 : par.index("_prot_")]
|
|
84 return [par]
|
|
85
|
|
86
|
|
87 def important_only(blast, split_identifiers):
|
|
88 for data in blast:
|
|
89 yield [
|
|
90 data[0], # 01 Query Seq-id (ID of your sequence)
|
|
91 data[1], # 13 All subject Seq-id(s), separated by a ';'
|
|
92 split_identifiers(
|
|
93 data[1], data[2]
|
|
94 ), # 25 All subject title(s), separated by a '<>'
|
|
95 data[3].split(";"), # Extra: All Subject Accessions
|
|
96 data[4].split(";"), # Extra: All TaxIDs
|
|
97 ]
|
|
98
|
|
99
|
|
100 def deform_scores(blast):
|
|
101 for data in blast:
|
|
102 for org in data[2]:
|
|
103 yield [data[0], data[1], org, data[3], data[4]]
|
|
104
|
|
105
|
|
106 def expand_fields(blast):
|
|
107 for data in blast:
|
|
108 for x in range(0, len(data[4])):
|
|
109 yield [data[0], data[1], data[2][x], data[3], int(data[4][x])]
|
|
110
|
|
111 def expand_taxIDs(blast, taxFilter):
|
|
112 for data in blast:
|
|
113 # if(len(data[4]) > 0):
|
|
114 # print(data[0])
|
|
115 for ID in data[4]:
|
|
116 if ID != "N/A":
|
|
117 filterOut = False
|
|
118 for tax in taxFilter:
|
|
119 if str(ID).strip() == tax:
|
|
120 filterOut = True
|
|
121 if not filterOut:
|
|
122 yield [data[0], data[1], data[2], data[3], int(ID)]
|
|
123
|
|
124
|
|
125 def expand_titles(blast):
|
|
126 for data in blast:
|
|
127 for title in data[2]:
|
|
128 yield [data[0], data[1], title, data[3], data[4]]
|
|
129
|
|
130
|
|
131 def filter_phage(blast, phageTaxLookup, phageSciNames):
|
|
132 for data in blast:
|
|
133 for x in range(0, len(phageTaxLookup)):
|
|
134 if (data[4]) == phageTaxLookup[x]:
|
|
135 yield [data[0], data[1], phageSciNames[x], data[3], data[4]]
|
|
136 break
|
|
137
|
|
138
|
|
139 def remove_dupes(data):
|
|
140 has_seen = {}
|
|
141 for row in data:
|
|
142 # qseqid, sseqid
|
|
143 key = (row[0], row[4])
|
|
144 # If we've seen the key before, we can exit
|
|
145 if key in has_seen:
|
|
146 continue
|
|
147
|
|
148 # Otherwise, continue on
|
|
149 has_seen[key] = True
|
|
150 # Pretty simple
|
|
151 yield row
|
|
152
|
|
153 def scoreMap(blast):
|
|
154 c = {}
|
|
155 m = {}
|
|
156 for (qseq, subID, subTitle, access, ID) in blast:
|
|
157 if (str(subTitle), ID) not in c:
|
|
158 m[(str(subTitle), ID)] = access
|
|
159 c[(str(subTitle), ID)] = 0
|
|
160
|
|
161 c[(str(subTitle), ID)] += 1
|
|
162 return c, m
|
|
163
|
|
164
|
|
165 if __name__ == "__main__":
|
|
166 parser = argparse.ArgumentParser(description="Top related genomes")
|
|
167 parser.add_argument(
|
|
168 "blast", type=argparse.FileType("r"), help="Blast 25 Column Results"
|
|
169 )
|
|
170 parser.add_argument("phagedb", type=argparse.FileType("r"))
|
|
171 parser.add_argument("--access", action="store_true")
|
|
172 parser.add_argument("--protein", action="store_true")
|
|
173 parser.add_argument("--canonical", action="store_true")
|
|
174 parser.add_argument("--noFilter", action="store_true")
|
|
175 #parser.add_argument("--title", action="store_true") # Add when ready to update XML after semester
|
|
176 parser.add_argument("--hits", type=int, default=5)
|
|
177 parser.add_argument("--xmlMode", action="store_true")
|
|
178 parser.add_argument("--taxFilter", type=str)
|
|
179
|
|
180 args = parser.parse_args()
|
|
181
|
|
182 phageDb = args.phagedb
|
|
183 phageTaxLookup = []
|
|
184 sciName = []
|
|
185 line = phageDb.readline()
|
|
186
|
|
187 taxList = []
|
|
188 if args.taxFilter and args.taxFilter != "" :
|
|
189 args.taxFilter = args.taxFilter.split(" ")
|
|
190 for ind in args.taxFilter:
|
|
191 taxList.append(ind.strip())
|
|
192
|
|
193 while line:
|
|
194 line = line.split("\t")
|
|
195 phageTaxLookup.append(int(line[0]))
|
|
196 line[1] = line[1].strip()
|
|
197 if (line[1] == ""):
|
|
198 line[1] = "Novel Genome"
|
|
199 sciName.append(line[1])
|
|
200 line = phageDb.readline()
|
|
201
|
|
202 if args.protein:
|
|
203 splitId = split_identifiers_prot
|
|
204 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
|
|
205 elif args.canonical:
|
|
206 splitId = split_identifiers_phage
|
|
207 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
|
|
208 else:
|
|
209 splitId = split_identifiers_nucl
|
|
210 # phageNameLookup = {k['desc'].rstrip('.'): k['id'] for k in phageDb}
|
|
211
|
|
212 data = parse_blast(args.blast, args.xmlMode)
|
|
213 # data = with_dice(data)
|
|
214 # data = filter_dice(data, threshold=0.0)
|
|
215 data = important_only(data, splitId)
|
|
216
|
|
217 data = expand_taxIDs(data, taxList)
|
|
218 data = remove_dupes(data)
|
|
219 if not args.noFilter:
|
|
220 data = filter_phage(data, phageTaxLookup, sciName)
|
|
221 listify = []
|
|
222 for x in data:
|
|
223 listify.append(x)
|
|
224 #listify = greatest_taxID(listify)
|
|
225
|
|
226 count_label = "Similar Unique Proteins"
|
|
227
|
|
228 counts, accessions = scoreMap(listify)
|
|
229
|
|
230 nameRec = listify[0][0]
|
|
231 sys.stdout.write(
|
|
232 "Top %d matches for BLASTp results of %s\n"
|
|
233 % (args.hits, nameRec)
|
|
234 )
|
|
235 header = "# TaxID\t"
|
|
236 #if args.title:
|
|
237 header += "Name\t"
|
|
238 if args.access:
|
|
239 header += "Accessions\t"
|
|
240 header += "Similar Unique Proteins\n"
|
|
241 sys.stdout.write(header)
|
|
242
|
|
243 for idx, ((name, ID), num) in enumerate(
|
|
244 sorted(counts.items(), key=lambda item: -item[1])
|
|
245 ):
|
|
246 if idx > args.hits - 1:
|
|
247 break
|
|
248 line = str(ID) + "\t"
|
|
249 #if args.title:
|
|
250 line += str(name) + "\t"
|
|
251 if args.access:
|
|
252 line += str(accessions[(name, ID)][0]) + "\t"
|
|
253 line += str(num) + "\n"
|
|
254 sys.stdout.write(line)
|