comparison cpt_related_genome_prot/relatedness_prot.py @ 0:ebcc87a27f9c draft default tip

Uploaded
author cpt
date Fri, 10 Jun 2022 08:46:28 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ebcc87a27f9c
1 #!/usr/bin/env python
2 import sys
3 import argparse
4 import json
5 import logging
6 from Bio.Blast import NCBIXML
7
8 logging.basicConfig(level=logging.DEBUG)
9 log = logging.getLogger()
10
11 def parse_blast(blast, isXML = False):
12 res = []
13 finalRes = []
14 if isXML:
15 for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1):
16 for alignment in blast_record.alignments:
17 tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:]
18 tempID = tempID[:tempID.find("|")]
19 tempDesc = alignment.title
20 while tempDesc.find("|") >= 0:
21 tempDesc = tempDesc[tempDesc.find("|") + 1:]
22 tempDesc = tempDesc.strip()
23 tempID = tempID.strip()
24 #for hsp in alignment.hsps:
25 line = [str(blast_record.query)[:str(blast_record.query).find("[")].strip()]
26 line.append(alignment.hit_id)
27 line.append(tempDesc)
28 line.append(alignment.accession)
29 res.append(line)
30 blast.seek(0)
31 resInd = -1
32 taxLine = blast.readline()
33 while taxLine:
34 if "<Hit>" in taxLine:
35 resInd += 1
36 taxSlice = ""
37 elif "<taxid>" in taxLine:
38 taxSlice = taxLine[taxLine.find("<taxid>") + 7:taxLine.find("</taxid>")]
39 finalRes.append(res[resInd])
40 finalRes[-1].append(taxSlice)
41 taxLine = blast.readline()
42 return finalRes
43 else:
44 for line in blast:
45 finalRes.append(line.strip("\n").split("\t"))
46 return finalRes
47
48 def with_dice(blast):
49 for data in blast:
50 dice = 2 * int(data[14]) / (float(data[22]) + float(data[23]))
51 yield data + [dice]
52
53
54 def filter_dice(blast, threshold=0.5):
55 for data in blast:
56 if data[-1] > threshold:
57 yield data
58
59
60 def split_identifiers_nucl(_, ident):
61 if "<>" in ident:
62 idents = ident.split("<>")
63 else:
64 idents = [ident]
65 return idents
66
67
68 def split_identifiers_prot(_, ident):
69 if "<>" in ident:
70 idents = ident.split("<>")
71 else:
72 idents = [ident]
73 return [
74 x[x.index("[") + 1 : x.rindex("]")]
75 for x in idents
76 # MULTISPECIES: recombination-associated protein RdgC [Enterobacteriaceae]<>RecName: Full=Recombination-associated protein RdgC<>putative exonuclease, RdgC [Enterobacter sp. 638]
77 if "[" in x and "]" in x
78 ]
79
80
81 def split_identifiers_phage(par, ident):
82 par = par.replace("lcl|", "")
83 par = par[0 : par.index("_prot_")]
84 return [par]
85
86
87 def important_only(blast, split_identifiers):
88 for data in blast:
89 yield [
90 data[0], # 01 Query Seq-id (ID of your sequence)
91 data[1], # 13 All subject Seq-id(s), separated by a ';'
92 split_identifiers(
93 data[1], data[2]
94 ), # 25 All subject title(s), separated by a '<>'
95 data[3].split(";"), # Extra: All Subject Accessions
96 data[4].split(";"), # Extra: All TaxIDs
97 ]
98
99
100 def deform_scores(blast):
101 for data in blast:
102 for org in data[2]:
103 yield [data[0], data[1], org, data[3], data[4]]
104
105
106 def expand_fields(blast):
107 for data in blast:
108 for x in range(0, len(data[4])):
109 yield [data[0], data[1], data[2][x], data[3], int(data[4][x])]
110
111 def expand_taxIDs(blast, taxFilter):
112 for data in blast:
113 # if(len(data[4]) > 0):
114 # print(data[0])
115 for ID in data[4]:
116 if ID != "N/A":
117 filterOut = False
118 for tax in taxFilter:
119 if str(ID).strip() == tax:
120 filterOut = True
121 if not filterOut:
122 yield [data[0], data[1], data[2], data[3], int(ID)]
123
124
125 def expand_titles(blast):
126 for data in blast:
127 for title in data[2]:
128 yield [data[0], data[1], title, data[3], data[4]]
129
130
131 def filter_phage(blast, phageTaxLookup, phageSciNames):
132 for data in blast:
133 for x in range(0, len(phageTaxLookup)):
134 if (data[4]) == phageTaxLookup[x]:
135 yield [data[0], data[1], phageSciNames[x], data[3], data[4]]
136 break
137
138
139 def remove_dupes(data):
140 has_seen = {}
141 for row in data:
142 # qseqid, sseqid
143 key = (row[0], row[4])
144 # If we've seen the key before, we can exit
145 if key in has_seen:
146 continue
147
148 # Otherwise, continue on
149 has_seen[key] = True
150 # Pretty simple
151 yield row
152
153 def scoreMap(blast):
154 c = {}
155 m = {}
156 for (qseq, subID, subTitle, access, ID) in blast:
157 if (str(subTitle), ID) not in c:
158 m[(str(subTitle), ID)] = access
159 c[(str(subTitle), ID)] = 0
160
161 c[(str(subTitle), ID)] += 1
162 return c, m
163
164
165 if __name__ == "__main__":
166 parser = argparse.ArgumentParser(description="Top related genomes")
167 parser.add_argument(
168 "blast", type=argparse.FileType("r"), help="Blast 25 Column Results"
169 )
170 parser.add_argument("phagedb", type=argparse.FileType("r"))
171 parser.add_argument("--access", action="store_true")
172 parser.add_argument("--protein", action="store_true")
173 parser.add_argument("--canonical", action="store_true")
174 parser.add_argument("--noFilter", action="store_true")
175 #parser.add_argument("--title", action="store_true") # Add when ready to update XML after semester
176 parser.add_argument("--hits", type=int, default=5)
177 parser.add_argument("--xmlMode", action="store_true")
178 parser.add_argument("--taxFilter", type=str)
179
180 args = parser.parse_args()
181
182 phageDb = args.phagedb
183 phageTaxLookup = []
184 sciName = []
185 line = phageDb.readline()
186
187 taxList = []
188 if args.taxFilter and args.taxFilter != "" :
189 args.taxFilter = args.taxFilter.split(" ")
190 for ind in args.taxFilter:
191 taxList.append(ind.strip())
192
193 while line:
194 line = line.split("\t")
195 phageTaxLookup.append(int(line[0]))
196 line[1] = line[1].strip()
197 if (line[1] == ""):
198 line[1] = "Novel Genome"
199 sciName.append(line[1])
200 line = phageDb.readline()
201
202 if args.protein:
203 splitId = split_identifiers_prot
204 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
205 elif args.canonical:
206 splitId = split_identifiers_phage
207 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
208 else:
209 splitId = split_identifiers_nucl
210 # phageNameLookup = {k['desc'].rstrip('.'): k['id'] for k in phageDb}
211
212 data = parse_blast(args.blast, args.xmlMode)
213 # data = with_dice(data)
214 # data = filter_dice(data, threshold=0.0)
215 data = important_only(data, splitId)
216
217 data = expand_taxIDs(data, taxList)
218 data = remove_dupes(data)
219 if not args.noFilter:
220 data = filter_phage(data, phageTaxLookup, sciName)
221 listify = []
222 for x in data:
223 listify.append(x)
224 #listify = greatest_taxID(listify)
225
226 count_label = "Similar Unique Proteins"
227
228 counts, accessions = scoreMap(listify)
229
230 nameRec = listify[0][0]
231 sys.stdout.write(
232 "Top %d matches for BLASTp results of %s\n"
233 % (args.hits, nameRec)
234 )
235 header = "# TaxID\t"
236 #if args.title:
237 header += "Name\t"
238 if args.access:
239 header += "Accessions\t"
240 header += "Similar Unique Proteins\n"
241 sys.stdout.write(header)
242
243 for idx, ((name, ID), num) in enumerate(
244 sorted(counts.items(), key=lambda item: -item[1])
245 ):
246 if idx > args.hits - 1:
247 break
248 line = str(ID) + "\t"
249 #if args.title:
250 line += str(name) + "\t"
251 if args.access:
252 line += str(accessions[(name, ID)][0]) + "\t"
253 line += str(num) + "\n"
254 sys.stdout.write(line)