comparison cpt_disruptin_proximity/disruptin_proximity_2_lysis_genes.py @ 0:6661bb42b5a9 draft

Uploaded
author cpt
date Fri, 17 Jun 2022 12:24:06 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6661bb42b5a9
1 #!/usr/bin/env python
2 """
3 This program is intended to identify protein coding sequences within a certain window (number of base pairs) of genes encoding recognized endolysin domains and genes encoding transmembrane domains. The goal is narrow a list of disruptin candidates by identifying the sequences close to the other lysis genes in the phage genome.
4 Inputs for this program include a .fasta file with protein sequences of lysis gene candidates from the phage genome, a .gff3 file with the tmhmm results from the genome, a .gff3 file with the results from interproscan of the genome, a .gff3 file of the genome, window size in number of base pairs, a tab separated list of endolysin domains, and optional names of output files.
5 The program outputs lists of lysis gene candidates that are close to protein codings sequences with endolysin domains or to sequences with transmembrane domains and lists of the proteins in proximity to the lysis gene candidates (one list for proteins with endolysin domains and one list for TMD-containing proteins).
6 """
7
8 from Bio import SeqIO
9 import argparse
10 import sys
11 from CPT_GFFParser import gffParse, gffWrite
12 from Bio.SeqRecord import SeqRecord
13 from Bio.SeqFeature import SeqFeature
14 from Bio.Seq import Seq
15 from intervaltree import IntervalTree, Interval
16 from gff3 import feature_lambda, feature_test_type
17
18 # Used for genome in fasta format
19 # outputs the start and end coordinates from a record in fasta format
20 # Should work for seqrecords from NCBI database
21 # def location_start_end(seqrec):
22 # F = seqrec.description
23 # description_subparts = F.split(' ')
24 # for i in range(len(description_subparts)):
25 # if description_subparts[i].startswith('[location'):
26 # location = description_subparts[i][10:-1]
27 # location_se = location.split('..')
28 # location_start = location_se[0]
29 # location_end = location_se[1]
30 #
31 # return location_start, location_end
32
33 # adapted from intersect_and_adjacent.py
34 def treeFeatures(features):
35 for feat in features:
36 # used with genome in fasta format
37 # start, end = location_start_end(feat)
38
39 # Interval(begin, end, data)
40 yield Interval(int(feat.location.start), int(feat.location.end), feat.id)
41
42
43 # Function to read enzyme domain names and ids from the enzyme list
44 # Enzyme list must be a tab separated txt file with the format with four columns: Predicted catalytic or binding domain, Abbreviation, Conserved domain, Phage example
45 # The first column is used here as the domain name, and the 3rd column is the domain id
46 def read_enzyme_list(enzyme_file=None):
47 enzyme_file.seek(0)
48 domains = []
49 #domain_names = []
50
51 for line in enzyme_file:
52 if not line.startswith("*"):
53 words = line.split("\t")
54 if len(words) > 3:
55 domains += [words[2]]
56 #domain_names += [words[0]]
57
58 return (domains[1:])
59
60
61 # adapted from intersect_and_adjacent.py
62 def intersect(rec_a, rec_b, window):
63 if len(rec_a) > 0 and len(rec_b) > 0:
64
65 # builds interval tree from Interval objects of form (start, end, id) for each feature
66 tree_a = IntervalTree(list(treeFeatures(rec_a)))
67 tree_b = IntervalTree(list(treeFeatures(rec_b)))
68
69 # Used to map ids back to features later
70 rec_a_map = {f.id: f for f in rec_a}
71 rec_b_map = {f.id: f for f in rec_b}
72
73 rec_a_hits_in_b = []
74 rec_b_hits_in_a = []
75
76 for feature in rec_a:
77 # Used with genome in fasta format
78 # start, end = location_start_end(feature)
79 # Save each feature in rec_a that overlaps a feature in rec_b
80 # hits = tree_b.find_range((int(feature.location.start), int(feature.location.end)))
81 hits = tree_b[
82 (int(feature.location.start) - window) : (
83 int(feature.location.end) + window
84 )
85 ]
86 # feature id is saved in interval result.data, use map to get full feature
87 for hit in hits:
88 rec_a_hits_in_b.append(rec_b_map[hit.data])
89
90 for feature in rec_b:
91 # Used with genome in fasta format
92 # start, end = location_start_end(feature)
93 # Save each feature in rec_a that overlaps a feature in rec_b
94 # hits = tree_a.find_range((int(feature.location.start), int(feature.location.end)))
95 hits = tree_a[
96 (int(feature.location.start) - window) : (
97 int(feature.location.end) + window
98 )
99 ]
100 # feature id is saved in interval result.data, use map to get full feature
101 for hit in hits:
102 rec_b_hits_in_a.append(rec_a_map[hit.data])
103
104 # Remove duplicate features using sets
105 rec_a = set(rec_a_hits_in_b)
106 rec_b = set(rec_b_hits_in_a)
107
108 else:
109 # If one input is empty, output two empty result files.
110 rec_a = SeqRecord(Seq(""), "none")
111 rec_b = SeqRecord(Seq(""), "none")
112 return rec_a, rec_b
113
114
115 # Function to identify enzyme domains associated with endolysin function from the interproscan results file
116 def find_endolysins(rec_ipro, enzyme_domain_ids):
117
118 # print(rec_ipro)
119
120 if len(rec_ipro) > 0:
121 endo_rec_names = []
122 endo_rec_domain_ids = []
123 rec_domain_name = []
124
125 # Check each feature in the ipro file if the domain id/feature qualifier "Name" is included in the domain list.
126 for seq in rec_ipro:
127 for i in range(len(seq.features)):
128 f = seq.features[i]
129 if f.type == "protein_match":
130 # print(f)
131 unwanted = ["TRANS", "SIGNAL", "CYTO", "Coil", "Signal"]
132 # Ignores feature with unwanted key words in the feature name
133 if all(x not in f.qualifiers["Name"][0] for x in unwanted):
134 # If feature is included in the given enzyme domain list, the protein name, domain id, and domain name are stored
135 domain_description = [str(f.qualifiers['Name'][0])]
136 if 'signature_desc' in f.qualifiers:
137 domain_description += [str(f.qualifiers['signature_desc'][0])]
138
139 for i in enzyme_domain_ids:
140 for y in domain_description:
141 if i in y:
142 endo_rec_domain_ids += [f.qualifiers["Name"][0]]
143
144 # e_index = enzyme_domain_ids.index(i)
145 # rec_domain_name += [enzyme_domain_names[e_index]]
146
147 target = f.qualifiers["Target"][0]
148 target = target.split(" ")
149 protein_name = str(target[0]) + '**'
150 endo_rec_names += [protein_name]
151
152 return endo_rec_names, endo_rec_domain_ids
153
154
155 def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window):
156 rec_lgc = list(SeqIO.parse(lgc, "fasta"))
157 rec_tmhmm = gffParse(tmhmm)
158 rec_ipro = gffParse(ipro)
159 recTemp = gffParse(genome)[0]
160 tempFeats = feature_lambda(
161 recTemp.features,
162 feature_test_type,
163 {"types": ["CDS"]},
164 subfeatures=True,
165 )
166 recTemp.features = tempFeats
167 rec_genome_ini = [recTemp]
168
169 # genome.seek(0)
170 # examiner = GFFExaminer()
171 # print(examiner.available_limits(genome))
172
173 enzyme_domain_ids = read_enzyme_list(enzyme)
174
175 if len(rec_lgc) > 0 and len(rec_tmhmm) > 0 and len(rec_genome_ini) > 0:
176
177 # find names of the proteins containing endolysin associated domains
178 endo_names, endo_domain_ids = find_endolysins(
179 rec_ipro, list(enzyme_domain_ids)
180 )
181
182 # find names of proteins containing transmembrane domains
183 tmhmm_protein_names = []
184 for seq in rec_tmhmm:
185 tmhmm_protein_names += [str(seq.id) + '**']
186
187 lgc_names = []
188 for seq in rec_lgc:
189 lgc_names += [str(seq.id) + '**']
190
191 adjacent_endo = {}
192 adjacent_lgc_to_endo = {}
193 adjacent_tm = {}
194 adjacent_lgc_to_tm = {}
195
196 # print(len(tmhmm_protein_names), len(endo_names))
197 # print(rec_genome_ini)
198 # print(len(rec_genome_ini))
199
200 for i in range(len(rec_genome_ini)):
201 rec_genome = rec_genome_ini[i]
202
203 # find records for proteins containing endolysin domains and tmds from genome fasta file
204 tm_seqrec = []
205 endolysin_seqrec = []
206 lgc_seqrec = []
207
208 # print(tmhmm_protein_names)
209 # print(endo_names)
210 # print(lgc_names)
211 # print(rec_genome)
212
213 for feat in rec_genome.features:
214 # print(feat)
215 # searches for synonyms and
216 if feat.type == "CDS":
217 feat_names = []
218 feat_names.append(str(feat.id) + '**')
219 if "locus_tag" in feat.qualifiers:
220 feat_names.append(str(feat.qualifiers["locus_tag"][0]) + '**')
221 if "protein_id" in feat.qualifiers:
222 feat_names.append(str(feat.qualifiers["protein_id"][0]) + '**')
223 if "Name" in feat.qualifiers:
224 if len(str(feat.qualifiers["Name"][0])) > 5:
225 feat_names.append(str(feat.qualifiers["Name"][0]) + '**')
226
227 # print(str(feat_names))
228 # print(str(feat.qualifiers))
229 for i in range(len(feat_names)):
230 if str(feat_names[i]) in str(lgc_names):
231 lgc_seqrec += [feat]
232 # check if gene annotated as holin using key words/synonyms
233 holin_annotations = ["holin"]
234 if "product" in feat.qualifiers:
235 if any(
236 x
237 for x in holin_annotations
238 if (x in str(feat.qualifiers))
239 ):
240 tm_seqrec += [feat]
241 # check if protein contains a TMD
242 for i in range(len(feat_names)):
243 if str(feat_names[i]) in str(tmhmm_protein_names):
244 # print(feat_names[i])
245 tm_seqrec += [feat]
246
247 # check if gene annotated as endolysin using key words/synonyms
248 endolysin_annotations = ["lysin", "lysozyme"]
249 if "product" in feat.qualifiers:
250 if any(
251 x
252 for x in endolysin_annotations
253 if (x in str(feat.qualifiers))
254 ):
255 endolysin_seqrec += [feat]
256 # check if protein contains an endolysin-associated domain
257 for i in range(len(feat_names)):
258 if str(feat_names[i]) in str(endo_names):
259 endolysin_seqrec += [feat]
260
261 # print(endolysin_seqrec, tm_seqrec, lgc_seqrec)
262 # find possible endolysins that are adjacent to (or within window length away from) the lysis gene, or disruptin, candidates
263 # if len(endolysin_seqrec) > 0:
264 adjacent_lgc_to_endo_i, adjacent_endo_i = intersect(
265 endolysin_seqrec, lgc_seqrec, window
266 )
267 # find TMD-containing proteins that are adjacent to (or within window length away from) the lysis gene, or disruptin, candidates
268 # if len(tm_seqrec) > 0:
269 adjacent_lgc_to_tm_i, adjacent_tm_i = intersect(
270 tm_seqrec, lgc_seqrec, window
271 )
272
273 # print(len(endolysin_seqrec), len(lgc_seqrec), len(tm_seqrec))
274 adjacent_endo[rec_genome.id] = adjacent_endo_i
275 adjacent_lgc_to_endo[rec_genome.id] = adjacent_lgc_to_endo_i
276 adjacent_tm[rec_genome.id] = adjacent_tm_i
277 adjacent_lgc_to_tm[rec_genome.id] = adjacent_lgc_to_tm_i
278 # print(rec_genome.id)
279 else:
280 return 0, 0, 0, 0
281 # print(adjacent_endo)
282 return adjacent_endo, adjacent_lgc_to_endo, adjacent_tm, adjacent_lgc_to_tm
283
284
285 if __name__ == "__main__":
286 parser = argparse.ArgumentParser(
287 description="Identify lysis gene candidates next to possible endolysin or holin genes",
288 epilog="",
289 )
290 parser.add_argument(
291 "lgc",
292 type=argparse.FileType("r"),
293 help="fasta file with protein coding sequences of lysis gene candidates",
294 )
295 parser.add_argument(
296 "tmhmm",
297 type=argparse.FileType("r"),
298 help="gff3 file with tmhmm results from the coding sequences in the genome",
299 )
300 parser.add_argument(
301 "ipro",
302 type=argparse.FileType("r"),
303 help="gff3 file with interpro results from protein sequences in the genome",
304 )
305 parser.add_argument(
306 "genome",
307 type=argparse.FileType("r"),
308 help="fasta file with protein coding sequences for all genes in the genome",
309 )
310 parser.add_argument(
311 "window",
312 type=int,
313 default=1000,
314 help="Allows features this far away to still be considered 'adjacent'",
315 )
316 parser.add_argument(
317 "enzyme",
318 type=argparse.FileType("r"),
319 help="tab delimited text file including list of conserved protein domains linked to endolysin function",
320 )
321 parser.add_argument("--oa", type=str, default="possible_endolysin.gff3")
322 parser.add_argument(
323 "--ob", type=str, default="lysis_gene_candidates_near_endolysin.gff3"
324 )
325 parser.add_argument("--oc", type=str, default="possible_holin.gff3")
326 parser.add_argument(
327 "--od", type=str, default="lysis_gene_candidates_near_holin.gff3"
328 )
329 args = parser.parse_args()
330
331 endo, lgc_endo, tm, lgc_tm = adjacent_lgc(
332 args.lgc, args.tmhmm, args.ipro, args.genome, args.enzyme, args.window
333 )
334
335 if endo == 0:
336 with open(args.oa, "w") as handle:
337 handle.write("##gff-version 3")
338 with open(args.ob, "w") as handle:
339 handle.write("##gff-version 3")
340 with open(args.oc, "w") as handle:
341 handle.write("##gff-version 3")
342 with open(args.od, "w") as handle:
343 handle.write("##gff-version 3")
344 return
345
346 args.genome.seek(0)
347 rec = list(gffParse(args.genome))
348
349 with open(args.oa, "w") as handle:
350 for i in range(len(rec)):
351 rec_i = rec[i]
352 if endo.get(rec_i.id, "") is not "":
353 rec_i.features = endo[rec_i.id]
354 gffWrite([rec_i], handle)
355
356 with open(args.ob, "w") as handle:
357 for i in range(len(rec)):
358 rec_i = rec[i]
359 if lgc_endo.get(rec_i.id, "") is not "":
360 rec_i.features = lgc_endo[rec_i.id]
361 gffWrite([rec_i], handle)
362
363 with open(args.oc, "w") as handle:
364 for i in range(len(rec)):
365 rec_i = rec[i]
366 if tm.get(rec_i.id, "") is not "":
367 rec_i.features = tm[rec_i.id]
368 gffWrite([rec_i], handle)
369
370 with open(args.od, "w") as handle:
371 for i in range(len(rec)):
372 rec_i = rec[i]
373 if lgc_tm.get(rec_i.id, "") is not "":
374 rec_i.features = lgc_tm[rec_i.id]
375 gffWrite([rec_i], handle)