annotate cpt_disruptin_proximity/disruptin_proximity_2_lysis_genes.py @ 0:6661bb42b5a9 draft

Uploaded
author cpt
date Fri, 17 Jun 2022 12:24:06 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
2 """
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
3 This program is intended to identify protein coding sequences within a certain window (number of base pairs) of genes encoding recognized endolysin domains and genes encoding transmembrane domains. The goal is narrow a list of disruptin candidates by identifying the sequences close to the other lysis genes in the phage genome.
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
4 Inputs for this program include a .fasta file with protein sequences of lysis gene candidates from the phage genome, a .gff3 file with the tmhmm results from the genome, a .gff3 file with the results from interproscan of the genome, a .gff3 file of the genome, window size in number of base pairs, a tab separated list of endolysin domains, and optional names of output files.
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
5 The program outputs lists of lysis gene candidates that are close to protein codings sequences with endolysin domains or to sequences with transmembrane domains and lists of the proteins in proximity to the lysis gene candidates (one list for proteins with endolysin domains and one list for TMD-containing proteins).
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
6 """
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
7
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
8 from Bio import SeqIO
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
9 import argparse
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
10 import sys
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
11 from CPT_GFFParser import gffParse, gffWrite
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
12 from Bio.SeqRecord import SeqRecord
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
13 from Bio.SeqFeature import SeqFeature
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
14 from Bio.Seq import Seq
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
15 from intervaltree import IntervalTree, Interval
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
16 from gff3 import feature_lambda, feature_test_type
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
17
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
18 # Used for genome in fasta format
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
19 # outputs the start and end coordinates from a record in fasta format
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
20 # Should work for seqrecords from NCBI database
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
21 # def location_start_end(seqrec):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
22 # F = seqrec.description
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
23 # description_subparts = F.split(' ')
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
24 # for i in range(len(description_subparts)):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
25 # if description_subparts[i].startswith('[location'):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
26 # location = description_subparts[i][10:-1]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
27 # location_se = location.split('..')
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
28 # location_start = location_se[0]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
29 # location_end = location_se[1]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
30 #
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
31 # return location_start, location_end
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
32
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
33 # adapted from intersect_and_adjacent.py
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
34 def treeFeatures(features):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
35 for feat in features:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
36 # used with genome in fasta format
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
37 # start, end = location_start_end(feat)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
38
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
39 # Interval(begin, end, data)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
40 yield Interval(int(feat.location.start), int(feat.location.end), feat.id)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
41
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
42
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
43 # Function to read enzyme domain names and ids from the enzyme list
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
44 # Enzyme list must be a tab separated txt file with the format with four columns: Predicted catalytic or binding domain, Abbreviation, Conserved domain, Phage example
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
45 # The first column is used here as the domain name, and the 3rd column is the domain id
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
46 def read_enzyme_list(enzyme_file=None):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
47 enzyme_file.seek(0)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
48 domains = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
49 #domain_names = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
50
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
51 for line in enzyme_file:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
52 if not line.startswith("*"):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
53 words = line.split("\t")
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
54 if len(words) > 3:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
55 domains += [words[2]]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
56 #domain_names += [words[0]]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
57
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
58 return (domains[1:])
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
59
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
60
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
61 # adapted from intersect_and_adjacent.py
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
62 def intersect(rec_a, rec_b, window):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
63 if len(rec_a) > 0 and len(rec_b) > 0:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
64
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
65 # builds interval tree from Interval objects of form (start, end, id) for each feature
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
66 tree_a = IntervalTree(list(treeFeatures(rec_a)))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
67 tree_b = IntervalTree(list(treeFeatures(rec_b)))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
68
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
69 # Used to map ids back to features later
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
70 rec_a_map = {f.id: f for f in rec_a}
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
71 rec_b_map = {f.id: f for f in rec_b}
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
72
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
73 rec_a_hits_in_b = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
74 rec_b_hits_in_a = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
75
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
76 for feature in rec_a:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
77 # Used with genome in fasta format
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
78 # start, end = location_start_end(feature)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
79 # Save each feature in rec_a that overlaps a feature in rec_b
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
80 # hits = tree_b.find_range((int(feature.location.start), int(feature.location.end)))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
81 hits = tree_b[
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
82 (int(feature.location.start) - window) : (
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
83 int(feature.location.end) + window
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
84 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
85 ]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
86 # feature id is saved in interval result.data, use map to get full feature
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
87 for hit in hits:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
88 rec_a_hits_in_b.append(rec_b_map[hit.data])
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
89
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
90 for feature in rec_b:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
91 # Used with genome in fasta format
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
92 # start, end = location_start_end(feature)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
93 # Save each feature in rec_a that overlaps a feature in rec_b
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
94 # hits = tree_a.find_range((int(feature.location.start), int(feature.location.end)))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
95 hits = tree_a[
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
96 (int(feature.location.start) - window) : (
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
97 int(feature.location.end) + window
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
98 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
99 ]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
100 # feature id is saved in interval result.data, use map to get full feature
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
101 for hit in hits:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
102 rec_b_hits_in_a.append(rec_a_map[hit.data])
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
103
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
104 # Remove duplicate features using sets
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
105 rec_a = set(rec_a_hits_in_b)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
106 rec_b = set(rec_b_hits_in_a)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
107
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
108 else:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
109 # If one input is empty, output two empty result files.
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
110 rec_a = SeqRecord(Seq(""), "none")
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
111 rec_b = SeqRecord(Seq(""), "none")
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
112 return rec_a, rec_b
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
113
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
114
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
115 # Function to identify enzyme domains associated with endolysin function from the interproscan results file
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
116 def find_endolysins(rec_ipro, enzyme_domain_ids):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
117
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
118 # print(rec_ipro)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
119
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
120 if len(rec_ipro) > 0:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
121 endo_rec_names = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
122 endo_rec_domain_ids = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
123 rec_domain_name = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
124
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
125 # Check each feature in the ipro file if the domain id/feature qualifier "Name" is included in the domain list.
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
126 for seq in rec_ipro:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
127 for i in range(len(seq.features)):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
128 f = seq.features[i]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
129 if f.type == "protein_match":
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
130 # print(f)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
131 unwanted = ["TRANS", "SIGNAL", "CYTO", "Coil", "Signal"]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
132 # Ignores feature with unwanted key words in the feature name
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
133 if all(x not in f.qualifiers["Name"][0] for x in unwanted):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
134 # If feature is included in the given enzyme domain list, the protein name, domain id, and domain name are stored
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
135 domain_description = [str(f.qualifiers['Name'][0])]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
136 if 'signature_desc' in f.qualifiers:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
137 domain_description += [str(f.qualifiers['signature_desc'][0])]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
138
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
139 for i in enzyme_domain_ids:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
140 for y in domain_description:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
141 if i in y:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
142 endo_rec_domain_ids += [f.qualifiers["Name"][0]]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
143
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
144 # e_index = enzyme_domain_ids.index(i)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
145 # rec_domain_name += [enzyme_domain_names[e_index]]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
146
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
147 target = f.qualifiers["Target"][0]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
148 target = target.split(" ")
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
149 protein_name = str(target[0]) + '**'
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
150 endo_rec_names += [protein_name]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
151
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
152 return endo_rec_names, endo_rec_domain_ids
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
153
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
154
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
155 def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
156 rec_lgc = list(SeqIO.parse(lgc, "fasta"))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
157 rec_tmhmm = gffParse(tmhmm)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
158 rec_ipro = gffParse(ipro)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
159 recTemp = gffParse(genome)[0]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
160 tempFeats = feature_lambda(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
161 recTemp.features,
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
162 feature_test_type,
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
163 {"types": ["CDS"]},
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
164 subfeatures=True,
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
165 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
166 recTemp.features = tempFeats
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
167 rec_genome_ini = [recTemp]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
168
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
169 # genome.seek(0)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
170 # examiner = GFFExaminer()
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
171 # print(examiner.available_limits(genome))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
172
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
173 enzyme_domain_ids = read_enzyme_list(enzyme)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
174
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
175 if len(rec_lgc) > 0 and len(rec_tmhmm) > 0 and len(rec_genome_ini) > 0:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
176
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
177 # find names of the proteins containing endolysin associated domains
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
178 endo_names, endo_domain_ids = find_endolysins(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
179 rec_ipro, list(enzyme_domain_ids)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
180 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
181
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
182 # find names of proteins containing transmembrane domains
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
183 tmhmm_protein_names = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
184 for seq in rec_tmhmm:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
185 tmhmm_protein_names += [str(seq.id) + '**']
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
186
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
187 lgc_names = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
188 for seq in rec_lgc:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
189 lgc_names += [str(seq.id) + '**']
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
190
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
191 adjacent_endo = {}
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
192 adjacent_lgc_to_endo = {}
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
193 adjacent_tm = {}
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
194 adjacent_lgc_to_tm = {}
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
195
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
196 # print(len(tmhmm_protein_names), len(endo_names))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
197 # print(rec_genome_ini)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
198 # print(len(rec_genome_ini))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
199
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
200 for i in range(len(rec_genome_ini)):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
201 rec_genome = rec_genome_ini[i]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
202
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
203 # find records for proteins containing endolysin domains and tmds from genome fasta file
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
204 tm_seqrec = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
205 endolysin_seqrec = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
206 lgc_seqrec = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
207
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
208 # print(tmhmm_protein_names)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
209 # print(endo_names)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
210 # print(lgc_names)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
211 # print(rec_genome)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
212
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
213 for feat in rec_genome.features:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
214 # print(feat)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
215 # searches for synonyms and
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
216 if feat.type == "CDS":
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
217 feat_names = []
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
218 feat_names.append(str(feat.id) + '**')
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
219 if "locus_tag" in feat.qualifiers:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
220 feat_names.append(str(feat.qualifiers["locus_tag"][0]) + '**')
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
221 if "protein_id" in feat.qualifiers:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
222 feat_names.append(str(feat.qualifiers["protein_id"][0]) + '**')
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
223 if "Name" in feat.qualifiers:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
224 if len(str(feat.qualifiers["Name"][0])) > 5:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
225 feat_names.append(str(feat.qualifiers["Name"][0]) + '**')
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
226
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
227 # print(str(feat_names))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
228 # print(str(feat.qualifiers))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
229 for i in range(len(feat_names)):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
230 if str(feat_names[i]) in str(lgc_names):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
231 lgc_seqrec += [feat]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
232 # check if gene annotated as holin using key words/synonyms
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
233 holin_annotations = ["holin"]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
234 if "product" in feat.qualifiers:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
235 if any(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
236 x
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
237 for x in holin_annotations
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
238 if (x in str(feat.qualifiers))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
239 ):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
240 tm_seqrec += [feat]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
241 # check if protein contains a TMD
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
242 for i in range(len(feat_names)):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
243 if str(feat_names[i]) in str(tmhmm_protein_names):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
244 # print(feat_names[i])
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
245 tm_seqrec += [feat]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
246
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
247 # check if gene annotated as endolysin using key words/synonyms
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
248 endolysin_annotations = ["lysin", "lysozyme"]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
249 if "product" in feat.qualifiers:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
250 if any(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
251 x
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
252 for x in endolysin_annotations
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
253 if (x in str(feat.qualifiers))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
254 ):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
255 endolysin_seqrec += [feat]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
256 # check if protein contains an endolysin-associated domain
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
257 for i in range(len(feat_names)):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
258 if str(feat_names[i]) in str(endo_names):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
259 endolysin_seqrec += [feat]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
260
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
261 # print(endolysin_seqrec, tm_seqrec, lgc_seqrec)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
262 # find possible endolysins that are adjacent to (or within window length away from) the lysis gene, or disruptin, candidates
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
263 # if len(endolysin_seqrec) > 0:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
264 adjacent_lgc_to_endo_i, adjacent_endo_i = intersect(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
265 endolysin_seqrec, lgc_seqrec, window
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
266 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
267 # find TMD-containing proteins that are adjacent to (or within window length away from) the lysis gene, or disruptin, candidates
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
268 # if len(tm_seqrec) > 0:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
269 adjacent_lgc_to_tm_i, adjacent_tm_i = intersect(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
270 tm_seqrec, lgc_seqrec, window
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
271 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
272
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
273 # print(len(endolysin_seqrec), len(lgc_seqrec), len(tm_seqrec))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
274 adjacent_endo[rec_genome.id] = adjacent_endo_i
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
275 adjacent_lgc_to_endo[rec_genome.id] = adjacent_lgc_to_endo_i
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
276 adjacent_tm[rec_genome.id] = adjacent_tm_i
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
277 adjacent_lgc_to_tm[rec_genome.id] = adjacent_lgc_to_tm_i
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
278 # print(rec_genome.id)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
279 else:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
280 return 0, 0, 0, 0
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
281 # print(adjacent_endo)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
282 return adjacent_endo, adjacent_lgc_to_endo, adjacent_tm, adjacent_lgc_to_tm
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
283
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
284
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
285 if __name__ == "__main__":
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
286 parser = argparse.ArgumentParser(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
287 description="Identify lysis gene candidates next to possible endolysin or holin genes",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
288 epilog="",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
289 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
290 parser.add_argument(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
291 "lgc",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
292 type=argparse.FileType("r"),
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
293 help="fasta file with protein coding sequences of lysis gene candidates",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
294 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
295 parser.add_argument(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
296 "tmhmm",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
297 type=argparse.FileType("r"),
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
298 help="gff3 file with tmhmm results from the coding sequences in the genome",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
299 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
300 parser.add_argument(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
301 "ipro",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
302 type=argparse.FileType("r"),
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
303 help="gff3 file with interpro results from protein sequences in the genome",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
304 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
305 parser.add_argument(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
306 "genome",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
307 type=argparse.FileType("r"),
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
308 help="fasta file with protein coding sequences for all genes in the genome",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
309 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
310 parser.add_argument(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
311 "window",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
312 type=int,
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
313 default=1000,
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
314 help="Allows features this far away to still be considered 'adjacent'",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
315 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
316 parser.add_argument(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
317 "enzyme",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
318 type=argparse.FileType("r"),
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
319 help="tab delimited text file including list of conserved protein domains linked to endolysin function",
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
320 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
321 parser.add_argument("--oa", type=str, default="possible_endolysin.gff3")
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
322 parser.add_argument(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
323 "--ob", type=str, default="lysis_gene_candidates_near_endolysin.gff3"
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
324 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
325 parser.add_argument("--oc", type=str, default="possible_holin.gff3")
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
326 parser.add_argument(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
327 "--od", type=str, default="lysis_gene_candidates_near_holin.gff3"
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
328 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
329 args = parser.parse_args()
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
330
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
331 endo, lgc_endo, tm, lgc_tm = adjacent_lgc(
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
332 args.lgc, args.tmhmm, args.ipro, args.genome, args.enzyme, args.window
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
333 )
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
334
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
335 if endo == 0:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
336 with open(args.oa, "w") as handle:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
337 handle.write("##gff-version 3")
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
338 with open(args.ob, "w") as handle:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
339 handle.write("##gff-version 3")
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
340 with open(args.oc, "w") as handle:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
341 handle.write("##gff-version 3")
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
342 with open(args.od, "w") as handle:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
343 handle.write("##gff-version 3")
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
344 return
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
345
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
346 args.genome.seek(0)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
347 rec = list(gffParse(args.genome))
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
348
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
349 with open(args.oa, "w") as handle:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
350 for i in range(len(rec)):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
351 rec_i = rec[i]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
352 if endo.get(rec_i.id, "") is not "":
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
353 rec_i.features = endo[rec_i.id]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
354 gffWrite([rec_i], handle)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
355
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
356 with open(args.ob, "w") as handle:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
357 for i in range(len(rec)):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
358 rec_i = rec[i]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
359 if lgc_endo.get(rec_i.id, "") is not "":
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
360 rec_i.features = lgc_endo[rec_i.id]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
361 gffWrite([rec_i], handle)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
362
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
363 with open(args.oc, "w") as handle:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
364 for i in range(len(rec)):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
365 rec_i = rec[i]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
366 if tm.get(rec_i.id, "") is not "":
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
367 rec_i.features = tm[rec_i.id]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
368 gffWrite([rec_i], handle)
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
369
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
370 with open(args.od, "w") as handle:
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
371 for i in range(len(rec)):
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
372 rec_i = rec[i]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
373 if lgc_tm.get(rec_i.id, "") is not "":
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
374 rec_i.features = lgc_tm[rec_i.id]
6661bb42b5a9 Uploaded
cpt
parents:
diff changeset
375 gffWrite([rec_i], handle)