Mercurial > repos > cpt > cpt_phageqc_annotations
comparison cpt_phageqc_annotation/phage_annotation_validator.py @ 0:c3140b08d703 draft default tip
Uploaded
author | cpt |
---|---|
date | Fri, 17 Jun 2022 13:00:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c3140b08d703 |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 # vim: set fileencoding=utf-8 | |
4 import os | |
5 import sys | |
6 import json | |
7 import math | |
8 import numpy | |
9 import argparse | |
10 import itertools | |
11 import logging | |
12 from gff3 import ( | |
13 feature_lambda, | |
14 coding_genes, | |
15 genes, | |
16 get_gff3_id, | |
17 feature_test_location, | |
18 get_rbs_from, | |
19 nice_name, | |
20 ) | |
21 from shinefind import NaiveSDCaller | |
22 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature | |
23 from Bio import SeqIO | |
24 from Bio.SeqRecord import SeqRecord | |
25 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
26 from jinja2 import Environment, FileSystemLoader | |
27 from cpt import MGAFinder | |
28 | |
29 logging.basicConfig(level=logging.DEBUG) | |
30 log = logging.getLogger(name="pav") | |
31 | |
32 # Path to script, required because of Galaxy. | |
33 SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__)) | |
34 # Path to the HTML template for the report | |
35 | |
36 ENCOURAGEMENT = ( | |
37 (100, "Perfection itself!"), | |
38 (90, "Amazing!"), | |
39 (80, "Not too bad, a few minor things to fix..."), | |
40 (70, "Some issues to address"), | |
41 ( | |
42 50, | |
43 """Issues detected! </p><p class="text-muted">Have you heard of the | |
44 <a href="https://cpt.tamu.edu">CPT</a>\'s Automated Phage Annotation | |
45 Pipeline?""", | |
46 ), | |
47 ( | |
48 0, | |
49 """<b>MAJOR</b> issues detected! Please consider using the | |
50 <a href="https://cpt.tamu.edu">CPT</a>\'s Automated Phage Annotation Pipeline""", | |
51 ), | |
52 ) | |
53 | |
54 | |
55 def gen_qc_feature(start, end, message, strand=0, id_src=None, type_src="gene"): | |
56 kwargs = {"qualifiers": {"note": [message]}} | |
57 kwargs["type"] = type_src | |
58 kwargs["strand"] = strand | |
59 kwargs["phase"]=0 | |
60 kwargs["score"]=0.0 | |
61 kwargs["source"]="feature" | |
62 if id_src is not None: | |
63 kwargs["id"] = id_src.id | |
64 kwargs["qualifiers"]["ID"] = [id_src.id] | |
65 kwargs["qualifiers"]["Name"] = id_src.qualifiers.get("Name", []) | |
66 | |
67 | |
68 if end >= start: | |
69 return gffSeqFeature(FeatureLocation(start, end, strand=strand), **kwargs) | |
70 else: | |
71 return gffSeqFeature(FeatureLocation(end, start, strand=strand), **kwargs) | |
72 | |
73 | |
74 def __ensure_location_in_bounds(start=0, end=0, parent_length=0): | |
75 # This prevents frameshift errors | |
76 while start < 0: | |
77 start += 3 | |
78 while end < 0: | |
79 end += 3 | |
80 while start > parent_length: | |
81 start -= 3 | |
82 while end > parent_length: | |
83 end -= 3 | |
84 return (start, end) | |
85 | |
86 | |
87 def missing_rbs(record, lookahead_min=5, lookahead_max=15): | |
88 """ | |
89 Identify gene features with missing RBSs | |
90 | |
91 This "looks ahead" 5-15 bases ahead of each gene feature, and checks if | |
92 there's an RBS feature in those bounds. | |
93 | |
94 The returned data is a set of genes with the RBS sequence in the __upstream | |
95 attribute, and a message in the __message attribute. | |
96 """ | |
97 results = [] | |
98 good = 0 | |
99 bad = 0 | |
100 qc_features = [] | |
101 sd_finder = NaiveSDCaller() | |
102 | |
103 any_rbss = False | |
104 | |
105 for gene in coding_genes(record.features): | |
106 # Check if there are RBSs, TODO: make this recursive. Each feature in | |
107 # gene.sub_features can also have sub_features. | |
108 rbss = get_rbs_from(gene) | |
109 # No RBS found | |
110 if len(rbss) == 0: | |
111 # Get the sequence lookahead_min to lookahead_max upstream | |
112 if gene.strand > 0: | |
113 start = gene.location.start - lookahead_max | |
114 end = gene.location.start - lookahead_min | |
115 else: | |
116 start = gene.location.end + lookahead_min | |
117 end = gene.location.end + lookahead_max | |
118 # We have to ensure the feature is ON the genome, otherwise we may | |
119 # be trying to access a location outside of the length of the | |
120 # genome, which would be bad. | |
121 (start, end) = __ensure_location_in_bounds( | |
122 start=start, end=end, parent_length=len(record) | |
123 ) | |
124 # Temporary feature to extract sequence | |
125 tmp = gffSeqFeature( | |
126 FeatureLocation(start, end, strand=gene.strand), type="domain" | |
127 ) | |
128 # Get the sequence | |
129 seq = str(tmp.extract(record.seq)) | |
130 # Set the default properties | |
131 gene.__upstream = seq.lower() | |
132 gene.__message = "No RBS annotated, None found" | |
133 | |
134 # Try and do an automated shinefind call | |
135 sds = sd_finder.list_sds(seq) | |
136 if len(sds) > 0: | |
137 sd = sds[0] | |
138 gene.__upstream = sd_finder.highlight_sd( | |
139 seq.lower(), sd["start"], sd["end"] | |
140 ) | |
141 gene.__message = "Unannotated but valid RBS" | |
142 | |
143 qc_features.append( | |
144 gen_qc_feature( | |
145 start, end, "Missing RBS", strand=gene.strand, id_src=gene, type_src="gene" | |
146 ) | |
147 ) | |
148 | |
149 bad += 1 | |
150 results.append(gene) | |
151 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand) | |
152 else: | |
153 if len(rbss) > 1: | |
154 log.warn("%s RBSs found for gene %s", rbss[0].id, get_gff3_id(gene)) | |
155 any_rbss = True | |
156 # get first RBS/CDS | |
157 cds = list(genes(gene.sub_features, feature_type="CDS"))[0] | |
158 rbs = rbss[0] | |
159 | |
160 # Get the distance between the two | |
161 if gene.strand > 0: | |
162 distance = cds.location.start - rbs.location.end | |
163 else: | |
164 distance = rbs.location.start - cds.location.end | |
165 | |
166 # If the RBS is too far away, annotate that | |
167 if distance > lookahead_max: | |
168 gene.__message = "RBS too far away (%s nt)" % distance | |
169 | |
170 qc_features.append( | |
171 gen_qc_feature( | |
172 rbs.location.start, | |
173 rbs.location.end, | |
174 gene.__message, | |
175 strand=gene.strand, | |
176 id_src=gene, | |
177 type_src="gene" | |
178 ) | |
179 ) | |
180 | |
181 bad += 1 | |
182 results.append(gene) | |
183 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand) | |
184 else: | |
185 good += 1 | |
186 | |
187 return good, bad, results, qc_features, any_rbss | |
188 | |
189 | |
190 # modified from get_orfs_or_cdss.py | |
191 # ----------------------------------------------------------- | |
192 | |
193 | |
194 def require_sd(data, record, chrom_start, sd_min, sd_max): | |
195 sd_finder = NaiveSDCaller() | |
196 for putative_gene in data: | |
197 if putative_gene[2] > 0: # strand | |
198 start = chrom_start + putative_gene[0] - sd_max | |
199 end = chrom_start + putative_gene[0] - sd_min | |
200 else: | |
201 start = chrom_start + putative_gene[1] + sd_min | |
202 end = chrom_start + putative_gene[1] + sd_max | |
203 | |
204 (start, end) = __ensure_location_in_bounds( | |
205 start=start, end=end, parent_length=len(record) | |
206 ) | |
207 tmp = gffSeqFeature( | |
208 FeatureLocation(start, end, strand=putative_gene[2]), type="domain" | |
209 ) | |
210 # Get the sequence | |
211 seq = str(tmp.extract(record.seq)) | |
212 sds = sd_finder.list_sds(seq) | |
213 if len(sds) > 0: | |
214 yield putative_gene + (start, end) | |
215 | |
216 | |
217 def excessive_gap( | |
218 record, | |
219 excess=50, | |
220 excess_divergent=200, | |
221 min_gene=30, | |
222 slop=30, | |
223 lookahead_min=5, | |
224 lookahead_max=15, | |
225 ): | |
226 """ | |
227 Identify excessive gaps between gene features. | |
228 | |
229 Default "excessive" gap size is 10, but that should likely be larger. | |
230 """ | |
231 results = [] | |
232 good = 0 | |
233 bad = 0 | |
234 | |
235 contiguous_regions = [] | |
236 | |
237 sorted_genes = sorted( | |
238 genes(record.features), key=lambda feature: feature.location.start | |
239 ) | |
240 if len(sorted_genes) == 0: | |
241 log.warn("NO GENES FOUND") | |
242 return good, bad, results, [] | |
243 | |
244 current_gene = None | |
245 for gene in sorted_genes: | |
246 # If the gene's start is contiguous to the "current_gene", then we | |
247 # extend current_gene | |
248 for cds in genes(gene.sub_features, feature_type="CDS"): | |
249 if current_gene is None: | |
250 current_gene = [int(cds.location.start), int(cds.location.end)] | |
251 | |
252 if cds.location.start <= current_gene[1] + excess: | |
253 # Don't want to decrease size | |
254 if int(cds.location.end) >= current_gene[1]: | |
255 current_gene[1] = int(cds.location.end) | |
256 else: | |
257 # If it's discontiguous, we append the region and clear. | |
258 contiguous_regions.append(current_gene) | |
259 current_gene = [int(cds.location.start), int(cds.location.end)] | |
260 | |
261 # This generally expected that annotations would NOT continue unto the end | |
262 # of the genome, however that's a bug, and we can make it here with an | |
263 # empty contiguous_regions list | |
264 contiguous_regions.append(current_gene) | |
265 | |
266 for i in range(len(contiguous_regions) + 1): | |
267 if i == 0: | |
268 a = (1, 1) | |
269 b = contiguous_regions[i] | |
270 elif i >= len(contiguous_regions): | |
271 a = contiguous_regions[i - 1] | |
272 b = (len(record.seq), None) | |
273 else: | |
274 a = contiguous_regions[i - 1] | |
275 b = contiguous_regions[i] | |
276 | |
277 gap_size = abs(b[0] - a[1]) | |
278 | |
279 if gap_size > min(excess, excess_divergent): | |
280 a_feat_l = itertools.islice( | |
281 feature_lambda( | |
282 sorted_genes, | |
283 feature_test_location, | |
284 {"loc": a[1]}, | |
285 subfeatures=False, | |
286 ), | |
287 1, | |
288 ) | |
289 b_feat_l = itertools.islice( | |
290 feature_lambda( | |
291 sorted_genes, | |
292 feature_test_location, | |
293 {"loc": b[0]}, | |
294 subfeatures=False, | |
295 ), | |
296 1, | |
297 ) | |
298 | |
299 try: | |
300 a_feat = next(a_feat_l) | |
301 except StopIteration: | |
302 # Triggers on end of genome | |
303 a_feat = None | |
304 try: | |
305 b_feat = next(b_feat_l) | |
306 except StopIteration: | |
307 # Triggers on end of genome | |
308 b_feat = None | |
309 | |
310 result_obj = [ | |
311 a[1], | |
312 b[0], | |
313 None if not a_feat else a_feat.location.strand, | |
314 None if not b_feat else b_feat.location.strand, | |
315 ] | |
316 | |
317 if a_feat is None or b_feat is None: | |
318 if gap_size > excess_divergent: | |
319 results.append(result_obj) | |
320 else: | |
321 if ( | |
322 a_feat.location.strand == b_feat.location.strand | |
323 and gap_size > excess | |
324 ): | |
325 results.append(result_obj) | |
326 elif ( | |
327 a_feat.location.strand != b_feat.location.strand | |
328 and gap_size > excess_divergent | |
329 ): | |
330 results.append(result_obj) | |
331 | |
332 better_results = [] | |
333 qc_features = [] | |
334 of = MGAFinder(11, "CDS", "closed", min_gene) | |
335 # of = OrfFinder(11, 'CDS', 'closed', min_gene) | |
336 | |
337 for result_obj in results: | |
338 start = result_obj[0] | |
339 end = result_obj[1] | |
340 f = gen_qc_feature(start, end, "Excessive gap, %s bases" % abs(end - start), type_src="gene") | |
341 qc_features.append(f) | |
342 putative_genes = of.putative_genes_in_sequence( | |
343 str(record[start - slop : end + slop].seq) | |
344 ) | |
345 putative_genes = list( | |
346 require_sd(putative_genes, record, start, lookahead_min, lookahead_max) | |
347 ) | |
348 for putative_gene in putative_genes: | |
349 # (0, 33, 1, 'ATTATTTTATCAAAACGCTTTACAATCTTTTAG', 'MILSKRFTIF', 123123, 124324) | |
350 possible_gene_start = start + putative_gene[0] | |
351 possible_gene_end = start + putative_gene[1] | |
352 | |
353 if possible_gene_start <= possible_gene_end: | |
354 possible_cds = gffSeqFeature( | |
355 FeatureLocation( | |
356 possible_gene_start, possible_gene_end, strand=putative_gene[2] | |
357 ), | |
358 type="CDS", | |
359 ) | |
360 else: | |
361 possible_cds = gffSeqFeature( | |
362 FeatureLocation( | |
363 possible_gene_end, possible_gene_start, strand=putative_gene[2], | |
364 ), | |
365 type="CDS", | |
366 ) | |
367 | |
368 # Now we adjust our boundaries for the RBS that's required | |
369 # There are only two cases, the rbs is upstream of it, or downstream | |
370 if putative_gene[5] < possible_gene_start: | |
371 possible_gene_start = putative_gene[5] | |
372 else: | |
373 possible_gene_end = putative_gene[6] | |
374 | |
375 if putative_gene[5] <= putative_gene[6]: | |
376 possible_rbs = gffSeqFeature( | |
377 FeatureLocation( | |
378 putative_gene[5], putative_gene[6], strand=putative_gene[2] | |
379 ), | |
380 type="Shine_Dalgarno_sequence", | |
381 ) | |
382 else: | |
383 possible_rbs = gffSeqFeature( | |
384 FeatureLocation( | |
385 putative_gene[6], putative_gene[5], strand=putative_gene[2], | |
386 ), | |
387 type="Shine_Dalgarno_sequence", | |
388 ) | |
389 | |
390 if possible_gene_start <= possible_gene_end: | |
391 possible_gene = gffSeqFeature( | |
392 FeatureLocation( | |
393 possible_gene_start, possible_gene_end, strand=putative_gene[2] | |
394 ), | |
395 type="gene", | |
396 qualifiers={"note": ["Possible gene"]}, | |
397 ) | |
398 else: | |
399 possible_gene = gffSeqFeature( | |
400 FeatureLocation( | |
401 possible_gene_end, possible_gene_start, strand=putative_gene[2], | |
402 ), | |
403 type="gene", | |
404 qualifiers={"note": ["Possible gene"]}, | |
405 ) | |
406 possible_gene.sub_features = [possible_rbs, possible_cds] | |
407 qc_features.append(possible_gene) | |
408 | |
409 better_results.append(result_obj + [len(putative_genes)]) | |
410 | |
411 # Bad gaps are those with more than zero possible genes found | |
412 bad = len([x for x in better_results if x[2] > 0]) | |
413 # Generally taking "good" here as every possible gap in the genome | |
414 # Thus, good is TOTAL - gaps | |
415 good = len(sorted_genes) + 1 - bad | |
416 # and bad is just gaps | |
417 return good, bad, better_results, qc_features | |
418 | |
419 | |
420 def phi(x): | |
421 """Standard phi function used in calculation of normal distribution""" | |
422 return math.exp(-1 * math.pi * x * x) | |
423 | |
424 | |
425 def norm(x, mean=0, sd=1): | |
426 """ | |
427 Normal distribution. Given an x position, a mean, and a standard | |
428 deviation, calculate the "y" value. Useful for score scaling | |
429 | |
430 Modified to multiply by SD. This means even at sd=5, norm(x, mean) where x = mean => 1, rather than 1/5. | |
431 """ | |
432 return (1 / float(sd)) * phi(float(x - mean) / float(sd)) * sd | |
433 | |
434 | |
435 def coding_density(record, mean=92.5, sd=20): | |
436 """ | |
437 Find coding density in the genome | |
438 """ | |
439 feature_lengths = 0 | |
440 | |
441 for gene_a in coding_genes(record.features): | |
442 feature_lengths += sum( | |
443 [len(x) for x in genes(gene_a.sub_features, feature_type="CDS")] | |
444 ) | |
445 | |
446 avgFeatLen = float(feature_lengths) / float(len(record.seq)) | |
447 return int(norm(100 * avgFeatLen, mean=mean, sd=sd) * 100), int(100 * avgFeatLen) | |
448 | |
449 | |
450 def exact_coding_density(record, mean=92.5, sd=20): | |
451 """ | |
452 Find exact coding density in the genome | |
453 """ | |
454 data = numpy.zeros(len(record.seq)) | |
455 | |
456 for gene_a in coding_genes(record.features): | |
457 for cds in genes(gene_a.sub_features, feature_type="CDS"): | |
458 for i in range(cds.location.start, cds.location.end + 1): | |
459 data[i - 1] = 1 | |
460 | |
461 return float(sum(data)) / len(data) | |
462 | |
463 | |
464 def excessive_overlap(record, excess=15, excess_divergent=30): | |
465 """ | |
466 Find excessive overlaps in the genome, where excessive is defined as 15 | |
467 bases for same strand, and 30 for divergent translation. | |
468 | |
469 Does a product of all the top-level features in the genome, and calculates | |
470 gaps. | |
471 """ | |
472 results = [] | |
473 bad = 0 | |
474 qc_features = [] | |
475 | |
476 for (gene_a, gene_b) in itertools.combinations(coding_genes(record.features), 2): | |
477 # Get the CDS from the subfeature list. | |
478 # TODO: not recursive. | |
479 cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")] | |
480 cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")] | |
481 | |
482 if len(cds_a) == 0: | |
483 log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a)) | |
484 continue | |
485 | |
486 if len(cds_b) == 0: | |
487 log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b)) | |
488 continue | |
489 | |
490 cds_a = cds_a[0] | |
491 cds_b = cds_b[0] | |
492 | |
493 # Set of locations that are included in the CDS of A and the | |
494 # CDS of B | |
495 cas = set(range(cds_a.location.start, cds_a.location.end)) | |
496 cbs = set(range(cds_b.location.start, cds_b.location.end)) | |
497 | |
498 # Here we calculate the intersection between the two sets, and | |
499 # if it's larger than our excessive size, we know that they're | |
500 # overlapped | |
501 ix = cas.intersection(cbs) | |
502 | |
503 if (cds_a.location.strand == cds_b.location.strand and len(ix) >= excess) or ( | |
504 cds_a.location.strand != cds_b.location.strand | |
505 and len(ix) >= excess_divergent | |
506 ): | |
507 bad += float(len(ix)) / float(min(excess, excess_divergent)) | |
508 qc_features.append( | |
509 gen_qc_feature(min(ix), max(ix), "Excessive Overlap", id_src=gene_a, type_src="gene") | |
510 ) | |
511 results.append((gene_a, gene_b, min(ix), max(ix))) | |
512 | |
513 # Good isn't accurate here. It's a triangle number and just ugly, but we | |
514 # don't care enough to fix it. | |
515 good = len(list(coding_genes(record.features))) | |
516 good = int(good - bad) | |
517 if good < 0: | |
518 good = 0 | |
519 return good, int(bad), results, qc_features | |
520 | |
521 | |
522 def get_encouragement(score): | |
523 """Some text telling the user how they did | |
524 """ | |
525 for encouragement in ENCOURAGEMENT: | |
526 if score > encouragement[0]: | |
527 return encouragement[1] | |
528 return ENCOURAGEMENT[-1][1] | |
529 | |
530 | |
531 def genome_overview(record): | |
532 """Genome overview | |
533 """ | |
534 data = { | |
535 "genes": { | |
536 "count": 0, | |
537 "bases": len(record.seq), | |
538 "density": 0, # genes / kb | |
539 "avg_len": [], | |
540 "comp": {"A": 0, "C": 0, "G": 0, "T": 0}, | |
541 }, | |
542 "overall": { | |
543 "comp": { | |
544 "A": record.seq.count("A") + record.seq.count("a"), | |
545 "C": record.seq.count("C") + record.seq.count("c"), | |
546 "G": record.seq.count("G") + record.seq.count("g"), | |
547 "T": record.seq.count("T") + record.seq.count("t"), | |
548 }, | |
549 "gc": 0, | |
550 }, | |
551 } | |
552 gene_features = list(coding_genes(record.features)) | |
553 data["genes"]["count"] = len(gene_features) | |
554 | |
555 for feat in gene_features: | |
556 data["genes"]["comp"]["A"] += feat.extract(record).seq.count("A") + feat.extract(record).seq.count("a") | |
557 data["genes"]["comp"]["C"] += feat.extract(record).seq.count("C") + feat.extract(record).seq.count("c") | |
558 data["genes"]["comp"]["T"] += feat.extract(record).seq.count("T") + feat.extract(record).seq.count("t") | |
559 data["genes"]["comp"]["G"] += feat.extract(record).seq.count("G") + feat.extract(record).seq.count("g") | |
560 #data["genes"]["bases"] += len(feat) | |
561 data["genes"]["avg_len"].append(len(feat)) | |
562 | |
563 data["genes"]["avg_len"] = float(sum(data["genes"]["avg_len"])) / len(gene_features) | |
564 data["overall"]["gc"] = float( | |
565 data["overall"]["comp"]["G"] + data["overall"]["comp"]["C"] | |
566 ) / len(record.seq) | |
567 return data | |
568 | |
569 | |
570 def find_morons(record): | |
571 """Locate morons in the genome | |
572 | |
573 Don't even know why... | |
574 | |
575 TODO: remove? Idk. | |
576 """ | |
577 results = [] | |
578 good = 0 | |
579 bad = 0 | |
580 | |
581 gene_features = list(coding_genes(record.features)) | |
582 for i, gene in enumerate(gene_features): | |
583 two_left = gene_features[i - 2 : i] | |
584 two_right = gene_features[i + 1 : i + 1 + 2] | |
585 strands = [x.strand for x in two_left] + [x.strand for x in two_right] | |
586 anticon = [x for x in strands if x != gene.strand] | |
587 | |
588 if len(anticon) == 4: | |
589 has_rbs = [x.type == "Shine_Dalgarno_sequence" for x in gene.sub_features] | |
590 if any(has_rbs): | |
591 rbs = [ | |
592 x for x in gene.sub_features if x.type == "Shine_Dalgarno_sequence" | |
593 ][0] | |
594 rbs_msg = str(rbs.extract(record.seq)) | |
595 else: | |
596 rbs_msg = "No RBS Available" | |
597 results.append((gene, two_left, two_right, rbs_msg)) | |
598 bad += 1 | |
599 else: | |
600 good += 1 | |
601 return good, bad, results, [] | |
602 | |
603 | |
604 def bad_gene_model(record): | |
605 """Find features without product | |
606 """ | |
607 results = [] | |
608 good = 0 | |
609 bad = 0 | |
610 qc_features = [] | |
611 | |
612 for gene in coding_genes(record.features): | |
613 exons = [ | |
614 x for x in genes(gene.sub_features, feature_type="exon") if len(x) > 10 | |
615 ] | |
616 CDSs = [x for x in genes(gene.sub_features, feature_type="CDS")] | |
617 if len(exons) >= 1 and len(CDSs) >= 1: | |
618 if len(exons) != len(CDSs): | |
619 results.append( | |
620 ( | |
621 get_gff3_id(gene), | |
622 None, | |
623 None, | |
624 "Mismatched number of exons and CDSs in gff3 representation", | |
625 ) | |
626 ) | |
627 qc_features.append( | |
628 gen_qc_feature( | |
629 gene.location.start, | |
630 gene.location.end, | |
631 "Mismatched number of exons and CDSs in gff3 representation", | |
632 strand=gene.strand, | |
633 id_src=gene, | |
634 type_src="gene" | |
635 ) | |
636 ) | |
637 bad += 1 | |
638 else: | |
639 for (exon, cds) in zip( | |
640 sorted(exons, key=lambda x: x.location.start), | |
641 sorted(CDSs, key=lambda x: x.location.start), | |
642 ): | |
643 if len(exon) != len(cds): | |
644 results.append( | |
645 ( | |
646 get_gff3_id(gene), | |
647 exon, | |
648 cds, | |
649 "CDS does not extend to full length of gene", | |
650 ) | |
651 ) | |
652 qc_features.append( | |
653 gen_qc_feature( | |
654 exon.location.start, | |
655 exon.location.end, | |
656 "CDS does not extend to full length of gene", | |
657 strand=exon.strand, | |
658 id_src=gene, | |
659 type_src="CDS" | |
660 ) | |
661 ) | |
662 bad += 1 | |
663 else: | |
664 good += 1 | |
665 else: | |
666 log.warn("Could not handle %s, %s", exons, CDSs) | |
667 results.append( | |
668 ( | |
669 get_gff3_id(gene), | |
670 None, | |
671 None, | |
672 "{0} exons, {1} CDSs".format(len(exons), len(CDSs)), | |
673 ) | |
674 ) | |
675 | |
676 return good, len(results) + bad, results, qc_features | |
677 | |
678 | |
679 def weird_starts(record): | |
680 """Find features without product | |
681 """ | |
682 good = 0 | |
683 bad = 0 | |
684 qc_features = [] | |
685 results = [] | |
686 | |
687 overall = {} | |
688 for gene in coding_genes(record.features): | |
689 seq = [x for x in genes(gene.sub_features, feature_type="CDS")] | |
690 if len(seq) == 0: | |
691 log.warn("No CDS for gene %s", get_gff3_id(gene)) | |
692 continue | |
693 else: | |
694 seq = seq[0] | |
695 | |
696 seq_str = str(seq.extract(record.seq)) | |
697 start_codon = seq_str[0:3] | |
698 if len(seq_str) < 3: | |
699 sys.stderr.write("Fatal Error: CDS of length less than 3 at " + str(seq.location) + '\n') | |
700 exit(2) | |
701 # if len(seq_str) % 3 != 0: | |
702 # if len(seq_str) < 3: | |
703 # stop_codon = seq_str[-(len(seq_str))] | |
704 # else: | |
705 # stop_codon = seq_str[-3] | |
706 # | |
707 # log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str)) | |
708 # seq.__error = "Bad CDS Length" | |
709 # results.append(seq) | |
710 # qc_features.append( | |
711 # gen_qc_feature( | |
712 # s, e, "Bad Length", strand=seq.strand, id_src=gene | |
713 # ) | |
714 # ) | |
715 # bad += 1 | |
716 # seq.__start = start_codon | |
717 # seq.__stop = stop_codon | |
718 # continue | |
719 | |
720 stop_codon = seq_str[-3] | |
721 seq.__start = start_codon | |
722 seq.__stop = stop_codon | |
723 if start_codon not in overall: | |
724 overall[start_codon] = 1 | |
725 else: | |
726 overall[start_codon] += 1 | |
727 | |
728 if start_codon not in ("ATG", "TTG", "GTG"): | |
729 log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene)) | |
730 seq.__error = "Unusual start codon %s" % start_codon | |
731 | |
732 s = 0 | |
733 e = 0 | |
734 if seq.strand > 0: | |
735 s = seq.location.start | |
736 e = seq.location.start + 3 | |
737 else: | |
738 s = seq.location.end | |
739 e = seq.location.end - 3 | |
740 | |
741 results.append(seq) | |
742 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand) | |
743 qc_features.append( | |
744 gen_qc_feature( | |
745 s, e, "Weird start codon", strand=seq.strand, id_src=gene, type_src="gene" | |
746 ) | |
747 ) | |
748 bad += 1 | |
749 else: | |
750 good += 1 | |
751 | |
752 return good, bad, results, qc_features, overall | |
753 | |
754 | |
755 def missing_genes(record): | |
756 """Find features without product | |
757 """ | |
758 results = [] | |
759 good = 0 | |
760 bad = 0 | |
761 qc_features = [] | |
762 | |
763 for gene in coding_genes(record.features): | |
764 if gene.qualifiers.get("cpt_source", [None])[0] == "CPT_GENE_MODEL_CORRECTION": | |
765 results.append(gene) | |
766 bad += 1 | |
767 else: | |
768 good += 1 | |
769 | |
770 return good, bad, results, qc_features | |
771 | |
772 | |
773 def gene_model_correction_issues(record): | |
774 """Find features that have issues from the gene model correction step. | |
775 These have qualifiers beginning with CPT_GMS | |
776 """ | |
777 results = [] | |
778 good = 0 | |
779 bad = 0 | |
780 qc_features = [] | |
781 | |
782 # For each gene | |
783 for gene in coding_genes(record.features): | |
784 # Get the list of child CDSs | |
785 cdss = [x for x in genes(gene.sub_features, feature_type="CDS")] | |
786 # And our matching qualifiers | |
787 gene_data = [(k, v) for (k, v) in gene.qualifiers.items() if k == "cpt_gmc"] | |
788 # If there are problems with ONLY the parent, let's complain | |
789 local_results = [] | |
790 local_qc_features = [] | |
791 for x in gene_data: | |
792 if "Missing Locus Tag" in x[1]: | |
793 # Missing locus tag is an either or thing, if it hits here | |
794 # there shouldn't be anything else wrong with it. | |
795 | |
796 # Obviously missing so we remove it | |
797 gene.qualifiers["locus_tag"] = [""] | |
798 # Translation from bp_genbank2gff3.py | |
799 cdss[0].qualifiers["locus_tag"] = cdss[0].qualifiers["Name"] | |
800 # Append our results | |
801 local_results.append((gene, cdss[0], "Gene is missing a locus_tag")) | |
802 local_qc_features.append( | |
803 gen_qc_feature( | |
804 gene.location.start, | |
805 gene.location.end, | |
806 "Gene is missing a locus_tag", | |
807 strand=gene.strand, | |
808 type_src="gene" | |
809 ) | |
810 ) | |
811 | |
812 # We need to alert on any child issues as well. | |
813 for cds in cdss: | |
814 cds_data = [ | |
815 (k, v[0]) for (k, v) in cds.qualifiers.items() if k == "cpt_gmc" | |
816 ] | |
817 if len(gene_data) == 0 and len(cds_data) == 0: | |
818 # Alles gut | |
819 pass | |
820 else: | |
821 for _, problem in cds_data: | |
822 if problem == "BOTH Missing Locus Tag": | |
823 gene.qualifiers["locus_tag"] = [""] | |
824 cds.qualifiers["locus_tag"] = [""] | |
825 local_results.append( | |
826 (gene, cds, "Both gene and CDS are missing locus tags") | |
827 ) | |
828 local_qc_features.append( | |
829 gen_qc_feature( | |
830 cds.location.start, | |
831 cds.location.end, | |
832 "CDS is missing a locus_tag", | |
833 strand=cds.strand, | |
834 type_src="CDS" | |
835 ) | |
836 ) | |
837 local_qc_features.append( | |
838 gen_qc_feature( | |
839 gene.location.start, | |
840 gene.location.end, | |
841 "Gene is missing a locus_tag", | |
842 strand=gene.strand, | |
843 type_src="gene" | |
844 ) | |
845 ) | |
846 elif problem == "Different locus tag from associated gene.": | |
847 gene.qualifiers["locus_tag"] = gene.qualifiers["Name"] | |
848 cds.qualifiers["locus_tag"] = cds.qualifiers["cpt_gmc_locus"] | |
849 local_results.append( | |
850 (gene, cds, "Gene and CDS have differing locus tags") | |
851 ) | |
852 local_qc_features.append( | |
853 gen_qc_feature( | |
854 gene.location.start, | |
855 gene.location.end, | |
856 "Gene and CDS have differing locus tags", | |
857 strand=gene.strand, | |
858 type_src="gene" | |
859 ) | |
860 ) | |
861 elif problem == "Missing Locus Tag": | |
862 # Copy this over | |
863 gene.qualifiers["locus_tag"] = gene.qualifiers["Name"] | |
864 # This one is missing | |
865 cds.qualifiers["locus_tag"] = [""] | |
866 local_results.append((gene, cds, "CDS is missing a locus_tag")) | |
867 local_qc_features.append( | |
868 gen_qc_feature( | |
869 cds.location.start, | |
870 cds.location.end, | |
871 "CDS is missing a locus_tag", | |
872 strand=cds.strand, | |
873 type_src="CDS" | |
874 ) | |
875 ) | |
876 else: | |
877 log.warn("Cannot handle %s", problem) | |
878 | |
879 if len(local_results) > 0: | |
880 bad += 1 | |
881 else: | |
882 good += 1 | |
883 | |
884 qc_features.extend(local_qc_features) | |
885 results.extend(local_results) | |
886 return good, bad, results, qc_features | |
887 | |
888 | |
889 def missing_tags(record): | |
890 """Find features without product | |
891 """ | |
892 results = [] | |
893 good = 0 | |
894 bad = 0 | |
895 qc_features = [] | |
896 | |
897 for gene in coding_genes(record.features): | |
898 cds = [x for x in genes(gene.sub_features, feature_type="CDS")] | |
899 if len(cds) == 0: | |
900 log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene)) | |
901 continue | |
902 | |
903 cds = cds[0] | |
904 | |
905 if "product" not in cds.qualifiers: | |
906 log.info("Missing product tag on %s", get_gff3_id(gene)) | |
907 qc_features.append( | |
908 gen_qc_feature( | |
909 cds.location.start, | |
910 cds.location.end, | |
911 "Missing product tag", | |
912 strand=cds.strand, | |
913 type_src="CDS" | |
914 ) | |
915 ) | |
916 results.append(cds) | |
917 bad += 1 | |
918 else: | |
919 good += 1 | |
920 | |
921 return good, bad, results, qc_features | |
922 | |
923 | |
924 def evaluate_and_report( | |
925 annotations, | |
926 genome, | |
927 gff3=None, | |
928 tbl=None, | |
929 sd_min=5, | |
930 sd_max=15, | |
931 min_gene_length=30, | |
932 excessive_gap_dist=50, | |
933 excessive_gap_divergent_dist=200, | |
934 excessive_overlap_dist=25, | |
935 excessive_overlap_divergent_dist=50, | |
936 reportTemplateName="phage_annotation_validator.html", | |
937 ): | |
938 """ | |
939 Generate our HTML evaluation of the genome | |
940 """ | |
941 # Get features from GFF file | |
942 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta")) | |
943 # Get the first GFF3 record | |
944 # TODO: support multiple GFF3 files. | |
945 mostFeat = 0 | |
946 for rec in list(gffParse(annotations, base_dict=seq_dict)): | |
947 if len(rec.features) > mostFeat: | |
948 mostFeat = len(rec.features) | |
949 record = rec | |
950 | |
951 gff3_qc_record = SeqRecord(record.id, id=record.id) | |
952 gff3_qc_record.features = [] | |
953 gff3_qc_features = [] | |
954 | |
955 log.info("Locating missing RBSs") | |
956 # mb_any = "did they annotate ANY rbss? if so, take off from score." | |
957 mb_good, mb_bad, mb_results, mb_annotations, mb_any = missing_rbs( | |
958 record, lookahead_min=sd_min, lookahead_max=sd_max | |
959 ) | |
960 gff3_qc_features += mb_annotations | |
961 | |
962 log.info("Locating excessive gaps") | |
963 eg_good, eg_bad, eg_results, eg_annotations = excessive_gap( | |
964 record, | |
965 excess=excessive_gap_dist, | |
966 excess_divergent=excessive_gap_divergent_dist, | |
967 min_gene=min_gene_length, | |
968 slop=excessive_overlap_dist, | |
969 lookahead_min=sd_min, | |
970 lookahead_max=sd_max, | |
971 ) | |
972 gff3_qc_features += eg_annotations | |
973 | |
974 log.info("Locating excessive overlaps") | |
975 eo_good, eo_bad, eo_results, eo_annotations = excessive_overlap( | |
976 record, | |
977 excess=excessive_overlap_dist, | |
978 excess_divergent=excessive_overlap_divergent_dist, | |
979 ) | |
980 gff3_qc_features += eo_annotations | |
981 | |
982 log.info("Locating morons") | |
983 mo_good, mo_bad, mo_results, mo_annotations = find_morons(record) | |
984 gff3_qc_features += mo_annotations | |
985 | |
986 log.info("Locating missing tags") | |
987 mt_good, mt_bad, mt_results, mt_annotations = missing_tags(record) | |
988 gff3_qc_features += mt_annotations | |
989 | |
990 log.info("Locating missing gene features") | |
991 mg_good, mg_bad, mg_results, mg_annotations = missing_genes(record) | |
992 gff3_qc_features += mg_annotations | |
993 | |
994 log.info("Determining coding density") | |
995 cd, cd_real = coding_density(record) | |
996 | |
997 log.info("Locating weird starts") | |
998 ws_good, ws_bad, ws_results, ws_annotations, ws_overall = weird_starts(record) | |
999 gff3_qc_features += ws_annotations | |
1000 | |
1001 log.info("Locating bad gene models") | |
1002 gm_good, gm_bad, gm_results, gm_annotations = bad_gene_model(record) | |
1003 if gm_good + gm_bad == 0: | |
1004 gm_bad = 1 | |
1005 | |
1006 log.info("Locating more bad gene models") | |
1007 gmc_good, gmc_bad, gmc_results, gmc_annotations = gene_model_correction_issues( | |
1008 record | |
1009 ) | |
1010 if gmc_good + gmc_bad == 0: | |
1011 gmc_bad = 1 | |
1012 | |
1013 good_scores = [eg_good, eo_good, mt_good, ws_good, gm_good, gmc_good] | |
1014 bad_scores = [eg_bad, eo_bad, mt_bad, ws_bad, gm_bad, gmc_bad] | |
1015 | |
1016 # Only if they tried to annotate RBSs do we consider them. | |
1017 if mb_any: | |
1018 good_scores.append(mb_good) | |
1019 bad_scores.append(mb_bad) | |
1020 subscores = [] | |
1021 | |
1022 for (g, b) in zip(good_scores, bad_scores): | |
1023 if g + b == 0: | |
1024 s = 0 | |
1025 else: | |
1026 s = int(100 * float(g) / (float(b) + float(g))) | |
1027 subscores.append(s) | |
1028 subscores.append(cd) | |
1029 | |
1030 score = int(float(sum(subscores)) / float(len(subscores))) | |
1031 | |
1032 # This is data that will go into our HTML template | |
1033 kwargs = { | |
1034 "upstream_min": sd_min, | |
1035 "upstream_max": sd_max, | |
1036 "record_name": record.id, | |
1037 "record_nice_name": nice_name(record), | |
1038 "params": { | |
1039 "sd_min": sd_min, | |
1040 "sd_max": sd_max, | |
1041 "min_gene_length": min_gene_length, | |
1042 "excessive_gap_dist": excessive_gap_dist, | |
1043 "excessive_gap_divergent_dist": excessive_gap_divergent_dist, | |
1044 "excessive_overlap_dist": excessive_overlap_dist, | |
1045 "excessive_overlap_divergent_dist": excessive_overlap_divergent_dist, | |
1046 }, | |
1047 "score": score, | |
1048 "encouragement": get_encouragement(score), | |
1049 "genome_overview": genome_overview(record), | |
1050 "rbss_annotated": mb_any, | |
1051 "missing_rbs": mb_results, | |
1052 "missing_rbs_good": mb_good, | |
1053 "missing_rbs_bad": mb_bad, | |
1054 "missing_rbs_score": 0 | |
1055 if mb_good + mb_bad == 0 | |
1056 else (100 * mb_good / (mb_good + mb_bad)), | |
1057 "excessive_gap": eg_results, | |
1058 "excessive_gap_good": eg_good, | |
1059 "excessive_gap_bad": eg_bad, | |
1060 "excessive_gap_score": 0 | |
1061 if eo_good + eo_bad == 0 | |
1062 else (100 * eo_good / (eo_good + eo_bad)), | |
1063 "excessive_overlap": eo_results, | |
1064 "excessive_overlap_good": eo_good, | |
1065 "excessive_overlap_bad": eo_bad, | |
1066 "excessive_overlap_score": 0 | |
1067 if eo_good + eo_bad == 0 | |
1068 else (100 * eo_good / (eo_good + eo_bad)), | |
1069 "morons": mo_results, | |
1070 "morons_good": mo_good, | |
1071 "morons_bad": mo_bad, | |
1072 "morons_score": 0 | |
1073 if mo_good + mo_bad == 0 | |
1074 else (100 * mo_good / (mo_good + mo_bad)), | |
1075 "missing_tags": mt_results, | |
1076 "missing_tags_good": mt_good, | |
1077 "missing_tags_bad": mt_bad, | |
1078 "missing_tags_score": 0 | |
1079 if mt_good + mt_bad == 0 | |
1080 else (100 * mt_good / (mt_good + mt_bad)), | |
1081 "missing_genes": mg_results, | |
1082 "missing_genes_good": mg_good, | |
1083 "missing_genes_bad": mg_bad, | |
1084 "missing_genes_score": 0 | |
1085 if mg_good + mg_bad == 0 | |
1086 else (100 * mg_good / (mg_good + mg_bad)), | |
1087 "weird_starts": ws_results, | |
1088 "weird_starts_good": ws_good, | |
1089 "weird_starts_bad": ws_bad, | |
1090 "weird_starts_overall": ws_overall, | |
1091 "weird_starts_overall_sorted_keys": sorted( | |
1092 ws_overall, reverse=True, key=lambda x: ws_overall[x] | |
1093 ), | |
1094 "weird_starts_score": 0 | |
1095 if ws_good + ws_bad == 0 | |
1096 else (100 * ws_good / (ws_good + ws_bad)), | |
1097 "gene_model": gm_results, | |
1098 "gene_model_good": gm_good, | |
1099 "gene_model_bad": gm_bad, | |
1100 "gene_model_score": 0 | |
1101 if gm_good + gm_bad == 0 | |
1102 else (100 * gm_good / (gm_good + gm_bad)), | |
1103 "gene_model_correction": gmc_results, | |
1104 "gene_model_correction_good": gmc_good, | |
1105 "gene_model_correction_bad": gmc_bad, | |
1106 "gene_model_correction_score": 0 | |
1107 if gmc_good + gmc_bad == 0 | |
1108 else (100 * gmc_good / (gmc_good + gmc_bad)), | |
1109 "coding_density": cd, | |
1110 "coding_density_exact": exact_coding_density(record), | |
1111 "coding_density_real": cd_real, | |
1112 "coding_density_score": cd, | |
1113 } | |
1114 | |
1115 with open(tbl, "w") as handle: | |
1116 kw_subset = {} | |
1117 for key in kwargs: | |
1118 if ( | |
1119 key in ("score", "record_name") | |
1120 or "_good" in key | |
1121 or "_bad" in key | |
1122 or "_overall" in key | |
1123 ): | |
1124 kw_subset[key] = kwargs[key] | |
1125 json.dump(kw_subset, handle) | |
1126 | |
1127 with open(gff3, "w") as handle: | |
1128 gff3_qc_record.features = gff3_qc_features | |
1129 gff3_qc_record.annotations = {} | |
1130 gffWrite([gff3_qc_record], handle) | |
1131 | |
1132 def nice_strand(direction): | |
1133 # It is somehow possible for whole gffSeqFeature objects to end up in here, apparently at the gene level | |
1134 if "SeqFeature" in str(type(direction)): | |
1135 direction = direction.location.strand | |
1136 if direction > 0: | |
1137 return "→"#.decode("utf-8") | |
1138 else: | |
1139 return "←"#.decode("utf-8") | |
1140 | |
1141 def nice_strand_tex(direction): | |
1142 if "SeqFeature" in str(type(direction)): | |
1143 direction = direction.location.strand | |
1144 if direction > 0: | |
1145 return "$\\rightarrow$" | |
1146 else: | |
1147 return "$\\leftarrow$" | |
1148 | |
1149 def texify(data): | |
1150 return data.replace("_", "\\_").replace("$", "\\$") | |
1151 | |
1152 def length(data): | |
1153 return len(data) | |
1154 | |
1155 def my_encode(data): | |
1156 return str(data)#.encode("utf-8") | |
1157 | |
1158 def my_decode(data): | |
1159 # For production | |
1160 return str(data)#.decode("utf-8") | |
1161 # For local testing. No, I do not understand. | |
1162 return str(data)#.encode("utf-8")).decode("utf-8") | |
1163 | |
1164 env = Environment( | |
1165 loader=FileSystemLoader(SCRIPT_PATH), trim_blocks=True, lstrip_blocks=True | |
1166 ) | |
1167 env.filters.update( | |
1168 { | |
1169 "nice_id": get_gff3_id, | |
1170 "nice_strand": nice_strand, | |
1171 "nice_strand_tex": nice_strand_tex, | |
1172 "texify": texify, | |
1173 "length": length, | |
1174 "encode": my_encode, | |
1175 "decode": my_decode, | |
1176 } | |
1177 ) | |
1178 tpl = env.get_template(reportTemplateName) | |
1179 return tpl.render(**kwargs)#.encode("utf-8") | |
1180 | |
1181 | |
1182 if __name__ == "__main__": | |
1183 parser = argparse.ArgumentParser( | |
1184 description="rebase gff3 features against parent locations", epilog="" | |
1185 ) | |
1186 parser.add_argument( | |
1187 "annotations", type=argparse.FileType("r"), help="Parent GFF3 annotations" | |
1188 ) | |
1189 parser.add_argument("genome", type=argparse.FileType("r"), help="Genome Sequence") | |
1190 parser.add_argument( | |
1191 "--gff3", type=str, help="GFF3 Annotations", default="qc_annotations.gff3" | |
1192 ) | |
1193 parser.add_argument( | |
1194 "--tbl", | |
1195 type=str, | |
1196 help="Table for noninteractive parsing", | |
1197 default="qc_results.json", | |
1198 ) | |
1199 | |
1200 parser.add_argument( | |
1201 "--sd_min", | |
1202 type=int, | |
1203 help="Minimum distance from gene start for an SD to be", | |
1204 default=5, | |
1205 ) | |
1206 parser.add_argument( | |
1207 "--sd_max", | |
1208 type=int, | |
1209 help="Maximum distance from gene start for an SD to be", | |
1210 default=15, | |
1211 ) | |
1212 | |
1213 parser.add_argument( | |
1214 "--min_gene_length", | |
1215 type=int, | |
1216 help="Minimum length for a putative gene call (AAs)", | |
1217 default=30, | |
1218 ) | |
1219 | |
1220 parser.add_argument( | |
1221 "--excessive_overlap_dist", | |
1222 type=int, | |
1223 help="Excessive overlap for genes in same direction", | |
1224 default=25, | |
1225 ) | |
1226 parser.add_argument( | |
1227 "--excessive_overlap_divergent_dist", | |
1228 type=int, | |
1229 help="Excessive overlap for genes in diff directions", | |
1230 default=50, | |
1231 ) | |
1232 | |
1233 parser.add_argument( | |
1234 "--excessive_gap_dist", | |
1235 type=int, | |
1236 help="Maximum distance between two genes", | |
1237 default=40, | |
1238 ) | |
1239 parser.add_argument( | |
1240 "--excessive_gap_divergent_dist", | |
1241 type=int, | |
1242 help="Maximum distance between two divergent genes", | |
1243 default=200, | |
1244 ) | |
1245 | |
1246 parser.add_argument( | |
1247 "--reportTemplateName", | |
1248 help="Report template file name", | |
1249 default="phageqc_report_full.html", | |
1250 ) | |
1251 | |
1252 args = parser.parse_args() | |
1253 | |
1254 sys.stdout.write(evaluate_and_report(**vars(args))) |