annotate cpt_phageqc_annotation/phage_annotation_validator.py @ 0:c3140b08d703 draft default tip

Uploaded
author cpt
date Fri, 17 Jun 2022 13:00:50 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
c3140b08d703 Uploaded
cpt
parents:
diff changeset
2 # -*- coding: utf-8 -*-
c3140b08d703 Uploaded
cpt
parents:
diff changeset
3 # vim: set fileencoding=utf-8
c3140b08d703 Uploaded
cpt
parents:
diff changeset
4 import os
c3140b08d703 Uploaded
cpt
parents:
diff changeset
5 import sys
c3140b08d703 Uploaded
cpt
parents:
diff changeset
6 import json
c3140b08d703 Uploaded
cpt
parents:
diff changeset
7 import math
c3140b08d703 Uploaded
cpt
parents:
diff changeset
8 import numpy
c3140b08d703 Uploaded
cpt
parents:
diff changeset
9 import argparse
c3140b08d703 Uploaded
cpt
parents:
diff changeset
10 import itertools
c3140b08d703 Uploaded
cpt
parents:
diff changeset
11 import logging
c3140b08d703 Uploaded
cpt
parents:
diff changeset
12 from gff3 import (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
13 feature_lambda,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
14 coding_genes,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
15 genes,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
16 get_gff3_id,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
17 feature_test_location,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
18 get_rbs_from,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
19 nice_name,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
20 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
21 from shinefind import NaiveSDCaller
c3140b08d703 Uploaded
cpt
parents:
diff changeset
22 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
c3140b08d703 Uploaded
cpt
parents:
diff changeset
23 from Bio import SeqIO
c3140b08d703 Uploaded
cpt
parents:
diff changeset
24 from Bio.SeqRecord import SeqRecord
c3140b08d703 Uploaded
cpt
parents:
diff changeset
25 from Bio.SeqFeature import SeqFeature, FeatureLocation
c3140b08d703 Uploaded
cpt
parents:
diff changeset
26 from jinja2 import Environment, FileSystemLoader
c3140b08d703 Uploaded
cpt
parents:
diff changeset
27 from cpt import MGAFinder
c3140b08d703 Uploaded
cpt
parents:
diff changeset
28
c3140b08d703 Uploaded
cpt
parents:
diff changeset
29 logging.basicConfig(level=logging.DEBUG)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
30 log = logging.getLogger(name="pav")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
31
c3140b08d703 Uploaded
cpt
parents:
diff changeset
32 # Path to script, required because of Galaxy.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
33 SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
34 # Path to the HTML template for the report
c3140b08d703 Uploaded
cpt
parents:
diff changeset
35
c3140b08d703 Uploaded
cpt
parents:
diff changeset
36 ENCOURAGEMENT = (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
37 (100, "Perfection itself!"),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
38 (90, "Amazing!"),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
39 (80, "Not too bad, a few minor things to fix..."),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
40 (70, "Some issues to address"),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
41 (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
42 50,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
43 """Issues detected! </p><p class="text-muted">Have you heard of the
c3140b08d703 Uploaded
cpt
parents:
diff changeset
44 <a href="https://cpt.tamu.edu">CPT</a>\'s Automated Phage Annotation
c3140b08d703 Uploaded
cpt
parents:
diff changeset
45 Pipeline?""",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
46 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
47 (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
48 0,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
49 """<b>MAJOR</b> issues detected! Please consider using the
c3140b08d703 Uploaded
cpt
parents:
diff changeset
50 <a href="https://cpt.tamu.edu">CPT</a>\'s Automated Phage Annotation Pipeline""",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
51 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
52 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
53
c3140b08d703 Uploaded
cpt
parents:
diff changeset
54
c3140b08d703 Uploaded
cpt
parents:
diff changeset
55 def gen_qc_feature(start, end, message, strand=0, id_src=None, type_src="gene"):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
56 kwargs = {"qualifiers": {"note": [message]}}
c3140b08d703 Uploaded
cpt
parents:
diff changeset
57 kwargs["type"] = type_src
c3140b08d703 Uploaded
cpt
parents:
diff changeset
58 kwargs["strand"] = strand
c3140b08d703 Uploaded
cpt
parents:
diff changeset
59 kwargs["phase"]=0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
60 kwargs["score"]=0.0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
61 kwargs["source"]="feature"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
62 if id_src is not None:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
63 kwargs["id"] = id_src.id
c3140b08d703 Uploaded
cpt
parents:
diff changeset
64 kwargs["qualifiers"]["ID"] = [id_src.id]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
65 kwargs["qualifiers"]["Name"] = id_src.qualifiers.get("Name", [])
c3140b08d703 Uploaded
cpt
parents:
diff changeset
66
c3140b08d703 Uploaded
cpt
parents:
diff changeset
67
c3140b08d703 Uploaded
cpt
parents:
diff changeset
68 if end >= start:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
69 return gffSeqFeature(FeatureLocation(start, end, strand=strand), **kwargs)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
70 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
71 return gffSeqFeature(FeatureLocation(end, start, strand=strand), **kwargs)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
72
c3140b08d703 Uploaded
cpt
parents:
diff changeset
73
c3140b08d703 Uploaded
cpt
parents:
diff changeset
74 def __ensure_location_in_bounds(start=0, end=0, parent_length=0):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
75 # This prevents frameshift errors
c3140b08d703 Uploaded
cpt
parents:
diff changeset
76 while start < 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
77 start += 3
c3140b08d703 Uploaded
cpt
parents:
diff changeset
78 while end < 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
79 end += 3
c3140b08d703 Uploaded
cpt
parents:
diff changeset
80 while start > parent_length:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
81 start -= 3
c3140b08d703 Uploaded
cpt
parents:
diff changeset
82 while end > parent_length:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
83 end -= 3
c3140b08d703 Uploaded
cpt
parents:
diff changeset
84 return (start, end)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
85
c3140b08d703 Uploaded
cpt
parents:
diff changeset
86
c3140b08d703 Uploaded
cpt
parents:
diff changeset
87 def missing_rbs(record, lookahead_min=5, lookahead_max=15):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
88 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
89 Identify gene features with missing RBSs
c3140b08d703 Uploaded
cpt
parents:
diff changeset
90
c3140b08d703 Uploaded
cpt
parents:
diff changeset
91 This "looks ahead" 5-15 bases ahead of each gene feature, and checks if
c3140b08d703 Uploaded
cpt
parents:
diff changeset
92 there's an RBS feature in those bounds.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
93
c3140b08d703 Uploaded
cpt
parents:
diff changeset
94 The returned data is a set of genes with the RBS sequence in the __upstream
c3140b08d703 Uploaded
cpt
parents:
diff changeset
95 attribute, and a message in the __message attribute.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
96 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
97 results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
98 good = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
99 bad = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
100 qc_features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
101 sd_finder = NaiveSDCaller()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
102
c3140b08d703 Uploaded
cpt
parents:
diff changeset
103 any_rbss = False
c3140b08d703 Uploaded
cpt
parents:
diff changeset
104
c3140b08d703 Uploaded
cpt
parents:
diff changeset
105 for gene in coding_genes(record.features):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
106 # Check if there are RBSs, TODO: make this recursive. Each feature in
c3140b08d703 Uploaded
cpt
parents:
diff changeset
107 # gene.sub_features can also have sub_features.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
108 rbss = get_rbs_from(gene)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
109 # No RBS found
c3140b08d703 Uploaded
cpt
parents:
diff changeset
110 if len(rbss) == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
111 # Get the sequence lookahead_min to lookahead_max upstream
c3140b08d703 Uploaded
cpt
parents:
diff changeset
112 if gene.strand > 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
113 start = gene.location.start - lookahead_max
c3140b08d703 Uploaded
cpt
parents:
diff changeset
114 end = gene.location.start - lookahead_min
c3140b08d703 Uploaded
cpt
parents:
diff changeset
115 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
116 start = gene.location.end + lookahead_min
c3140b08d703 Uploaded
cpt
parents:
diff changeset
117 end = gene.location.end + lookahead_max
c3140b08d703 Uploaded
cpt
parents:
diff changeset
118 # We have to ensure the feature is ON the genome, otherwise we may
c3140b08d703 Uploaded
cpt
parents:
diff changeset
119 # be trying to access a location outside of the length of the
c3140b08d703 Uploaded
cpt
parents:
diff changeset
120 # genome, which would be bad.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
121 (start, end) = __ensure_location_in_bounds(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
122 start=start, end=end, parent_length=len(record)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
123 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
124 # Temporary feature to extract sequence
c3140b08d703 Uploaded
cpt
parents:
diff changeset
125 tmp = gffSeqFeature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
126 FeatureLocation(start, end, strand=gene.strand), type="domain"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
127 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
128 # Get the sequence
c3140b08d703 Uploaded
cpt
parents:
diff changeset
129 seq = str(tmp.extract(record.seq))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
130 # Set the default properties
c3140b08d703 Uploaded
cpt
parents:
diff changeset
131 gene.__upstream = seq.lower()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
132 gene.__message = "No RBS annotated, None found"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
133
c3140b08d703 Uploaded
cpt
parents:
diff changeset
134 # Try and do an automated shinefind call
c3140b08d703 Uploaded
cpt
parents:
diff changeset
135 sds = sd_finder.list_sds(seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
136 if len(sds) > 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
137 sd = sds[0]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
138 gene.__upstream = sd_finder.highlight_sd(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
139 seq.lower(), sd["start"], sd["end"]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
140 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
141 gene.__message = "Unannotated but valid RBS"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
142
c3140b08d703 Uploaded
cpt
parents:
diff changeset
143 qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
144 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
145 start, end, "Missing RBS", strand=gene.strand, id_src=gene, type_src="gene"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
146 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
147 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
148
c3140b08d703 Uploaded
cpt
parents:
diff changeset
149 bad += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
150 results.append(gene)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
151 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
152 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
153 if len(rbss) > 1:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
154 log.warn("%s RBSs found for gene %s", rbss[0].id, get_gff3_id(gene))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
155 any_rbss = True
c3140b08d703 Uploaded
cpt
parents:
diff changeset
156 # get first RBS/CDS
c3140b08d703 Uploaded
cpt
parents:
diff changeset
157 cds = list(genes(gene.sub_features, feature_type="CDS"))[0]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
158 rbs = rbss[0]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
159
c3140b08d703 Uploaded
cpt
parents:
diff changeset
160 # Get the distance between the two
c3140b08d703 Uploaded
cpt
parents:
diff changeset
161 if gene.strand > 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
162 distance = cds.location.start - rbs.location.end
c3140b08d703 Uploaded
cpt
parents:
diff changeset
163 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
164 distance = rbs.location.start - cds.location.end
c3140b08d703 Uploaded
cpt
parents:
diff changeset
165
c3140b08d703 Uploaded
cpt
parents:
diff changeset
166 # If the RBS is too far away, annotate that
c3140b08d703 Uploaded
cpt
parents:
diff changeset
167 if distance > lookahead_max:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
168 gene.__message = "RBS too far away (%s nt)" % distance
c3140b08d703 Uploaded
cpt
parents:
diff changeset
169
c3140b08d703 Uploaded
cpt
parents:
diff changeset
170 qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
171 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
172 rbs.location.start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
173 rbs.location.end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
174 gene.__message,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
175 strand=gene.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
176 id_src=gene,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
177 type_src="gene"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
178 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
179 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
180
c3140b08d703 Uploaded
cpt
parents:
diff changeset
181 bad += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
182 results.append(gene)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
183 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
184 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
185 good += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
186
c3140b08d703 Uploaded
cpt
parents:
diff changeset
187 return good, bad, results, qc_features, any_rbss
c3140b08d703 Uploaded
cpt
parents:
diff changeset
188
c3140b08d703 Uploaded
cpt
parents:
diff changeset
189
c3140b08d703 Uploaded
cpt
parents:
diff changeset
190 # modified from get_orfs_or_cdss.py
c3140b08d703 Uploaded
cpt
parents:
diff changeset
191 # -----------------------------------------------------------
c3140b08d703 Uploaded
cpt
parents:
diff changeset
192
c3140b08d703 Uploaded
cpt
parents:
diff changeset
193
c3140b08d703 Uploaded
cpt
parents:
diff changeset
194 def require_sd(data, record, chrom_start, sd_min, sd_max):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
195 sd_finder = NaiveSDCaller()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
196 for putative_gene in data:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
197 if putative_gene[2] > 0: # strand
c3140b08d703 Uploaded
cpt
parents:
diff changeset
198 start = chrom_start + putative_gene[0] - sd_max
c3140b08d703 Uploaded
cpt
parents:
diff changeset
199 end = chrom_start + putative_gene[0] - sd_min
c3140b08d703 Uploaded
cpt
parents:
diff changeset
200 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
201 start = chrom_start + putative_gene[1] + sd_min
c3140b08d703 Uploaded
cpt
parents:
diff changeset
202 end = chrom_start + putative_gene[1] + sd_max
c3140b08d703 Uploaded
cpt
parents:
diff changeset
203
c3140b08d703 Uploaded
cpt
parents:
diff changeset
204 (start, end) = __ensure_location_in_bounds(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
205 start=start, end=end, parent_length=len(record)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
206 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
207 tmp = gffSeqFeature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
208 FeatureLocation(start, end, strand=putative_gene[2]), type="domain"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
209 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
210 # Get the sequence
c3140b08d703 Uploaded
cpt
parents:
diff changeset
211 seq = str(tmp.extract(record.seq))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
212 sds = sd_finder.list_sds(seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
213 if len(sds) > 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
214 yield putative_gene + (start, end)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
215
c3140b08d703 Uploaded
cpt
parents:
diff changeset
216
c3140b08d703 Uploaded
cpt
parents:
diff changeset
217 def excessive_gap(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
218 record,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
219 excess=50,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
220 excess_divergent=200,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
221 min_gene=30,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
222 slop=30,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
223 lookahead_min=5,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
224 lookahead_max=15,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
225 ):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
226 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
227 Identify excessive gaps between gene features.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
228
c3140b08d703 Uploaded
cpt
parents:
diff changeset
229 Default "excessive" gap size is 10, but that should likely be larger.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
230 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
231 results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
232 good = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
233 bad = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
234
c3140b08d703 Uploaded
cpt
parents:
diff changeset
235 contiguous_regions = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
236
c3140b08d703 Uploaded
cpt
parents:
diff changeset
237 sorted_genes = sorted(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
238 genes(record.features), key=lambda feature: feature.location.start
c3140b08d703 Uploaded
cpt
parents:
diff changeset
239 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
240 if len(sorted_genes) == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
241 log.warn("NO GENES FOUND")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
242 return good, bad, results, []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
243
c3140b08d703 Uploaded
cpt
parents:
diff changeset
244 current_gene = None
c3140b08d703 Uploaded
cpt
parents:
diff changeset
245 for gene in sorted_genes:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
246 # If the gene's start is contiguous to the "current_gene", then we
c3140b08d703 Uploaded
cpt
parents:
diff changeset
247 # extend current_gene
c3140b08d703 Uploaded
cpt
parents:
diff changeset
248 for cds in genes(gene.sub_features, feature_type="CDS"):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
249 if current_gene is None:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
250 current_gene = [int(cds.location.start), int(cds.location.end)]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
251
c3140b08d703 Uploaded
cpt
parents:
diff changeset
252 if cds.location.start <= current_gene[1] + excess:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
253 # Don't want to decrease size
c3140b08d703 Uploaded
cpt
parents:
diff changeset
254 if int(cds.location.end) >= current_gene[1]:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
255 current_gene[1] = int(cds.location.end)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
256 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
257 # If it's discontiguous, we append the region and clear.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
258 contiguous_regions.append(current_gene)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
259 current_gene = [int(cds.location.start), int(cds.location.end)]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
260
c3140b08d703 Uploaded
cpt
parents:
diff changeset
261 # This generally expected that annotations would NOT continue unto the end
c3140b08d703 Uploaded
cpt
parents:
diff changeset
262 # of the genome, however that's a bug, and we can make it here with an
c3140b08d703 Uploaded
cpt
parents:
diff changeset
263 # empty contiguous_regions list
c3140b08d703 Uploaded
cpt
parents:
diff changeset
264 contiguous_regions.append(current_gene)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
265
c3140b08d703 Uploaded
cpt
parents:
diff changeset
266 for i in range(len(contiguous_regions) + 1):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
267 if i == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
268 a = (1, 1)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
269 b = contiguous_regions[i]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
270 elif i >= len(contiguous_regions):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
271 a = contiguous_regions[i - 1]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
272 b = (len(record.seq), None)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
273 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
274 a = contiguous_regions[i - 1]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
275 b = contiguous_regions[i]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
276
c3140b08d703 Uploaded
cpt
parents:
diff changeset
277 gap_size = abs(b[0] - a[1])
c3140b08d703 Uploaded
cpt
parents:
diff changeset
278
c3140b08d703 Uploaded
cpt
parents:
diff changeset
279 if gap_size > min(excess, excess_divergent):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
280 a_feat_l = itertools.islice(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
281 feature_lambda(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
282 sorted_genes,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
283 feature_test_location,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
284 {"loc": a[1]},
c3140b08d703 Uploaded
cpt
parents:
diff changeset
285 subfeatures=False,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
286 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
287 1,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
288 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
289 b_feat_l = itertools.islice(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
290 feature_lambda(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
291 sorted_genes,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
292 feature_test_location,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
293 {"loc": b[0]},
c3140b08d703 Uploaded
cpt
parents:
diff changeset
294 subfeatures=False,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
295 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
296 1,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
297 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
298
c3140b08d703 Uploaded
cpt
parents:
diff changeset
299 try:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
300 a_feat = next(a_feat_l)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
301 except StopIteration:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
302 # Triggers on end of genome
c3140b08d703 Uploaded
cpt
parents:
diff changeset
303 a_feat = None
c3140b08d703 Uploaded
cpt
parents:
diff changeset
304 try:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
305 b_feat = next(b_feat_l)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
306 except StopIteration:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
307 # Triggers on end of genome
c3140b08d703 Uploaded
cpt
parents:
diff changeset
308 b_feat = None
c3140b08d703 Uploaded
cpt
parents:
diff changeset
309
c3140b08d703 Uploaded
cpt
parents:
diff changeset
310 result_obj = [
c3140b08d703 Uploaded
cpt
parents:
diff changeset
311 a[1],
c3140b08d703 Uploaded
cpt
parents:
diff changeset
312 b[0],
c3140b08d703 Uploaded
cpt
parents:
diff changeset
313 None if not a_feat else a_feat.location.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
314 None if not b_feat else b_feat.location.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
315 ]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
316
c3140b08d703 Uploaded
cpt
parents:
diff changeset
317 if a_feat is None or b_feat is None:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
318 if gap_size > excess_divergent:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
319 results.append(result_obj)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
320 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
321 if (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
322 a_feat.location.strand == b_feat.location.strand
c3140b08d703 Uploaded
cpt
parents:
diff changeset
323 and gap_size > excess
c3140b08d703 Uploaded
cpt
parents:
diff changeset
324 ):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
325 results.append(result_obj)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
326 elif (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
327 a_feat.location.strand != b_feat.location.strand
c3140b08d703 Uploaded
cpt
parents:
diff changeset
328 and gap_size > excess_divergent
c3140b08d703 Uploaded
cpt
parents:
diff changeset
329 ):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
330 results.append(result_obj)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
331
c3140b08d703 Uploaded
cpt
parents:
diff changeset
332 better_results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
333 qc_features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
334 of = MGAFinder(11, "CDS", "closed", min_gene)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
335 # of = OrfFinder(11, 'CDS', 'closed', min_gene)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
336
c3140b08d703 Uploaded
cpt
parents:
diff changeset
337 for result_obj in results:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
338 start = result_obj[0]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
339 end = result_obj[1]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
340 f = gen_qc_feature(start, end, "Excessive gap, %s bases" % abs(end - start), type_src="gene")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
341 qc_features.append(f)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
342 putative_genes = of.putative_genes_in_sequence(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
343 str(record[start - slop : end + slop].seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
344 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
345 putative_genes = list(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
346 require_sd(putative_genes, record, start, lookahead_min, lookahead_max)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
347 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
348 for putative_gene in putative_genes:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
349 # (0, 33, 1, 'ATTATTTTATCAAAACGCTTTACAATCTTTTAG', 'MILSKRFTIF', 123123, 124324)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
350 possible_gene_start = start + putative_gene[0]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
351 possible_gene_end = start + putative_gene[1]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
352
c3140b08d703 Uploaded
cpt
parents:
diff changeset
353 if possible_gene_start <= possible_gene_end:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
354 possible_cds = gffSeqFeature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
355 FeatureLocation(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
356 possible_gene_start, possible_gene_end, strand=putative_gene[2]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
357 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
358 type="CDS",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
359 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
360 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
361 possible_cds = gffSeqFeature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
362 FeatureLocation(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
363 possible_gene_end, possible_gene_start, strand=putative_gene[2],
c3140b08d703 Uploaded
cpt
parents:
diff changeset
364 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
365 type="CDS",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
366 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
367
c3140b08d703 Uploaded
cpt
parents:
diff changeset
368 # Now we adjust our boundaries for the RBS that's required
c3140b08d703 Uploaded
cpt
parents:
diff changeset
369 # There are only two cases, the rbs is upstream of it, or downstream
c3140b08d703 Uploaded
cpt
parents:
diff changeset
370 if putative_gene[5] < possible_gene_start:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
371 possible_gene_start = putative_gene[5]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
372 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
373 possible_gene_end = putative_gene[6]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
374
c3140b08d703 Uploaded
cpt
parents:
diff changeset
375 if putative_gene[5] <= putative_gene[6]:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
376 possible_rbs = gffSeqFeature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
377 FeatureLocation(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
378 putative_gene[5], putative_gene[6], strand=putative_gene[2]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
379 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
380 type="Shine_Dalgarno_sequence",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
381 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
382 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
383 possible_rbs = gffSeqFeature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
384 FeatureLocation(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
385 putative_gene[6], putative_gene[5], strand=putative_gene[2],
c3140b08d703 Uploaded
cpt
parents:
diff changeset
386 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
387 type="Shine_Dalgarno_sequence",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
388 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
389
c3140b08d703 Uploaded
cpt
parents:
diff changeset
390 if possible_gene_start <= possible_gene_end:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
391 possible_gene = gffSeqFeature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
392 FeatureLocation(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
393 possible_gene_start, possible_gene_end, strand=putative_gene[2]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
394 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
395 type="gene",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
396 qualifiers={"note": ["Possible gene"]},
c3140b08d703 Uploaded
cpt
parents:
diff changeset
397 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
398 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
399 possible_gene = gffSeqFeature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
400 FeatureLocation(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
401 possible_gene_end, possible_gene_start, strand=putative_gene[2],
c3140b08d703 Uploaded
cpt
parents:
diff changeset
402 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
403 type="gene",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
404 qualifiers={"note": ["Possible gene"]},
c3140b08d703 Uploaded
cpt
parents:
diff changeset
405 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
406 possible_gene.sub_features = [possible_rbs, possible_cds]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
407 qc_features.append(possible_gene)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
408
c3140b08d703 Uploaded
cpt
parents:
diff changeset
409 better_results.append(result_obj + [len(putative_genes)])
c3140b08d703 Uploaded
cpt
parents:
diff changeset
410
c3140b08d703 Uploaded
cpt
parents:
diff changeset
411 # Bad gaps are those with more than zero possible genes found
c3140b08d703 Uploaded
cpt
parents:
diff changeset
412 bad = len([x for x in better_results if x[2] > 0])
c3140b08d703 Uploaded
cpt
parents:
diff changeset
413 # Generally taking "good" here as every possible gap in the genome
c3140b08d703 Uploaded
cpt
parents:
diff changeset
414 # Thus, good is TOTAL - gaps
c3140b08d703 Uploaded
cpt
parents:
diff changeset
415 good = len(sorted_genes) + 1 - bad
c3140b08d703 Uploaded
cpt
parents:
diff changeset
416 # and bad is just gaps
c3140b08d703 Uploaded
cpt
parents:
diff changeset
417 return good, bad, better_results, qc_features
c3140b08d703 Uploaded
cpt
parents:
diff changeset
418
c3140b08d703 Uploaded
cpt
parents:
diff changeset
419
c3140b08d703 Uploaded
cpt
parents:
diff changeset
420 def phi(x):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
421 """Standard phi function used in calculation of normal distribution"""
c3140b08d703 Uploaded
cpt
parents:
diff changeset
422 return math.exp(-1 * math.pi * x * x)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
423
c3140b08d703 Uploaded
cpt
parents:
diff changeset
424
c3140b08d703 Uploaded
cpt
parents:
diff changeset
425 def norm(x, mean=0, sd=1):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
426 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
427 Normal distribution. Given an x position, a mean, and a standard
c3140b08d703 Uploaded
cpt
parents:
diff changeset
428 deviation, calculate the "y" value. Useful for score scaling
c3140b08d703 Uploaded
cpt
parents:
diff changeset
429
c3140b08d703 Uploaded
cpt
parents:
diff changeset
430 Modified to multiply by SD. This means even at sd=5, norm(x, mean) where x = mean => 1, rather than 1/5.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
431 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
432 return (1 / float(sd)) * phi(float(x - mean) / float(sd)) * sd
c3140b08d703 Uploaded
cpt
parents:
diff changeset
433
c3140b08d703 Uploaded
cpt
parents:
diff changeset
434
c3140b08d703 Uploaded
cpt
parents:
diff changeset
435 def coding_density(record, mean=92.5, sd=20):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
436 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
437 Find coding density in the genome
c3140b08d703 Uploaded
cpt
parents:
diff changeset
438 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
439 feature_lengths = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
440
c3140b08d703 Uploaded
cpt
parents:
diff changeset
441 for gene_a in coding_genes(record.features):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
442 feature_lengths += sum(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
443 [len(x) for x in genes(gene_a.sub_features, feature_type="CDS")]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
444 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
445
c3140b08d703 Uploaded
cpt
parents:
diff changeset
446 avgFeatLen = float(feature_lengths) / float(len(record.seq))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
447 return int(norm(100 * avgFeatLen, mean=mean, sd=sd) * 100), int(100 * avgFeatLen)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
448
c3140b08d703 Uploaded
cpt
parents:
diff changeset
449
c3140b08d703 Uploaded
cpt
parents:
diff changeset
450 def exact_coding_density(record, mean=92.5, sd=20):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
451 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
452 Find exact coding density in the genome
c3140b08d703 Uploaded
cpt
parents:
diff changeset
453 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
454 data = numpy.zeros(len(record.seq))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
455
c3140b08d703 Uploaded
cpt
parents:
diff changeset
456 for gene_a in coding_genes(record.features):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
457 for cds in genes(gene_a.sub_features, feature_type="CDS"):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
458 for i in range(cds.location.start, cds.location.end + 1):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
459 data[i - 1] = 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
460
c3140b08d703 Uploaded
cpt
parents:
diff changeset
461 return float(sum(data)) / len(data)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
462
c3140b08d703 Uploaded
cpt
parents:
diff changeset
463
c3140b08d703 Uploaded
cpt
parents:
diff changeset
464 def excessive_overlap(record, excess=15, excess_divergent=30):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
465 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
466 Find excessive overlaps in the genome, where excessive is defined as 15
c3140b08d703 Uploaded
cpt
parents:
diff changeset
467 bases for same strand, and 30 for divergent translation.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
468
c3140b08d703 Uploaded
cpt
parents:
diff changeset
469 Does a product of all the top-level features in the genome, and calculates
c3140b08d703 Uploaded
cpt
parents:
diff changeset
470 gaps.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
471 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
472 results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
473 bad = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
474 qc_features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
475
c3140b08d703 Uploaded
cpt
parents:
diff changeset
476 for (gene_a, gene_b) in itertools.combinations(coding_genes(record.features), 2):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
477 # Get the CDS from the subfeature list.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
478 # TODO: not recursive.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
479 cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
480 cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
481
c3140b08d703 Uploaded
cpt
parents:
diff changeset
482 if len(cds_a) == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
483 log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
484 continue
c3140b08d703 Uploaded
cpt
parents:
diff changeset
485
c3140b08d703 Uploaded
cpt
parents:
diff changeset
486 if len(cds_b) == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
487 log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
488 continue
c3140b08d703 Uploaded
cpt
parents:
diff changeset
489
c3140b08d703 Uploaded
cpt
parents:
diff changeset
490 cds_a = cds_a[0]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
491 cds_b = cds_b[0]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
492
c3140b08d703 Uploaded
cpt
parents:
diff changeset
493 # Set of locations that are included in the CDS of A and the
c3140b08d703 Uploaded
cpt
parents:
diff changeset
494 # CDS of B
c3140b08d703 Uploaded
cpt
parents:
diff changeset
495 cas = set(range(cds_a.location.start, cds_a.location.end))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
496 cbs = set(range(cds_b.location.start, cds_b.location.end))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
497
c3140b08d703 Uploaded
cpt
parents:
diff changeset
498 # Here we calculate the intersection between the two sets, and
c3140b08d703 Uploaded
cpt
parents:
diff changeset
499 # if it's larger than our excessive size, we know that they're
c3140b08d703 Uploaded
cpt
parents:
diff changeset
500 # overlapped
c3140b08d703 Uploaded
cpt
parents:
diff changeset
501 ix = cas.intersection(cbs)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
502
c3140b08d703 Uploaded
cpt
parents:
diff changeset
503 if (cds_a.location.strand == cds_b.location.strand and len(ix) >= excess) or (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
504 cds_a.location.strand != cds_b.location.strand
c3140b08d703 Uploaded
cpt
parents:
diff changeset
505 and len(ix) >= excess_divergent
c3140b08d703 Uploaded
cpt
parents:
diff changeset
506 ):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
507 bad += float(len(ix)) / float(min(excess, excess_divergent))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
508 qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
509 gen_qc_feature(min(ix), max(ix), "Excessive Overlap", id_src=gene_a, type_src="gene")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
510 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
511 results.append((gene_a, gene_b, min(ix), max(ix)))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
512
c3140b08d703 Uploaded
cpt
parents:
diff changeset
513 # Good isn't accurate here. It's a triangle number and just ugly, but we
c3140b08d703 Uploaded
cpt
parents:
diff changeset
514 # don't care enough to fix it.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
515 good = len(list(coding_genes(record.features)))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
516 good = int(good - bad)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
517 if good < 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
518 good = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
519 return good, int(bad), results, qc_features
c3140b08d703 Uploaded
cpt
parents:
diff changeset
520
c3140b08d703 Uploaded
cpt
parents:
diff changeset
521
c3140b08d703 Uploaded
cpt
parents:
diff changeset
522 def get_encouragement(score):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
523 """Some text telling the user how they did
c3140b08d703 Uploaded
cpt
parents:
diff changeset
524 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
525 for encouragement in ENCOURAGEMENT:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
526 if score > encouragement[0]:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
527 return encouragement[1]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
528 return ENCOURAGEMENT[-1][1]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
529
c3140b08d703 Uploaded
cpt
parents:
diff changeset
530
c3140b08d703 Uploaded
cpt
parents:
diff changeset
531 def genome_overview(record):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
532 """Genome overview
c3140b08d703 Uploaded
cpt
parents:
diff changeset
533 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
534 data = {
c3140b08d703 Uploaded
cpt
parents:
diff changeset
535 "genes": {
c3140b08d703 Uploaded
cpt
parents:
diff changeset
536 "count": 0,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
537 "bases": len(record.seq),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
538 "density": 0, # genes / kb
c3140b08d703 Uploaded
cpt
parents:
diff changeset
539 "avg_len": [],
c3140b08d703 Uploaded
cpt
parents:
diff changeset
540 "comp": {"A": 0, "C": 0, "G": 0, "T": 0},
c3140b08d703 Uploaded
cpt
parents:
diff changeset
541 },
c3140b08d703 Uploaded
cpt
parents:
diff changeset
542 "overall": {
c3140b08d703 Uploaded
cpt
parents:
diff changeset
543 "comp": {
c3140b08d703 Uploaded
cpt
parents:
diff changeset
544 "A": record.seq.count("A") + record.seq.count("a"),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
545 "C": record.seq.count("C") + record.seq.count("c"),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
546 "G": record.seq.count("G") + record.seq.count("g"),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
547 "T": record.seq.count("T") + record.seq.count("t"),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
548 },
c3140b08d703 Uploaded
cpt
parents:
diff changeset
549 "gc": 0,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
550 },
c3140b08d703 Uploaded
cpt
parents:
diff changeset
551 }
c3140b08d703 Uploaded
cpt
parents:
diff changeset
552 gene_features = list(coding_genes(record.features))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
553 data["genes"]["count"] = len(gene_features)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
554
c3140b08d703 Uploaded
cpt
parents:
diff changeset
555 for feat in gene_features:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
556 data["genes"]["comp"]["A"] += feat.extract(record).seq.count("A") + feat.extract(record).seq.count("a")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
557 data["genes"]["comp"]["C"] += feat.extract(record).seq.count("C") + feat.extract(record).seq.count("c")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
558 data["genes"]["comp"]["T"] += feat.extract(record).seq.count("T") + feat.extract(record).seq.count("t")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
559 data["genes"]["comp"]["G"] += feat.extract(record).seq.count("G") + feat.extract(record).seq.count("g")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
560 #data["genes"]["bases"] += len(feat)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
561 data["genes"]["avg_len"].append(len(feat))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
562
c3140b08d703 Uploaded
cpt
parents:
diff changeset
563 data["genes"]["avg_len"] = float(sum(data["genes"]["avg_len"])) / len(gene_features)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
564 data["overall"]["gc"] = float(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
565 data["overall"]["comp"]["G"] + data["overall"]["comp"]["C"]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
566 ) / len(record.seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
567 return data
c3140b08d703 Uploaded
cpt
parents:
diff changeset
568
c3140b08d703 Uploaded
cpt
parents:
diff changeset
569
c3140b08d703 Uploaded
cpt
parents:
diff changeset
570 def find_morons(record):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
571 """Locate morons in the genome
c3140b08d703 Uploaded
cpt
parents:
diff changeset
572
c3140b08d703 Uploaded
cpt
parents:
diff changeset
573 Don't even know why...
c3140b08d703 Uploaded
cpt
parents:
diff changeset
574
c3140b08d703 Uploaded
cpt
parents:
diff changeset
575 TODO: remove? Idk.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
576 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
577 results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
578 good = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
579 bad = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
580
c3140b08d703 Uploaded
cpt
parents:
diff changeset
581 gene_features = list(coding_genes(record.features))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
582 for i, gene in enumerate(gene_features):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
583 two_left = gene_features[i - 2 : i]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
584 two_right = gene_features[i + 1 : i + 1 + 2]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
585 strands = [x.strand for x in two_left] + [x.strand for x in two_right]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
586 anticon = [x for x in strands if x != gene.strand]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
587
c3140b08d703 Uploaded
cpt
parents:
diff changeset
588 if len(anticon) == 4:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
589 has_rbs = [x.type == "Shine_Dalgarno_sequence" for x in gene.sub_features]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
590 if any(has_rbs):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
591 rbs = [
c3140b08d703 Uploaded
cpt
parents:
diff changeset
592 x for x in gene.sub_features if x.type == "Shine_Dalgarno_sequence"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
593 ][0]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
594 rbs_msg = str(rbs.extract(record.seq))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
595 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
596 rbs_msg = "No RBS Available"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
597 results.append((gene, two_left, two_right, rbs_msg))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
598 bad += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
599 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
600 good += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
601 return good, bad, results, []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
602
c3140b08d703 Uploaded
cpt
parents:
diff changeset
603
c3140b08d703 Uploaded
cpt
parents:
diff changeset
604 def bad_gene_model(record):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
605 """Find features without product
c3140b08d703 Uploaded
cpt
parents:
diff changeset
606 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
607 results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
608 good = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
609 bad = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
610 qc_features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
611
c3140b08d703 Uploaded
cpt
parents:
diff changeset
612 for gene in coding_genes(record.features):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
613 exons = [
c3140b08d703 Uploaded
cpt
parents:
diff changeset
614 x for x in genes(gene.sub_features, feature_type="exon") if len(x) > 10
c3140b08d703 Uploaded
cpt
parents:
diff changeset
615 ]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
616 CDSs = [x for x in genes(gene.sub_features, feature_type="CDS")]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
617 if len(exons) >= 1 and len(CDSs) >= 1:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
618 if len(exons) != len(CDSs):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
619 results.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
620 (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
621 get_gff3_id(gene),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
622 None,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
623 None,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
624 "Mismatched number of exons and CDSs in gff3 representation",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
625 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
626 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
627 qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
628 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
629 gene.location.start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
630 gene.location.end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
631 "Mismatched number of exons and CDSs in gff3 representation",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
632 strand=gene.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
633 id_src=gene,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
634 type_src="gene"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
635 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
636 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
637 bad += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
638 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
639 for (exon, cds) in zip(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
640 sorted(exons, key=lambda x: x.location.start),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
641 sorted(CDSs, key=lambda x: x.location.start),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
642 ):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
643 if len(exon) != len(cds):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
644 results.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
645 (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
646 get_gff3_id(gene),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
647 exon,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
648 cds,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
649 "CDS does not extend to full length of gene",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
650 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
651 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
652 qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
653 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
654 exon.location.start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
655 exon.location.end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
656 "CDS does not extend to full length of gene",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
657 strand=exon.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
658 id_src=gene,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
659 type_src="CDS"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
660 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
661 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
662 bad += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
663 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
664 good += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
665 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
666 log.warn("Could not handle %s, %s", exons, CDSs)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
667 results.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
668 (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
669 get_gff3_id(gene),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
670 None,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
671 None,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
672 "{0} exons, {1} CDSs".format(len(exons), len(CDSs)),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
673 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
674 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
675
c3140b08d703 Uploaded
cpt
parents:
diff changeset
676 return good, len(results) + bad, results, qc_features
c3140b08d703 Uploaded
cpt
parents:
diff changeset
677
c3140b08d703 Uploaded
cpt
parents:
diff changeset
678
c3140b08d703 Uploaded
cpt
parents:
diff changeset
679 def weird_starts(record):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
680 """Find features without product
c3140b08d703 Uploaded
cpt
parents:
diff changeset
681 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
682 good = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
683 bad = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
684 qc_features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
685 results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
686
c3140b08d703 Uploaded
cpt
parents:
diff changeset
687 overall = {}
c3140b08d703 Uploaded
cpt
parents:
diff changeset
688 for gene in coding_genes(record.features):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
689 seq = [x for x in genes(gene.sub_features, feature_type="CDS")]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
690 if len(seq) == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
691 log.warn("No CDS for gene %s", get_gff3_id(gene))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
692 continue
c3140b08d703 Uploaded
cpt
parents:
diff changeset
693 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
694 seq = seq[0]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
695
c3140b08d703 Uploaded
cpt
parents:
diff changeset
696 seq_str = str(seq.extract(record.seq))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
697 start_codon = seq_str[0:3]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
698 if len(seq_str) < 3:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
699 sys.stderr.write("Fatal Error: CDS of length less than 3 at " + str(seq.location) + '\n')
c3140b08d703 Uploaded
cpt
parents:
diff changeset
700 exit(2)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
701 # if len(seq_str) % 3 != 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
702 # if len(seq_str) < 3:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
703 # stop_codon = seq_str[-(len(seq_str))]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
704 # else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
705 # stop_codon = seq_str[-3]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
706 #
c3140b08d703 Uploaded
cpt
parents:
diff changeset
707 # log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
708 # seq.__error = "Bad CDS Length"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
709 # results.append(seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
710 # qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
711 # gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
712 # s, e, "Bad Length", strand=seq.strand, id_src=gene
c3140b08d703 Uploaded
cpt
parents:
diff changeset
713 # )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
714 # )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
715 # bad += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
716 # seq.__start = start_codon
c3140b08d703 Uploaded
cpt
parents:
diff changeset
717 # seq.__stop = stop_codon
c3140b08d703 Uploaded
cpt
parents:
diff changeset
718 # continue
c3140b08d703 Uploaded
cpt
parents:
diff changeset
719
c3140b08d703 Uploaded
cpt
parents:
diff changeset
720 stop_codon = seq_str[-3]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
721 seq.__start = start_codon
c3140b08d703 Uploaded
cpt
parents:
diff changeset
722 seq.__stop = stop_codon
c3140b08d703 Uploaded
cpt
parents:
diff changeset
723 if start_codon not in overall:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
724 overall[start_codon] = 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
725 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
726 overall[start_codon] += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
727
c3140b08d703 Uploaded
cpt
parents:
diff changeset
728 if start_codon not in ("ATG", "TTG", "GTG"):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
729 log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
730 seq.__error = "Unusual start codon %s" % start_codon
c3140b08d703 Uploaded
cpt
parents:
diff changeset
731
c3140b08d703 Uploaded
cpt
parents:
diff changeset
732 s = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
733 e = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
734 if seq.strand > 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
735 s = seq.location.start
c3140b08d703 Uploaded
cpt
parents:
diff changeset
736 e = seq.location.start + 3
c3140b08d703 Uploaded
cpt
parents:
diff changeset
737 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
738 s = seq.location.end
c3140b08d703 Uploaded
cpt
parents:
diff changeset
739 e = seq.location.end - 3
c3140b08d703 Uploaded
cpt
parents:
diff changeset
740
c3140b08d703 Uploaded
cpt
parents:
diff changeset
741 results.append(seq)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
742 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
743 qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
744 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
745 s, e, "Weird start codon", strand=seq.strand, id_src=gene, type_src="gene"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
746 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
747 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
748 bad += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
749 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
750 good += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
751
c3140b08d703 Uploaded
cpt
parents:
diff changeset
752 return good, bad, results, qc_features, overall
c3140b08d703 Uploaded
cpt
parents:
diff changeset
753
c3140b08d703 Uploaded
cpt
parents:
diff changeset
754
c3140b08d703 Uploaded
cpt
parents:
diff changeset
755 def missing_genes(record):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
756 """Find features without product
c3140b08d703 Uploaded
cpt
parents:
diff changeset
757 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
758 results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
759 good = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
760 bad = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
761 qc_features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
762
c3140b08d703 Uploaded
cpt
parents:
diff changeset
763 for gene in coding_genes(record.features):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
764 if gene.qualifiers.get("cpt_source", [None])[0] == "CPT_GENE_MODEL_CORRECTION":
c3140b08d703 Uploaded
cpt
parents:
diff changeset
765 results.append(gene)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
766 bad += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
767 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
768 good += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
769
c3140b08d703 Uploaded
cpt
parents:
diff changeset
770 return good, bad, results, qc_features
c3140b08d703 Uploaded
cpt
parents:
diff changeset
771
c3140b08d703 Uploaded
cpt
parents:
diff changeset
772
c3140b08d703 Uploaded
cpt
parents:
diff changeset
773 def gene_model_correction_issues(record):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
774 """Find features that have issues from the gene model correction step.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
775 These have qualifiers beginning with CPT_GMS
c3140b08d703 Uploaded
cpt
parents:
diff changeset
776 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
777 results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
778 good = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
779 bad = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
780 qc_features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
781
c3140b08d703 Uploaded
cpt
parents:
diff changeset
782 # For each gene
c3140b08d703 Uploaded
cpt
parents:
diff changeset
783 for gene in coding_genes(record.features):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
784 # Get the list of child CDSs
c3140b08d703 Uploaded
cpt
parents:
diff changeset
785 cdss = [x for x in genes(gene.sub_features, feature_type="CDS")]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
786 # And our matching qualifiers
c3140b08d703 Uploaded
cpt
parents:
diff changeset
787 gene_data = [(k, v) for (k, v) in gene.qualifiers.items() if k == "cpt_gmc"]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
788 # If there are problems with ONLY the parent, let's complain
c3140b08d703 Uploaded
cpt
parents:
diff changeset
789 local_results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
790 local_qc_features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
791 for x in gene_data:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
792 if "Missing Locus Tag" in x[1]:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
793 # Missing locus tag is an either or thing, if it hits here
c3140b08d703 Uploaded
cpt
parents:
diff changeset
794 # there shouldn't be anything else wrong with it.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
795
c3140b08d703 Uploaded
cpt
parents:
diff changeset
796 # Obviously missing so we remove it
c3140b08d703 Uploaded
cpt
parents:
diff changeset
797 gene.qualifiers["locus_tag"] = [""]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
798 # Translation from bp_genbank2gff3.py
c3140b08d703 Uploaded
cpt
parents:
diff changeset
799 cdss[0].qualifiers["locus_tag"] = cdss[0].qualifiers["Name"]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
800 # Append our results
c3140b08d703 Uploaded
cpt
parents:
diff changeset
801 local_results.append((gene, cdss[0], "Gene is missing a locus_tag"))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
802 local_qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
803 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
804 gene.location.start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
805 gene.location.end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
806 "Gene is missing a locus_tag",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
807 strand=gene.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
808 type_src="gene"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
809 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
810 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
811
c3140b08d703 Uploaded
cpt
parents:
diff changeset
812 # We need to alert on any child issues as well.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
813 for cds in cdss:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
814 cds_data = [
c3140b08d703 Uploaded
cpt
parents:
diff changeset
815 (k, v[0]) for (k, v) in cds.qualifiers.items() if k == "cpt_gmc"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
816 ]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
817 if len(gene_data) == 0 and len(cds_data) == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
818 # Alles gut
c3140b08d703 Uploaded
cpt
parents:
diff changeset
819 pass
c3140b08d703 Uploaded
cpt
parents:
diff changeset
820 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
821 for _, problem in cds_data:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
822 if problem == "BOTH Missing Locus Tag":
c3140b08d703 Uploaded
cpt
parents:
diff changeset
823 gene.qualifiers["locus_tag"] = [""]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
824 cds.qualifiers["locus_tag"] = [""]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
825 local_results.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
826 (gene, cds, "Both gene and CDS are missing locus tags")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
827 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
828 local_qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
829 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
830 cds.location.start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
831 cds.location.end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
832 "CDS is missing a locus_tag",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
833 strand=cds.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
834 type_src="CDS"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
835 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
836 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
837 local_qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
838 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
839 gene.location.start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
840 gene.location.end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
841 "Gene is missing a locus_tag",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
842 strand=gene.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
843 type_src="gene"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
844 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
845 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
846 elif problem == "Different locus tag from associated gene.":
c3140b08d703 Uploaded
cpt
parents:
diff changeset
847 gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
848 cds.qualifiers["locus_tag"] = cds.qualifiers["cpt_gmc_locus"]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
849 local_results.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
850 (gene, cds, "Gene and CDS have differing locus tags")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
851 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
852 local_qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
853 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
854 gene.location.start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
855 gene.location.end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
856 "Gene and CDS have differing locus tags",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
857 strand=gene.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
858 type_src="gene"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
859 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
860 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
861 elif problem == "Missing Locus Tag":
c3140b08d703 Uploaded
cpt
parents:
diff changeset
862 # Copy this over
c3140b08d703 Uploaded
cpt
parents:
diff changeset
863 gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
864 # This one is missing
c3140b08d703 Uploaded
cpt
parents:
diff changeset
865 cds.qualifiers["locus_tag"] = [""]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
866 local_results.append((gene, cds, "CDS is missing a locus_tag"))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
867 local_qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
868 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
869 cds.location.start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
870 cds.location.end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
871 "CDS is missing a locus_tag",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
872 strand=cds.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
873 type_src="CDS"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
874 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
875 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
876 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
877 log.warn("Cannot handle %s", problem)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
878
c3140b08d703 Uploaded
cpt
parents:
diff changeset
879 if len(local_results) > 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
880 bad += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
881 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
882 good += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
883
c3140b08d703 Uploaded
cpt
parents:
diff changeset
884 qc_features.extend(local_qc_features)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
885 results.extend(local_results)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
886 return good, bad, results, qc_features
c3140b08d703 Uploaded
cpt
parents:
diff changeset
887
c3140b08d703 Uploaded
cpt
parents:
diff changeset
888
c3140b08d703 Uploaded
cpt
parents:
diff changeset
889 def missing_tags(record):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
890 """Find features without product
c3140b08d703 Uploaded
cpt
parents:
diff changeset
891 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
892 results = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
893 good = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
894 bad = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
895 qc_features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
896
c3140b08d703 Uploaded
cpt
parents:
diff changeset
897 for gene in coding_genes(record.features):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
898 cds = [x for x in genes(gene.sub_features, feature_type="CDS")]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
899 if len(cds) == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
900 log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
901 continue
c3140b08d703 Uploaded
cpt
parents:
diff changeset
902
c3140b08d703 Uploaded
cpt
parents:
diff changeset
903 cds = cds[0]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
904
c3140b08d703 Uploaded
cpt
parents:
diff changeset
905 if "product" not in cds.qualifiers:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
906 log.info("Missing product tag on %s", get_gff3_id(gene))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
907 qc_features.append(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
908 gen_qc_feature(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
909 cds.location.start,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
910 cds.location.end,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
911 "Missing product tag",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
912 strand=cds.strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
913 type_src="CDS"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
914 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
915 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
916 results.append(cds)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
917 bad += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
918 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
919 good += 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
920
c3140b08d703 Uploaded
cpt
parents:
diff changeset
921 return good, bad, results, qc_features
c3140b08d703 Uploaded
cpt
parents:
diff changeset
922
c3140b08d703 Uploaded
cpt
parents:
diff changeset
923
c3140b08d703 Uploaded
cpt
parents:
diff changeset
924 def evaluate_and_report(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
925 annotations,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
926 genome,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
927 gff3=None,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
928 tbl=None,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
929 sd_min=5,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
930 sd_max=15,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
931 min_gene_length=30,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
932 excessive_gap_dist=50,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
933 excessive_gap_divergent_dist=200,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
934 excessive_overlap_dist=25,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
935 excessive_overlap_divergent_dist=50,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
936 reportTemplateName="phage_annotation_validator.html",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
937 ):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
938 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
939 Generate our HTML evaluation of the genome
c3140b08d703 Uploaded
cpt
parents:
diff changeset
940 """
c3140b08d703 Uploaded
cpt
parents:
diff changeset
941 # Get features from GFF file
c3140b08d703 Uploaded
cpt
parents:
diff changeset
942 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
943 # Get the first GFF3 record
c3140b08d703 Uploaded
cpt
parents:
diff changeset
944 # TODO: support multiple GFF3 files.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
945 mostFeat = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
946 for rec in list(gffParse(annotations, base_dict=seq_dict)):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
947 if len(rec.features) > mostFeat:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
948 mostFeat = len(rec.features)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
949 record = rec
c3140b08d703 Uploaded
cpt
parents:
diff changeset
950
c3140b08d703 Uploaded
cpt
parents:
diff changeset
951 gff3_qc_record = SeqRecord(record.id, id=record.id)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
952 gff3_qc_record.features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
953 gff3_qc_features = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
954
c3140b08d703 Uploaded
cpt
parents:
diff changeset
955 log.info("Locating missing RBSs")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
956 # mb_any = "did they annotate ANY rbss? if so, take off from score."
c3140b08d703 Uploaded
cpt
parents:
diff changeset
957 mb_good, mb_bad, mb_results, mb_annotations, mb_any = missing_rbs(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
958 record, lookahead_min=sd_min, lookahead_max=sd_max
c3140b08d703 Uploaded
cpt
parents:
diff changeset
959 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
960 gff3_qc_features += mb_annotations
c3140b08d703 Uploaded
cpt
parents:
diff changeset
961
c3140b08d703 Uploaded
cpt
parents:
diff changeset
962 log.info("Locating excessive gaps")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
963 eg_good, eg_bad, eg_results, eg_annotations = excessive_gap(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
964 record,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
965 excess=excessive_gap_dist,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
966 excess_divergent=excessive_gap_divergent_dist,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
967 min_gene=min_gene_length,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
968 slop=excessive_overlap_dist,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
969 lookahead_min=sd_min,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
970 lookahead_max=sd_max,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
971 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
972 gff3_qc_features += eg_annotations
c3140b08d703 Uploaded
cpt
parents:
diff changeset
973
c3140b08d703 Uploaded
cpt
parents:
diff changeset
974 log.info("Locating excessive overlaps")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
975 eo_good, eo_bad, eo_results, eo_annotations = excessive_overlap(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
976 record,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
977 excess=excessive_overlap_dist,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
978 excess_divergent=excessive_overlap_divergent_dist,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
979 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
980 gff3_qc_features += eo_annotations
c3140b08d703 Uploaded
cpt
parents:
diff changeset
981
c3140b08d703 Uploaded
cpt
parents:
diff changeset
982 log.info("Locating morons")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
983 mo_good, mo_bad, mo_results, mo_annotations = find_morons(record)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
984 gff3_qc_features += mo_annotations
c3140b08d703 Uploaded
cpt
parents:
diff changeset
985
c3140b08d703 Uploaded
cpt
parents:
diff changeset
986 log.info("Locating missing tags")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
987 mt_good, mt_bad, mt_results, mt_annotations = missing_tags(record)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
988 gff3_qc_features += mt_annotations
c3140b08d703 Uploaded
cpt
parents:
diff changeset
989
c3140b08d703 Uploaded
cpt
parents:
diff changeset
990 log.info("Locating missing gene features")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
991 mg_good, mg_bad, mg_results, mg_annotations = missing_genes(record)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
992 gff3_qc_features += mg_annotations
c3140b08d703 Uploaded
cpt
parents:
diff changeset
993
c3140b08d703 Uploaded
cpt
parents:
diff changeset
994 log.info("Determining coding density")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
995 cd, cd_real = coding_density(record)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
996
c3140b08d703 Uploaded
cpt
parents:
diff changeset
997 log.info("Locating weird starts")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
998 ws_good, ws_bad, ws_results, ws_annotations, ws_overall = weird_starts(record)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
999 gff3_qc_features += ws_annotations
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1000
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1001 log.info("Locating bad gene models")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1002 gm_good, gm_bad, gm_results, gm_annotations = bad_gene_model(record)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1003 if gm_good + gm_bad == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1004 gm_bad = 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1005
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1006 log.info("Locating more bad gene models")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1007 gmc_good, gmc_bad, gmc_results, gmc_annotations = gene_model_correction_issues(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1008 record
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1009 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1010 if gmc_good + gmc_bad == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1011 gmc_bad = 1
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1012
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1013 good_scores = [eg_good, eo_good, mt_good, ws_good, gm_good, gmc_good]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1014 bad_scores = [eg_bad, eo_bad, mt_bad, ws_bad, gm_bad, gmc_bad]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1015
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1016 # Only if they tried to annotate RBSs do we consider them.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1017 if mb_any:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1018 good_scores.append(mb_good)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1019 bad_scores.append(mb_bad)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1020 subscores = []
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1021
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1022 for (g, b) in zip(good_scores, bad_scores):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1023 if g + b == 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1024 s = 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1025 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1026 s = int(100 * float(g) / (float(b) + float(g)))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1027 subscores.append(s)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1028 subscores.append(cd)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1029
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1030 score = int(float(sum(subscores)) / float(len(subscores)))
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1031
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1032 # This is data that will go into our HTML template
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1033 kwargs = {
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1034 "upstream_min": sd_min,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1035 "upstream_max": sd_max,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1036 "record_name": record.id,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1037 "record_nice_name": nice_name(record),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1038 "params": {
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1039 "sd_min": sd_min,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1040 "sd_max": sd_max,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1041 "min_gene_length": min_gene_length,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1042 "excessive_gap_dist": excessive_gap_dist,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1043 "excessive_gap_divergent_dist": excessive_gap_divergent_dist,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1044 "excessive_overlap_dist": excessive_overlap_dist,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1045 "excessive_overlap_divergent_dist": excessive_overlap_divergent_dist,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1046 },
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1047 "score": score,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1048 "encouragement": get_encouragement(score),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1049 "genome_overview": genome_overview(record),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1050 "rbss_annotated": mb_any,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1051 "missing_rbs": mb_results,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1052 "missing_rbs_good": mb_good,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1053 "missing_rbs_bad": mb_bad,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1054 "missing_rbs_score": 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1055 if mb_good + mb_bad == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1056 else (100 * mb_good / (mb_good + mb_bad)),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1057 "excessive_gap": eg_results,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1058 "excessive_gap_good": eg_good,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1059 "excessive_gap_bad": eg_bad,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1060 "excessive_gap_score": 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1061 if eo_good + eo_bad == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1062 else (100 * eo_good / (eo_good + eo_bad)),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1063 "excessive_overlap": eo_results,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1064 "excessive_overlap_good": eo_good,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1065 "excessive_overlap_bad": eo_bad,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1066 "excessive_overlap_score": 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1067 if eo_good + eo_bad == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1068 else (100 * eo_good / (eo_good + eo_bad)),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1069 "morons": mo_results,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1070 "morons_good": mo_good,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1071 "morons_bad": mo_bad,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1072 "morons_score": 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1073 if mo_good + mo_bad == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1074 else (100 * mo_good / (mo_good + mo_bad)),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1075 "missing_tags": mt_results,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1076 "missing_tags_good": mt_good,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1077 "missing_tags_bad": mt_bad,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1078 "missing_tags_score": 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1079 if mt_good + mt_bad == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1080 else (100 * mt_good / (mt_good + mt_bad)),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1081 "missing_genes": mg_results,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1082 "missing_genes_good": mg_good,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1083 "missing_genes_bad": mg_bad,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1084 "missing_genes_score": 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1085 if mg_good + mg_bad == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1086 else (100 * mg_good / (mg_good + mg_bad)),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1087 "weird_starts": ws_results,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1088 "weird_starts_good": ws_good,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1089 "weird_starts_bad": ws_bad,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1090 "weird_starts_overall": ws_overall,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1091 "weird_starts_overall_sorted_keys": sorted(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1092 ws_overall, reverse=True, key=lambda x: ws_overall[x]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1093 ),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1094 "weird_starts_score": 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1095 if ws_good + ws_bad == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1096 else (100 * ws_good / (ws_good + ws_bad)),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1097 "gene_model": gm_results,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1098 "gene_model_good": gm_good,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1099 "gene_model_bad": gm_bad,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1100 "gene_model_score": 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1101 if gm_good + gm_bad == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1102 else (100 * gm_good / (gm_good + gm_bad)),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1103 "gene_model_correction": gmc_results,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1104 "gene_model_correction_good": gmc_good,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1105 "gene_model_correction_bad": gmc_bad,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1106 "gene_model_correction_score": 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1107 if gmc_good + gmc_bad == 0
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1108 else (100 * gmc_good / (gmc_good + gmc_bad)),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1109 "coding_density": cd,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1110 "coding_density_exact": exact_coding_density(record),
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1111 "coding_density_real": cd_real,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1112 "coding_density_score": cd,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1113 }
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1114
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1115 with open(tbl, "w") as handle:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1116 kw_subset = {}
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1117 for key in kwargs:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1118 if (
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1119 key in ("score", "record_name")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1120 or "_good" in key
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1121 or "_bad" in key
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1122 or "_overall" in key
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1123 ):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1124 kw_subset[key] = kwargs[key]
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1125 json.dump(kw_subset, handle)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1126
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1127 with open(gff3, "w") as handle:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1128 gff3_qc_record.features = gff3_qc_features
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1129 gff3_qc_record.annotations = {}
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1130 gffWrite([gff3_qc_record], handle)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1131
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1132 def nice_strand(direction):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1133 # It is somehow possible for whole gffSeqFeature objects to end up in here, apparently at the gene level
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1134 if "SeqFeature" in str(type(direction)):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1135 direction = direction.location.strand
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1136 if direction > 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1137 return "→"#.decode("utf-8")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1138 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1139 return "←"#.decode("utf-8")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1140
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1141 def nice_strand_tex(direction):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1142 if "SeqFeature" in str(type(direction)):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1143 direction = direction.location.strand
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1144 if direction > 0:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1145 return "$\\rightarrow$"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1146 else:
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1147 return "$\\leftarrow$"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1148
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1149 def texify(data):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1150 return data.replace("_", "\\_").replace("$", "\\$")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1151
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1152 def length(data):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1153 return len(data)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1154
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1155 def my_encode(data):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1156 return str(data)#.encode("utf-8")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1157
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1158 def my_decode(data):
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1159 # For production
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1160 return str(data)#.decode("utf-8")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1161 # For local testing. No, I do not understand.
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1162 return str(data)#.encode("utf-8")).decode("utf-8")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1163
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1164 env = Environment(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1165 loader=FileSystemLoader(SCRIPT_PATH), trim_blocks=True, lstrip_blocks=True
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1166 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1167 env.filters.update(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1168 {
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1169 "nice_id": get_gff3_id,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1170 "nice_strand": nice_strand,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1171 "nice_strand_tex": nice_strand_tex,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1172 "texify": texify,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1173 "length": length,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1174 "encode": my_encode,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1175 "decode": my_decode,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1176 }
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1177 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1178 tpl = env.get_template(reportTemplateName)
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1179 return tpl.render(**kwargs)#.encode("utf-8")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1180
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1181
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1182 if __name__ == "__main__":
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1183 parser = argparse.ArgumentParser(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1184 description="rebase gff3 features against parent locations", epilog=""
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1185 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1186 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1187 "annotations", type=argparse.FileType("r"), help="Parent GFF3 annotations"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1188 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1189 parser.add_argument("genome", type=argparse.FileType("r"), help="Genome Sequence")
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1190 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1191 "--gff3", type=str, help="GFF3 Annotations", default="qc_annotations.gff3"
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1192 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1193 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1194 "--tbl",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1195 type=str,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1196 help="Table for noninteractive parsing",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1197 default="qc_results.json",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1198 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1199
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1200 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1201 "--sd_min",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1202 type=int,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1203 help="Minimum distance from gene start for an SD to be",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1204 default=5,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1205 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1206 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1207 "--sd_max",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1208 type=int,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1209 help="Maximum distance from gene start for an SD to be",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1210 default=15,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1211 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1212
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1213 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1214 "--min_gene_length",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1215 type=int,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1216 help="Minimum length for a putative gene call (AAs)",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1217 default=30,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1218 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1219
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1220 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1221 "--excessive_overlap_dist",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1222 type=int,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1223 help="Excessive overlap for genes in same direction",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1224 default=25,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1225 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1226 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1227 "--excessive_overlap_divergent_dist",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1228 type=int,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1229 help="Excessive overlap for genes in diff directions",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1230 default=50,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1231 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1232
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1233 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1234 "--excessive_gap_dist",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1235 type=int,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1236 help="Maximum distance between two genes",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1237 default=40,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1238 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1239 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1240 "--excessive_gap_divergent_dist",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1241 type=int,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1242 help="Maximum distance between two divergent genes",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1243 default=200,
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1244 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1245
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1246 parser.add_argument(
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1247 "--reportTemplateName",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1248 help="Report template file name",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1249 default="phageqc_report_full.html",
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1250 )
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1251
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1252 args = parser.parse_args()
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1253
c3140b08d703 Uploaded
cpt
parents:
diff changeset
1254 sys.stdout.write(evaluate_and_report(**vars(args)))