# HG changeset patch
# User cpt
# Date 1655470850 0
# Node ID c3140b08d703e19f56ca95e2bc6becea01da49b3
Uploaded
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/cpt-macros.xml Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,115 @@
+
+
+
+
+ python
+ biopython
+ requests
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/cpt.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/cpt.py Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,342 @@
+#!/usr/bin/env python
+import regex as re
+from Bio.Seq import Seq, reverse_complement, translate
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from Bio.Data import CodonTable
+import logging
+
+logging.basicConfig()
+log = logging.getLogger()
+
+PHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*phage (?P.*)$")
+BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*bacteriophage (?P.*)$")
+STARTS_WITH_PHAGE = re.compile(
+ "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P.*)$"
+)
+NEW_STYLE_NAMES = re.compile("(?Pv[A-Z]_[A-Z][a-z]{2}_.*)")
+
+
+def phage_name_parser(name):
+ host = None
+ phage = None
+ name = name.replace(", complete genome.", "")
+ name = name.replace(", complete genome", "")
+
+ m = BACTERIOPHAGE_IN_MIDDLE.match(name)
+ if m:
+ host = m.group("host")
+ phage = m.group("phage")
+ return (host, phage)
+
+ m = PHAGE_IN_MIDDLE.match(name)
+ if m:
+ host = m.group("host")
+ phage = m.group("phage")
+ return (host, phage)
+
+ m = STARTS_WITH_PHAGE.match(name)
+ if m:
+ phage = m.group("phage")
+ return (host, phage)
+
+ m = NEW_STYLE_NAMES.match(name)
+ if m:
+ phage = m.group("phage")
+ return (host, phage)
+
+ return (host, phage)
+
+
+class OrfFinder(object):
+ def __init__(self, table, ftype, ends, min_len, strand):
+ self.table = table
+ self.table_obj = CodonTable.ambiguous_generic_by_id[table]
+ self.ends = ends
+ self.ftype = ftype
+ self.min_len = min_len
+ self.starts = sorted(self.table_obj.start_codons)
+ self.stops = sorted(self.table_obj.stop_codons)
+ self.re_starts = re.compile("|".join(self.starts))
+ self.re_stops = re.compile("|".join(self.stops))
+ self.strand = strand
+
+ def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
+ seq_format = "fasta"
+ log.debug("Genetic code table %i" % self.table)
+ log.debug("Minimum length %i aa" % self.min_len)
+
+ out_count = 0
+
+ out_gff3.write("##gff-version 3\n")
+
+ for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
+ for i, (f_start, f_end, f_strand, n, t) in enumerate(
+ self.get_all_peptides(str(record.seq).upper())
+ ):
+ out_count += 1
+
+ descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
+ len(t),
+ len(n),
+ f_start,
+ f_end,
+ f_strand,
+ record.description,
+ )
+ fid = record.id + "|%s%i" % (self.ftype, i + 1)
+
+ r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+ t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+
+ SeqIO.write(r, out_nuc, "fasta")
+ SeqIO.write(t, out_prot, "fasta")
+
+ nice_strand = "+" if f_strand == +1 else "-"
+
+ out_bed.write(
+ "\t".join(
+ map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
+ )
+ + "\n"
+ )
+
+ out_gff3.write(
+ "\t".join(
+ map(
+ str,
+ [
+ record.id,
+ "getOrfsOrCds",
+ "CDS",
+ f_start + 1,
+ f_end,
+ ".",
+ nice_strand,
+ 0,
+ "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
+ ],
+ )
+ )
+ + "\n"
+ )
+ log.info("Found %i %ss", out_count, self.ftype)
+
+ def start_chop_and_trans(self, s, strict=True):
+ """Returns offset, trimmed nuc, protein."""
+ if strict:
+ assert s[-3:] in self.stops, s
+ assert len(s) % 3 == 0
+ for match in self.re_starts.finditer(s, overlapped=True):
+ # Must check the start is in frame
+ start = match.start()
+ if start % 3 == 0:
+ n = s[start:]
+ assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+ if strict:
+ t = translate(n, self.table)
+ else:
+ # Use when missing stop codon,
+ t = "M" + translate(n[3:], self.table, to_stop=True)
+ yield start, n, t # Edited by CPT to be a generator
+
+ def break_up_frame(self, s):
+ """Returns offset, nuc, protein."""
+ start = 0
+ for match in self.re_stops.finditer(s, overlapped=True):
+ index = match.start() + 3
+ if index % 3 != 0:
+ continue
+ n = s[start:index]
+ for (offset, n, t) in self.start_chop_and_trans(n):
+ if n and len(t) >= self.min_len:
+ yield start + offset, n, t
+ start = index
+
+ def putative_genes_in_sequence(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+ Co-ordinates are Python style zero-based.
+ """
+ nuc_seq = nuc_seq.upper()
+ # TODO - Refactor to use a generator function (in start order)
+ # rather than making a list and sorting?
+ answer = []
+ full_len = len(nuc_seq)
+
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ answer.append((start, start + len(n), +1, n, t))
+
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ answer.append((start, start - len(n), -1, n, t))
+ answer.sort()
+ return answer
+
+ def get_all_peptides(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+
+ Co-ordinates are Python style zero-based.
+ """
+ # Refactored into generator by CPT
+ full_len = len(nuc_seq)
+ if self.strand != "reverse":
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ yield (start, start + len(n), +1, n, t)
+ if self.strand != "forward":
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ yield (start - len(n), start, -1, n, t)
+
+
+class MGAFinder(object):
+ def __init__(self, table, ftype, ends, min_len):
+ self.table = table
+ self.table_obj = CodonTable.ambiguous_generic_by_id[table]
+ self.ends = ends
+ self.ftype = ftype
+ self.min_len = min_len
+ self.starts = sorted(self.table_obj.start_codons)
+ self.stops = sorted(self.table_obj.stop_codons)
+ self.re_starts = re.compile("|".join(self.starts))
+ self.re_stops = re.compile("|".join(self.stops))
+
+ def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
+ seq_format = "fasta"
+ log.debug("Genetic code table %i" % self.table)
+ log.debug("Minimum length %i aa" % self.min_len)
+
+ out_count = 0
+
+ out_gff3.write("##gff-version 3\n")
+
+ for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
+ for i, (f_start, f_end, f_strand, n, t) in enumerate(
+ self.get_all_peptides(str(record.seq).upper())
+ ):
+ out_count += 1
+
+ descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
+ len(t),
+ len(n),
+ f_start,
+ f_end,
+ f_strand,
+ record.description,
+ )
+ fid = record.id + "|%s%i" % (self.ftype, i + 1)
+
+ r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+ t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+
+ SeqIO.write(r, out_nuc, "fasta")
+ SeqIO.write(t, out_prot, "fasta")
+
+ nice_strand = "+" if f_strand == +1 else "-"
+
+ out_bed.write(
+ "\t".join(
+ map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
+ )
+ + "\n"
+ )
+
+ out_gff3.write(
+ "\t".join(
+ map(
+ str,
+ [
+ record.id,
+ "getOrfsOrCds",
+ "CDS",
+ f_start + 1,
+ f_end,
+ ".",
+ nice_strand,
+ 0,
+ "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
+ ],
+ )
+ )
+ + "\n"
+ )
+ log.info("Found %i %ss", out_count, self.ftype)
+
+ def start_chop_and_trans(self, s, strict=True):
+ """Returns offset, trimmed nuc, protein."""
+ if strict:
+ assert s[-3:] in self.stops, s
+ assert len(s) % 3 == 0
+ for match in self.re_starts.finditer(s, overlapped=True):
+ # Must check the start is in frame
+ start = match.start()
+ if start % 3 == 0:
+ n = s[start:]
+ assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+ if strict:
+ t = translate(n, self.table)
+ else:
+ # Use when missing stop codon,
+ t = "M" + translate(n[3:], self.table, to_stop=True)
+ yield start, n, t
+
+ def break_up_frame(self, s):
+ """Returns offset, nuc, protein."""
+ start = 0
+ for match in self.re_stops.finditer(s, overlapped=True):
+ index = match.start() + 3
+ if index % 3 != 0:
+ continue
+ n = s[start:index]
+ for (offset, n, t) in self.start_chop_and_trans(n):
+ if n and len(t) >= self.min_len:
+ yield start + offset, n, t
+ start = index
+
+ def putative_genes_in_sequence(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+ Co-ordinates are Python style zero-based.
+ """
+ nuc_seq = nuc_seq.upper()
+ # TODO - Refactor to use a generator function (in start order)
+ # rather than making a list and sorting?
+ answer = []
+ full_len = len(nuc_seq)
+
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ answer.append((start, start + len(n), +1, n, t))
+
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ answer.append((start, start - len(n), -1, n, t))
+ answer.sort()
+ return answer
+
+ def get_all_peptides(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+
+ Co-ordinates are Python style zero-based.
+ """
+ # Refactored into generator by CPT
+
+ full_len = len(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ yield (start, start + len(n), +1, n, t)
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ yield (start - len(n), start, -1, n, t)
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/gff3.py Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,346 @@
+import copy
+import logging
+
+log = logging.getLogger()
+log.setLevel(logging.WARN)
+
+
+def feature_lambda(
+ feature_list,
+ test,
+ test_kwargs,
+ subfeatures=True,
+ parent=None,
+ invert=False,
+ recurse=True,
+):
+ """Recursively search through features, testing each with a test function, yielding matches.
+
+ GFF3 is a hierachical data structure, so we need to be able to recursively
+ search through features. E.g. if you're looking for a feature with
+ ID='bob.42', you can't just do a simple list comprehension with a test
+ case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.
+
+ :type feature_list: list
+ :param feature_list: an iterable of features
+
+ :type test: function reference
+ :param test: a closure with the method signature (feature, **kwargs) where
+ the kwargs are those passed in the next argument. This
+ function should return True or False, True if the feature is
+ to be yielded as part of the main feature_lambda function, or
+ False if it is to be ignored. This function CAN mutate the
+ features passed to it (think "apply").
+
+ :type test_kwargs: dictionary
+ :param test_kwargs: kwargs to pass to your closure when it is called.
+
+ :type subfeatures: boolean
+ :param subfeatures: when a feature is matched, should just that feature be
+ yielded to the caller, or should the entire sub_feature
+ tree for that feature be included? subfeatures=True is
+ useful in cases such as searching for a gene feature,
+ and wanting to know what RBS/Shine_Dalgarno_sequences
+ are in the sub_feature tree (which can be accomplished
+ with two feature_lambda calls). subfeatures=False is
+ useful in cases when you want to process (and possibly
+ return) the entire feature tree, such as applying a
+ qualifier to every single feature.
+
+ :type invert: boolean
+ :param invert: Negate/invert the result of the filter.
+
+ :rtype: yielded list
+ :return: Yields a list of matching features.
+ """
+ # Either the top level set of [features] or the subfeature attribute
+ for feature in feature_list:
+ feature._parent = parent
+ if not parent:
+ # Set to self so we cannot go above root.
+ feature._parent = feature
+ test_result = test(feature, **test_kwargs)
+ # if (not invert and test_result) or (invert and not test_result):
+ if invert ^ test_result:
+ if not subfeatures:
+ feature_copy = copy.deepcopy(feature)
+ feature_copy.sub_features = list()
+ yield feature_copy
+ else:
+ yield feature
+
+ if recurse and hasattr(feature, "sub_features"):
+ for x in feature_lambda(
+ feature.sub_features,
+ test,
+ test_kwargs,
+ subfeatures=subfeatures,
+ parent=feature,
+ invert=invert,
+ recurse=recurse,
+ ):
+ yield x
+
+
+def fetchParent(feature):
+ if not hasattr(feature, "_parent") or feature._parent is None:
+ return feature
+ else:
+ return fetchParent(feature._parent)
+
+
+def feature_test_true(feature, **kwargs):
+ return True
+
+
+def feature_test_type(feature, **kwargs):
+ if "type" in kwargs:
+ return str(feature.type).upper() == str(kwargs["type"]).upper()
+ elif "types" in kwargs:
+ for x in kwargs["types"]:
+ if str(feature.type).upper() == str(x).upper():
+ return True
+ return False
+ raise Exception("Incorrect feature_test_type call, need type or types")
+
+
+def feature_test_qual_value(feature, **kwargs):
+ """Test qualifier values.
+
+ For every feature, check that at least one value in
+ feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
+ """
+ if isinstance(kwargs["qualifier"], list):
+ for qualifier in kwargs["qualifier"]:
+ for attribute_value in feature.qualifiers.get(qualifier, []):
+ if attribute_value in kwargs["attribute_list"]:
+ return True
+ else:
+ for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []):
+ if attribute_value in kwargs["attribute_list"]:
+ return True
+ return False
+
+
+def feature_test_location(feature, **kwargs):
+ if "strand" in kwargs:
+ if feature.location.strand != kwargs["strand"]:
+ return False
+
+ return feature.location.start <= kwargs["loc"] <= feature.location.end
+
+
+def feature_test_quals(feature, **kwargs):
+ """
+ Example::
+
+ a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']})
+
+ # Check if a contains a Note
+ feature_test_quals(a, {'Note': None}) # Returns True
+ feature_test_quals(a, {'Product': None}) # Returns False
+
+ # Check if a contains a note with specific value
+ feature_test_quals(a, {'Note': ['ome']}) # Returns True
+
+ # Check if a contains a note with specific value
+ feature_test_quals(a, {'Note': ['other']}) # Returns False
+ """
+ for key in kwargs:
+ if key not in feature.qualifiers:
+ return False
+
+ # Key is present, no value specified
+ if kwargs[key] is None:
+ return True
+
+ # Otherwise there is a key value we're looking for.
+ # so we make a list of matches
+ matches = []
+ # And check all of the feature qualifier valuse
+ for value in feature.qualifiers[key]:
+ # For that kwargs[key] value
+ for x in kwargs[key]:
+ matches.append(x in value)
+
+ # If none matched, then we return false.
+ if not any(matches):
+ return False
+
+ return True
+
+
+def feature_test_contains(feature, **kwargs):
+ if "index" in kwargs:
+ return feature.location.start < kwargs["index"] < feature.location.end
+ elif "range" in kwargs:
+ return (
+ feature.location.start < kwargs["range"]["start"] < feature.location.end
+ and feature.location.start < kwargs["range"]["end"] < feature.location.end
+ )
+ else:
+ raise RuntimeError("Must use index or range keyword")
+
+
+def get_id(feature=None, parent_prefix=None):
+ result = ""
+ if parent_prefix is not None:
+ result += parent_prefix + "|"
+ if "locus_tag" in feature.qualifiers:
+ result += feature.qualifiers["locus_tag"][0]
+ elif "gene" in feature.qualifiers:
+ result += feature.qualifiers["gene"][0]
+ elif "Gene" in feature.qualifiers:
+ result += feature.qualifiers["Gene"][0]
+ elif "product" in feature.qualifiers:
+ result += feature.qualifiers["product"][0]
+ elif "Product" in feature.qualifiers:
+ result += feature.qualifiers["Product"][0]
+ elif "Name" in feature.qualifiers:
+ result += feature.qualifiers["Name"][0]
+ else:
+ return feature.id
+ # Leaving in case bad things happen.
+ # result += '%s_%s_%s_%s' % (
+ # feature.id,
+ # feature.location.start,
+ # feature.location.end,
+ # feature.location.strand
+ # )
+ return result
+
+
+def get_gff3_id(gene):
+ return gene.qualifiers.get("Name", [gene.id])[0]
+
+
+def ensure_location_in_bounds(start=0, end=0, parent_length=0):
+ # This prevents frameshift errors
+ while start < 0:
+ start += 3
+ while end < 0:
+ end += 3
+ while start > parent_length:
+ start -= 3
+ while end > parent_length:
+ end -= 3
+ return (start, end)
+
+
+def coding_genes(feature_list):
+ for x in genes(feature_list):
+ if (
+ len(
+ list(
+ feature_lambda(
+ x.sub_features,
+ feature_test_type,
+ {"type": "CDS"},
+ subfeatures=False,
+ )
+ )
+ )
+ > 0
+ ):
+ yield x
+
+
+def genes(feature_list, feature_type="gene", sort=False):
+ """
+ Simple filter to extract gene features from the feature set.
+ """
+
+ if not sort:
+ for x in feature_lambda(
+ feature_list, feature_test_type, {"type": feature_type}, subfeatures=True
+ ):
+ yield x
+ else:
+ data = list(genes(feature_list, feature_type=feature_type, sort=False))
+ data = sorted(data, key=lambda feature: feature.location.start)
+ for x in data:
+ yield x
+
+
+def wa_unified_product_name(feature):
+ """
+ Try and figure out a name. We gave conflicting instructions, so
+ this isn't as trivial as it should be. Sometimes it will be in
+ 'product' or 'Product', othertimes in 'Name'
+ """
+ # Manually applied tags.
+ protein_product = feature.qualifiers.get(
+ "product", feature.qualifiers.get("Product", [None])
+ )[0]
+
+ # If neither of those are available ...
+ if protein_product is None:
+ # And there's a name...
+ if "Name" in feature.qualifiers:
+ if not is_uuid(feature.qualifiers["Name"][0]):
+ protein_product = feature.qualifiers["Name"][0]
+
+ return protein_product
+
+
+def is_uuid(name):
+ return name.count("-") == 4 and len(name) == 36
+
+
+def get_rbs_from(gene):
+ # Normal RBS annotation types
+ rbs_rbs = list(
+ feature_lambda(
+ gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False
+ )
+ )
+ rbs_sds = list(
+ feature_lambda(
+ gene.sub_features,
+ feature_test_type,
+ {"type": "Shine_Dalgarno_sequence"},
+ subfeatures=False,
+ )
+ )
+ # Fraking apollo
+ apollo_exons = list(
+ feature_lambda(
+ gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False
+ )
+ )
+ apollo_exons = [x for x in apollo_exons if len(x) < 10]
+ # These are more NCBI's style
+ regulatory_elements = list(
+ feature_lambda(
+ gene.sub_features,
+ feature_test_type,
+ {"type": "regulatory"},
+ subfeatures=False,
+ )
+ )
+ rbs_regulatory = list(
+ feature_lambda(
+ regulatory_elements,
+ feature_test_quals,
+ {"regulatory_class": ["ribosome_binding_site"]},
+ subfeatures=False,
+ )
+ )
+ # Here's hoping you find just one ;)
+ return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons
+
+
+def nice_name(record):
+ """
+ get the real name rather than NCBI IDs and so on. If fails, will return record.id
+ """
+ name = record.id
+ likely_parental_contig = list(genes(record.features, feature_type="contig"))
+ if len(likely_parental_contig) == 1:
+ name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]
+ return name
+
+
+def fsort(it):
+ for i in sorted(it, key=lambda x: int(x.location.start)):
+ yield i
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/macros.xml Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,23 @@
+
+
+
+
+ python
+ biopython
+ cpt_gffparser
+
+
+
+
+
+
+
+
+
+
+ ln -s $genome_fasta genomeref.fa;
+
+
+ genomeref.fa
+
+
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phage_annotation_validator.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/phage_annotation_validator.py Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,1254 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim: set fileencoding=utf-8
+import os
+import sys
+import json
+import math
+import numpy
+import argparse
+import itertools
+import logging
+from gff3 import (
+ feature_lambda,
+ coding_genes,
+ genes,
+ get_gff3_id,
+ feature_test_location,
+ get_rbs_from,
+ nice_name,
+)
+from shinefind import NaiveSDCaller
+from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from jinja2 import Environment, FileSystemLoader
+from cpt import MGAFinder
+
+logging.basicConfig(level=logging.DEBUG)
+log = logging.getLogger(name="pav")
+
+# Path to script, required because of Galaxy.
+SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
+# Path to the HTML template for the report
+
+ENCOURAGEMENT = (
+ (100, "Perfection itself!"),
+ (90, "Amazing!"),
+ (80, "Not too bad, a few minor things to fix..."),
+ (70, "Some issues to address"),
+ (
+ 50,
+ """Issues detected!
Have you heard of the
+ CPT\'s Automated Phage Annotation
+ Pipeline?""",
+ ),
+ (
+ 0,
+ """MAJOR issues detected! Please consider using the
+ CPT\'s Automated Phage Annotation Pipeline""",
+ ),
+)
+
+
+def gen_qc_feature(start, end, message, strand=0, id_src=None, type_src="gene"):
+ kwargs = {"qualifiers": {"note": [message]}}
+ kwargs["type"] = type_src
+ kwargs["strand"] = strand
+ kwargs["phase"]=0
+ kwargs["score"]=0.0
+ kwargs["source"]="feature"
+ if id_src is not None:
+ kwargs["id"] = id_src.id
+ kwargs["qualifiers"]["ID"] = [id_src.id]
+ kwargs["qualifiers"]["Name"] = id_src.qualifiers.get("Name", [])
+
+
+ if end >= start:
+ return gffSeqFeature(FeatureLocation(start, end, strand=strand), **kwargs)
+ else:
+ return gffSeqFeature(FeatureLocation(end, start, strand=strand), **kwargs)
+
+
+def __ensure_location_in_bounds(start=0, end=0, parent_length=0):
+ # This prevents frameshift errors
+ while start < 0:
+ start += 3
+ while end < 0:
+ end += 3
+ while start > parent_length:
+ start -= 3
+ while end > parent_length:
+ end -= 3
+ return (start, end)
+
+
+def missing_rbs(record, lookahead_min=5, lookahead_max=15):
+ """
+ Identify gene features with missing RBSs
+
+ This "looks ahead" 5-15 bases ahead of each gene feature, and checks if
+ there's an RBS feature in those bounds.
+
+ The returned data is a set of genes with the RBS sequence in the __upstream
+ attribute, and a message in the __message attribute.
+ """
+ results = []
+ good = 0
+ bad = 0
+ qc_features = []
+ sd_finder = NaiveSDCaller()
+
+ any_rbss = False
+
+ for gene in coding_genes(record.features):
+ # Check if there are RBSs, TODO: make this recursive. Each feature in
+ # gene.sub_features can also have sub_features.
+ rbss = get_rbs_from(gene)
+ # No RBS found
+ if len(rbss) == 0:
+ # Get the sequence lookahead_min to lookahead_max upstream
+ if gene.strand > 0:
+ start = gene.location.start - lookahead_max
+ end = gene.location.start - lookahead_min
+ else:
+ start = gene.location.end + lookahead_min
+ end = gene.location.end + lookahead_max
+ # We have to ensure the feature is ON the genome, otherwise we may
+ # be trying to access a location outside of the length of the
+ # genome, which would be bad.
+ (start, end) = __ensure_location_in_bounds(
+ start=start, end=end, parent_length=len(record)
+ )
+ # Temporary feature to extract sequence
+ tmp = gffSeqFeature(
+ FeatureLocation(start, end, strand=gene.strand), type="domain"
+ )
+ # Get the sequence
+ seq = str(tmp.extract(record.seq))
+ # Set the default properties
+ gene.__upstream = seq.lower()
+ gene.__message = "No RBS annotated, None found"
+
+ # Try and do an automated shinefind call
+ sds = sd_finder.list_sds(seq)
+ if len(sds) > 0:
+ sd = sds[0]
+ gene.__upstream = sd_finder.highlight_sd(
+ seq.lower(), sd["start"], sd["end"]
+ )
+ gene.__message = "Unannotated but valid RBS"
+
+ qc_features.append(
+ gen_qc_feature(
+ start, end, "Missing RBS", strand=gene.strand, id_src=gene, type_src="gene"
+ )
+ )
+
+ bad += 1
+ results.append(gene)
+ results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
+ else:
+ if len(rbss) > 1:
+ log.warn("%s RBSs found for gene %s", rbss[0].id, get_gff3_id(gene))
+ any_rbss = True
+ # get first RBS/CDS
+ cds = list(genes(gene.sub_features, feature_type="CDS"))[0]
+ rbs = rbss[0]
+
+ # Get the distance between the two
+ if gene.strand > 0:
+ distance = cds.location.start - rbs.location.end
+ else:
+ distance = rbs.location.start - cds.location.end
+
+ # If the RBS is too far away, annotate that
+ if distance > lookahead_max:
+ gene.__message = "RBS too far away (%s nt)" % distance
+
+ qc_features.append(
+ gen_qc_feature(
+ rbs.location.start,
+ rbs.location.end,
+ gene.__message,
+ strand=gene.strand,
+ id_src=gene,
+ type_src="gene"
+ )
+ )
+
+ bad += 1
+ results.append(gene)
+ results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
+ else:
+ good += 1
+
+ return good, bad, results, qc_features, any_rbss
+
+
+# modified from get_orfs_or_cdss.py
+# -----------------------------------------------------------
+
+
+def require_sd(data, record, chrom_start, sd_min, sd_max):
+ sd_finder = NaiveSDCaller()
+ for putative_gene in data:
+ if putative_gene[2] > 0: # strand
+ start = chrom_start + putative_gene[0] - sd_max
+ end = chrom_start + putative_gene[0] - sd_min
+ else:
+ start = chrom_start + putative_gene[1] + sd_min
+ end = chrom_start + putative_gene[1] + sd_max
+
+ (start, end) = __ensure_location_in_bounds(
+ start=start, end=end, parent_length=len(record)
+ )
+ tmp = gffSeqFeature(
+ FeatureLocation(start, end, strand=putative_gene[2]), type="domain"
+ )
+ # Get the sequence
+ seq = str(tmp.extract(record.seq))
+ sds = sd_finder.list_sds(seq)
+ if len(sds) > 0:
+ yield putative_gene + (start, end)
+
+
+def excessive_gap(
+ record,
+ excess=50,
+ excess_divergent=200,
+ min_gene=30,
+ slop=30,
+ lookahead_min=5,
+ lookahead_max=15,
+):
+ """
+ Identify excessive gaps between gene features.
+
+ Default "excessive" gap size is 10, but that should likely be larger.
+ """
+ results = []
+ good = 0
+ bad = 0
+
+ contiguous_regions = []
+
+ sorted_genes = sorted(
+ genes(record.features), key=lambda feature: feature.location.start
+ )
+ if len(sorted_genes) == 0:
+ log.warn("NO GENES FOUND")
+ return good, bad, results, []
+
+ current_gene = None
+ for gene in sorted_genes:
+ # If the gene's start is contiguous to the "current_gene", then we
+ # extend current_gene
+ for cds in genes(gene.sub_features, feature_type="CDS"):
+ if current_gene is None:
+ current_gene = [int(cds.location.start), int(cds.location.end)]
+
+ if cds.location.start <= current_gene[1] + excess:
+ # Don't want to decrease size
+ if int(cds.location.end) >= current_gene[1]:
+ current_gene[1] = int(cds.location.end)
+ else:
+ # If it's discontiguous, we append the region and clear.
+ contiguous_regions.append(current_gene)
+ current_gene = [int(cds.location.start), int(cds.location.end)]
+
+ # This generally expected that annotations would NOT continue unto the end
+ # of the genome, however that's a bug, and we can make it here with an
+ # empty contiguous_regions list
+ contiguous_regions.append(current_gene)
+
+ for i in range(len(contiguous_regions) + 1):
+ if i == 0:
+ a = (1, 1)
+ b = contiguous_regions[i]
+ elif i >= len(contiguous_regions):
+ a = contiguous_regions[i - 1]
+ b = (len(record.seq), None)
+ else:
+ a = contiguous_regions[i - 1]
+ b = contiguous_regions[i]
+
+ gap_size = abs(b[0] - a[1])
+
+ if gap_size > min(excess, excess_divergent):
+ a_feat_l = itertools.islice(
+ feature_lambda(
+ sorted_genes,
+ feature_test_location,
+ {"loc": a[1]},
+ subfeatures=False,
+ ),
+ 1,
+ )
+ b_feat_l = itertools.islice(
+ feature_lambda(
+ sorted_genes,
+ feature_test_location,
+ {"loc": b[0]},
+ subfeatures=False,
+ ),
+ 1,
+ )
+
+ try:
+ a_feat = next(a_feat_l)
+ except StopIteration:
+ # Triggers on end of genome
+ a_feat = None
+ try:
+ b_feat = next(b_feat_l)
+ except StopIteration:
+ # Triggers on end of genome
+ b_feat = None
+
+ result_obj = [
+ a[1],
+ b[0],
+ None if not a_feat else a_feat.location.strand,
+ None if not b_feat else b_feat.location.strand,
+ ]
+
+ if a_feat is None or b_feat is None:
+ if gap_size > excess_divergent:
+ results.append(result_obj)
+ else:
+ if (
+ a_feat.location.strand == b_feat.location.strand
+ and gap_size > excess
+ ):
+ results.append(result_obj)
+ elif (
+ a_feat.location.strand != b_feat.location.strand
+ and gap_size > excess_divergent
+ ):
+ results.append(result_obj)
+
+ better_results = []
+ qc_features = []
+ of = MGAFinder(11, "CDS", "closed", min_gene)
+ # of = OrfFinder(11, 'CDS', 'closed', min_gene)
+
+ for result_obj in results:
+ start = result_obj[0]
+ end = result_obj[1]
+ f = gen_qc_feature(start, end, "Excessive gap, %s bases" % abs(end - start), type_src="gene")
+ qc_features.append(f)
+ putative_genes = of.putative_genes_in_sequence(
+ str(record[start - slop : end + slop].seq)
+ )
+ putative_genes = list(
+ require_sd(putative_genes, record, start, lookahead_min, lookahead_max)
+ )
+ for putative_gene in putative_genes:
+ # (0, 33, 1, 'ATTATTTTATCAAAACGCTTTACAATCTTTTAG', 'MILSKRFTIF', 123123, 124324)
+ possible_gene_start = start + putative_gene[0]
+ possible_gene_end = start + putative_gene[1]
+
+ if possible_gene_start <= possible_gene_end:
+ possible_cds = gffSeqFeature(
+ FeatureLocation(
+ possible_gene_start, possible_gene_end, strand=putative_gene[2]
+ ),
+ type="CDS",
+ )
+ else:
+ possible_cds = gffSeqFeature(
+ FeatureLocation(
+ possible_gene_end, possible_gene_start, strand=putative_gene[2],
+ ),
+ type="CDS",
+ )
+
+ # Now we adjust our boundaries for the RBS that's required
+ # There are only two cases, the rbs is upstream of it, or downstream
+ if putative_gene[5] < possible_gene_start:
+ possible_gene_start = putative_gene[5]
+ else:
+ possible_gene_end = putative_gene[6]
+
+ if putative_gene[5] <= putative_gene[6]:
+ possible_rbs = gffSeqFeature(
+ FeatureLocation(
+ putative_gene[5], putative_gene[6], strand=putative_gene[2]
+ ),
+ type="Shine_Dalgarno_sequence",
+ )
+ else:
+ possible_rbs = gffSeqFeature(
+ FeatureLocation(
+ putative_gene[6], putative_gene[5], strand=putative_gene[2],
+ ),
+ type="Shine_Dalgarno_sequence",
+ )
+
+ if possible_gene_start <= possible_gene_end:
+ possible_gene = gffSeqFeature(
+ FeatureLocation(
+ possible_gene_start, possible_gene_end, strand=putative_gene[2]
+ ),
+ type="gene",
+ qualifiers={"note": ["Possible gene"]},
+ )
+ else:
+ possible_gene = gffSeqFeature(
+ FeatureLocation(
+ possible_gene_end, possible_gene_start, strand=putative_gene[2],
+ ),
+ type="gene",
+ qualifiers={"note": ["Possible gene"]},
+ )
+ possible_gene.sub_features = [possible_rbs, possible_cds]
+ qc_features.append(possible_gene)
+
+ better_results.append(result_obj + [len(putative_genes)])
+
+ # Bad gaps are those with more than zero possible genes found
+ bad = len([x for x in better_results if x[2] > 0])
+ # Generally taking "good" here as every possible gap in the genome
+ # Thus, good is TOTAL - gaps
+ good = len(sorted_genes) + 1 - bad
+ # and bad is just gaps
+ return good, bad, better_results, qc_features
+
+
+def phi(x):
+ """Standard phi function used in calculation of normal distribution"""
+ return math.exp(-1 * math.pi * x * x)
+
+
+def norm(x, mean=0, sd=1):
+ """
+ Normal distribution. Given an x position, a mean, and a standard
+ deviation, calculate the "y" value. Useful for score scaling
+
+ Modified to multiply by SD. This means even at sd=5, norm(x, mean) where x = mean => 1, rather than 1/5.
+ """
+ return (1 / float(sd)) * phi(float(x - mean) / float(sd)) * sd
+
+
+def coding_density(record, mean=92.5, sd=20):
+ """
+ Find coding density in the genome
+ """
+ feature_lengths = 0
+
+ for gene_a in coding_genes(record.features):
+ feature_lengths += sum(
+ [len(x) for x in genes(gene_a.sub_features, feature_type="CDS")]
+ )
+
+ avgFeatLen = float(feature_lengths) / float(len(record.seq))
+ return int(norm(100 * avgFeatLen, mean=mean, sd=sd) * 100), int(100 * avgFeatLen)
+
+
+def exact_coding_density(record, mean=92.5, sd=20):
+ """
+ Find exact coding density in the genome
+ """
+ data = numpy.zeros(len(record.seq))
+
+ for gene_a in coding_genes(record.features):
+ for cds in genes(gene_a.sub_features, feature_type="CDS"):
+ for i in range(cds.location.start, cds.location.end + 1):
+ data[i - 1] = 1
+
+ return float(sum(data)) / len(data)
+
+
+def excessive_overlap(record, excess=15, excess_divergent=30):
+ """
+ Find excessive overlaps in the genome, where excessive is defined as 15
+ bases for same strand, and 30 for divergent translation.
+
+ Does a product of all the top-level features in the genome, and calculates
+ gaps.
+ """
+ results = []
+ bad = 0
+ qc_features = []
+
+ for (gene_a, gene_b) in itertools.combinations(coding_genes(record.features), 2):
+ # Get the CDS from the subfeature list.
+ # TODO: not recursive.
+ cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")]
+ cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")]
+
+ if len(cds_a) == 0:
+ log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a))
+ continue
+
+ if len(cds_b) == 0:
+ log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b))
+ continue
+
+ cds_a = cds_a[0]
+ cds_b = cds_b[0]
+
+ # Set of locations that are included in the CDS of A and the
+ # CDS of B
+ cas = set(range(cds_a.location.start, cds_a.location.end))
+ cbs = set(range(cds_b.location.start, cds_b.location.end))
+
+ # Here we calculate the intersection between the two sets, and
+ # if it's larger than our excessive size, we know that they're
+ # overlapped
+ ix = cas.intersection(cbs)
+
+ if (cds_a.location.strand == cds_b.location.strand and len(ix) >= excess) or (
+ cds_a.location.strand != cds_b.location.strand
+ and len(ix) >= excess_divergent
+ ):
+ bad += float(len(ix)) / float(min(excess, excess_divergent))
+ qc_features.append(
+ gen_qc_feature(min(ix), max(ix), "Excessive Overlap", id_src=gene_a, type_src="gene")
+ )
+ results.append((gene_a, gene_b, min(ix), max(ix)))
+
+ # Good isn't accurate here. It's a triangle number and just ugly, but we
+ # don't care enough to fix it.
+ good = len(list(coding_genes(record.features)))
+ good = int(good - bad)
+ if good < 0:
+ good = 0
+ return good, int(bad), results, qc_features
+
+
+def get_encouragement(score):
+ """Some text telling the user how they did
+ """
+ for encouragement in ENCOURAGEMENT:
+ if score > encouragement[0]:
+ return encouragement[1]
+ return ENCOURAGEMENT[-1][1]
+
+
+def genome_overview(record):
+ """Genome overview
+ """
+ data = {
+ "genes": {
+ "count": 0,
+ "bases": len(record.seq),
+ "density": 0, # genes / kb
+ "avg_len": [],
+ "comp": {"A": 0, "C": 0, "G": 0, "T": 0},
+ },
+ "overall": {
+ "comp": {
+ "A": record.seq.count("A") + record.seq.count("a"),
+ "C": record.seq.count("C") + record.seq.count("c"),
+ "G": record.seq.count("G") + record.seq.count("g"),
+ "T": record.seq.count("T") + record.seq.count("t"),
+ },
+ "gc": 0,
+ },
+ }
+ gene_features = list(coding_genes(record.features))
+ data["genes"]["count"] = len(gene_features)
+
+ for feat in gene_features:
+ data["genes"]["comp"]["A"] += feat.extract(record).seq.count("A") + feat.extract(record).seq.count("a")
+ data["genes"]["comp"]["C"] += feat.extract(record).seq.count("C") + feat.extract(record).seq.count("c")
+ data["genes"]["comp"]["T"] += feat.extract(record).seq.count("T") + feat.extract(record).seq.count("t")
+ data["genes"]["comp"]["G"] += feat.extract(record).seq.count("G") + feat.extract(record).seq.count("g")
+ #data["genes"]["bases"] += len(feat)
+ data["genes"]["avg_len"].append(len(feat))
+
+ data["genes"]["avg_len"] = float(sum(data["genes"]["avg_len"])) / len(gene_features)
+ data["overall"]["gc"] = float(
+ data["overall"]["comp"]["G"] + data["overall"]["comp"]["C"]
+ ) / len(record.seq)
+ return data
+
+
+def find_morons(record):
+ """Locate morons in the genome
+
+ Don't even know why...
+
+ TODO: remove? Idk.
+ """
+ results = []
+ good = 0
+ bad = 0
+
+ gene_features = list(coding_genes(record.features))
+ for i, gene in enumerate(gene_features):
+ two_left = gene_features[i - 2 : i]
+ two_right = gene_features[i + 1 : i + 1 + 2]
+ strands = [x.strand for x in two_left] + [x.strand for x in two_right]
+ anticon = [x for x in strands if x != gene.strand]
+
+ if len(anticon) == 4:
+ has_rbs = [x.type == "Shine_Dalgarno_sequence" for x in gene.sub_features]
+ if any(has_rbs):
+ rbs = [
+ x for x in gene.sub_features if x.type == "Shine_Dalgarno_sequence"
+ ][0]
+ rbs_msg = str(rbs.extract(record.seq))
+ else:
+ rbs_msg = "No RBS Available"
+ results.append((gene, two_left, two_right, rbs_msg))
+ bad += 1
+ else:
+ good += 1
+ return good, bad, results, []
+
+
+def bad_gene_model(record):
+ """Find features without product
+ """
+ results = []
+ good = 0
+ bad = 0
+ qc_features = []
+
+ for gene in coding_genes(record.features):
+ exons = [
+ x for x in genes(gene.sub_features, feature_type="exon") if len(x) > 10
+ ]
+ CDSs = [x for x in genes(gene.sub_features, feature_type="CDS")]
+ if len(exons) >= 1 and len(CDSs) >= 1:
+ if len(exons) != len(CDSs):
+ results.append(
+ (
+ get_gff3_id(gene),
+ None,
+ None,
+ "Mismatched number of exons and CDSs in gff3 representation",
+ )
+ )
+ qc_features.append(
+ gen_qc_feature(
+ gene.location.start,
+ gene.location.end,
+ "Mismatched number of exons and CDSs in gff3 representation",
+ strand=gene.strand,
+ id_src=gene,
+ type_src="gene"
+ )
+ )
+ bad += 1
+ else:
+ for (exon, cds) in zip(
+ sorted(exons, key=lambda x: x.location.start),
+ sorted(CDSs, key=lambda x: x.location.start),
+ ):
+ if len(exon) != len(cds):
+ results.append(
+ (
+ get_gff3_id(gene),
+ exon,
+ cds,
+ "CDS does not extend to full length of gene",
+ )
+ )
+ qc_features.append(
+ gen_qc_feature(
+ exon.location.start,
+ exon.location.end,
+ "CDS does not extend to full length of gene",
+ strand=exon.strand,
+ id_src=gene,
+ type_src="CDS"
+ )
+ )
+ bad += 1
+ else:
+ good += 1
+ else:
+ log.warn("Could not handle %s, %s", exons, CDSs)
+ results.append(
+ (
+ get_gff3_id(gene),
+ None,
+ None,
+ "{0} exons, {1} CDSs".format(len(exons), len(CDSs)),
+ )
+ )
+
+ return good, len(results) + bad, results, qc_features
+
+
+def weird_starts(record):
+ """Find features without product
+ """
+ good = 0
+ bad = 0
+ qc_features = []
+ results = []
+
+ overall = {}
+ for gene in coding_genes(record.features):
+ seq = [x for x in genes(gene.sub_features, feature_type="CDS")]
+ if len(seq) == 0:
+ log.warn("No CDS for gene %s", get_gff3_id(gene))
+ continue
+ else:
+ seq = seq[0]
+
+ seq_str = str(seq.extract(record.seq))
+ start_codon = seq_str[0:3]
+ if len(seq_str) < 3:
+ sys.stderr.write("Fatal Error: CDS of length less than 3 at " + str(seq.location) + '\n')
+ exit(2)
+# if len(seq_str) % 3 != 0:
+# if len(seq_str) < 3:
+# stop_codon = seq_str[-(len(seq_str))]
+# else:
+# stop_codon = seq_str[-3]
+#
+# log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str))
+# seq.__error = "Bad CDS Length"
+# results.append(seq)
+# qc_features.append(
+# gen_qc_feature(
+# s, e, "Bad Length", strand=seq.strand, id_src=gene
+# )
+# )
+# bad += 1
+# seq.__start = start_codon
+# seq.__stop = stop_codon
+# continue
+
+ stop_codon = seq_str[-3]
+ seq.__start = start_codon
+ seq.__stop = stop_codon
+ if start_codon not in overall:
+ overall[start_codon] = 1
+ else:
+ overall[start_codon] += 1
+
+ if start_codon not in ("ATG", "TTG", "GTG"):
+ log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene))
+ seq.__error = "Unusual start codon %s" % start_codon
+
+ s = 0
+ e = 0
+ if seq.strand > 0:
+ s = seq.location.start
+ e = seq.location.start + 3
+ else:
+ s = seq.location.end
+ e = seq.location.end - 3
+
+ results.append(seq)
+ results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
+ qc_features.append(
+ gen_qc_feature(
+ s, e, "Weird start codon", strand=seq.strand, id_src=gene, type_src="gene"
+ )
+ )
+ bad += 1
+ else:
+ good += 1
+
+ return good, bad, results, qc_features, overall
+
+
+def missing_genes(record):
+ """Find features without product
+ """
+ results = []
+ good = 0
+ bad = 0
+ qc_features = []
+
+ for gene in coding_genes(record.features):
+ if gene.qualifiers.get("cpt_source", [None])[0] == "CPT_GENE_MODEL_CORRECTION":
+ results.append(gene)
+ bad += 1
+ else:
+ good += 1
+
+ return good, bad, results, qc_features
+
+
+def gene_model_correction_issues(record):
+ """Find features that have issues from the gene model correction step.
+ These have qualifiers beginning with CPT_GMS
+ """
+ results = []
+ good = 0
+ bad = 0
+ qc_features = []
+
+ # For each gene
+ for gene in coding_genes(record.features):
+ # Get the list of child CDSs
+ cdss = [x for x in genes(gene.sub_features, feature_type="CDS")]
+ # And our matching qualifiers
+ gene_data = [(k, v) for (k, v) in gene.qualifiers.items() if k == "cpt_gmc"]
+ # If there are problems with ONLY the parent, let's complain
+ local_results = []
+ local_qc_features = []
+ for x in gene_data:
+ if "Missing Locus Tag" in x[1]:
+ # Missing locus tag is an either or thing, if it hits here
+ # there shouldn't be anything else wrong with it.
+
+ # Obviously missing so we remove it
+ gene.qualifiers["locus_tag"] = [""]
+ # Translation from bp_genbank2gff3.py
+ cdss[0].qualifiers["locus_tag"] = cdss[0].qualifiers["Name"]
+ # Append our results
+ local_results.append((gene, cdss[0], "Gene is missing a locus_tag"))
+ local_qc_features.append(
+ gen_qc_feature(
+ gene.location.start,
+ gene.location.end,
+ "Gene is missing a locus_tag",
+ strand=gene.strand,
+ type_src="gene"
+ )
+ )
+
+ # We need to alert on any child issues as well.
+ for cds in cdss:
+ cds_data = [
+ (k, v[0]) for (k, v) in cds.qualifiers.items() if k == "cpt_gmc"
+ ]
+ if len(gene_data) == 0 and len(cds_data) == 0:
+ # Alles gut
+ pass
+ else:
+ for _, problem in cds_data:
+ if problem == "BOTH Missing Locus Tag":
+ gene.qualifiers["locus_tag"] = [""]
+ cds.qualifiers["locus_tag"] = [""]
+ local_results.append(
+ (gene, cds, "Both gene and CDS are missing locus tags")
+ )
+ local_qc_features.append(
+ gen_qc_feature(
+ cds.location.start,
+ cds.location.end,
+ "CDS is missing a locus_tag",
+ strand=cds.strand,
+ type_src="CDS"
+ )
+ )
+ local_qc_features.append(
+ gen_qc_feature(
+ gene.location.start,
+ gene.location.end,
+ "Gene is missing a locus_tag",
+ strand=gene.strand,
+ type_src="gene"
+ )
+ )
+ elif problem == "Different locus tag from associated gene.":
+ gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
+ cds.qualifiers["locus_tag"] = cds.qualifiers["cpt_gmc_locus"]
+ local_results.append(
+ (gene, cds, "Gene and CDS have differing locus tags")
+ )
+ local_qc_features.append(
+ gen_qc_feature(
+ gene.location.start,
+ gene.location.end,
+ "Gene and CDS have differing locus tags",
+ strand=gene.strand,
+ type_src="gene"
+ )
+ )
+ elif problem == "Missing Locus Tag":
+ # Copy this over
+ gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
+ # This one is missing
+ cds.qualifiers["locus_tag"] = [""]
+ local_results.append((gene, cds, "CDS is missing a locus_tag"))
+ local_qc_features.append(
+ gen_qc_feature(
+ cds.location.start,
+ cds.location.end,
+ "CDS is missing a locus_tag",
+ strand=cds.strand,
+ type_src="CDS"
+ )
+ )
+ else:
+ log.warn("Cannot handle %s", problem)
+
+ if len(local_results) > 0:
+ bad += 1
+ else:
+ good += 1
+
+ qc_features.extend(local_qc_features)
+ results.extend(local_results)
+ return good, bad, results, qc_features
+
+
+def missing_tags(record):
+ """Find features without product
+ """
+ results = []
+ good = 0
+ bad = 0
+ qc_features = []
+
+ for gene in coding_genes(record.features):
+ cds = [x for x in genes(gene.sub_features, feature_type="CDS")]
+ if len(cds) == 0:
+ log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene))
+ continue
+
+ cds = cds[0]
+
+ if "product" not in cds.qualifiers:
+ log.info("Missing product tag on %s", get_gff3_id(gene))
+ qc_features.append(
+ gen_qc_feature(
+ cds.location.start,
+ cds.location.end,
+ "Missing product tag",
+ strand=cds.strand,
+ type_src="CDS"
+ )
+ )
+ results.append(cds)
+ bad += 1
+ else:
+ good += 1
+
+ return good, bad, results, qc_features
+
+
+def evaluate_and_report(
+ annotations,
+ genome,
+ gff3=None,
+ tbl=None,
+ sd_min=5,
+ sd_max=15,
+ min_gene_length=30,
+ excessive_gap_dist=50,
+ excessive_gap_divergent_dist=200,
+ excessive_overlap_dist=25,
+ excessive_overlap_divergent_dist=50,
+ reportTemplateName="phage_annotation_validator.html",
+):
+ """
+ Generate our HTML evaluation of the genome
+ """
+ # Get features from GFF file
+ seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))
+ # Get the first GFF3 record
+ # TODO: support multiple GFF3 files.
+ mostFeat = 0
+ for rec in list(gffParse(annotations, base_dict=seq_dict)):
+ if len(rec.features) > mostFeat:
+ mostFeat = len(rec.features)
+ record = rec
+
+ gff3_qc_record = SeqRecord(record.id, id=record.id)
+ gff3_qc_record.features = []
+ gff3_qc_features = []
+
+ log.info("Locating missing RBSs")
+ # mb_any = "did they annotate ANY rbss? if so, take off from score."
+ mb_good, mb_bad, mb_results, mb_annotations, mb_any = missing_rbs(
+ record, lookahead_min=sd_min, lookahead_max=sd_max
+ )
+ gff3_qc_features += mb_annotations
+
+ log.info("Locating excessive gaps")
+ eg_good, eg_bad, eg_results, eg_annotations = excessive_gap(
+ record,
+ excess=excessive_gap_dist,
+ excess_divergent=excessive_gap_divergent_dist,
+ min_gene=min_gene_length,
+ slop=excessive_overlap_dist,
+ lookahead_min=sd_min,
+ lookahead_max=sd_max,
+ )
+ gff3_qc_features += eg_annotations
+
+ log.info("Locating excessive overlaps")
+ eo_good, eo_bad, eo_results, eo_annotations = excessive_overlap(
+ record,
+ excess=excessive_overlap_dist,
+ excess_divergent=excessive_overlap_divergent_dist,
+ )
+ gff3_qc_features += eo_annotations
+
+ log.info("Locating morons")
+ mo_good, mo_bad, mo_results, mo_annotations = find_morons(record)
+ gff3_qc_features += mo_annotations
+
+ log.info("Locating missing tags")
+ mt_good, mt_bad, mt_results, mt_annotations = missing_tags(record)
+ gff3_qc_features += mt_annotations
+
+ log.info("Locating missing gene features")
+ mg_good, mg_bad, mg_results, mg_annotations = missing_genes(record)
+ gff3_qc_features += mg_annotations
+
+ log.info("Determining coding density")
+ cd, cd_real = coding_density(record)
+
+ log.info("Locating weird starts")
+ ws_good, ws_bad, ws_results, ws_annotations, ws_overall = weird_starts(record)
+ gff3_qc_features += ws_annotations
+
+ log.info("Locating bad gene models")
+ gm_good, gm_bad, gm_results, gm_annotations = bad_gene_model(record)
+ if gm_good + gm_bad == 0:
+ gm_bad = 1
+
+ log.info("Locating more bad gene models")
+ gmc_good, gmc_bad, gmc_results, gmc_annotations = gene_model_correction_issues(
+ record
+ )
+ if gmc_good + gmc_bad == 0:
+ gmc_bad = 1
+
+ good_scores = [eg_good, eo_good, mt_good, ws_good, gm_good, gmc_good]
+ bad_scores = [eg_bad, eo_bad, mt_bad, ws_bad, gm_bad, gmc_bad]
+
+ # Only if they tried to annotate RBSs do we consider them.
+ if mb_any:
+ good_scores.append(mb_good)
+ bad_scores.append(mb_bad)
+ subscores = []
+
+ for (g, b) in zip(good_scores, bad_scores):
+ if g + b == 0:
+ s = 0
+ else:
+ s = int(100 * float(g) / (float(b) + float(g)))
+ subscores.append(s)
+ subscores.append(cd)
+
+ score = int(float(sum(subscores)) / float(len(subscores)))
+
+ # This is data that will go into our HTML template
+ kwargs = {
+ "upstream_min": sd_min,
+ "upstream_max": sd_max,
+ "record_name": record.id,
+ "record_nice_name": nice_name(record),
+ "params": {
+ "sd_min": sd_min,
+ "sd_max": sd_max,
+ "min_gene_length": min_gene_length,
+ "excessive_gap_dist": excessive_gap_dist,
+ "excessive_gap_divergent_dist": excessive_gap_divergent_dist,
+ "excessive_overlap_dist": excessive_overlap_dist,
+ "excessive_overlap_divergent_dist": excessive_overlap_divergent_dist,
+ },
+ "score": score,
+ "encouragement": get_encouragement(score),
+ "genome_overview": genome_overview(record),
+ "rbss_annotated": mb_any,
+ "missing_rbs": mb_results,
+ "missing_rbs_good": mb_good,
+ "missing_rbs_bad": mb_bad,
+ "missing_rbs_score": 0
+ if mb_good + mb_bad == 0
+ else (100 * mb_good / (mb_good + mb_bad)),
+ "excessive_gap": eg_results,
+ "excessive_gap_good": eg_good,
+ "excessive_gap_bad": eg_bad,
+ "excessive_gap_score": 0
+ if eo_good + eo_bad == 0
+ else (100 * eo_good / (eo_good + eo_bad)),
+ "excessive_overlap": eo_results,
+ "excessive_overlap_good": eo_good,
+ "excessive_overlap_bad": eo_bad,
+ "excessive_overlap_score": 0
+ if eo_good + eo_bad == 0
+ else (100 * eo_good / (eo_good + eo_bad)),
+ "morons": mo_results,
+ "morons_good": mo_good,
+ "morons_bad": mo_bad,
+ "morons_score": 0
+ if mo_good + mo_bad == 0
+ else (100 * mo_good / (mo_good + mo_bad)),
+ "missing_tags": mt_results,
+ "missing_tags_good": mt_good,
+ "missing_tags_bad": mt_bad,
+ "missing_tags_score": 0
+ if mt_good + mt_bad == 0
+ else (100 * mt_good / (mt_good + mt_bad)),
+ "missing_genes": mg_results,
+ "missing_genes_good": mg_good,
+ "missing_genes_bad": mg_bad,
+ "missing_genes_score": 0
+ if mg_good + mg_bad == 0
+ else (100 * mg_good / (mg_good + mg_bad)),
+ "weird_starts": ws_results,
+ "weird_starts_good": ws_good,
+ "weird_starts_bad": ws_bad,
+ "weird_starts_overall": ws_overall,
+ "weird_starts_overall_sorted_keys": sorted(
+ ws_overall, reverse=True, key=lambda x: ws_overall[x]
+ ),
+ "weird_starts_score": 0
+ if ws_good + ws_bad == 0
+ else (100 * ws_good / (ws_good + ws_bad)),
+ "gene_model": gm_results,
+ "gene_model_good": gm_good,
+ "gene_model_bad": gm_bad,
+ "gene_model_score": 0
+ if gm_good + gm_bad == 0
+ else (100 * gm_good / (gm_good + gm_bad)),
+ "gene_model_correction": gmc_results,
+ "gene_model_correction_good": gmc_good,
+ "gene_model_correction_bad": gmc_bad,
+ "gene_model_correction_score": 0
+ if gmc_good + gmc_bad == 0
+ else (100 * gmc_good / (gmc_good + gmc_bad)),
+ "coding_density": cd,
+ "coding_density_exact": exact_coding_density(record),
+ "coding_density_real": cd_real,
+ "coding_density_score": cd,
+ }
+
+ with open(tbl, "w") as handle:
+ kw_subset = {}
+ for key in kwargs:
+ if (
+ key in ("score", "record_name")
+ or "_good" in key
+ or "_bad" in key
+ or "_overall" in key
+ ):
+ kw_subset[key] = kwargs[key]
+ json.dump(kw_subset, handle)
+
+ with open(gff3, "w") as handle:
+ gff3_qc_record.features = gff3_qc_features
+ gff3_qc_record.annotations = {}
+ gffWrite([gff3_qc_record], handle)
+
+ def nice_strand(direction):
+ # It is somehow possible for whole gffSeqFeature objects to end up in here, apparently at the gene level
+ if "SeqFeature" in str(type(direction)):
+ direction = direction.location.strand
+ if direction > 0:
+ return "→"#.decode("utf-8")
+ else:
+ return "←"#.decode("utf-8")
+
+ def nice_strand_tex(direction):
+ if "SeqFeature" in str(type(direction)):
+ direction = direction.location.strand
+ if direction > 0:
+ return "$\\rightarrow$"
+ else:
+ return "$\\leftarrow$"
+
+ def texify(data):
+ return data.replace("_", "\\_").replace("$", "\\$")
+
+ def length(data):
+ return len(data)
+
+ def my_encode(data):
+ return str(data)#.encode("utf-8")
+
+ def my_decode(data):
+ # For production
+ return str(data)#.decode("utf-8")
+ # For local testing. No, I do not understand.
+ return str(data)#.encode("utf-8")).decode("utf-8")
+
+ env = Environment(
+ loader=FileSystemLoader(SCRIPT_PATH), trim_blocks=True, lstrip_blocks=True
+ )
+ env.filters.update(
+ {
+ "nice_id": get_gff3_id,
+ "nice_strand": nice_strand,
+ "nice_strand_tex": nice_strand_tex,
+ "texify": texify,
+ "length": length,
+ "encode": my_encode,
+ "decode": my_decode,
+ }
+ )
+ tpl = env.get_template(reportTemplateName)
+ return tpl.render(**kwargs)#.encode("utf-8")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="rebase gff3 features against parent locations", epilog=""
+ )
+ parser.add_argument(
+ "annotations", type=argparse.FileType("r"), help="Parent GFF3 annotations"
+ )
+ parser.add_argument("genome", type=argparse.FileType("r"), help="Genome Sequence")
+ parser.add_argument(
+ "--gff3", type=str, help="GFF3 Annotations", default="qc_annotations.gff3"
+ )
+ parser.add_argument(
+ "--tbl",
+ type=str,
+ help="Table for noninteractive parsing",
+ default="qc_results.json",
+ )
+
+ parser.add_argument(
+ "--sd_min",
+ type=int,
+ help="Minimum distance from gene start for an SD to be",
+ default=5,
+ )
+ parser.add_argument(
+ "--sd_max",
+ type=int,
+ help="Maximum distance from gene start for an SD to be",
+ default=15,
+ )
+
+ parser.add_argument(
+ "--min_gene_length",
+ type=int,
+ help="Minimum length for a putative gene call (AAs)",
+ default=30,
+ )
+
+ parser.add_argument(
+ "--excessive_overlap_dist",
+ type=int,
+ help="Excessive overlap for genes in same direction",
+ default=25,
+ )
+ parser.add_argument(
+ "--excessive_overlap_divergent_dist",
+ type=int,
+ help="Excessive overlap for genes in diff directions",
+ default=50,
+ )
+
+ parser.add_argument(
+ "--excessive_gap_dist",
+ type=int,
+ help="Maximum distance between two genes",
+ default=40,
+ )
+ parser.add_argument(
+ "--excessive_gap_divergent_dist",
+ type=int,
+ help="Maximum distance between two divergent genes",
+ default=200,
+ )
+
+ parser.add_argument(
+ "--reportTemplateName",
+ help="Report template file name",
+ default="phageqc_report_full.html",
+ )
+
+ args = parser.parse_args()
+
+ sys.stdout.write(evaluate_and_report(**vars(args)))
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phage_annotation_validator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/phage_annotation_validator.xml Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,102 @@
+
+
+ validate phage annotations
+
+ macros.xml
+ cpt-macros.xml
+
+
+ python
+ biopython
+ cpt_gffparser
+ python-levenshtein
+ regex
+ metagene_annotator
+ jinja2
+ numpy
+
+ $output;
+
+#if ".tex" in str($report_format):
+ mv $output tmp.tex;
+ docker run --rm -i --user="1002:1002" --net=none -v \$PWD:/data blang/latex pdflatex tmp.tex &&
+ docker run --rm -i --user="1002:1002" --net=none -v \$PWD:/data blang/latex pdflatex tmp.tex &&
+ mv tmp.pdf $output;
+#end if
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phageqc_report_464.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/phageqc_report_464.html Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,419 @@
+
+
+
+
+
+
+
+
+
+ [BICH464] Phage QC on {{record_name}} - {{score}}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Phage {{record_name}}
+
+
+
+
+
+
Genome Overview
+
Genes
+
+ - Count: {{ genome_overview.genes.count }}
+ - Bases: {{ genome_overview.genes.bases }}
+ - Average Length: {{ genome_overview.genes.avg_len | round | int}}
+ - Coding Density: {{ '%0.2f' % (100 * coding_density_exact) }}%
+ - Composition
+
+ - A {{ genome_overview.genes.comp.A }}
+ - C {{ genome_overview.genes.comp.C }}
+ - T {{ genome_overview.genes.comp.T }}
+ - G {{ genome_overview.genes.comp.G }}
+
+
+
+
Overall
+
+ - %GC: {{ '%0.2f' % (100 * genome_overview.overall.gc) }}%
+ - Composition
+
+ - A {{ genome_overview.overall.comp.A }}
+ - C {{ genome_overview.overall.comp.C }}
+ - T {{ genome_overview.overall.comp.T }}
+ - G {{ genome_overview.overall.comp.G }}
+
+
+
+
+
+
+
+
Genes Missing RBS {{missing_rbs_good}} / {{missing_rbs_good + missing_rbs_bad}}
+
The following genes have issues with their RBS.
+ {% if not rbss_annotated %}
+
+ Since you have not annotated any possible RBSs, this does not count off from your overall score.
+
+ {% endif %}
+
+
+
+
+ ID |
+ Location |
+ Error |
+ Upstream (-{{upstream_max}} .. -{{upstream_min}}) |
+
+
+
+ {% for row in missing_rbs %}
+ {% if "None found" in row.__message %}
+
+ {{row | nice_id | decode}} |
+ {{row.location.start}}..{{row.location.end}} [{{row.strand}}] |
+ None found |
+ {{row.__upstream }} |
+
+ {% endif %}
+ {% endfor %}
+
+
+
+
+
Start Codon Usage
+
This section covers genes with unusual start codons
+
+
+
+
+ Start Codon |
+ Count |
+
+
+
+ {% for codon_key in weird_starts_overall_sorted_keys %}
+ {{ codon_key }} | {{ weird_starts_overall[codon_key] }} |
+ {% endfor %}
+
+
+
+
+
+
+
+
+ ID |
+ Location |
+ Error |
+
+
+
+ {% for row in weird_starts %}
+
+ {{row | nice_id| decode}} |
+ {{row.location.start}}..{{row.location.end}} [{{row.strand}}] |
+ {{row.qualifiers.get('note', [])}} |
+
+ {% endfor %}
+
+
+
+
+
Intergenic Gaps
+
Phage genomes are under pressure to maintain high coding density. Large intergenic gaps may be a sign of incorrect gene starts or missing genes.
+
+
+
+
+ Region |
+ Size |
+ Bounding Gene Transcription Direction |
+ Message |
+
+
+
+ {% for row in excessive_gap %}
+
+ {{row[0]}} .. {{row[1]}} |
+ {{row[1] - row[0]}} |
+ {{row[2] | nice_strand}} {{row[3] | nice_strand}} |
+
+ {% if row[4] == 0 %}
+ {% else %}
+ {{row[4]}} possible genes found in this region
+ {% endif %}
+ |
+
+ {% endfor %}
+
+
+
+
+
Overlapping Genes
+
Large gene overlaps may indicate an incorrect gene start or miscalled gene.
+
+
+
+
+ Feature A |
+ Feature B |
+ Shared Region |
+ Overlap Length |
+
+
+
+ {% for row in excessive_overlap %}
+
+ {{row[0] | nice_id | decode}} ({{row[0].location}}) |
+ {{row[1] | nice_id | decode}} ({{row[1].location}}) |
+ {{row[2]}}..{{row[3]}} |
+ {{row[3] - row[2]}}bp |
+
+ {% endfor %}
+
+
+
+
+
+
+
These issues are mostly derived from how Apollo handles the gene model.
+
+
+
+
+ ID |
+ Exon |
+ CDS |
+ Message |
+
+
+
+ {% for row in gene_model %}
+
+ {{row[0]}} |
+ {{row[1].location}} |
+ {{row[2].location}} |
+ {{row[3]}} |
+
+ {% endfor %}
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phageqc_report_annotation_table.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/phageqc_report_annotation_table.html Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,244 @@
+
+
+
+
+
+
+
+
+
+ Annotation Table
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Data on each organism will be accessible from the tabs above.
+
+ {% for (record, data) in annotation_table_data %}
+
+
+
+
+ {% for col in annotation_table_col_names %}
+ {{ col[0] }} |
+ {% endfor %}
+
+
+
+ {% for row in data %}
+
+ {% for col in row %}
+ {% if col is not string %}{% for val in col %}- {{ val }}
{% endfor %} {% else %}{{ col }}{% endif %} |
+ {% endfor %}
+
+ {% endfor %}
+
+
+
+ {% endfor %}
+
+
+
+
+
+
+
+
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phageqc_report_full.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/phageqc_report_full.html Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,412 @@
+
+
+
+
+
+
+
+
+
+ Phage QC on {{record_name}} - {{score}}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Phage {{record_name}}
+
+
+
+
+
+
+
+
Genes missing RBS {{missing_rbs_good}} / {{missing_rbs_good + missing_rbs_bad}}
+
The following genes have issues with their RBS.
+ {% if not rbss_annotated %}
+
+ Since you have not annotated any possible RBSs, this does not count off from your overall score.
+
+ {% endif %}
+
+
+
+
+ Feature Type |
+ ID |
+ Location |
+ Error |
+ Upstream (-{{upstream_max}} .. -{{upstream_min}}) |
+
+
+
+ {% for row in missing_rbs %}
+
+ {{row.type}} |
+ {{row.id}} |
+ {{row.location.start}}..{{row.location.end}} [{{row.strand}}] |
+ {{row.__message}} |
+ {{row.__upstream}} |
+
+ {% endfor %}
+
+
+
+
+
Start Codon Usage
+
This section covers genes with unusual start codons
+
+
+
+
+ Start Codon |
+ Count |
+
+
+
+ {% for codon_key in weird_starts_overall_sorted_keys %}
+ {{ codon_key }} | {{ weird_starts_overall[codon_key] }} |
+ {% endfor %}
+
+
+
+
+
+
+
+
+ Feature Type |
+ ID |
+ Location |
+ Error |
+
+
+
+ {% for row in weird_starts %}
+
+ {{row.type}} |
+ {{row.id}} |
+ {{row.location.start}}..{{row.location.end}} [{{row.strand}}] |
+ {{row.__error}} |
+
+ {% endfor %}
+
+
+
+
+
Intergenic Gaps
+
Phage genomes are under pressure to maintain high coding density. Large intergenic gaps may be a sign of incorrect gene starts or missing genes.
+
+
+
+
+ Region |
+ Size |
+ Bounding Gene Transcription Direction |
+ Message |
+
+
+
+ {% for row in excessive_gap %}
+
+ {{row[0]}} .. {{row[1]}} |
+ {{row[1] - row[0]}} |
+ {{row[2] | nice_strand}} {{row[3] | nice_strand}} |
+
+ {% if row[4] == 0 %}
+ {% else %}
+ {{row[4]}} ORFs found in this region
+ {% endif %}
+ |
+
+ {% endfor %}
+
+
+
+
+
Overlapping Genes
+
Large gene overlaps may indicate an incorrect gene start or miscalled gene.
+
+
+
+
+ Feature A |
+ Feature B |
+ Shared Region |
+ Overlap Length |
+
+
+
+ {% for row in excessive_overlap %}
+
+ {{row[0].id}} ({{row[0].location}}) |
+ {{row[1].id}} ({{row[1].location}}) |
+ {{row[2]}}..{{row[3]}} |
+ {{row[3] - row[2]}} bp |
+
+ {% endfor %}
+
+
+
+
+
+
+
Possible Morons {{morons_good}} / {{morons_good + morons_bad}} (Doesn't count towards score)
+
+
+
+
+ Feature |
+ RBS |
+ Surrounding Features |
+
+
+
+ {% for row in morons %}
+
+ {{row[0].id}} |
+ {{row[3]}} |
+
+ {% for x in row[1] %}
+ {{ x | nice_strand }}
+ {% endfor %}
+ {{ row[0].strand | nice_strand }}
+ {% for x in row[2] %}
+ {{ x | nice_strand }}
+ {% endfor %}
+
+ |
+
+ {% endfor %}
+
+
+
+
+
+
+
Missing Product Tags {{missing_tags_good}} / {{missing_tags_good + missing_tags_bad}}
+
+
+
+
+ Feature |
+ Qualifiers |
+
+
+
+ {% for row in missing_tags %}
+
+ {{row.id}} |
+
+ {% for key in row.qualifiers %}
+ {{ key }}
+
+ {% for value in row.qualifiers[key] %}
+ - {{value}}
+ {% endfor %}
+
+ {% endfor %}
+ |
+
+ {% endfor %}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phageqc_report_genomea.tex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/phageqc_report_genomea.tex Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,235 @@
+\documentclass[]{article}
+\usepackage{lmodern}
+\usepackage{amssymb,amsmath}
+\usepackage{ifxetex,ifluatex}
+\usepackage{fixltx2e} % provides \textsubscript
+\usepackage[T1]{fontenc}
+\usepackage[utf8]{inputenc}
+
+\addtolength{\oddsidemargin}{-.875in}
+\addtolength{\evensidemargin}{-.875in}
+\addtolength{\textwidth}{1.75in}
+
+\addtolength{\topmargin}{-.875in}
+\addtolength{\textheight}{1.75in}
+
+\usepackage{fancyhdr}
+\pagestyle{fancy}
+\lhead{GenomeA Compliance Report}
+\chead{}
+\rhead{ {{record_name | texify}} }
+\lfoot{}
+\cfoot{\thepage}
+\rfoot{}
+
+
+
+\usepackage{microtype}
+\usepackage{hyperref}
+\hypersetup{unicode=true,
+ pdfborder={0 0 0},
+ breaklinks=true}
+\urlstyle{same} % don't use monospace font for urls
+\usepackage{longtable,booktabs}
+\date{Compiled \today}
+\title{GenomeA Compliance Report for {{record_nice_name | texify}}}
+
+\begin{document}
+%\pagestyle{plain}
+\maketitle
+This report details possible issues with your submitted genome annotations.
+
+\section{Required Changes}
+
+The changes detailed in this section are required for acceptance of your
+submission.
+
+\subsection{Missing Gene Features}
+
+These coding sequences (``CDS'' in your GenBank file) are missing the
+associated gene feature (``gene''). This is required for validation by NCBI's
+rules which are encoded in the sequin and tbl2asn programs.
+{%if missing_genes_bad > 0 %}
+
+{{ missing_genes_bad }} out of {{ missing_genes_good + missing_genes_bad
+}} features are lacking their associated gene feature.
+
+\begin{longtable}{ll}
+\hline
+Feature ID & Location\\
+\hline
+\endhead
+{% for row in missing_genes %}
+{{ row.id | texify }} & \texttt{{'{'}}{{row.location}}{{'}'}}\tabularnewline
+{% endfor %}
+\end{longtable}
+{% else %}
+You are not missing any gene features
+{% endif %}
+
+\subsection{Missing Product Tags}\label{missing-product-tags}
+
+{{missing_tags_good}} out of {{missing_tags_good + missing_tags_bad}} features have product tags (\texttt{/product="..."}).
+{% if missing_tags_bad > 0 %}
+The following features are missing product tags
+\begin{longtable}{ll}
+\hline
+Feature & Location\\
+\hline
+\endhead
+{% for row in missing_tags %}
+{{ row.id | texify }} & \texttt{{'{'}}{{row.location}}{{'}'}}\tabularnewline
+{% endfor %}
+\end{longtable}
+{% endif %}
+
+\subsection{Missing Locus Tags}\label{missing-locus-tags}
+
+{{gene_model_correction_good}} out of {{gene_model_correction_good + gene_model_correction_bad}} features have valid locus tags (\texttt{/locus\_tag="..."}).
+{% if gene_model_correction_bad > 0 %}
+The following features have issues with their locus tags
+\begin{longtable}{lllll}
+\hline
+ID & Location & Gene Locus Tag & CDS Locus Tag & Issue \\
+\hline
+\endhead
+{% for row in gene_model_correction %}
+{{ row[0].id | texify }} & \texttt{{'{'}}{{row[1].location}}{{'}'}} & {{ row[0].qualifiers['locus_tag'][0] | texify }} & {{ row[1].qualifiers['locus_tag'][0] | texify }} & {{ row[2] | texify }}\tabularnewline
+{% endfor %}
+\end{longtable}
+{% endif %}
+
+
+\section{Suggested Changes}\label{suggested-changes}
+
+These changes are not required, but are strongly encouraged in order to
+provide a uniform genome annotation within the phage community.
+
+\subsection{Start Codons}\label{start-codons}
+Nearly all phage genes use ATG, GTG or TTG as start codons. The start codon distribution is as
+follows:
+
+
+\begin{longtable}{lll}
+\hline
+Start Codon & Count\\
+\hline
+\endhead
+{% for codon_key in weird_starts_overall_sorted_keys %}
+{{ codon_key }} & {{ weird_starts_overall[codon_key] }} \\
+{% endfor %}
+\end{longtable}
+
+{% if weird_starts_bad != 0 %}
+There are {{weird_starts_bad }} unusual start codons in the genome, these
+should be carefully justified. If there is evidence for these starts, the
+GenomeA text should note this.
+
+\begin{longtable}{lll}
+\hline
+Feature ID & Location & Start Codon\\
+\hline
+\endhead
+{% for row in weird_starts %}
+{{ row.id | texify }} & \texttt{{'{'}}{{row.location}}{{'}'}} & {{row.__start}} \\
+{% endfor %}
+\end{longtable}
+
+{% endif %}
+
+\subsection{Unannotated RBSs}\label{unannotated-rbss}
+
+The following CDSs either do not have a detectable ribosome binding site (RBS;
+Shine-Dalgarno sequence), in which case there is a strong possibility that
+this is not the correct start, or there is one but it is not annotated.
+Annotating the RBS as part of the gene feature is the best practice.
+
+\begin{longtable}{lllll}
+\hline
+ID & Location & Error & Upstream (-{{upstream_max}} .. -{{upstream_min}})\\
+\hline
+\endhead
+{% for row in missing_rbs %}
+{% if 'Unannotated' not in row.__message%}
+{{ row.id | texify }} & \texttt{{'{'}}{{row.location}}{{'}'}} & {{row.__message | texify}} & \texttt{{'{'}}{{row.__upstream}}{{'}'}} \\
+{% endif %}
+{% endfor %}
+{% for row in missing_rbs %}
+{% if 'Unannotated' in row.__message%}
+{{ row.id | texify }} & \texttt{{'{'}}{{row.location}}{{'}'}} & {{row.__message | texify}} & \texttt{{'{'}}{{row.__upstream}}{{'}'}} \\
+{% endif %}
+{% endfor %}
+\end{longtable}
+
+\section{Areas for Further Examination}\label{notes}
+
+These areas may be indicative of a problem, or may simply be
+informational. You should examine the areas mentioned in detail to ensure
+that the annotations are valid and that no genes are missed.
+
+
+
+
+
+\subsection{Unusual Gaps}\label{excessive-gaps}
+
+{% if excessive_gap | length == 0 %}
+No gaps over {{ params['excessive_gap_dist'] }} nt (for genes on the same
+strand) or {{ params['excessive_gap_divergent_dist'] }} (for genes on
+opposite strands) were found.
+{% else %}
+Gaps over {{ params['excessive_gap_dist'] }} nt (for genes on the same
+strand) or {{ params['excessive_gap_divergent_dist'] }} (for genes on
+opposite strands) were found.
+
+\begin{longtable}{llll}
+\hline
+Region & Size & Surroundings & Messages\\
+\hline
+\endhead
+{% for row in excessive_gap %}
+\texttt{{'{'}}{{row[0]}}..{{row[1]}}{{'}'}} & {{row[1] - row[0]}} & {{row[2] | nice_strand_tex}} {{row[3] | nice_strand_tex}} & {% if row[4] != 0 %}{{row[4]}} ORFs found in this region{% endif %} \\
+
+{% endfor %}
+\end{longtable}
+{% endif %}
+
+
+
+
+\subsection{Unusual Overlaps}\label{excessive-overlaps}
+
+{% if excessive_overlap | length == 0 %}
+No overlaps over {{ params['excessive_overlap_dist'] }} nt (for genes on the same
+strand) or {{ params['excessive_overlap_divergent_dist'] }} (for genes on
+opposite strands) were found.
+{% else %}
+Overlaps over {{ params['excessive_overlap_dist'] }} nt (for genes on the same
+strand) or {{ params['excessive_overlap_divergent_dist'] }} (for genes on
+opposite strands) were found.
+\begin{longtable}{llllll}
+\hline
+\multicolumn{2}{l}{Feature A} & \multicolumn{2}{l}{Feature B} & & \\
+ID & Location & ID & Location & Region & Length\\
+\hline
+\endhead
+{% for row in excessive_overlap %}
+{{row[0].id | texify}} & \texttt{{'{'}}{{row[0].location}}{{'}'}} & {{row[1].id | texify}} & \texttt{{'{'}}{{row[1].location}}{{'}'}} & {{row[2]}}..{{row[3]}} & {{row[3] - row[2]}} \\
+{% endfor %}
+\end{longtable}
+{% endif %}
+
+\subsection{Coding Density}\label{coding-density}
+
+You have a coding density of {{ coding_density_real }}\% which scores
+{{ coding_density }} / 100 on our scale. Most genomes should be in the 90\% to 100\%
+coding density range
+
+
+
+
+
+
+
+
+\end{document}
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/shinefind.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/shinefind.py Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,420 @@
+#!/usr/bin/env python
+import re
+import sys
+import argparse
+import logging
+from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import FeatureLocation
+from gff3 import (
+ feature_lambda,
+ feature_test_type,
+ feature_test_true,
+ feature_test_quals,
+ get_id,
+ ensure_location_in_bounds,
+)
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger()
+
+
+class NaiveSDCaller(object):
+
+ # TODO May make switch for different sequence sets
+ SD_SEQUENCES = (
+ "AGGAGGT",
+ "GGAGGT",
+ "AGGAGG",
+ "GGGGGG",
+ "AGGAG",
+ "GAGGT",
+ "GGAGG",
+ "GGGGG",
+ "AGGT",
+ "GGGT",
+ "GAGG",
+ "GGGG",
+ "AGGA",
+ "GGAG",
+ "GGA",
+ "GAG",
+ "AGG",
+ "GGT",
+ "GGG",
+ )
+
+ def __init__(self):
+ self.sd_reg = [re.compile(x, re.IGNORECASE) for x in self.SD_SEQUENCES]
+
+ def list_sds(self, sequence, sd_min=3, sd_max=17):
+ hits = []
+ for regex in self.sd_reg:
+ for match in regex.finditer(sequence):
+ spacing = len(sequence) - len(match.group()) - match.start()
+ if sd_max >= spacing+sd_min and spacing+sd_min >= sd_min:
+ #if the spacing is within gap limits, add
+ #(search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min)
+ #print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min))
+ hits.append(
+ {
+ "spacing": spacing,
+ "hit": match.group(),
+ "start": match.start(),
+ "end": match.end(),
+ "len": len(match.group()),
+ }
+ )
+ hits = sorted(hits, key= lambda x: (-x['len'],x['spacing']))
+ return hits
+
+ @classmethod
+ def highlight_sd(cls, sequence, start, end):
+ return " ".join(
+ [
+ sequence[0:start].lower(),
+ sequence[start:end].upper(),
+ sequence[end:].lower(),
+ ]
+ )
+
+ @classmethod
+ def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd_min=3, sd_max=17):
+ results = []
+ for idx, hit in enumerate(hits):
+ # gene complement(124..486)
+ # -1 491 501 0 5 5
+ # -1 491 501 0 4 5
+ # -1 491 501 1 4 5
+ # -1 491 501 2 3 5
+ # -1 491 501 1 3 5
+ # -1 491 501 0 3 5
+
+ qualifiers = {
+ "source": "CPT_ShineFind",
+ "ID": "%s.rbs-%s" % (feature_id, idx),
+ }
+
+ if strand > 0:
+ start = parent_end - hit["spacing"] - hit["len"]
+ end = parent_end - hit["spacing"]
+ else:
+ start = parent_start + hit["spacing"]
+ end = parent_start + hit["spacing"] + hit["len"]
+ # check that the END of the SD sequence is within the given min/max of parent start/end
+
+ # gap is either the sd_start-cds_end (neg strand) or the sd_end-cds_start (pos strand)
+ # minimum absolute value of these two will be the proper gap regardless of strand
+ tmp = gffSeqFeature(
+ FeatureLocation(min(start, end), max(start, end), strand=strand),
+ #FeatureLocation(min(start, end), max(start, end), strand=strand),
+ type="Shine_Dalgarno_sequence",
+ qualifiers=qualifiers,
+ )
+ results.append(tmp)
+ return results
+
+ def testFeatureUpstream(self, feature, record, sd_min=3, sd_max=17):
+ # Strand information necessary to getting correct upstream sequence
+ strand = feature.location.strand
+
+ # n_bases_upstream (plus/minus 7 upstream to make the min/max define the possible gap position)
+ if strand > 0:
+ start = feature.location.start - sd_max - 7
+ end = feature.location.start - sd_min
+ else:
+ start = feature.location.end + sd_min
+ end = feature.location.end + sd_max + 7
+
+ (start, end) = ensure_location_in_bounds(
+ start=start, end=end, parent_length=len(record)
+ )
+
+ # Create our temp feature used to obtain correct portion of
+ # genome
+ tmp = gffSeqFeature(FeatureLocation(min(start, end), max(start, end), strand=strand), type="domain")
+ seq = str(tmp.extract(record.seq))
+ return self.list_sds(seq, sd_min, sd_max), start, end, seq
+
+ def hasSd(self, feature, record, sd_min=3, sd_max=17):
+ sds, start, end, seq = self.testFeatureUpstream(
+ feature, record, sd_min=sd_min, sd_max=sd_max
+ )
+ return len(sds) > 0
+
+
+# Cycle through subfeatures, set feature's location to be equal
+# to the smallest start and largest end.
+# Remove pending bugfix for feature display in Apollo
+def fminmax(feature):
+ fmin = None
+ fmax = None
+ for sf in feature_lambda([feature], feature_test_true, {}, subfeatures=True):
+ if fmin is None:
+ fmin = sf.location.start
+ fmax = sf.location.end
+ if sf.location.start < fmin:
+ fmin = sf.location.start
+ if sf.location.end > fmax:
+ fmax = sf.location.end
+ return fmin, fmax
+
+
+def fix_gene_boundaries(feature):
+ # There is a bug in Apollo whereby we have created gene
+ # features which are larger than expected, but we cannot see this.
+ # We only see a perfect sized gene + SD together.
+ #
+ # So, we clamp the location of the gene feature to the
+ # contained mRNAs. Will remove pending Apollo upgrade.
+ fmin, fmax = fminmax(feature)
+ if feature.location.strand > 0:
+ feature.location = FeatureLocation(fmin, fmax, strand=1)
+ else:
+ feature.location = FeatureLocation(fmin, fmax, strand=-1)
+ return feature
+
+def shinefind(
+ fasta,
+ gff3,
+ gff3_output=None,
+ table_output=None,
+ lookahead_min=3,
+ lookahead_max=17,
+ top_only=False,
+ add=False,
+):
+ table_output.write(
+ "\t".join(
+ [
+ "ID",
+ "Name",
+ "Terminus",
+ "Terminus",
+ "Strand",
+ "Upstream Sequence",
+ "SD",
+ "Spacing",
+ ]
+ )
+ + "\n"
+ )
+
+ sd_finder = NaiveSDCaller()
+ # Load up sequence(s) for GFF3 data
+ seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
+ # Parse GFF3 records
+ for record in gffParse(gff3, base_dict=seq_dict):
+ # Shinefind's gff3_output.
+ gff3_output_record = SeqRecord(record.seq, record.id)
+ # Filter out just coding sequences
+ ignored_features = []
+ for x in record.features:
+ # If feature X does NOT contain a CDS, add to ignored_features
+ # list. This means if we have a top level gene feature with or
+ # without a CDS subfeature, we're catch it appropriately here.
+ if (
+ len(
+ list(
+ feature_lambda(
+ [x], feature_test_type, {"type": "CDS"}, subfeatures=True
+ )
+ )
+ )
+ == 0
+ ):
+ ignored_features.append(x)
+
+ # Loop over all gene features
+ for gene in feature_lambda(
+ record.features, feature_test_type, {"type": "gene"}, subfeatures=True
+ ):
+
+ # Get the CDS from this gene.
+ feature = sorted(
+ list(
+ feature_lambda(
+ gene.sub_features,
+ feature_test_type,
+ {"type": "CDS"},
+ subfeatures=True,
+ )
+ ),
+ key=lambda x: x.location.start,
+ )
+ # If no CDSs are in this gene feature, then quit
+ if len(feature) == 0:
+ # We've already caught these above in our ignored_features
+ # list, so we skip out on the rest of this for loop
+ continue
+ else:
+ # Otherwise pull the first on the strand.
+ feature = feature[0]
+
+ # Three different ways RBSs can be stored that we expect.
+ rbs_rbs = list(
+ feature_lambda(
+ gene.sub_features,
+ feature_test_type,
+ {"type": "RBS"},
+ subfeatures=False,
+ )
+ )
+ rbs_sds = list(
+ feature_lambda(
+ gene.sub_features,
+ feature_test_type,
+ {"type": "Shine_Dalgarno_sequence"},
+ subfeatures=False,
+ )
+ )
+ regulatory_elements = list(
+ feature_lambda(
+ gene.sub_features,
+ feature_test_type,
+ {"type": "regulatory"},
+ subfeatures=False,
+ )
+ )
+ rbs_regulatory = list(
+ feature_lambda(
+ regulatory_elements,
+ feature_test_quals,
+ {"regulatory_class": ["ribosome_binding_site"]},
+ subfeatures=False,
+ )
+ )
+ rbss = rbs_rbs + rbs_sds + rbs_regulatory
+
+ # If someone has already annotated an RBS, we move to the next gene
+ if len(rbss) > 0:
+ log.debug("Has %s RBSs", len(rbss))
+ ignored_features.append(gene)
+ continue
+
+ sds, start, end, seq = sd_finder.testFeatureUpstream(
+ feature, record, sd_min=lookahead_min, sd_max=lookahead_max
+ )
+
+ feature_id = get_id(feature)
+ sd_features = sd_finder.to_features(
+ sds, feature.location.strand, start, end, feature_id=feature.id
+ )
+
+ human_strand = "+" if feature.location.strand == 1 else "-"
+
+ # http://book.pythontips.com/en/latest/for_-_else.html
+ log.debug("Found %s SDs", len(sds))
+ for (sd, sd_feature) in zip(sds, sd_features):
+ # If we only want the top feature, after the bulk of the
+ # forloop executes once, we append the top feature, and fake a
+ # break, because an actual break triggers the else: block
+ table_output.write(
+ "\t".join(
+ map(
+ str,
+ [
+ feature.id,
+ feature_id,
+ feature.location.start,
+ feature.location.end,
+ human_strand,
+ sd_finder.highlight_sd(seq, sd["start"], sd["end"]),
+ sd["hit"],
+ int(sd["spacing"]) + lookahead_min,
+ ],
+ )
+ )
+ + "\n"
+ )
+
+ if add:
+ # Append the top RBS to the gene feature
+ gene.sub_features.append(sd_feature)
+ # Pick out start/end locations for all sub_features
+ locations = [x.location.start for x in gene.sub_features] + [
+ x.location.end for x in gene.sub_features
+ ]
+ # Update gene's start/end to be inclusive
+ gene.location._start = min(locations)
+ gene.location._end = max(locations)
+ # Also register the feature with the separate GFF3 output
+ sd_feature = fix_gene_boundaries(sd_feature)
+ gff3_output_record.features.append(sd_feature)
+
+ if top_only or sd == (sds[-1]):
+ break
+ else:
+ table_output.write(
+ "\t".join(
+ map(
+ str,
+ [
+ feature.id,
+ feature_id,
+ feature.location.start,
+ feature.location.end,
+ human_strand,
+ seq,
+ None,
+ -1,
+ ],
+ )
+ )
+ + "\n"
+ )
+
+ record.annotations = {}
+ gffWrite([record], sys.stdout)
+
+ gff3_output_record.features = sorted(
+ gff3_output_record.features, key=lambda x: x.location.start
+ )
+ gff3_output_record.annotations = {}
+ gffWrite([gff3_output_record], gff3_output)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Identify shine-dalgarno sequences")
+ parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta Genome")
+ parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
+
+ parser.add_argument(
+ "--gff3_output",
+ type=argparse.FileType("w"),
+ help="GFF3 Output",
+ default="shinefind.gff3",
+ )
+ parser.add_argument(
+ "--table_output",
+ type=argparse.FileType("w"),
+ help="Tabular Output",
+ default="shinefind.tbl",
+ )
+
+ parser.add_argument(
+ "--lookahead_min",
+ nargs="?",
+ type=int,
+ help="Number of bases upstream of CDSs to end search",
+ default=3,
+ )
+ parser.add_argument(
+ "--lookahead_max",
+ nargs="?",
+ type=int,
+ help="Number of bases upstream of CDSs to begin search",
+ default=17,
+ )
+
+ parser.add_argument("--top_only", action="store_true", help="Only report best hits")
+ parser.add_argument(
+ "--add",
+ action="store_true",
+ help='Function in "addition" mode whereby the '
+ + "RBSs are added directly to the gene model.",
+ )
+
+ args = parser.parse_args()
+ shinefind(**vars(args))
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/test-data/AY216660.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/test-data/AY216660.fasta Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,815 @@
+>AY216660.2 Enterobacteria phage T1, complete genome
+GCCTGCAATATGGTAAAATAGCACTAAATGTTAAACAAAGAGGATGTATTTATGAGTGAA
+CCTAAGAACGCTCCCGTAGTCCAGGGTGGTAATTTCAAAGAGCTATACAAGAAAAAGTTT
+GGCACTGTACTCGCGAAAAACCGGGCTATGACGCCAGAGCAACTATTCGATCTGTCAGTG
+AAGTATTTCGAATGGGCCGAGGACAATGCGATCAAGGCGTCAGAATCAGCCAGCTTTCAG
+GGTGGCGTTTATGAGTCGCTTGTCCATAAGCCGCGCGTCTTCACCTGGACCGGATACCGA
+CTATTCATCGGTGCAAGCGAGGCTGCAATCATTAAGTGGAAGCGAGAGGAAGAATACAGC
+GAGGTTATGGAGTTTGTGGAATCGGTAATCAACGAGCAAAAATTCCAGCTTGCCGCCAAC
+GGTGTTATTAATGCCTCCTTTATCGGTAAGGATCTCGGAATCGATAAGCCAGCCTCAATC
+AATATCGAAAACTCGTCAGCTTCCGCATCGACAGTAGTCGCCACTACTGAGGATGCGATG
+AAAGAGGCGGTAAACAGCATTCTTGATATGCTTTAACTTTAGGGGCGCGAGAGCGCCCAC
+ATGGGAGACTTAATCATGATTCAATGGGAAGACCTTAACGCAACGCAGAAGTTAGCGATC
+AAGAAAATGAGCGAGGCCAATTTCGAAAAAATGATTCGGATCTGGTTCCAACTTATGCAG
+GCGCAGCAGTTCCAGCCTAACTGGCATCACCTTTACCTATGTCACGAAGTGGAGGAAATT
+ATTGCAGGGCGGCGAGGGAATACAATCTTTAACGTCACACCAGGTTCCGGTAAAACTGAA
+GTGTTCTCAATTCACCTTCCGGTGTACGCAATGCTTAAGTGTAAGAAGGTGCGAAACCTT
+AACGTGTCGTTTGCTGACAGCCTGGTTAAGCGTAACAGTAAGCGCGTCCGTGAGATTATC
+AGCAGCAACGAATTTCAAGAGCTATGGCCTTGCAAGTTCGGTACATCGAAAGATGAGGAG
+ATGCAGGTTCTTAACGAAGATGGAAAGGTTTGGTTTGAGTTGATATCCGCAGCGGCTGGC
+GGTCGTATTACTGGTTCGCGTGGTGGCTACATGACGCCGGGATTCTCGGGGATGGTAATG
+CTAGACGATATCGACAAGCCTGATGATATGTTCTCAAAGGTTAAGCGTGAGCGTACGCAC
+ATGCTATTGAAAAACACCATCCGTTCCCGTCGTATGCATAACGAGACGCCTATTATTGCA
+ATTCAGCAGCGACTACATGCTCAGGATTCAACCTGGTTCATGATGAATGGCGGTATGGGT
+ATTGAGTTTGACCAAATCTCAATACCGGCGCTGGTGACGGAAGAATACGGAAAGACACTT
+CCTGATTGGTTGCAGCCTTACTTTGAGCGTGATGTTTTATCGTCTGAGTATGTAGAATTG
+GATGGCGTTAAGCATTACTCTTTTTGGCCAAGCAAGGAAAGCGTTCACGACCTGTTGGCG
+TTACGAGAAGCAGATCAGTATACCTTTGATTCTCAGTATCAGCAGAAACCGATCGCGCTG
+GGTGGCTCCGTGTTTAACTCAGAGTGGTGGACTTATTACGGCAGCAGTCTTGACGCTGAC
+GAGCCAGATCCGGGTAAATATGATTACCGATTCATCACTGCCGATACCGCTCAGAAGACA
+GGCGAGCTAAACGACTACACGGTATTTTGTTTGTGGGGCAAGAAGAATGATAAGGTTTAC
+TTTATCGACGGCATTCGCGGAAAGTGGGAAGCGCCGGATATGGAGAGGCAATTTACAGCT
+TTCGTCAATCAGGCATGGAGGCACAATAAATCAATGGGGGTACTTCGTAAAATTTATGTG
+GAAGATAAGGCGAGCGGTACGGGCTTAATCCAGAACCTCAGGAAAAAGACCCCGATCTCA
+ATCACTCCATTGCAGCGTAACAAAGACAAAGTTACCCGAGCTATGGATGCTCAGCCAGTT
+ATTAAAGCCGGGCGCGTGGTTCTGCCAGAAGAGCACCCTATGCTTGCTGAAATTATCGCT
+GAACACAGTGCCTTCACTTACGATGACACCCATCCGCATGATGATATCGTCGATAACTTC
+ATGGATGCGGCGAACATCGAATTGCTGACCATTGATGATCCTATCGAGAGAATGAAGCGA
+CTCGCCGGGATGGTTAAGCGGTAATAAATGAGATATAATTAGGGCTGTCAATTGACGGCC
+CTTTTTATTGGAGGAAACATGAAAATTGTTAAGCATGATGGATATAACGATATCTTTAAC
+GGCGGCGCGGACGGATCGCCTAAGCCATTCTTTATGTCTGATGCATCATATCACGTCGGT
+TCTTTCTACAACGACAACGCAACCGCGAAGCGAATTGTGGATGTTATCCCGGAAGAGATG
+GTGACGGCTGGTTTTAAAATGTCAGGCGTTAAGGATGAAAAAGAGTTCAAGTCTTTATGG
+GATAGCTACAAACTTGATTCAAGTCTGGTGGATCTTCTTTGTTGGGCACGACTTTACGGT
+GGCGCGGCGATGGTGGCAATCATCAAAGACAACCGGATGTTAACCAGTCAGGCAAAGCCT
+GGAGCTAAACTTGAAGGCGTCCGAGTTTATGATCGATTTGCTATCACTGTTGAAAAGCGA
+GTCACCAATGCAAGATCTCCTCGCTATGGTGAGCCTGAAATTTACAAGGTATCCCCTGGC
+GACAATATGCAGCCGTACCTGATTCATCACTCAAGAGTCTTTATTGCTGATGGTGAACGA
+GTGGCGCAACAGGCAAGAAAGCAGAATCAAGGATGGGGAGCTTCGGTATTGAATAAGTCA
+CTGATTGATGCAATCTGTGACTATGATTACTGTGAATCTCTGGCTACTCAGATCTTGCGA
+CGTAAGCAACAGGCTGTATGGAAGGTCAAAGGTCTTGCCGAAATGTGTGATGATGATGAT
+GCTCAGTATGCCGCGCGCCTGCGACTTGCTCAGGTAGATGATAACTCCGGCGTAGGCCGT
+GCGATCGGTATCGATGCTGAGACTGAGGAATATGACGTTCTCAACTCTGATATCAGCGGA
+GTCCCTGAGTTCTTATCAAGCAAGATGGACCGCATCGTCTCCCTATCCGGGATTCATGAG
+ATTATCATCAAGAATAAGAACGTAGGCGGCGTATCAGCGAGCCAAAACACAGCGCTTGAG
+ACTTTCTATAAGCTAGTCGATCGCAAGCGCGAGGAAGATTACAGGCCGCTTCTTGAGTTC
+TTGTTGCCGTTCATTGTTGATGAGGAAGAGTGGTCGATCGAGTTTGAGCCTTTGTCTGTT
+CCGAGTAAGAAAGAGGAATCAGAGATCACGAAGAATAACGTTGAGTCAGTCACGAAGGCT
+ATCACTGAGCAAATCATCGATCTGGAAGAAGCTCGCGACACGTTGCGATCCATTGCCCCT
+GAGTTCAAACTCAAGGATGGTAATAACATCAACATTCGCGAACCGGAAGAAACAACCGAA
+CCGGAGCCGGGATTAGGGGAGAAGTTAGAAGATGAAAATTAATGGCGTTGCAACACAGTG
+GCGCTATCCTGAAATGAGCGAGCGCGCAATGTCGCGCTCCCTACAGGATGTTGCAGCCAA
+ACTAACTGAAAAAATGCGTGACGAATTAAAGCCGATGAAATTTGACGCTACCGACGAAGA
+GATAGATCAGACAGAGAGGTCATTGCTTGATTACGTCGAATCACTCATCGCTCCGATTAT
+TGGTTCTCTATCATCCGTTGCGCTCACGATCTATAAATTCAACTCTAAGCAGTGGCTGCG
+CATCGCTCGCAATGCTGGAGGTAAGAAGAATCAAGCCGTGATGCTACTTGCCCTGATTGG
+TCCTACCGCTGCCGAAAGCTGGTACTCAGGACAATATAATCTGTGGCGATCGCAGGTTGC
+TACTTCTATCAGGAAATTTGCCGCCAACATGGTTACTGATTTCACTGATAAACTTCGTGC
+GGCATCCGGTCAGGGTAAAAGCAAGGATTTTGTTGTTGAACTTGCTAAGGAGCGATTTGG
+TATTTACCGGAACTGGGCCAAAAATAGAGCGTCGGGAATTGTCGGAACCTGGAACAGTAG
+ACTGATGCGTCAGCGCATAAAAGACGCTGGTGTCTCTTACTATTTCTGGCGCGGGGTGAT
+GGATTTACGCGAACGTGAAAAACATGTAAGATGGGAAGGTAAGCGCATAGCGGTAGATTC
+CGATCATGTATTCCCTGGTGAGGAATACAACTGCCGCTGTTGGGCTGTTCCAGACTTTTC
+TACAGGAGATTAAAAATGAAGGCAAAGCAAAGATTCGATTCAGTAAAAATCAAGGCGCAC
+TTTGATGATAACGGTTTTTTAGTTGACCGCCCAATCGTGGCGCGAATCGGCGCTCAGGTT
+TACAAAACGCCGCACGGCGATCGAGTTGAGTTCCGTCCGGCGTCCGAAGTTTTCAAGCAA
+GACTCCTTGCAAAGTTTTGCGGGTAAGCCAATTACTGTCGGTCACGTAACGGTAACTCCG
+CAGAATGCTAAGGACGTTGTTGTCGGATCGTGTGCTGGCGCTGGTATTGCTTCAGGGGTT
+GGCGTTGAAGTTCCTTTGAGTATTTACAGCGACTACGCGATCAGCAAGGCTAAAGCAAAA
+GAAGCAGGTGAATTATCTGTTGGTTATACTTCGGTTGATATTGATAAGCCTGGTTGGGGT
+TCAAATGAGACTGGAGAATATATCTTCGAAGAGGATATGAAACAGGACGAAGCGCCGCCT
+GAAGGTTGGGTGAGATTCGACGCGGTACAAACTAATATCAAGGTCAACCATATTGCCCTA
+GTTTTTAAAGGTCGTGCGGGAATTGCTAAATTAAATCTTGATGCCGAACAGGAGTTCCCG
+TATGATAATAACGTTCAATTAACTAACGAGGACAAGCAAATGAAAAAAATTAAGATCGAC
+TCAGTTGATGTGGAAGTAACCGAAGACGTTGCGAACCATATCGAAAAATTAACCGCGCAG
+ATTGCCACCATTCAGGGGAAAGCTGATGGCTTCGAAGCTGAGCGCGATGCGCTGAAGGTT
+AAGGTTGACTCTCTGCCGGAACTTGTGAAGGCCGAGGTAGAGAAGCAAAAAGCCGATGCC
+GCCGCACGCGCAGAAGTTACCGCAGTAGCAGAAACCGCAGGCGTCAAACATGATGGTCTT
+GATATCAAAGACGTCAAGATTGCCGTAGTTAAAGCCATGCTTGATAAAGATGTTAGTGAA
+AAATCAGACGCATATATCGACGCTATGTTTGATGTTGCTAAAGATTCTGATATCATGGCT
+ATTCAGCGTAAAGCAGTAAAAGGCGACTCTATCGAAGGCGGTAAGCCGGAAGAGAAAAAC
+GACGCCGCGCCTGTTACGCCAAATTCACGTTTAAGCAAAGTAATGTAAGGGGAAATATCA
+TGGCACAAATTAATGCATCTTATCAGCGAGATATGGCGATTGCGCTTCCGGGTATGGTTG
+CGGATACTTCAAAGTACAATATTGACGGCGCTTGTGTCGTTAATGAAGGTGATGTTCTTG
+TTGGCGCTGCCGTACAAGTTGTTCAAGCTCAGGCGGTTGATGGTCATAAGTTGGTTAAGG
+CTCTTACTACCGGAACCACTCCTTACGGCGTGGCAATCCGATCTCACTGGCAGACTGTTA
+ACGCTCAAAATCAGATGATTTACGAAGATGGCGGCGCTATCAACGTGATGACTTCAGGCC
+GAGTATGGATGCTTTCCAAATCCACCGAAGCGCCAACTTTCGGCTCTGCCGTTAAACTTG
+ATGTTGATGGTCAGGAAAAATCTGATGGCACGATCGAAACAACCTGGACCTACGCTGGCG
+GTTGGACTAAATACAAAGATATTCAGCTTGTTGAGGTTCAGTTGCATCAACTGTAATTAG
+CGTTTAATATGGGGACTATCCTTTTTTGGATAGTCCTTTTTTTATGGAGAAATCATTATG
+GCTTACGAAAATTTAATGTTGCGCCCGGCGTGTCCGGGAAATCTTTCTGATACTTCAACC
+TACAATATTGATGGCGCTTGCGTGGCTCAAGGTGACATTGAGTTCGGCTCAGCGGTTCAG
+GTTGTCGGCATCGTTGATGGTGTAAAAGTTGTTACGGCGCTTTCTGATGGTGGAACTCCT
+TACGGTATCGCTTTCCGTTCCCAATATGAACACCTGAGCGGTAAAATCCTCGACGGTGAA
+GTGTGCAACGTCGTTTCTCACGGTCGCGTGTGGGCGCTTACTTCTCTTGATGAGGCTCCC
+AGCTTGTTCTCAAAGTTGCAGTTTGGATCTGGTGGGGTTGTTACTGGTGGATCTGGTTAC
+GCAGGATGGACCTTTGCTGGCGGCTTTGTTAAGCACGAAGATGGCTACATTATTGAGGTT
+CGGGTGAAACAAAATGCTTTCATCGTTCCACCGCCGCCGCCCCCTGTCGTTCTTGTTGAA
+TCCGCTACAATCACCACTGACAAGGAAAGCCCTCAGCCAAACAACGTTACGATCCAGTGT
+GTAGCTAATGCTCTTCCGGCTAATGCAACTGATAAGACTGGCAAATGGTCAATCGACGCT
+ACCAATATCGCCACTGTCAATCCGGACTCAGGTCTTGTAACTCCTGTTGGTGGAGAGGTA
+GTCGGTGATTTCAATATTACCTGGACGGCTAACGATGCCAGCAAGACGACGGCAACCATT
+GCTTATCGCGTAGAAGCAGTGCCAACGCCAGAGGTTGATGTATAACATAAAAACACTTTG
+ACGCTTTAACAAAAAGTGCTATTATTGAAGCCGTGAACATAATCACGGTTTTTTATTAAC
+TATGGAGAAGTAATCATGACTACTAAAAAATTTGATGAAGCAGATAAAAGCAATGTTGAA
+ATGTATCTGATCCAGGCTGGCGTAAAACAGGATGCGGCCGCAACGATGGGTATCTGGACC
+GCTCAGGAACTACACCGCATCAAAAGCCAGTCCTATGAAGAAGACTACCCGGTCGGCTCA
+GCTTTACGCGTATTCCCGGTTACAACCGAGCTTTCTCCGACCGACAAGACGTTTGAGTAT
+ATGACCTTTGATAAGGTTGGTACGGCTCAGATTATCGCAGACTACACCGACGATCTTCCG
+CTGGTTGATGCCCTGGGTACTTCTGAATTTGGCAAGGTGTTCCGTCTTGGTAACGCGTAC
+CTGATCTCAATCGACGAAATCAAAGCGGGTCAGGCAACTGGTCGCCCACTGTCAACCCGT
+AAGGCGAGTGCGTGCCAGTTGGCGCATGATCAGCTTGTTAACCGCCTGGTGTTCAAAGGT
+TCCGCGCCGCACAAGATTGTGTCCGTGTTCAACCATCCGAATATCACCAAAATTACCTCT
+GGTAAGTGGATTGATGTATCTACTATGAAGCCGGAAACTGCGGAAGCTGAGCTAACTCAA
+GCGATCGAAACCATCGAGACGATTACTCGTGGTCAGCACCGCGCAACCAACATCCTGATC
+CCGCCTTCCATGCGTAAGGTTTTGGCGATTCGTATGCCTGAGACAACCATGTCTTACCTG
+GACTATTTTAAGTCTCAGAACTCCGGTATCGAAATCGACTCTATCGCAGAGCTTGAGGAT
+ATTGACGGCGCAGGCACCAAAGGCGTACTGGTGTACGAAAAGAATCCGATGAACATGTCC
+ATCGAGATCCCGGAAGCATTTAACATGCTGCCAGCACAACCGAAAGACTTGCACTTTAAA
+GTGCCTTGCACCTCTAAGTGTACTGGTCTTACAATTTATCGCCCGATGACTATTGTCTTA
+ATCACTGGCGTGTAATATTATAGGGGCTAACTTAGTTAGTCCCTTTTTTTATTGGAGAAA
+TCAAAATGGCTAAAGAAAAAACTGTTGTTATCGTAAACGTTGGTGTAGCTCTTCAGATGT
+TCCGTCTTGAAGATGGTTCCTTTGCTAAAGTTCTTCCAGATGAAGAGGTCACGCTTCCGG
+CGTCCGTTCTTGATTTACCTGGTCTGCGTTGCTTAATTGCTCGCGAAGAAATCGAAGTTA
+AAGACGACAGTGCAACCAACCGCAAAATCCGCGCTGAAATGGCAAAGATCACGAAGCCAG
+ATCCGTGGGATAAAATGAGCGTAAAAGAGCTTGAAGACGGCGGCGAATATTAATCATCAA
+GGCGCTCATGTAGCGCCTTTTTTTATGGTGGTAAATTATGAATCAAGAAACTTTAATTGC
+AGTTGTTGAGCAAATGCGAAAGCTGGTTCCGGCACTTCGTAAGGTTCCAGACGAAACGCT
+TTATGCGTGGGTAGAAATGGCTGAGCTTTTTGTATGCCAGAAGACCTTTAAAGACGCATA
+CGTCAAAGCGCTCGCTCTTTATGCATTGCACCTTGCTTTCCTTGACGGGGCGCTAAAAGG
+TGAAGATGAGGATCTGGAATCGTACTCACGACGAGTTACGTCATTCTCCCTGAGCGGTGA
+ATTTAGCCAGACTTTCGGAGAGGTTACAAAGAACCAGTCAGGAGACATGATGCTTTCGAC
+GCCGTGGGGTAAGATGTTCGAACAGCTTAAAGCGCGACGCCGTGGTCGATTCGCATTAAT
+GACAGGACTCCGTGGAGGATGCCACTAATGAACTACTCACAGATTGAAAGGATGGCTCGC
+AAAGGTGTGGCTTTCTTCACCGATCCGTCAAGACCTATGAACCTGATAAAGCAAGGTGAA
+TACGGATATGATGAAAACGGATTCGAGATCCCACCGATGGAACAGGTTATTCCAATATCC
+GGCGCGACGCGAAGACCGAACGCGCGTGAGATTGACGGGGAAACCATCCGCGCCTCAGAT
+ATTTTGGGGATCTTCAATAATGATCATGAAATAAACGAAGGTGACTATATAGAGATTGAT
+GGCATTCGTCATGTTGTCGTTGATGCTCGCCCGGTTCAGGCGTCACTGGAACCAGTTGCC
+TATCGTCCAGTATTGCGGAGGGTATCAGTCGGTGGCTAATTATCAGATTCGTAGATTTCA
+AGGCGAGATTGATGCGTGGATTAATGCCGCTGAAAGCACGTTAGAACATGCTATTGAGAT
+ATTCGTAAGGGATGTTCACGACGCTCTTGTTAGCCGCTCCCCTGTTGATACAGGTCGATT
+CAAGGGTAACTGGCAGATAACTTTTAACGAAATCCCTAACCACGCATTAAACCGATACGA
+TAAAACTGGCGGTGTCGTCAGGGGTGAGGAACAGGCAAAAACTTATGGCATGTTCAGCCG
+TGGCGGCGCGATAACATCCGTTCACTTTTCAAACATGTTGATTTATGCAAACGCTCTTGA
+GTACGGTCATTCACAGCAAGCACCGAGCGGCGTTGTCGGTCTTGTGGCGTTAAGGCTTAG
+ATCATATATGGCTGACGCAATCAAGCAGGCAAGGAGACAGCAAAATGCACTATGAGTTAT
+CAGCGGCGGCGCGAGCCGCTTTTCTATCAAAGTACAGAGACTTTCCTCACTACATGGAAA
+ACAGAAATTTCACACCGCCGAAGGATGGCGGGATGTGGCTGAGGTTCAACTACATTGAAG
+GGGATACGCTTTATCTATCCATTGACAGAAAGTGTAAATCTTACATCGCAATCGTTCAGA
+TCGGCGTAGTGTTCCCTCCAGGCTCCGGCGTTGACGAAGCAAGATTGAAAGCAAAAGAGA
+TTGCTGATTTTTTCAAAGATGGTAAAATGCTTAACGTTGGTTATATTTTCGAGGGTGCAA
+TCGTGCATCAAATTGTTAAACATGAAAGCGGGTGGATGATTCCGGTTCGCTTTACAGTAC
+GAGTAGACACAAAGGAGACTTAATTATGCACTTACCAAATGGCGCACAAATTTTCGTGGA
+AACCTCTCGCGGGGTAGAGGTTGAGGCAACCGCTATCACTAACGCAGAAAATCCTGTTGC
+TACAGTTGCATCTAAGGGTGACTTGGCAAAAGGTGATTACGTTATTGTAACTCAGTCAAC
+TTGGGCAAAGATGGTTAGTCGAGTGCTAATTGTTACTGACGCTCAGGAAACAAGTATCAC
+TCTTGCTGGAATTGACACCTCCGATACTCTTGTTTTCCCGGCTGGCGGCACGATGAGCTT
+TGCAAAAATTACTGGCTGGACTGAGATCCCTTGCGTACAGGAGATTGGTCAGGACGGCGG
+CGAGCAGCAGTATTACACTTATCAGTGTTTGTCCGACGATAAAGAGCAGCAGATCCCAAC
+GTTTAAATCTGCGGTCTCGCTAACTTACACCTTCGCGCACGAATTTGATAACCCGATCTA
+CCCGATTCTGCGCAAGCTGGATTCGTCTGGTCAGGTAACAGCGGTTCGAATGTACGTTCC
+GAAAGCGAGCGAAATGCGCATGTGGGCTGGCATCTTGTCGTTTAACGATATCCCATCCAC
+GCAGGTTAACGAAATGGAAACGGTGGAACTCGCCGTATCCCTGAAAGGTGACTTTACTTT
+CATCTCATCCACTCTGGCATCGCCTGGTGCTTAAATACCATCCACAGGGGGCTTGCACCC
+CCTTATTCATTTCTGTAAAATCATCTTATCAACTTTATTCGATTAACTTTTAACAAAAAG
+TGCTATCAACCAATCAGGAGAAACATCATGGCTAAATTCAATTTCGTGTTGGGCCAGCTT
+CCAGACTTCAAACTTCCGGTGACGTTCACCATGCCAAACGGCGAGGATGCGACTATTATT
+TTTACAGTACGCCACCTTTCCAGTAAAGAAGTGCAAGATATGTATGCGAAGCAGGGCGAA
+ATGAATGATAGCGATTTCATCACTAAGATAGCATCAGGATGGAATCTGGAAGAAGAATTT
+AACGAAGAGAATACGCGTAAGCTGGTACAGTATTATCCTTCCGCAGCGTACAATCTGACG
+GCAACTTACATCAAGGCGCTCGCCGGACACCGCGCAAAAAACTAAAAAGGGCGGTTTATC
+TGTTATATCAGAAACCGCCAACAGAAGAGCAATTAAGATCGGTTGGCCTCAGTCTTTCTG
+ACTATGAAGACGAGGAACCGGAAACGATAATCGGCGATGCTGAAATGGTGAAGGCGTGGA
+ATGTTTTTACGTCAATGCTCACTCAGTGGAGAAGTTCAGGCGCTGGAGCTTATGGTCTTG
+ACTATAATGTTTTGCCTATGTTGTTCAAAATCTATAAAATAGAAGATGAAGAACTGGCAT
+TGCAGGACGTTAGGATCATGGAAGCGAAAGCGCTTGAAATGATTGCTAAGCAAAACAACT
+AAGCCGCCGTTTGGCGGTTTTTTCGTATATAGGGGGGGTTATATGGTTGATAAGGTAGCA
+GGTCTATCTCTTGACGTTGACGTGTCAACAGTTCAGCGCGCTGTCAAGTCACTGAAAGAG
+TTTTCAAAGGCCAACGATCAGGCCGCTGATTCTATGGGTTCTTTAATCAATGAGTCAGAG
+GTTGCAAAACAGAAGGCCAAAGAACACGCCGAACAACTCAGGCGCCAGAGAAAAGAGTAT
+GAGGCCGTGGAGAAGGCAATCGATCCTACAGTATCAAAAATGGAAAGGTTGAAGATTGCA
+TCTCAGCAGCTTGATAAACTCTGGCAGCAGGGAGTCGTTCCAGATGAGACTTTTTTCCGT
+TTGGGTGAAATGCTGGATCTGCAAAACGCAAAACTTGCTCGCAGCCGGGCCATGCTGACA
+GAAGAAGGGCAGGCAGCATTGCAAGAGGCGAAAGCAAAAGAGCAGGCGGCAGTGCGTAGC
+AAGGCGTTCATGGATGCCCTGAATGGTCAAGTTAACGCGATCGGTAAGACTCATGCTGAA
+TTGATGGAACTGAAAGCGGCTGAGCTTGGTTTATCGAAAGAAGCAGCACCACTAATCGCA
+AAACTTAAAGATCAAGGCCGGGCTATGAATGCAGCAGGTATTAGCGCCGGGGAATACAGA
+CAGGCAATGCGAATGCTTCCTGCGCAGATCACAGATGTCGTAACATCTCTTGCATCCGGT
+ATGCCAGTATGGATGGTTGCTATCCAACAGGGCGGTCAGATTAAGGATTCGTTCGGCGGG
+ATCGGTAACACGTTTAAAGTGTTGCTGAGTTATATTAATCCGGTCACGGCAGGTGTTGGC
+GTTCTTGTTGGTTCGTTAGGTATTCTAGCGAAAGCTGGTTATGACTCTTACAAATCAATA
+ACTGATATTCAGAATGCGCTTATTGAGACTGGAGGTTATGCAGGTGTTACGGCTGAAGAG
+CTTGATTCAGTGTCTAAAAAGATCGCGCAGACAAGCAACTCAACCATTGGGAGTATTCGC
+GAGATTGTAACGGAGTTGGCGAGTTCTGGTAAGTACACCCGCGAGCAGATCCAGAACATC
+ACTAAGGCTACCGCAGAGTGGTCAGCGTCAACAGGAAAATCAGCAAGTCAAATTATTTCT
+GAGTTCGAAAAAATAGCAAGCGATCCGGTAAAAGGACTGAAGAAGTTAAACGAGCAATAT
+AATTTCCTTGAAAAAGGGCAGCTTACCTATATCGATACATTAAGCCGGACGAAAGGAGAA
+ACTGAGGCTGTATCAGAGGCTACAAAACTATTCGCAGACGTAATGGAAAAGCGAATGAAG
+TCGATCGCGGATAACGCTACTCCTCTGGAAAAGATGTGGAGCGATATTAAACAATGGGCT
+TCGGACGCGTGGGGATGGGTTGGTGATCATACACTCGGGGCACTAAACCTGATTATCGAC
+GTTGTTCAGGGTACAGTGATTCAGGTTAAAATGATTCTTGCGAAGGGTGATGAATACATC
+TCAAACTTTATCGCCTCAGCCATAAAGGCAACTCAGTCACTGCCTGGAATGAGTGACTTC
+GGCGCTGATGTACTGAAGGAGCAGGAGAATATTGTAAAAAGTTCTCGCGACAACTACGAT
+CAGTTAGCTTCAGATCTTGACGCTATTAACGCTCGTGTAGAAAAAGGCGAGATGGGATAC
+ATTGAAGCAATGAGGCAGCGCCGCACCCTTGAAAAGCAGTACAGTGAGGAAACTAAGGAG
+GCAATAAGGAAAGAAGCAGAAGAGATCGAGAAGCGAAACCGAGAACGAAATAAGCAGTCG
+AAAATTGTACGATCACCGACAGAGCAATTCGACAAGGAGTTAATTTCACTCAGGGCTCAA
+CTTAAGGTATTGCAGGAGCATAAGGAGATCGGTCAAAAACTATCAGCACAGAGAAAGGCG
+CTGTTTACAACTGAGGCTACGATCGCTGTTCTTCGCGAAGCTAGTTCTAAGCGCCAGTTG
+TCTGCGGAAGAAAAAGCGTTGCTGGCAAGTCAGGAGAGAGTTATTGAGCTTGCGAAACAG
+AAGGCCGAGATTGGCGATCAGATTGTTAAGCAACAGCAGTTGAATGATCTTACCGATAAA
+TCTCTGAAGTTTGTCAATGAGATGACGGCGGCGACGGAACAGCTTAACGCGTCACGCGGT
+CTTAGTACTCGCGACATGGAACGACAAGCTGAACTAGCTAAAATCACCACTGATTACATC
+AACTCCGGCGGCAGCGAAGGAGACGAGAAACTTCAGAACATGATTAAGGCGCAAAATGAT
+TACTACGCTGCGGAAGATGCCAAGCGAGCTGACTGGTTGGCAGGTGCTGAAAGTGCTTTT
+GCTGATTACGGTGATGCAGCAATGGATATGTACGGCAATGTTAACGAGATCGCGTCAAGT
+GCCCTTAACGGAATGTCAGATATGATGGTTCAATTTCTGACCACAGGAAAAGCGAACTTT
+GAGGACTTTGCGAAAAATATCATCGGCATGATTATAAAGATGATTGCTCAGATGGTAATC
+TTCAATACGATCTCAGGCATGATGGGCGGTAAGACGTGGAGCTTTGCTGGAGGGGCGTCG
+TCTGGTGCTTCTGCGGCATCACAGGCAACCCCTACACCTGCTGCTTCTGTTTTTAGATCT
+GTATCTTCCGGCGGGGCCGCTGTATCACTTGCTGCCGCAGCGGGTAGCGTGGCAACCTCT
+GGATTCAACGCATCAAACTCGGCGCCAAAGGTGGTAAACCATTCAGGAGGTGGAACGGTC
+GTTGACGTTAGCGGAATGGAGGTGAAAGTTGACAACGGTTCAGATCCGAGGGGGATTTCT
+CAGGGCGTGGAAATGATGTTCAAAAAAATGATTCGTGAGTCTTGTTCGCAGGGCGGCGAG
+GTTTATAATTACATTCAGGAAAAAACAGGAGGCTAATAATGGCGACACTTGACACTTTTG
+GTTGGTGTACGCAGGTTCAAGGGGGCGGTGGCTCCCTTACCACTACCAACAGCGACCGCT
+CTATTCAGTTCGGTAACGGGTACATGCAACTTGCATCATCTGGATTTAACACCACGCGGC
+GTGAATATTCAGTCGTCTATGCCGGGGAAGATTTCATGGCTGTTTACGACTTCTGCAACT
+CTCACCGCATTAAGCCGTTCGCATGGACGCCGCCGGACGGTAAGATCGGGATATGGGTAG
+TAAAGCCTAACAGTTTGGGAGCGAAGCCAGTATCGCGCGACGTGATGGAGATTAACGTCA
+CGTTTATGGAGCAATTTACATCTATGGAGTAACGCCATATAACAAAAGCCCGCCTTGCGC
+GGGTTTTTTTGTAGCTGTAGAATGGTTGCAGGTAAACAAGAGGAAAAATCAATGAGCGAA
+AACAAAAAACTTTATGATGAAGAAAGCGGAAAGAGCCTGTTTCACAACTGCCTTCAATCA
+CTATATCCGGGAGAGATAATCACTCTCATCGAAGTTGATGGTAGTAAGTTCGGCGCTCAG
+GTGTACCGATTCCACGGTGAGAATATCCAGTACACTCCAGAAGAAATCATGCAGGCCCAG
+CAAACTGGAACGCTACCGCCGAAGGAAATTACATTCCGTGGCGAGAAATACGGGGCGCGA
+CCGTTCGGTATATCCGGGATCTCGTTTGACAGTTCCGGGAAGGCAACAAAACCACAATTA
+ACGGTGGCAAACATTGATAGTCGCGTATCTGCGATGATTCGTGCATATAACGGACTAATG
+CAAGCTAAGGTGACTATCTGGATAACTCAGCGTGAGCTTATTAACTCCGATGGCTCAATC
+GCTGATGGAGCTTACCGTAAACTGGTATACTATATCGAGCGTCCGAACTATGTTGATAAA
+AGCGTTGCGCGGTTCGATCTCACATCACCTTACGATATGGACGGCATAATGATCCCGTCT
+CGACTCACGCAAAGCGTATGCTATTTTGCACAACGAGGGTGGTATAAAACAGGGAAAGGC
+TGCGGATACAACGGGCAAAATGGTTACTTCGATAAAGACAATAATCCTGTAGACGATCCG
+TCGCTGGATTTTTGCCCGGGAACGGTAACGGCCTGCCGCCTGAGATTCGGCGCAAACAAT
+GAATTGGATTTTGGCGGTTGCGCTGTCGCTTCATTACAGAGGAAAAATCAATGATTAGTG
+CAAAAATTAAACTTGAAATTATGACTCACGCTCAGGAAGAATACCCCCGCGAATGCTGCG
+GGGTAGTCACCCAAAAGGGCCGCGTGCAAAAATACCATCGCATTGATAATGTGCATCGTG
+ATCCCGAGAATCATTTCATGATGGATGCTGTACAATACGCTTGCATTGAGGACGATGCGG
+AATCAACAACAATAGCAATTGTTCACAGCCACACAGGAGACGGGGCTACAACTCTACCAA
+GCGCTCACGATACGTGCATGTGCAACGAGATGGAAGTTACCTGGATTATTGTTAGCGTGC
+CGGAAGGGGATATGCGATTTGTGAAGCCGGAGAAATTGCCTCTGATTGGTCGTCCGTGGT
+CATTAGGATCATTCGACTGTTACGGTCTTGTTATGGCGTGGCACAAAGAGCACGGCGTAG
+AATTGCGCGATCGCCGATTGAATTTTGAATGGTGGAAACCTGAGTACGGAATTAATCTCT
+ATCAGGATTATTACAAGCAGGATGGCTTCGTTGAGATTCCAGATCAGAATAATCCGTCAT
+TCGGTGATATGGTAATCATGCAGATAGGGCAAAACGTTCCGGTATGGAACCATGCAGGGA
+TTTACCTGGGAGATAATCAGATCTTGCATCATGCCTTCGGCAAGCTATCTCGTCGTGATA
+TTTATTCCGGATGGTATCAGGATCATACTGTTTTAATCGTTCGCCATAAGGATCTTAAAT
+TATGAATGATGTAAAAGTAATTAAATTGTCAGGTTCACTTGGGAGACGCTTCGGCGTCTT
+TCACCGTTACGCTGTTGACTCTTACCCGGAAGCCATACGGGCGCTATCCAGTCAGGTTGA
+CGGATTTAAAGAATACATGCAAAGCGAGGTAGGATCTCGTAGCAAGTTTGCAATATTTGT
+GGATGGCGTTAACGTGGGACACCATGAAGAGGAAAAATTCAAGTGCGCGAAAGAGATAAG
+AATCGTACCGATCCCTACTGGCTCTAAGACAGGAGGTCTATTTCAGGTTGTATTGGGCGC
+GGCAATAATGGTTGCAGCATTCTATACTGGCGGCGCGTCTCTGGCTTTAATGGGCACAAT
+GTCCTCGTCTCTGTTTATGATGGGCGGCGCTATGGTGCTGGGCGGCGTGATGCAGATGAT
+TTCACCGCAGCCGGGTGGCGCAAACTTTGAAGTTCAATCAAGCAAGAATAAACCTTCGTA
+CGCGTTCGGCGGTGCTGTCAATACGACGGCGGCGGGATACCCTCTCCCGGTCCCGTATGG
+ATATCGCGCCGGAGGTGGGGCAACTTTCTCAGCAGGTTCTTATGCCGAGGATATGAGTTA
+AAATTAACCCGCCTTGCGCGGGTTTTTTTTCGCCTGTATAATGAGTCCACCGATAAATAG
+CACAAAAAGGTAAACATCATGATTCAAAAAGTGATAAGCGGATCTAAAGGTGGGTCACAG
+AAGCCTCATAACCCAGTTGAGATGGAGGACAATCTAATCTCAATCAACAAAATCAAGATC
+CTGTTAGCTGTATCTGATGGTGAAATTGACGAAACATTCAGCCTGAAGCAGTTGATGTTT
+AACTCAGTCCCGGTGCAAAACGAGGATGGCTCATTTAACTTCGAGGGAGTAAAGGCAGAG
+TTCAGACCGGGGACGCAGACTCAGGAATATATCAAGGGAATGGAAGATAGCTCTAGTGAG
+GTAACTGTAAATCGTGAGGTTACTACCGATAACCCATACACAATCTCAGTAACCAACAAA
+ACGCTGTCGGCAATCCGTATCAAAATGTTCATGCCTCGCGGCGTACGAATTGAAAGTAAC
+GGTGATAAAAATGGCGTAAGAGTTGAGTATGAGGTGCAACAAGCTGTTGATGGCGGCTCG
+TTTGAGACGGTGCTCACCGATGTAATCGAAGGCAAAACAATGTCAGGTTACGATCGAAGC
+AGACGTGTAAACCTACCTAACTTCAACAATCAGGTGATATTCAGAGTGGTTCGGAAAACT
+CCAGACTCTAACGACTCGAACGTTGTTGACGCGATTCAGGTAAAGAGCTATGCCGAGGTG
+ATTGATGCCAAATTCCGTTATCCGCTGACTGGTCTTCTTTTTGTCGAGTTTGATTCGAAG
+ATGTTCCCAAACCAGTTACCTACGATCTCAATTCGTAAGCGCTGGAAGATTGTAAACGTT
+CCGTCAAACTATGATCCAGAATCACGAACTTATAACGGAAATTGGGATGGAACTTTTAAG
+AAGGCATGGACGAATAATCCGGCCTGGGTGCTTTATGACCTGATGATTAATCAGCGTTAT
+GGCTTGGATCAGAAAGAGCTTGGTATCGCTGTAGATAAATGGGCGCTCTACGAGGCTGCG
+CAATATTGCGATCAGATGGTTCCTGATGGGAAGGGCGGGACGGAACCTCGATACCTTTGC
+GACGTGATAATCCAGTCTCAGACTGACGCTTACAAGGTTATCCGAGATATTTGCTCAATC
+TTTCGTGGTATGAGCTTTTGGAATGGTGAGAGCATTTCGGTAATCATCGACAGGCCGCGT
+GAACCTGCGTACATCTTCACTAACGACAACGTTGTTAATGGTGACTTCTCCTACACGTTC
+GCAAGCGAAAAGAGCATGTACACGACGTGTAATGTGATGTTTGATGATGAACAAAACATG
+TATCAGCAGGACGTTGAGCCAGTATTCGATCGTGAGGCTACTCTACGGTTTGGGAACAAC
+GTTACGAGCATTACAGCGATCGGTTGCACACGTCGAAGCGAGGCCAACCGACGCGGGAGA
+TGGATTCTGAAAACTAACCTCCGCAGCACTACGGTAAACTTCGCTACCGGGCTTGAGGGC
+ATGATCCCGACAATCGGAGATGTTGTGGCAATAGCTGATAACTTCTGGTCAAGTAACTTG
+ACAATGAACCTGTCAGGGCGTTTGCTCGAAGTGTCTGGAAGTCAGATTTTCTTGCCGTTC
+CGGGTGGATGCACGCGCTGGTGACTTTATTATCGTAAATAAGCCAGATGGCAAGCCTGTG
+AAGCGCACAATCTCAAGTGTTAGTGCGGATGGTAAGACTATAGAGGTTAACATTGGCTTT
+GGCTTTCCTGTGAAGCCTAACACGGTATTCGCTATCGACCGCACCGACATTGCGTTACAG
+CAGTACGTCGTGACAAAAATCGATAAGGGCGATGATGATGAGGAATTTACCTACAAAATA
+ACGGCGGTGGAGTACGATCCTAACAAGTATGATGAGATTGATTACGGAGTTAACATCGAC
+GACCGACCGACGAGCATCGTTGAACCAGATCAGATCCCTAGACCGAAAAATGTGCAAGTA
+TCCTCAGAGTCGAGAATCGTCCAGGGGATGAGCGTAGAAACGATGATTGTTAGCTGGGAT
+AAAGTTCCGTACGCTGTTTTCTATGACGTCCAGTGGCGAAAGGATAACGGCAACTGGCAA
+AATGTACCGCAGACAGCAAACAAAGAGGTATACGTTGAAGGTATTTACGCTGGCAACTAT
+CAGGTTCGCGTTCGCTCAGTCGCTGGTTCGGGCACGACTTCAGGCTGGTCAAATATCGTC
+GCGGCAACGTTGACGGGTAAACAGGGTGAACCGGGCCGACCGATTAACCTCACAGCTACG
+GATGATGTTGTTTTTGGTATCCGTACAAAATGGGGGTTCTCTGATGGTTCTGGAGATACA
+GCCTATACAGAGTTGCAACAGTCACCGGATGGAACAGTGGATAACGCAAGTTTGCTTTCT
+TTGATTCCGTATCCGCAGCATGAGTATTATCACTCACCAATGCCTGGAGGGAATATTGTT
+TGGTATCGGGTAAGGACGGTTGACAGGATCGGTAACGTGTCTCAGTGGACTGATTTTGTC
+AGAGGTATGGCATCAACAAACGTTGACGATATCATTGGGGAGATTTCTGTTGATATCGAA
+AACTCACCTGGTTACGAGTGGCTTGTTGATAACGCAACAGACAACGCGGCGCAGAACTCA
+GCTAACGCAGAGGCAGCAATAGAAAACGCGCTCGCCAATGACAAAGACGCGATCTACATG
+AAGAAGGAGAACGGAAAACGAAAAGCTGAGTACACGAAATCACTGAAACTTATTGCTGAT
+GAGACGCAGGCACGAGTGACGGCGATCGAGCAATTGAAGGCAAGTTTTGGCGATCAGATT
+AGCGCTAGCAACAGCGAGCTGCGTGAGGTTATCGCAACCGAGACTGAAGCACTATCGCGT
+GAGATTGACCAGCTTAAGGCTCAGATTGGTGACGATATTCAGGCAAGTCTGACTGATATT
+CGGGAGGTTATCGCAACCGAGACTGAAGCACTATCGCGTGAGATTGACCAGCTTAAGGCT
+CAGATTGGTGACGATATTCAGGCAAGTCTGACTGATATTCGGGAGGCTATCGCGAACGAG
+ACTGAGGCTAGAACGCAAGCTGACTTAACACTTAGCGCGCGGCTTGGAAATAACGAGGCG
+GCACTTGCTCAAAAACTAGACTCGTGGAGTAACGCGGATTCTACTGGTGCAATGTACGGT
+GTCAAGTTGGGTCTGAAATACAACGGCCAGGAATACAGTGCAGGCATGGCTATGTCTCTA
+GTTGGTTCCGGAGCTGCGGTTAAGGCGCAGATTTTGTTTGAGGCGTCACGATTTGCCATC
+ATGACTGGAATGAATGGTCAGACTCAGTACCCCTTCGTTGTTGAGAATGGTCAGGTTATT
+TTAAGTAGCGCGATTATCAAGAACGGATTCATCACCAACGCAATGATTGGAAACTTCATC
+CAGTCGAATAACTATGTATTTAACCAGTCCGGATGGAGGCTTGACAAGGGTGGAACATTC
+GAAAATTACGGAAGTGACGGTGAGGGTGCAATGAAGCAAACTAATACCACAATATCTGTT
+AGGGATGCGAGTGGTCGCCTGAGGGTTCAGATTGGCAGGTTGACTGGCTCATGGTAATAT
+CAAGGGCATCGAGAGATGCCCTTTTCTTTTGGAGGATTTATTATGGCGTACGGTATATCA
+ACTTGGGACGCAAATGGCGTTTATAATAACTATGGAATTAAGCCTATTACGGTTGTTGGT
+TGGAACTTTTTGTCAGCAGGCCAGAATTCAGCATCGTTCAGCTATCAAGTGCCTCCTGGT
+ATGCATGTGAACTACGTTATAAGCCTTGACGATGGCGCCATTAGTGGGCCTGGCAGGAAA
+ATTATTGCTAGCGGTAATACGATAACAGTAACGCCAACAAACTCACCTGGGCCAAACGTG
+TACCCATCATCAAACTGTTACTTAATAGCATATCTGGAGAATGATTAATGTCATACGGTG
+CTTTTATAGATGTAAACGGAAACCCATTCATAACCCCGTTATCCACGCCATTCGCTTTAT
+ATGCGAGAGGGGAAATTCAATCAGTAAATGTTAGTGGTTCACAGGTTGCGGAGAGATACG
+TTCGGATACCTACAGGTGTTCCGGTTATAGCTTTTTGCAAAACAACAAACACGCAGCAGG
+GGACCGCGCTTTCAGCCTTTACTTTCAGAAGCGGACCCAATGTTGGAACTGTTTATATAA
+GGGGGACAAATCCAGCAAACCAATCATACACGCTAACATACTACATATTTGCCATATTTG
+AGCAGTCACTACCGAGATGGGGTATGGCAATATGGGATGCGTCAGGAAAGCTAGTGCTGA
+CAAATGAGACAAAAGTCCTTAGTGATTTGGTTACAATCGGCACTCCTGGATACGCTGGCG
+GTGGATTAAACATAGACACAACACTCAGCGGAAGCTACGCGGTTGTTCCAACTATACTTG
+GCAACTATCAAGTTGTTATTGGAAGGTTGCCAACTGGGCAACCAATAATAGGAAACTCAA
+CAGCAGGCAGTTCATGCAGGTACAACGGGAGCACAACGAGAATAAATGCAGCAGCAACCA
+CTGCGGCAGGTCAGATAATGAACACAACGAATAATGGAAATATTATAACAGCAATCAAAA
+CGGCAGCATACGACTAAGCCCCTTGCGGGGCTTTTATTTTATATCGAGCAATCGTGAGAT
+TTGAAGTTTTTCTCTGATACATAGTTGAAGGAGAATGGGTATCCAGCACGCAATACCATC
+TCCTTTCCGCGCATCTTAGATCCAAAAACGTAAACCGAGTACTCCGCGCCGCCTGATTCA
+TAAATTGCCGTGCAAGTGCGCTCAGGCATCGATGAGCAACCAGTCAGGATGAATGCCGCA
+GCGATAATGGTGATTAACTTTTTCATTTGTATGTCCTCGTCGTTAGTGTGATTGCATTGT
+ATGTCGCATTTACTTTTATTGCAATAGAGCGATTACAATTTTTTTCGTGTAACAGGCGTA
+TATTTTTGTAACCGGAATGGGTGTTACAAAATCGCCTCCATCCGACCGCAGGGAGATATG
+ATAAAAACTCTATATAATATATATAGATAGATAATATTTAATTTTAGCTTTATATATATA
+ATTATTGTTGTGTAACAGTTGTATATCGTGTAACAGGTGATTTGATTGATTCGTCAAATT
+TCTATCATGTATGTTCAAAATTTAATCAATCTGGATTCTATTTGTAGGTATCTCTGTATT
+TCTAATAAAAAGCGGTTACAAGTGTTACGCAATAGACAGCGCATAAAAATCTACTTAAGC
+CATTGATTCTGTTGATGCTGGTTGTAACTTGAGCAATATAGACACGCAATTACACACTGA
+TTACATGTATTCGATTGACTAAACGCTGTTAATGGCTATAATGGATTCATCGTAAACGAA
+GGAGATAAACGCAATGTTCCAGGTATTCACATCAAGCCAGCTTTCTAACGACGAGTATCA
+TAGAAACGAAGGTTGGGCGTCAGAGTATGTAAGCGGATCGAGTCTTGCAGAAATTTATCA
+GACCTGCCCTGCTAACTGGAGATTCAAGAAGAACGAGACAACGAAAGCTCTGGAGTTCGG
+TACTCAGTCGCACACCAACTTTGAGAGTCGAGATCTGTTTACTGCAACGTATGCTAGATG
+CCCTGCTCCGTCAGAGTTTAAGGATCTGATTACTTCGCAGGCGGCGCTGGCAGCAAAATT
+AAAATCATTCGGCCTGAAAGGTACATCCGGTAAGCAGTACCCGGACCTCATCAAAATGAT
+GGTTGATTGTGGTGAAGAACTCAACGTTCAATACCTGATTGAACTGATCGCAGAAGCTGA
+GGCCCGTGCTGAAGGAAAACAACTTGTTGACGCGGACAAATACGACGCTTGCATGAAGAT
+GAGAGCCATCCTTGAGCAGAATCCCGATCATGAAGCGTGCATCAACAGTGAAACGGCGCA
+GCGTGAGATTTCAATCTTCGGTGAGATATCCGGCGTAAAAGTTAAGGTTCGACTTGACCA
+TCTGGACTACAAAGAGAATGTTCCAGGTCGTGTCCTGACTGGTTATGATGAGAATGGCGA
+TCCGGTATTTGAAGACGTAATTTTCCCGGAAGCACTGATTATCACAGATTTCAAAACTAC
+GATGAGCGCCAACCCGTTAGAGTTCCCGAGACTGGCATACAATCACGGCTATTACCTGAA
+GATGGCATTGCAGCATGACCTGCTACGACGCGCAATCCAGGCTGGAGCTTTTGAAGGTAA
+CTTCCCGGAAGACATTCCGATCGTGGTTCGATTGCTTGCGCAGGAGAAAAAAGAGCCTTA
+TATCGCACTGGCTTACCGTATGACTATGGAGCAAATCAGGATAGGTCGTAACCAGTACAT
+TAGCGTAGTCCACACTTACAAGGCTTGCTCTGAAATGGATGTTTGGCCTGGGTACGCTGG
+CGACGCAAGCGAGATCGAACTTGAAACGCCATCATGGGTGCGTTACCAAAATAAGTAAAC
+GGCATAAATAGCTAAACAAATAATTAATGAGGTGTTATAATGCACCTCATACACCAATCA
+GGAGAAGTTAAGATGCAATTATCACCAGAAACAAACGAAATCCTTCCCGCACTGTTCAAT
+GCTCGCAATAAATTTGCTAAAGCAAAGAAGGACGCAAAAAACAATCACCTGAAAAATTCA
+TACGCAACTCTTGATGCAATGATGGCTGCGGTTAGTCCGGCGCTAACCGACAACGATATT
+ATGATCCTGCAATCAATGCTGGACACCAGCACTGAAACAACCTTCCATCTTGAAACGATG
+CTGATTCACAAATCCGGGCAGTGGGCCAAATTCTTCATGATGATGCCGATTGCAAAGCGC
+GATCCGCAAGGCGTAGGTTCTGCAATGACGTATGCTCGACGTTACTCATTAGCCGCAGCG
+CTGGGGATTAGCCAGAGTGATGACGATGCTCAGCTTGCAGTGAAATCCGTCAAGGACTGG
+AAAAAAGAACTTGATGCGTGTGAAGACATCGAGTCACTGAAAGATGTATGGGCCAACGCT
+TACCGCCAGACTGACACGGCGAGCAAGTCAATCATTCAGGATCACTACAACGCATTGAAG
+GCTAAATTTGAGATCGGTAAAGCTCGCGGCATTCGCCCGGCGCAACCGGAACAGAAAAAA
+CAGGTTGAAGCAACAAGCGCGAAGCCTGTACAATCCCAATCAATCACCAACTTCGAATAA
+TCATCAGGGCGGCTTAGGTCGCCCATAAAATTTAGGAGAGAAAAACATGCATATTATCAC
+TGGCGAGATCCGCAAAGAACCAAAGATTCTTGAACGTAACGGCGGCAATACTTATATTAT
+CGAACTGGCAGAAAGCTATAAGCCTCGTGATGGCGATCGCGAATACACCAACTACACGTT
+CTTTTTTAGCGACGGTGGGAAGCCAGGCCTTGCTGACTGGTATCGTGAAGCGTTCCAAGT
+TGGTCGAGTTATCTCAGTATCGTGCGAGACGTTGAAGATCTCATCACGCGAACACAACGG
+AATGATTTACAATTCATTGCAGGCGGCTGACTTCCCTAAACTAGTATTTAGTCAACGAGG
+TCAAAGCAACCAGCAACAACGAGCGCCTCAGCAACAACAGCGTTCTCAGCAGCAATCACA
+ACCACAACCAAATCAACAATCAACATTTGACGACGATATTCCATTCTAAAGAGAAGCCCC
+GCATTGCGGGGCTTTTTATTACTTCATCTCAATTGCCTTCGGAAATGAGTCTACGTATTT
+TTTAAGCTTTCAACAAACTCTCCAGATACCATTCTTTCATGATTTCTTTTGCACTCTTCA
+ACCTTCTTGTTGTAAATCGAGTCATTAGGCATCTCAACTCGAACGCTAACAAAAACATCA
+CTCGGAATGTCAATAGGATCTCCGTCGCTATACCCATCCTTATGATTTCTTGCGAATGAA
+GGCGCGTTATCATGAGTTCTATGATAAGTCTTGATAACCAAAGACCCATCAGGGTTAACA
+TCATAATCTACCCACAATATAGGCTGTCCGTTAATGTCTTTTGGTATCTCTATACCACCA
+TTAACACCACCCCACGCTAGATCAGAATTTAGTGATAAACACCCCCTTATCAAATACTCG
+CCAGTAGAAATCCTCTCAACTGTACAACCTTCGCTCTCGTCGTTTGTTCTATGGCTCCCG
+TTGCCAAAAATATCAACGATGGGCGAGGCTCTTTTTATAAATCCGTTAGAGTCAACGGTT
+GTATTGCCTGTATTCCATATAACCTGGGTAGATTGCCATTGCCCCGCTCCTGTCTGTTTT
+CTGAGCCTCACATCCTTATAGTGTGGAGAGGTCATTGATGCGTCTTCAATAACAAAACCG
+AAAGCACTACCATCTCCTTGACCGTTTCTATGACGAACAGATATAGCTGTGACCCATCCA
+ACAGAATTAACATTGTAACTTGCAAGGCTGCTAGGCTGAGATGATAAGGCGTTAACATAA
+TCTACCCAGCTTGATTTCTCGTTAGATGCCGGACTTGATACAACGTTACCGGAAGAACTT
+GCCTGAAACGGTATCCACGGAGAAAACTTTGCATTATTGCCGTTTTTCATCCTTATGTAA
+AAAGTGCCGCGACCCGCTCTTGCGTATTCGTTTTGAGAATTTCCAGATTCTGGGGCAGTA
+CCCAAATAAGTAAAAGGCGTAAATCTCTGAATGCAAGCGCTTGCACCGTTAGCTCCATTA
+GGAAGAACCTCAAGCATACCTGCCGTAGCCTCAGGATAGCCCCTACTAGCGGTAGCTAAA
+GCACTTGATGATTGCATATATATTCCAGGGCCATCACCTTCACCAGTTATTGAATCAAGA
+TGTTTTTCCCCTAAGTTAGTTCTATCAAGTTTGTAAACCTTTAGGCTTGTCCTTGCCTGC
+TCCTCGGTAGTTCCACCAGTGCCACCAAAATTAACCCCTAGAGCCTTGTTTTCACTTCCA
+TCATGAGTCATGGCGCCCCAGTCGCCGCTATCCTTGACGATCAGCTTAAATCCGGTCCTA
+TCGTCATTTGAATAGATTTCCGTTAAATCTCCGCGAGGTTGATTTAATCTATTTACTGCT
+AGATTTTTCTTTGATTGCTCCTTGTCAGCAAGGCCAGAAAGATTTCCGTCCTTAGTTAAA
+AGATTGTCAGCATTAACGCTATTTGCTGCGTCCTGAGCTTCCTGAGCACTTACCGCTGCC
+GCGTCTTTAGCGCTAACCGCCGCATCGCGTGCTGCCTGGGCATCGTTTTTTGCTGAGGTT
+GCAGTTTGCGCCGCAGTCTGAGCATCATTAACATAGCCAGATAGATCGCCTTTAGCGTCA
+TTGATTGCCTGAATGGCCGCCGCCTCTTCAGAGTTAATATGCGTAACGGCTGAATTTTCT
+TTTTGCTGCACATTAGTGATAGCCTTGTCTTTTGCTGCATTAATGCTACCAATAGCGCTA
+TCTGAGATTTGCTGCGTCTGGTTTTTTATTGAGATAGACTCGTCACGCGCTGAATTAGCT
+GAGTCGCGAGCAAGCTCCGCTTGATTCTGCGCAGCTTCTGCGGCTGATTTGTTTGACTTA
+ATTTCTTGAACGATATTGTTCAGGTTGTCCATATCAAGATCGGCAATAATATCCAATGCC
+GATGCGATCTCAGTCTCTTTGCTCTGGTAGTACCGAAGCGTCTCAGCGACGTTTTGCGCA
+AGACCGTTAACGGTCAATGAATCGTTAAGCAAGATCACATACTTACCGTCAGCGGCAGTC
+TGACCATCCGTAGAGATAGCCTTTAGCTCTGTGTCGCTCACAATGTCGCTGATTACCGCC
+AGTTTAATTGGTTGCTCCAGGAATACGATAGTTGCACCGACTCGAATCAGAGCAAGCTGA
+TCTTTCCATTTTGTATCGGTTCCGTGAACCGTACCGTCTGCATCCATTGATGCAGTACCG
+CGTCTATATAAAGCCATAGTATAAACTCCTTAAGTAAATAGCACGAATTGCTAAGCAATG
+ATTTTATCATTGTGATACCTGCCTTGCAATGGGCAATAAAAAACCGCCCGAAGGCGGTTA
+GAGTAGGTCACTTGCTGAGAATAATTTGGCTTGCATTCCTGAGCGGTATCCTCCGGGATT
+CGGTATGATTATTTTTAATTGTCTGTCGTCTGCCATGTACAAGGTATCCTTGTCTGAAGG
+TTCACACGCCACCCTTACCTCTCGATCACCTTTATACCTGTATGCAACCATTTCGAGGTT
+TTCAGGGGTGAAGCAGGCCCATACCTCTTGCTTTGTGTGAAATGCAATATGCTGAGCATC
+AATCCAGTTAAGGCATAAGTAGATCGCCTTTTCAGTTTTACCAGTCACGGCAACCGAGCA
+ACTCGTGTATTTCTTCGCATAAAATGACTCTCTTCCTTCTTCATCAATAATCAAAATATT
+GCAAAACTCATCATCAAGCCCATCCTCGTGCACAAGTTGACACGGTATAGTGTGAAATAC
+GCTCTCTCTTCCATCCTCGCTACGCTTTATACCGACATCGAATGATTCGGTGGGTAGTGA
+CTCAAACACGCTCAACGGCGTATTTACACGTTTCTCTGTGCGCTCCATAACCTTCATCAC
+CGCCTCATGTTCAGCCATCATGACATTGACACCTGACACCGGAGTCCGGCGCGCTTTCTT
+GTTTGCCTTGACGATGTATTCCTGCGGAACCTTTCCGAGAAATCTTCCCAGGATGTTTAC
+GCATTCGCTGTACGGCATACCAGTCAACTTCATCAGCCAGCCAATACCAGAGTCGTTACC
+GCATGAGTTGCAGATCGCACCGCCGTCGCCCGGGGTGTTCAGGTTATCAGTCCAGCGAAA
+TCGGTCTTTACCGCCGCAGTTGGGGCAGGGTTGGTGCTTCTTATTAAAAACATTATTCGG
+CAATCCGCAGATTGATTGGAAAGCCTCGCGCCATAACCCCTTCATGTACGGTAAAACGTC
+CTCTTTCTGAAACATCATAAATTCTTCGTTCACTTCCAGATCTCCAAAATAAAAAACGCG
+TAGAAGGATGTTAACCCGCTACGCGTCGTTTGTTTTAACTAAAAATGCTATTGGTCGTTC
+CGATATCTTCCCTCTTCACGACGCGCATAAATTTCTTGTTCTTGCATCGCTTCTCAAGGC
+ACTTGCCGTTACCGTCAAACTTCAGGTCGAATCGCAACCATGCGGCCCTAAACCCTTTGC
+ACCCCTGTCGGCGGTAAGCACGATGCGCAGCTTCAGCGCCTTGCCATGAAATCATATTGC
+GTTCTTTCCAGCCTTGCACCGTCTGATTGCTAACCTTCAGAGCCTTTGCGCAGGCCGCCG
+GGCCGCCGTAATATTCGATAAGGGCATCAAGTCGCGCTCGCAGTCCGGCGCGCGTTTCTT
+CTTTATGAATATAGAACCCGCAACGCTGGCGAGGCTTTTTATCTTTACCGCGCCGTGTTC
+CGTTGTTACCATTGATGTGACGTTTATCGATTTCACCAGTTGACTCTGCGATACGTTGAA
+TACTCATCTTGATTCCCCTATAGCACTTTTTGCTAAAAACGTTTACTTTATGCCGTGTAT
+TATAGCGTAAACGTTACAACGATTCAAAGGATTAATAGCCGTGACAATGAACATTAAAAA
+ACAGATTGCATTACTTGGCGATGACTATATAAAGAGAACTCAGGAGCGATTTACTGTTGG
+TGAGGTTGTTCCTTATCCGTACCAGGTTGTTGCTTATGCCGAGATCGCGAAACGCCTATC
+AAATTACGAGCATCCATTCTTCGTTAAAGCGTCTGTATCCGCAGGTAAGACAATCATCTT
+CGCTATGGTGGCAAAGCAGTGTCAGAAAATGGGCTTAAAAATGCTTGTCCTGGCTCGTCA
+GGGTGAGATTGTCGATCAGGATAGCGAAGAGATCGACAACTTCGGGGTAACGAACTCCAT
+CTTCTCAGCGTCACTTGGAATCAAGTCCTGCTACTTCCCGATCGTGGTTGGCTCAGAGGG
+TACTGTTGCAAATGGCCTCGACAATGAGTTAGCTGATTTCGTCCCGCATGTAATTGGGAT
+CGACGAATGTCACCAGGTGGATTGGGAAGACCTTGCGCAAGCCATCGAGGGTAAGGAAAC
+AATGGAACAAATGAGGGGCGAGAAAGGGAAAATTATCATGGACGGAGATATTCCCCTGAT
+TGGTAATGATGGAAAGCCTTTGCTTGGAACTAAGCGCAGTCAGTACACGATCGTAATCAT
+GGAAATGATGCGGCGCTGTAAAAAGGTTCACGGTCACGATCTCAGAATATTTGGTATGAC
+TGGATCTGAATTTCGTGGCGTAGTTCCTATTCTGGTAGAGAATCCGAAAGCATTGGGATT
+CTGGCGTGAGCGAGTAACTGATATCGACACAAACTATCTGATTGAGTTCGGCTCTGTCGT
+TCCGACTATATTCGGATCAACAGACGGAGTTCATTACGATCTTGATAAGTTCAAGGCGTC
+TAGCGAGGACGGAGTGCAGGACTTTACAGAGAAAGACATGAAGGCTATGGAAGATGAGAT
+CCTTCATGATAAATCTCTGACTCAGCGAATCATGCAAATGGTCGCCAAAAAGGCAGAAGA
+ACGCAATGCGGTCCTGATTACATGTGCTGGTGTGCGCCACTGCAAAGAGGCAGCGGCAGC
+ACTTCCTCCGGGAAGCACCTATGCAATTATTACTGGCGACACAGACAACAAAGCGCGCAA
+GAAGATTCTTGACGATGTAAGGGCCGGAAAAATTAAATACACCTTTCAGGTAATGGCGCT
+CACTACTGGCGTTAACGTTCCAAATTGGGATTTCAGTGTCATACTCCGCAAGATAGGATC
+GCTCACTCTGTTGATTCAACTTTTGGGTAGGGGTATGCGACTGCTTAAATCCTGGCAGGT
+TGCTGAAGGAATGGTTAAGCAGGACCATCTGGTATGGGATTTTGCAGGTACGATGGATGA
+GCTGGGTCAGCTTTATTTCGATCCGATACTTGAGCAGGCGCAATTCCAGAAGCGTTTTGA
+AAACGGCAAAGATCCGAAAACATGTCCGAAATGCGGTTGCGTAAATAGCTTCTATGCTCG
+ACGATGCGTTAATGTCATTGATGGTGAGCGTTGCGATCATTTCTGGACTTCTCAGATTTG
+TGAGGACCAGGTTGACGAGCGAACCGGGAAAATCCTTGTTAAAGGATGCGGTGCAGAGAA
+TGACGTTGTTGCGCGAGTCTGTCGTTGTTGTGATGCTTCTCTTGTCGATCCTAACCTGAA
+GTTATCCGGTAAGGCGTACACCAAGAATGACTGGTATGAAGTAAAGAATTTTGAGGTTAC
+GCTAACCAAAAACCAGAAAGGCATAATATACAAATACACTCTGATTAACGACGATGGTGA
+TGAGTTCAAGGCGTATGAAAAATTCTTCCCCGAGTCTGACTCTAAGATTTGCGGTACGCT
+ATGGAAAACTAAAGGTGTACTTCCTCATGTGTCAGATCCTAAAATGCGCCGCTACTTTAT
+CGGAATGAAGAACGCCATCAAGATTTTGCAATACTCACATCATATTGCTCACCCGGTGCG
+CGTAACTCATCGTCGCAACCAGAAGAAAGAAGATATCATCTCACGCAAAGACTTCGGTAT
+GGAGGATATCCCGGAATGATTACAGACAAAGGTGATTATTTAGAATTTTACGAGAGAGAC
+ACAAGCGACACTCGAAAGGAGGATGCTCATCAGGTGGATTGTGTATCTTGGCTGAAATAC
+AATTTTCCTCACCTTCTATTTTGGCACACTGTCAATGAAGGTGAAAAAACAATCACATCG
+GCGCTCAGGGATGAGCAGGCAGGATTACTTAAAGGCGTGTCAGACTTCGTTATCCTGATT
+GGTGTTAACTCACGATACCCGTTTGCAGCAATCGAACTTAAGCGGGTTAATAAGTCAGGC
+AAAGGAAAGGCGTCACCAGTCAGCGACAAGCAAAGGGAATTTCTCCAAAAGGTCCGGGAG
+CGTGGCGGCTTCTCTGCCGTCGCATACGGATTCGGGCAATTCAAGATCGCAATTTACGAA
+ATGATGAAATAGCACTTTTTGTTAAAACTGCCGGGATGGAATCTGGCATTATTATCTCAC
+CAAAACGAGAGGAATAAAAATGAAAGACTTTAATGATATCGAAACTATCGACTTTGCAGA
+AACTGGTTGCTCATTCACTCGCGAAGCAATAGCATCAGGCGGTTATTATCAGGCATTGAA
+AACGCCAACCTGTAAAGAGATTTCAGGGCGTCGATACAAGGGGACAAATACCCCTGACGC
+TGTTCGTGATTTATGGTCAACTCCGCGAGAGGTTATTGCATACCTTGAGGGTCGTTATGG
+GAAATATGATCTCGACGCTGCGGCAAGCGAAGAAAATAAAGTTTGCGAGAAGTTTTACTC
+TCAGGAAACAAACTGCTTAAAACGTTGGTGGGGAAAGAATAAGCACGTTTGGTTAAATCC
+TCCTTATAGCCGACCTGATATATTTGTCAAGAAGGCCATTGAGCAAATGGAGCACAACAA
+TCAGATCGATATGCTTTTACCTGCAGATAACTCTACTGCGTGGTTTACTGAAGCGCGGCA
+GAACGCAGCTGAAATAATCTGGATTGAAGCGGACTTGACTGAGGATATTGACGGCAATGA
+ATACGCACGATCCGGTCGCCTGGCTTTCATATCCGGTGAAACTGGAAAGGCCGTAGACGG
+TAATAACAAAGGTTCGGTAATTTTTATTATGCGCGAACTTAAAGAAGGTGAGGTGCAACA
+GACTCACTACATCCCAATCACAAGCATTTGCCCTTCGGTGAAAAACAAACGAGCAAAGGT
+GAGGAAAGTATGATGAGCGAAAAAATGGTTCCTGTTAAATTAACTGAGCAAGGTTTATGG
+CTACTTTATCGAGCTACGTGCTGCGAAATTATGGAGCGAAACGGATTGACTCAGGATGTT
+ATTGGTTGCGATCTGTGGGAGTTCACTAGTTCTCTTGATATGTCTTTCGATGAGATAAAA
+AATGAATACATAGAGAACTGGCCTTCAATCATACAGAAAGACGTGGAAGAACTTAAAGCT
+GATACAATCGTACAGCACTAATTGCTAAAACTACCCGGCGAAAGTCGGGTATAGTTATTT
+CATAGAAACGAAATGAGGAATCAGAAGATGGCACGCATTAACGCAAACTTTTTCAATATC
+GCTCAGCAGTCCGCAAAAATGGCTGTTCATATTACGAACAAGCAAGGCGGCAACTTCGAT
+TGGGATATTGCTATGAACTTCCTTAAAATGTCTTATTACCGTTGCTCAGTTGAAGAAGTC
+GAAGGCTTCATCTCTGACGTGGAGAAATTAACTAATGCTGATAAAAAAGCAAGGTAAGCG
+CGAAGTGTGGGAGCACGCAAAGGAATGCGGCATCTCAGACGATATAGCATTAATTGCTAA
+ATACTTTGATATAAAGGATGTTAGCATTATATCAAACGGCAAGATTTCATTTATGGAAGG
+TATGCCGAGAAAAATGCAAAGAGTTCCAGCCACTCCATCACTTGAGTTTTACCGCGAAGA
+GGGAAAGAGAATTGAGCGAGAAAGAAAATCCACAAAAAACGGCAAGTCTTCCCGGCTTAA
+ATATTAATGCGGACGAATACCAGGCAATATGGATCGGGAAAAAGCAGGTTAAGCAAATCC
+CTTTCTCTGACTGGTTGCCACCTGACTTTGTTAATGTGCTTTGCACTATCGGTATTGAGC
+AGGAGTTGCATATAGGTTACTACTCACCTGGCCGAAACAGTATGATGCTTGAGGTTGACG
+GAAAGCTCGTTGAGTTTAAATCTTCAGATCTAGGATTCTGGTTAAAGGCTGTGGCATGAA
+ACTTTATTTTGCTGTAGTATTAACACCGCTAATTTCATTTTCAGTAATGTATTTCATTAT
+CATGTAAGGATTAAAATATGTCACAAGCTAAAATCACTACCGAGCAACTTATCGAAGAGC
+GCATGAGCGGCCTGACACTTCGCGAGATCGCGGAAAAGTACGGTATGCACATTCGCACTG
+TCGAGGCGCGTCACGCAAAACTTGCAAAAGAAGGCCACTTCCACGGCAACGAGCATGTTG
+CTAAGATGGTTCCGGAAGGCTTCATGGTAAAAGGCACGTCAACCATGATTGACGCGGAAG
+GTAACGAGAAGATTCGTTGGGTTAAGACATCAGTTGATAATGAGCGCCTTGAGGTTCTAA
+TGGAAAAAGCGCGTGAAGCATTCTGTTCAGAGTTGCCTAAGGCTATTCCATCTGAATCAC
+CTGACGTTAGTTTTGATGAAGACACGCTTGCGATGTATCCAGTTTTTGATTTGCACATTG
+GTGCTCTTGCTCACAAACATGAGTGCGGCGAAAACTACGACACAGCGACAGCAGAGAAGG
+TTATGAATGGGTTCTTTGACTACGCTGTAGATAAGGCGCCAAACTCAAAGAATGCCGTAT
+TGGTATTGGGTGGCGATTTCCTACATTACGACTCTTTGGAGTCTAAGACTCCAGCGTCAG
+GCCATTACTTAGATTCTGACAGTCGTTACGCTAAGCTTGTTTATGTCGCAATCCGATCAG
+TACGACGCGCAGTCTCTCGAATGCTGGAGAAGCACCAAGTTATTGATATTAAAGCAATAA
+GTGGGAATCACGACGAATCAGGGATGGTTTGGTTGCGCGCTGCGCTTGCTGCATTTTATG
+AAGATGAGCCGCGCGTAAATGTTGATGTTAGCCCTGCCGCAATGATGATGACCAGCTTTG
+GTAAGACCCTTATTGGATACACTCACGGGCATCAAATGCGAAAAGCAGATACTCGACTAA
+GTGTTATGGCAACTGATTTTCGTAAGTTGTTTGGTCAAAGTGATTACGTTTACACGCATA
+GCGGTCACTGGCACAGTCAAAAGATTACAGAAACAAACTTGGGTATTGATGAGGTTCATG
+GTCAGCTTGGAAGTCCTGACGCATACTCTGCCAATGGCGGTTGGAGGTCTCAGCGTCAAG
+CTGCTGTGATTGTCTATCACAAGGAATTTGGTGAGGTTGGACGATTCATTTGTCGACCTG
+AAATGTTCTAAATAGCACCTTTTGTTAAAACAGTACCCGCGAAAGCGGGTATTATTGTTT
+TATAGAAACAAGAGGAGATTGCAATGAACTGGCACGAGCATTACGAATATAGGGATGGTG
+TTCTATATCACAAGGTAAAGCCATGCAGAAGGCATGATGTAAATATTGGGGATGTTGCTG
+GAAGGGTTGCCAAAAACGGCTATCACTATGTTGTTCACAAGAACAGGCCGTATAAGAGAT
+CTCGAGTTATATGGGAGATGTTTAATGGTGAGATACCAGATGGTTTTGTTATAGATCATC
+TGAATCACAATGCCACCGATGATAGGATCGATAACCTTGAGTGTAAGCCAAGAAGAGAGA
+ATATGGTTAATGTTAAGTTAAGGATTGATAGCACGACCGGAGTAACTGGCGTATCAAGAA
+AGAGGGATAACAAGTGGAGGGCGTACATAACAATTATGGGTAAGCAGAAGTGCAAGAGCT
+TTGACACGTTTGAGGAAGCTTGCGCGCAGAGGATTGAATGGTCAGTAACTCATGATTTTC
+ACCCAAATCACGGTGGAACATACTAATAGCACCTTTTACCTAACCCGCGCCACAGAAGTG
+CGGCATAGTAACCACATCGAAAACAGAGATGCTATATCATGAAGATAGTCAAGTGCATCC
+GAAATGACTCCAAAACACTTCCATTCCGTGTAAATCAGATCTATAGTGTTGGTTATGATT
+TCGGTGGGGGATTATTTGAGATTTACGACGGGCGAGGTTCAGCAATCCAGACTCCTCTGA
+ACGGTCACTACCTGGAATTTATTGAGATAGATTAACAATAGCATTCATCACCTTACAGGC
+TGGCATGATTTACATGCTGGCCTTTTTGCGTTGTGTCAAATAAATTTGAAGGTTAAAATC
+GACTCACTTGTTCAAAAAATATATGGTGAGATTATGAAAGAGTTTTTAACGGCTGCTACG
+TCAAGCACTGGCGGTGCTTCGTTGGTAGGGGCGGCGACAGGGCAACTTTATATTGCTGGC
+GCTACATTCATTTGCTTTCTGCTTTTTGGTGCCTGGGGAGCGTACTGGAAGTATCGTGAT
+AGCAAGGCAATTCAGGAAGCGTTAAACGATGGCGATCTAAATAAGGCGCTTAAGATCAGG
+GGGAGATAATGAGTTTAAAAAATAACGTTATAGGCGCATCAATCGGGGCCGCTTTGACGT
+TGACACCTACCCTACTGGAACGGATCGAAGGTATAGAATACGAGGTTTATTACGATATCG
+CCGGAGTCCCTACCGTATGCAGCGGAATAACCGGGCCTGACGTCATACCTGGTAAGAAAT
+ACACTAAGCGAGAATGCGATGCATTGCTGATAAAACATATCGGCGTCGCTCAGCGATACG
+TTGACAAGAAGGTTAAGGTTGACATTCCGGTAACTATGCGCGCATCACTGTATAGCTTCA
+CTTTCAACGTTGGGACTGGCGCTTTCGGATCGTCTACAATGCTTAAGCTAATCAATCAGC
+GCAAGCACAAAGAAGCGTGTAATCAGTTATGGCGATGGGTATACTACTACAACCCAAAAA
+CCAAAAAGCGCGAAGTGTCGAGAGGGATCAAGAATCGGCGCGCTGAAGAATACGCATATT
+GCGTTAAGGAACTATAATGAAACTTAAGAAAACGTGCATTGCAATTACGGTTGCTGTTGG
+TGTGATTTCTCTATCCGGTTGTTCGACGGCATCTGCTCTGAGTGGTTTACTTTCTGACTC
+CCCGGATGTTACGGCGCAGGTTGGCGCTGAGAACACAAAACAACTAGCAGGAGTAACAGC
+AAAGGCGGATGATAAGCGAGAAGTGAAGGTGAGTGATTCAAATATTGGCAAGATTGACTC
+ATCCGTCAAGAAGTCCGTGGAGGTGTCAACCATTCAGGCCAACACGGTTAACGCTGAAAG
+CATCACAGTAACCAAATCTGGAAGCTGGTACGATCCTGTGGTTTGCTGGATTCTCGTTTT
+TATTGTCCTGTTGCTGTTTTATTTTTTAATTCGTAAGCACGAAAAAAAGGAGGCGTAAGC
+CTCCTTTCTTATTTGTACCTTTTGACGTGAAGTAGCAACTCCCCATCCTGATCGCAAAGA
+TTGTGCTCACCGTCGTTATTGGCCCTCATCGAAGTGAGCAGTAGGCTAAGTAATCCTTGC
+TCAAATTCTTCTTTCGTAAGCTGGAGCCTTGCGCACAATTCGACGTTCCGATCTATCAAT
+GTTTCTACGTTCGCCAAACAAGTCTTCATCACTCATTTCTCCAATGTGCATCATTTCCCA
+CGTATATCGATTGTTGTAGCCATCAATACACATCAGCTTCATCATTACCGGGCGCTTGAT
+CTTTCCCTTGCACCAGTAAAATCCACCCTTGCAATCAAGATAACCCTCAGTCACGCAGCG
+TGCGCAAAACTCCTTAGACAGCGCGCTTGTAAATTCACGGCGAGTCATTCCGGCGGCCTT
+AGCAAATCGCTCACTTTCCTTGTGGGCGTATATAAATTTCGCTATGTGCTGTCGAGTGTA
+TTTGTCGTAACCTTCGCAGAACCTGAACAGATCTAAAAGAAGAAGCATATTATTAACCCA
+TCAGTCGAGGATTGATGAAAACAATATCATCAATCCGGCAAGTGTAGTTCATCTCTTCGA
+GCGTGATCAGCAGGCTGTCGATTCGCTCAGCTACTTTTTGTTGACCGTTGAACGGCGTAA
+CGTTACGGCACTTTGCAACAATGCTATGAATAGGTGCGCGACCTTTGTTCTTCTTTGCGA
+TCTCAGTGATAACATCAATCAGCTTACGAGATTCAGCCTCATCACCAGCATACCCGGCGG
+CGCTGGCAGACGACAGATAGGTTCTGGAAAGCTCATTGAAAATCATGATCGCTTCCTGCA
+TTGTTTCAAGGTCAATCTCACGGTTGGAACGGTTCGGTGATTCACCCTCCCAGTTCTTGA
+TTGTGTGAAGAACTGAAGCAATGCGCAAAGCGTGCTTATCGAACTTGCCGAGATGACCGC
+GTAGCATTGAGTGAGAGTATTTGCCCCCGGCTGCGAAATCCGGCTCCATAGCCTGGCGAG
+CAAGGTTTAATTCACGCATAGCATTACGGCTTACAGAGAGAACAACGTTGTCCTCCTTCA
+TAATGTTGTGCACAAGTCGATAATATTTACTCACCAATCCTCGATCGACTTCCTTATACA
+GTGCATCACCATTTTCATCGCAAAGAATACGAGTGCCTAAAAGAGGTTCCTCGCGAACCA
+ATAGGAAACGCTCAGATACACCGATACCGCGCTGGCCTGCGTCCATGATACCCTTGATTG
+TTTCATCCTGCGCAATGACGCAGATCGAACCGACCGGGCAAAGAGATAAGTTATTGTCCT
+GATTTGAACGCGCAACCTCCATATGGTTTTTATCCCACGCCTTGAGGATAAGCTCGCTGT
+TTGATTTCTTATCAGAACCGCCATAAGTCAGGCCAAGCAATGTATTTATTGCCGTTGCCT
+CATCAGAGATTACGGAAAAGTGGCCTTGAACAGCAGCTACTTTCGCAAGACCTTCCGGTG
+TAGGATCTGATACCGCGAAAACAATATCAGCCATCTTCTTGATCTTCTCTTCCAGTTTTT
+CCTTGTCCTCGTACAGCGCCGCCGTCGTGTTACCCTTCGGATCGTTTTTGATTTCCTTCT
+CGATCTGACGTAGCTGACTGGTTAAACGGATACGTTCCTTTTTGCGCTCTTCATTCAGTC
+TCTGAATCTCTGCGCGCATAGGTGTAATCGCCGCTGAGTTAATCGCGGATTTACCTGTTG
+ATGGTGGCTGGCTAATCACCATATAAAGAGCGGTCGGTTGTTCTTCTCCGTGATATTGCA
+CCCAAAACTTCCCTAGCATCGCGGCTGAGATGCACCCAATGAAATGAGCGTACGCAGACG
+AAACAGGAAACTGTACAGACTCAGCTTTTGCTTTTGCATATTCGAATACCAGGTTATCGC
+CACCTAACGAAATCAGCGGGAACTTATCGTTTCCACTGTTGATATCGATCGGGTCTTGCC
+AAAACGAAACTGAATCCCCGTAGCTGTTTTCACGAATTGCGATCGCCACCGGATTAACTC
+CAGTGCTATTTGCGATCTCAATAATCTGTTGGTAATTCAGTTTTGGCTTAATATTAAACA
+TCACAATAAACTCCTTAGTTGACGGCGTGAATGATACACCGTCAATGGTACACGCGTTTT
+GCAAAAAGTGCTATTCGATCCGTGATTCTCATGAAAACCGTATAGCCTTTCCGCTTCCTT
+CCTTGCTTTAGCCGCTGCATCCAGCGTCATGAAGGTCCCTAAGTGTTTTGTCTTTTTATT
+TATGGTTATGTTTGCCGTGTATCTATTTGTTTTTTTATTCCAATACACTCCCATAACTCC
+TGTGTTTGATGACACTACTCCCTTGTTTCTTAGGTTGTCTTGCCTAGTTACTAGCCTGAG
+ATTCTCAATTCTGTTATCGTCTCTTTCATGGTTTATGTGGTCTACATCCATTCCTTCTGG
+TATGTTTCCGTAGTGTATTTTCCATACTATTCGATGGGCGTATTCAAATACACCTCCTGG
+AAAACATATTGACCGATAACCTTTTTTATTAACAGTTCCGGCAAGTAGATTTCGTCTTCT
+TCCGATTCTCCACTCTTTCCAGTACAGTCTCCCGCTAACGTATTTCAGTTCATCTTTCAT
+AAGTATTTACTCTCAAAGGTTGTGCCGTCTGCGATACTAAAACCAACCTCTTCGCGGAAT
+AAGGTCCAGCGGCAACCGTCCTCATCAAAGATGTAACCAGCTACGCCTCCAAGAGCGCGA
+CCGCTTTCTACCTGGTAACGCTTTCCAACCTTGAATGATTTTTTCATTGGGTTGCGATGG
+TCAAGTCCGGTGCACTTGAGCGTCTTTGTTTTCAACTCGGTGAACTTCGCGAAGAAGATT
+TCATCAGATCCGTGAATATGTAATTCATCATGCTTTTCAAGTTTTAAATACTTCCCGCAT
+TTTAACTTTACTTCACGAGTATCATCATAGCGCTCACGACCTTTGTACAGATTGTTTACT
+TCGAATCCTGTGATTTTGTCTGCGGAAGTGCATTTAAGTTTGATTGATTTCATTGTGTTC
+GCTCCTGATTGGTTATCTTGAATAAGGCCACTTTATCAAATGACCTTACGGCAATATTAA
+CAAATCGTGCTATTTACCAGGGAATATAATCGTCAACGTCCTCATTAATATCATCACCGG
+GTAACTCAAAGTTAGGATCGTATTCGCTTTCAGCATCCATATCCATATCACCAAGAGCCT
+CGTCAAGTGTGATTTCTTTATATGCTACCTTGATGCACCATTCGGAACTAAGTCCGGCAT
+CAAGCGCTGCAAAATAACGAGTCCAGAAATTATCTTGTTCCATTAATTAGCCCCACGAAT
+TAATGTAAATCGAGTTAGCCTCAAGCGTTGCACGAACATCCGCATCTGTTGCATTTATCA
+GTCGAGAGCCTGGCACGCTTCCGATAACACTATTTCCGTTACGCGTCTTAGTTACCGTCA
+TTGAAATAAAGCCTGAAGACTTATCCATCTTGATAAATACACGGCCTTTTGCGTTCAGGT
+GCTTGATGATATTCTCAACTTTAATGCTCATTTTTGATTCCTTTGTTTTGTTTGGTATGG
+GAGTAATATACCTTACTCCCCGATGTGTGTCTTTAGCAATTTGTGCTATCAGCACTCAGA
+AATTACTTTACACTCTTTGATCGTGCCGCCGAGAACTACCTTTTGCATCATTGCTTTTTG
+GCGACTATCGTAAACGCGAACATTTTTTACGCTGTTAAACTTTCCTGTATAGAAAGTGTG
+AACATAAATCATTTTTTACCGTCCCATGTTTGATATGCCGTAACCAGGTTAATGCTATGG
+TCGAACTCGTCAACGCTAAGCGATCCGATTCTAACAAGCTCTTTGTTGTGCTTAATGTCA
+ACCATCCGGCTATATGGATGACCTGGCGCGTATTTAAATAGACGCTTGCTGTCGGCAATG
+TCTGCGTTAATTCCGTAGAATCGGCCTACTGCCGCCTTAACGCGCTCCATGATGTTTTTT
+TCGTGGATTGCCGCCGCGTTATTTAGTGCCGCAACCGTACTTCCGTTAATCTTCGGTGAC
+TTCGTTGTGATTTTGTAAGTGTGGTTCATTATTCCCCCTAACCAGTCAGGATAGCTTTCT
+AGCTTCATCAATGATAACAGGTTCGAGACGCTCAAGTCGAACGCGGAACGTGTTCATAAA
+TCCCTCGTTATTCTTCAATAACTCGAAGGCGTCCGTAGCTTCAATGGCGCTACTACCCAT
+GTAAGCGATTACGCGCTTCTGTGTTTTTCGTGAGATCGCCACTACGCGGTAAATATTATC
+ACTCATACAATGCCTCCTTAACACGATCCGCTACGAATGAGTTAACGGTGATTGAATAGT
+CAGCCACGAAGCAAGGAGCAACACTGCCTGGCTCAGTGCGCATGAATGGTTTATTGAACT
+GAGTGATTACCTCGCCAGTATCGTTATCAATTAAGCGAGACGCGCCTAAGCGGTCCACCA
+CGCGCGTTACGCGGTCCACGCACTCCATTACATCACTTACGTCATAATAGGCGGTTAGAG
+CGGCTATCATTGCCTTCTCTTGCTTGTTGTGAATCTTACGCGCAATGTTCTCTATCGCAA
+CGTGCACTTTCTCGTCAGTCTGGATTACCGAGCTGGCTGAGAAGTCCAGATCTGTGAACT
+GCTTAAACATAATACTTTCCTCGTTTACTGTTGATGTGATGAATCATACCCAGTAGACAT
+GGATAAGTCGTTAGCAAAAAATGCTATTCCAATAATTGCTCATTATTCCATCAATACTGG
+AATATCTGGAATAATCACGATCATGATTGATCTCTAATGATGAGATGTGATTGTTGCATG
+GTGTGCAACTGTTGATGTGATTGTTGCTTAGAATGCAATGATTGTGAGAGGGGGGATCTA
+GTGTTACCAGGTTCGCCTGGTAGTCATCTCCATTTTTAGCAAAAAGTGCTATCGATTACG
+ATTACGCTTGATTGCGTGTTAATCATTGTATATGATTTGCCTAAATCGCAATCGTATACA
+AAATTACGCCAAATTACGCCTTATAATATACATATAGATATATGTATATATATAATATAT
+AAGTGTTTTTTTTATTTATATATATAGATACTATTATTTGTAATATGGTTGTATAGCGTG
+TCTTTTATGTTAATCGTTGGATGCTCTGATTATGATGCTCATGTTTATATATACAGTAGT
+TTTATCGGCGCATATTTATTTAGGGAAATCGATTAACACGATTAACACGATTAACCACAC
+TTAAAATCGCCCCGCAGACCTTGACACATGCGGGTCGTCGTGTAACACTACGATTAGAGA
+CACGGTGTTACATGTTAATCATGGTGTATATTGAAACTAAGGAGAAAGCTATGAGTTCTT
+ACCAGTCAGACGCAGTACAGGCAGCAATCAAGGCAGCTTACGAGAAGGCCGGAGTAACGG
+TTGAGCAGCGACCGGAGGCAAAAGTGACCGATGTTATCCGGGCCGCCTGCGATCAGCTTT
+ATGGTGATGGCGAGAATACCGAGTTCACATTCGACGCGAATAAGATGGCTGAGGCCGCAG
+CAAGAAAGTCGATGCCAGACGCTGACGAACATGATGTTGCCAAAGGCGCCGAGTCCTGGT
+TGCTCGGGAAGACGGATGAGATTAACGAGAAGTTTAAATCCTCATTCATCACCCCGATCG
+TTTCTCGACACTTCTCCAAGATCGGCAAGTCGGTCAAGGTGAGCGTGACCATGAACGATG
+AGAAGTTGCGAGTCGTTACTATCTCAGTGAGTGACGAAGAAGTTCCGGTGAAGAAGCGCC
+GCAGCCGGAAAAAAGTCAGCCTGGCTGATTGTCTGGATTCGTTTGTTCCTGATGTTGATG
+ATCTTGAGAAAGGCGACGTTACTGTAAGCACCGTGCGCGACCTGGTTCGCCAGATGAAAG
+CGCATATCGAAAAATGTGGACTGTAAGGAGAAGTAATTATGTTTAATATCAAACCATTAA
+CAGAAGCAGAGAAACAGGCTCAGGCCAAGCAAACCGAAAACATCCAAGTGATCGCTGATG
+CGCTGATTGGTAAGAGGTCAATCAAAATAAACCTCGACACTGTTGGTCAGTCATTTTTTA
+CTAAAGGTTTGGATAAGTACGTTATAAATGTGAAGGCGAGAGACCTGGTGGCGAGAATTC
+AAAAGCTAAACAATCAAAAGCTAAAGCTCATCAAGGTCGAAGGCAACATGTGCGAAATTG
+AGAACCTCAGCGCACCAGACCCGAATAAGTGGGAAATCACCGATGTCGAGTTTATCGTAG
+AATAGCACTTTTTGTTAAAACCGGATCGGGGTATCTTGCTATAGTTACCCCATCAAAACG
+AGATACCAATCAGAGGAATCACCATGTCAATCGTCAAGAACCAGCAAGCCATCGATTCAA
+CCAATAACAACCGCTTTGCTATTTTCATCACTCGCGACAACAAGCGCTTTGCAGTAAAGG
+CCGTACCAGGTGGATACAAAACCTACATGGAAGATAACGGAAAGTGGGTGCGGTGCGACA
+ACCTCGCAAACTTCTTGGTCTGGAACGCAGACCTGCAGGGATTCGATGACATCAGCACTT
+TAATTGAGGAGTAATAATCATGCCACGTTACAGCAACCTAACTCAACTAACCCGCGTCAA
+CGGGCACATGATCCCGGCAAAATCCACTCACTACGCAATGGGAGCAAAGCACGGATTGTA
+TTTCAAATGGCGCGGTCAATGGAACTTCACGGTTATTCGTAATTTCTACATTAGAGTTAC
+AGGTGATGACCCGCAATCGGTCGTAGAGAACTCAATCGGCGACAACAAGATCGAGGTGCT
+GAAATGAACTTCAACATAATTGCTTTCTGGTCTGCCGTATGGTTCTTTTGCGTAGGTCAT
+GTTGTGGTTGGAATCGTAATCATGTTGCTACTGTGTGCGGGAGCGTTCGAATGATGCGGA
+TTCTGATTTGCATGATGGCGGCGGTCGCCATGGCTATCCTGGTAGTGTCCGGCTGCGGCG
+AGGCCAGGGATAGCTGTCATGAAACCGGGAGCCAGGTTACTACTTTCGTGATGGTTGGCA
+ACGTATTGCTACCAATAACATCAAATGAAATCACTTGCGAATAGCACTTTTTGTTAAAAC
+TCAACTCCGGGGTTGCGATATAGTAACCCCATCGACAACGAACGAGGACGCAAACATGAA
+AATTAAATTACTTAGCAATGGCGGTTACAAGGGATTCACCCGCGACCTGGAAGCTGACCC
+TATCGTGGTTGACGCGGTTAAGTGCGACTCAAGTACTGGAGGCTACCGCGTTAAGGTTGA
+TGACCTTGTAAAAGCTGGCGTGTACGATCTTGATTATGGCCTGTCGGTTAGCCCGGTATT
+TGGCCCAGCTGACTTCAACGAGAAAGACGGAACGATGTTCTTTTTTGATTGGGAAGTGAA
+GGCAAACATCAAGCCGCGCAAGGTTCGTCTTCTCAGCAATGGCGGCTACCCGATGCGACC
+AGGTTATGAGAATCGCACGTTCCCGGTTATCGTTGACTTCATTGGGACAACTGACAACTT
+GGTATACGTTAGCCATGAGCAACTTAAGGCAGTTGGATTCGTTGGCGGTATGAATAAAGA
+AGCGCTTTGCTTCTTCCATCGATGTCCAGAGCCGATCGGTATTGAGTGCGAGTTAGTATA
+CTAAGCACGAATTGTTAAAAGGGGATTTGGCCTGACTGGTATAATCCCCACATCAACCAC
+TAATAGGAAAGCATCATGTTAAAATTAAAAGATATTCAGTTCCCTGTAGTATTTAACACT
+ATTAGCTGCGGTAAAATAACCTGCCACAGCAAAGATCGCGCAACAGATTCATCATTCAAT
+GAGTGCCACCCGTCTATTGTTGGTAATCTTATTGAGCTTCATAACAATCACAATCCTGAT
+AACATCCCATCTCTTCCATATTATGTTGAGGGAGTCGGGCCTGGTTGGAAGGTTGGTCGC
+TCCATCTTCCATGCAGCAAAGCCAGAAATCAAGCCAGCGCTACAATGCACTCAGATCGAG
+AACATGCCATTGAGCGCGACACTAAAAGGTGTCCAACTTGATAGCGAATCCTGGATCGAG
+ATTACCGCCACGCCTAAAACTATTGAGGTACATGATGATGTGGTGATTCTCCTGTTGCAT
+TACGGCAGCTTTAAGCACAAGACGGTATCAGGTGAAATCAGCATTAAGCGCGGAACTCTT
+GTCCGCTACGAGGTGAAATAATGACTGCATGGGTCTTGATTATCTTGATGAGCAAAGGTC
+CGGATCACGTATACATGGAAAGTCAACAATCATGCAACAAGGCACGGGAAGTTATCGCAG
+AGAACAAGCCGTTCGGATATGAAGTAAAAACTATGTGCGTTAAACGATAGCACGAATTGC
+TAAACCTTCCGCAAGGCCATTTGATATAGTGGCCTTATTGAAGCACGACAACCAACTGGA
+GGTAAAATTTATGAAATTCGAATGTATCAGCGATAACACCAAAAAATTTACTGTTGGCAA
+AATTTACGATGTTCCGACTGAGCACGCAGAGCAAACCGTAGCTCTGACCGACGACACGGG
+CCGCAACCGAATTGCAACCGTAACTCACAACGGTGAAGGTCTTCGCTGGAATAGCGGCGG
+CACTAAGTTCGCAACGTTCGGCAAGAAGCGCAAGCGCACCTTCCGCGTCAACGGCAATGT
+TGCAGCTAACAAGATCCATAACGTCAAGCCGTCGGAAGTTGACCGCAAGCCAGCGCTGAA
+GTTTAAAGAGAAGGTGGATTTATTCAATCTTGCCGCCTCGCTTGTTCTCCTGGTCGCTGC
+TATTTCGCTGCTTTCCATCATGTAATGCTAATGGGGAATCGACTCTGAACGGTTCCCCTT
+TCTTTTGGAGAAAACACTATGCCAGACTTTTCTAACTGGAATAACGAGCCGCCATCATTT
+CAGGAGTTGCTATTCTGCCTCCTGGTCCTGACATTATCTCTTAAGGGTGTTTTATGGCTA
+CTATCATGACAGTAGAAGATGCAGCACGCGATGCAGTGGAAGGAATGCGCCCAAATACCT
+CCAGAATAGCACACTACTACAAATCTGAGGTGTCGGCAGTGCAATTGGTCCACGAAATTT
+TAAGGCTCCCACAAGTCGATTCAGCGCGCGTTGTGACGTGCTTAAAAAATTATTTTTGCA
+TCACTATTAAAACGAATAGCACGAATTGCTAAAACCTATCAAGGGGAATACGCTATGATT
+CCCCTACACCAACAAACGAGGAAGCGATCATGAAACACTTAATCTGCATTGAAGCGCCTA
+ACGATCAATACACCCTGCATGGACTTGGTGTGTTCAAAGGTCACTACATTACCGCAGGAA
+CTTACGATGCTCGTCGCGGCGATGGCGACCTAATGATTACGTCAAAAGAAGTAAATCCGT
+ACATCATGCAGAATCTTGGCAATAACGAATATATGGCCTATGGCTGCAACGCGGTGTACA
+AGCACGTTAAGATCCGCAAGCGTGTTGTGCGTGCATTCAAGAAGATTGCAATTAAATACT
+GGAAAATGAGCAAGAAAGATGCCGGACGTTGGGCGCGCAACGTTGCAGATTCATACTTCT
+ATCGTAACGGCGAATCCTGCTACTTCCTGATCGATGAACTTATGGAAAACTACGGTGGCG
+ACTTCAGCCAGGGTAGCTTTGATGACTGGGCCAACTATGAGATCAGTTGCTGGTAATAGC
+ACGAATTGCTAAAACTTGCTCAAGGGCATTTGTTAGAATGCCCTTCGTTGAGTTAAGCAA
+CCAATCAGAGGAATAAATCATGGATAAAATCACAATTTGGGGCCAGACAATCAACCTGTT
+TCTCGGCACGCGCCGAGTGGCAATCTTTGACTTTGACGGGACACTTAGCGATGGATCTGG
+TCGACTTCACCTGCTGCCAACAAAGGATTTGCACTTGACTGAAAGCTGGTCTGAGTTTAA
+CCGAGCGGCAATATTTGATAACCCAATCCAGAGCACGATCGATGTGATGAACTCTATGTT
+TGCCGCTGGGTATCATGTGATCATTTTAACCGGGCGAAGTGATGAGGTGCGTTACGCATC
+TGAGTTATGGCTTAAGCATCACGGCGCTCGATATGATTACTTAGTCATGCGACCGCATAC
+CGACAACCGCAAAGACACGGTAATGAAAGAAGAGGCAGTGCGCGCTATCGGCATTGATAA
+CATTCTTGCGGCTTGGGATGACTCAGTGAATATAATAAAAAAATTCAGAGATCTAGGGAT
+AACCACATATCAGGTTTGTGAATATGCCTGTGATAGTCGAGAGGATTTAAATAGTCATGG
+TGTCGATTGATAACAAATCAATGGTAAGAGAGTTATTTACTTATTCTGACGGCGTTCTGT
+ATTGGAAGGCCAAATCATCTAAATACAGTAGAGCTAAAATAGGAGGCGCGGCAGGAAGCA
+AGGATAAAGACGGATACATAATAATCAGAGTAAGAAACGAAACTAGAGGCGCTCACAGGC
+TTGTATGGATATACCATAATGGCAAGATACCTGATGGAATGGAGGTAGACCATATGGATG
+GAGACATAACAAACAATAGAATAGAAAACCTAAGATTGGTAACGAGAACCATAAATAACA
+GGAATCAAAAAAAGAGATCTGATAACACAACCGGAGTATCCGGTGTAACTTTCATGAAAG
+ATAGAGGAAAGTATAGGGCGCAAGTTAGAAACAAGAGACTCGGGCAGTTCGACACAATAG
+AAGAGGCCGCCAAAGCAGTAAAGGATGAGCGGGATAGATTAGGTTTATTCACAAAAAGAC
+ACGGGGTGTAAACATGAAAACAGCTATCATTTTAAACGGCGCACCTGGTGCGGGAAAGGA
+CACTATCGGATGCATCCTGGCTGACACTTACGATCATGTAGCGCTACGCAGCTTCAAAGC
+GCCAATGTTTGAGATTGCCCGAGCAATCCTGGGTGAGACTAATTTCGAGTATTTCATGTT
+CTTGTATGAGGACCGTCGCTATAAAGAAGAGCCAGCGTCAATCCTGAACGGTAAAAGCCC
+GCGCCAGTTTATGATCTGGATTAGCGAGGAGGTCATCAAGCCGCAGTTCGGAAATCGCTT
+CTTCGGTATGCGAGCGGAAAGTAAGGTGAAAGAGTCGCATTCACTTTCGGTATTTACTGA
+CGGTGGATTCAAAGACGAGATCTTGCAGATGATTGAAGGTGACATCCAGGTCAAGCTGTG
+TCGAATCCATCGCAACGGTTGCAACTTTGACAACGACAGTCGCGACTATATCTATCTTGA
+CGATATGATCGGGGTCAACGGTTATCAGGAGTGTGACTTCTTTTCTGTCGAAGGCCATCC
+AGAAATTACCGCTCAGCACATAGCCGCCACGTTCATCAATAAATAGCACGAATTGCTAAA
+ACGTCGGTGTGGTGATTTGATATAGTTACCTCATCGACAACGAAGAGAGAAAATCGAAAT
+GATGGTATCAACTGATAAGTTTTTCACTTGCACCAAAACTTCTGAGGTATTCGAGCTGGT
+TCATACTGACAACGGTGATTTCATGCATGACGGTTGCGACGCTTTCATTGAAGTGAAAGA
+AAGCGACTATGACGACGGAGTTTATTACAACCCTGCGGTTAACACGCAGTTTTTTACCCC
+GATCGAAGAGGAAGGAGAAGAGGCATGATCACGATTAACCTGTCAGATAAACAAGCGCGT
+GAAATACTAGACACTATCGGAGAACAGCTTCACGTAAAAGGCGATACCGCTGAGATTCTT
+AACCAGATCGAAAGACAGCTAACCCCTGTGTCGACGAATCAAGCTGAGTTCGCAGCATGG
+AAAAGCGAACGCATCCTGCCAAATATCATCAAGGCATGGAAGCGCAAGCATAAAAAAGAA
+ATCAACGTTGAGGATTTATTTACCGATGAATTAAGTCCTTCAAATGTTGCTCAATACCAG
+TTGCGATACATGGAGTCGGTTTGCAATCAGGTTTTAGGTGTAAGTTTTTCATTCAAAGGT
+GATAAATAATGTTCGGTTTAAGCGAAGCGGAGTGGAATGTTGTAAAGCGTGCCGCGAAAG
+AATTAAACAAATTCGTCAGCGGAATGAAGAAAGAAGATCGGAAAAACGACAAGATTATGA
+TTGACGTAATTTCGACTCACCATAAAAAGGTCGAACTACTCATTGACCGCTACAAATTTG
+TCTGGACTGCCGGGTATATTGCAGGGCGCGTAGGTAACAAAGAGGGGGATTATGAATAAT
+GGCCAATTTACCAAAGAAAGGCGATCAGGTTCGATGTGTCACTTCACGCAATGGTAATGC
+TTTATCGGCGGGGTGCTTGTACGACGTAGAAAAAGTCAGTAAGTCAAAGAGGCTTGTATT
+CGTGTACGGCGACGATGGAAATCTGCATGAGATTGATTACCCGCAGGATGTAACTAATGG
+TCAATTCGAAATTAATGATTGACCTAAATCCCTGAGCGGTGATAGTATTAATCCCGTAGA
+CAGACGAGGCGCAACTAAGCGCAACGCGTGAGACGATTCTCACACTTCCAGCTAACAAGC
+TCGGTTGCATAGTGGTTAAGCAACGCCGCAGACCCGTAAGCGGCAACAATTCAAGAGGAT
+TGCATAATGCAAAAAACTAAAGACGAATCAGTCAAAATTGAAATTAAAGTAACTCGCAAC
+GGTGAAACCACTCGTTATAAAAAACGATTAAATCCTGGCGAGGCTGTTATTGGTCGCATT
+GCTGGCGTTATGATTAAGGCGCAGGAAGATGAAGCGATTCAAAGTTAAATTAATTATTCG
+AAAGATGGGAATGTTTTGCCAGTCGTGCAAGCAATCTTTCGAAGCTGAATTATCAGCAAC
+CAGTCAGGATGAAGCCATCACGAAAGCAAAAAAACTTTCCGGCGCTAACCTTGACACTCA
+CAAAATAAATATTGAATTAATCAAGGAGATTTAACATGACAATTTTTTTATTAATTATCG
+CTGGTGTCATTATTTTTGGTGCTGGTTTGTTTGCTGGCTTCGCACTTGTGGCGGCAGCAA
+TTGCGATGGACGCGAAGGATAAAACTGGTGTATGGCTGACCTACTCACCTAAGAAGGACC
+AATGGGAAATGACTGGCGACCTTGCTCACTGCTATTCTAAAGCTAAGACCCACCCTAAAG
+GCATTAAACGACGATTGTCGTGATGAACACTAACCCGCTCCGGCGGGTTTTTTTAT
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/test-data/AY216660.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/test-data/AY216660.gff3 Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,315 @@
+##gff-version 3
+##sequence-region AY216660.2 1 48836
+AY216660.2 GbkToGff gene 40 576 . + . locus_tag=CPT-T1_001;ID=CPT-T1_001.gene;
+AY216660.2 GbkToGff mRNA 40 576 . + . locus_tag=CPT-T1_001;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_001.mRNA;Parent=CPT-T1_001.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 40 43 . + . locus_tag=CPT-T1_001;regulatory_class=ribosome_binding_site;ID=CPT-T1_001.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_001.mRNA;
+AY216660.2 GbkToGff CDS 52 576 . + 0 locus_tag=CPT-T1_001;codon_start=1;transl_table=11;product=terminase small subunit;translation=MSEPKNAPVVQGGNFKELYKKKFGTVLAKNRAMTPEQLFDLSVKYFEWAEDNAIKASESASFQGGVYESLVHKPRVFTWTGYRLFIGASEAAIIKWKREEEYSEVMEFVESVINEQKFQLAANGVINASFIGKDLGIDKPASINIENSSASASTVVATTEDAMKEAVNSILDML;note=Orf no. 54 see PMID: 14972552;ID=CPT-T1_001.CDS.1;Parent=CPT-T1_001.mRNA;
+###
+AY216660.2 GbkToGff gene 589 2184 . + . locus_tag=CPT-T1_002;ID=CPT-T1_002.gene;
+AY216660.2 GbkToGff mRNA 589 2184 . + . locus_tag=CPT-T1_002;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_002.mRNA;Parent=CPT-T1_002.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 589 592 . + . locus_tag=CPT-T1_002;regulatory_class=ribosome_binding_site;ID=CPT-T1_002.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_002.mRNA;
+AY216660.2 GbkToGff CDS 601 2184 . + 0 locus_tag=CPT-T1_002;codon_start=1;transl_table=11;product=terminase large subunit;translation=MGDLIMIQWEDLNATQKLAIKKMSEANFEKMIRIWFQLMQAQQFQPNWHHLYLCHEVEEIIAGRRGNTIFNVTPGSGKTEVFSIHLPVYAMLKCKKVRNLNVSFADSLVKRNSKRVREIISSNEFQELWPCKFGTSKDEEMQVLNEDGKVWFELISAAAGGRITGSRGGYMTPGFSGMVMLDDIDKPDDMFSKVKRERTHMLLKNTIRSRRMHNETPIIAIQQRLHAQDSTWFMMNGGMGIEFDQISIPALVTEEYGKTLPDWLQPYFERDVLSSEYVELDGVKHYSFWPSKESVHDLLALREADQYTFDSQYQQKPIALGGSVFNSEWWTYYGSSLDADEPDPGKYDYRFITADTAQKTGELNDYTVFCLWGKKNDKVYFIDGIRGKWEAPDMERQFTAFVNQAWRHNKSMGVLRKIYVEDKASGTGLIQNLRKKTPISITPLQRNKDKVTRAMDAQPVIKAGRVVLPEEHPMLAEIIAEHSAFTYDDTHPHDDIVDNFMDAANIELLTIDDPIERMKRLAGMVKR;note=Orf no. 53 see PMID: 14972552;ID=CPT-T1_002.CDS.1;Parent=CPT-T1_002.mRNA;
+###
+AY216660.2 GbkToGff gene 2230 3522 . + . locus_tag=CPT-T1_003;ID=CPT-T1_003.gene;
+AY216660.2 GbkToGff mRNA 2230 23794 . + . locus_tag=CPT-T1_003;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_003.mRNA;Parent=CPT-T1_003.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 2230 2233 . + . locus_tag=CPT-T1_003;regulatory_class=ribosome_binding_site;ID=CPT-T1_003.Shine_Dalgarno_seqeunce.2;Parent=CPT-T1_003.mRNA;
+AY216660.2 GbkToGff CDS 2239 3522 . + 0 locus_tag=CPT-T1_003;note=HHPred predicted structural similarity at 99%25 probability to phage T4 portal protein gp20 Protein Data Bank entry 3JA7 over most of protein%3B Orf no. 52 see PMID: 14972552;codon_start=1;transl_table=11;product=portal protein;translation=MKIVKHDGYNDIFNGGADGSPKPFFMSDASYHVGSFYNDNATAKRIVDVIPEEMVTAGFKMSGVKDEKEFKSLWDSYKLDSSLVDLLCWARLYGGAAMVAIIKDNRMLTSQAKPGAKLEGVRVYDRFAITVEKRVTNARSPRYGEPEIYKVSPGDNMQPYLIHHSRVFIADGERVAQQARKQNQGWGASVLNKSLIDAICDYDYCESLATQILRRKQQAVWKVKGLAEMCDDDDAQYAARLRLAQVDDNSGVGRAIGIDAETEEYDVLNSDISGVPEFLSSKMDRIVSLSGIHEIIIKNKNVGGVSASQNTALETFYKLVDRKREEDYRPLLEFLLPFIVDEEEWSIEFEPLSVPSKKEESEITKNNVESVTKAITEQIIDLEEARDTLRSIAPEFKLKDGNNINIREPEETTEPEPGLGEKLEDEN;ID=CPT-T1_003.CDS.1;Parent=CPT-T1_003.mRNA;
+AY216660.2 GbkToGff gene 3496 4273 . + . locus_tag=CPT-T1_004;ID=CPT-T1_004.gene;
+AY216660.2 GbkToGff mRNA 3496 4273 . + . locus_tag=CPT-T1_004;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_004.mRNA;Parent=CPT-T1_004.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 3496 3499 . + . locus_tag=CPT-T1_004;regulatory_class=ribosome_binding_site;ID=CPT-T1_004.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_004.mRNA;
+AY216660.2 GbkToGff CDS 3512 4273 . + 0 locus_tag=CPT-T1_004;note=InterPro domain IPR006528,Orf no. 51 see PMID: 14972552;codon_start=1;transl_table=11;product=capsid morphogenesis protein;translation=MKINGVATQWRYPEMSERAMSRSLQDVAAKLTEKMRDELKPMKFDATDEEIDQTERSLLDYVESLIAPIIGSLSSVALTIYKFNSKQWLRIARNAGGKKNQAVMLLALIGPTAAESWYSGQYNLWRSQVATSIRKFAANMVTDFTDKLRAASGQGKSKDFVVELAKERFGIYRNWAKNRASGIVGTWNSRLMRQRIKDAGVSYYFWRGVMDLREREKHVRWEGKRIAVDSDHVFPGEEYNCRCWAVPDFSTGD;ID=CPT-T1_004.CDS.1;Parent=CPT-T1_004.mRNA;
+AY216660.2 GbkToGff gene 4264 5388 . + . locus_tag=CPT-T1_005;ID=CPT-T1_005.gene;
+AY216660.2 GbkToGff mRNA 4264 40736 . + . locus_tag=CPT-T1_005;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_005.mRNA;Parent=CPT-T1_005.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 4264 4268 . + . locus_tag=CPT-T1_005;regulatory_class=ribosome_binding_site;ID=CPT-T1_005.Shine_Dalgarno_seqeunce.2;Parent=CPT-T1_005.mRNA;
+AY216660.2 GbkToGff CDS 4276 5388 . + 0 locus_tag=CPT-T1_005;note=HHPred predicted structural similarity at 72%25 probability to phage T4 prohead core protein protease gp21 Protein Data Bank entry 5JBL over predicted catalytic third of protein%3B Orf no. 50 see PMID: 14972552;codon_start=1;transl_table=11;product=capsid maturation protease;translation=MKAKQRFDSVKIKAHFDDNGFLVDRPIVARIGAQVYKTPHGDRVEFRPASEVFKQDSLQSFAGKPITVGHVTVTPQNAKDVVVGSCAGAGIASGVGVEVPLSIYSDYAISKAKAKEAGELSVGYTSVDIDKPGWGSNETGEYIFEEDMKQDEAPPEGWVRFDAVQTNIKVNHIALVFKGRAGIAKLNLDAEQEFPYDNNVQLTNEDKQMKKIKIDSVDVEVTEDVANHIEKLTAQIATIQGKADGFEAERDALKVKVDSLPELVKAEVEKQKADAAARAEVTAVAETAGVKHDGLDIKDVKIAVVKAMLDKDVSEKSDAYIDAMFDVAKDSDIMAIQRKAVKGDSIEGGKPEEKNDAAPVTPNSRLSKVM;ID=CPT-T1_005.CDS.1;Parent=CPT-T1_005.mRNA;
+AY216660.2 GbkToGff gene 5389 5876 . + . locus_tag=CPT-T1_006;ID=CPT-T1_006.gene;
+AY216660.2 GbkToGff mRNA 5389 5876 . + . locus_tag=CPT-T1_006;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_006.mRNA;Parent=CPT-T1_006.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 5389 5392 . + . locus_tag=CPT-T1_006;regulatory_class=ribosome_binding_site;ID=CPT-T1_006.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_006.mRNA;
+AY216660.2 GbkToGff CDS 5400 5876 . + 0 locus_tag=CPT-T1_006;note=HHPred predicted structural similarity at 92%25 probability to phage TW1 Protein Data Bank Entry 5WK1 capsid stabilizing protein%2C equivalent to phage lambda gpD dec protein%2C over most of protein%3B Orf no. 49 see PMID: 14972552;codon_start=1;transl_table=11;product=capsid decoration protein;translation=MAQINASYQRDMAIALPGMVADTSKYNIDGACVVNEGDVLVGAAVQVVQAQAVDGHKLVKALTTGTTPYGVAIRSHWQTVNAQNQMIYEDGGAINVMTSGRVWMLSKSTEAPTFGSAVKLDVDGQEKSDGTIETTWTYAGGWTKYKDIQLVEVQLHQL;ID=CPT-T1_006.CDS.1;Parent=CPT-T1_006.mRNA;
+AY216660.2 GbkToGff gene 5926 6705 . + . locus_tag=CPT-T1_007;ID=CPT-T1_007.gene;
+AY216660.2 GbkToGff mRNA 5926 6705 . + . locus_tag=CPT-T1_007;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_007.mRNA;Parent=CPT-T1_007.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 5926 5929 . + . locus_tag=CPT-T1_007;regulatory_class=ribosome_binding_site;ID=CPT-T1_007.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_007.mRNA;
+AY216660.2 GbkToGff CDS 5938 6705 . + 0 locus_tag=CPT-T1_007;note=InterPro domains IPR008964 and IPR003343%2C invasin/intimin cell-adhesion fragments superfamily%3B bacterial Ig-like domain-containing protein%3B Orf no. 48 see PMID: 14972552;codon_start=1;transl_table=11;product=hypothetical protein;translation=MAYENLMLRPACPGNLSDTSTYNIDGACVAQGDIEFGSAVQVVGIVDGVKVVTALSDGGTPYGIAFRSQYEHLSGKILDGEVCNVVSHGRVWALTSLDEAPSLFSKLQFGSGGVVTGGSGYAGWTFAGGFVKHEDGYIIEVRVKQNAFIVPPPPPPVVLVESATITTDKESPQPNNVTIQCVANALPANATDKTGKWSIDATNIATVNPDSGLVTPVGGEVVGDFNITWTANDASKTTATIAYRVEAVPTPEVDV;ID=CPT-T1_007.CDS.1;Parent=CPT-T1_007.mRNA;
+AY216660.2 GbkToGff gene 6784 7755 . + . locus_tag=CPT-T1_008;ID=CPT-T1_008.gene;
+AY216660.2 GbkToGff mRNA 6784 7755 . + . locus_tag=CPT-T1_008;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_008.mRNA;Parent=CPT-T1_008.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 6784 6787 . + . locus_tag=CPT-T1_008;regulatory_class=ribosome_binding_site;ID=CPT-T1_008.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_008.mRNA;
+AY216660.2 GbkToGff CDS 6796 7755 . + 0 locus_tag=CPT-T1_008;codon_start=1;transl_table=11;product=major capsid protein;translation=MTTKKFDEADKSNVEMYLIQAGVKQDAAATMGIWTAQELHRIKSQSYEEDYPVGSALRVFPVTTELSPTDKTFEYMTFDKVGTAQIIADYTDDLPLVDALGTSEFGKVFRLGNAYLISIDEIKAGQATGRPLSTRKASACQLAHDQLVNRLVFKGSAPHKIVSVFNHPNITKITSGKWIDVSTMKPETAEAELTQAIETIETITRGQHRATNILIPPSMRKVLAIRMPETTMSYLDYFKSQNSGIEIDSIAELEDIDGAGTKGVLVYEKNPMNMSIEIPEAFNMLPAQPKDLHFKVPCTSKCTGLTIYRPMTIVLITGV;note=Orf no. 47 see PMID: 14972552;ID=CPT-T1_008.CDS.1;Parent=CPT-T1_008.mRNA;
+AY216660.2 GbkToGff gene 7794 8093 . + . locus_tag=CPT-T1_009;ID=CPT-T1_009.gene;
+AY216660.2 GbkToGff mRNA 7794 8093 . + . locus_tag=CPT-T1_009;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_009.mRNA;Parent=CPT-T1_009.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 7794 7797 . + . locus_tag=CPT-T1_009;regulatory_class=ribosome_binding_site;ID=CPT-T1_009.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_009.mRNA;
+AY216660.2 GbkToGff CDS 7806 8093 . + 0 locus_tag=CPT-T1_009;codon_start=1;transl_table=11;product=hypothetical protein;translation=MAKEKTVVIVNVGVALQMFRLEDGSFAKVLPDEEVTLPASVLDLPGLRCLIAREEIEVKDDSATNRKIRAEMAKITKPDPWDKMSVKELEDGGEY;note=Orf no. 46 see PMID: 14972552;ID=CPT-T1_009.CDS.1;Parent=CPT-T1_009.mRNA;
+AY216660.2 GbkToGff gene 8127 8548 . + . locus_tag=CPT-T1_010;ID=CPT-T1_010.gene;
+AY216660.2 GbkToGff mRNA 8127 8548 . + . locus_tag=CPT-T1_010;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_010.mRNA;Parent=CPT-T1_010.gene;
+AY216660.2 GbkToGff CDS 8138 8548 . + 0 locus_tag=CPT-T1_010;note=HHPred predicted structural similarity at 96%25 probability to Bsubtilis yqbG (myophage protein%2C see PMID 29279385) Protein Data Bank entry 1ZTS over most of protein%3B Orf no. 45 see PMID: 14972552;codon_start=1;transl_table=11;product=head-to-tail connector complex protein;translation=MNQETLIAVVEQMRKLVPALRKVPDETLYAWVEMAELFVCQKTFKDAYVKALALYALHLAFLDGALKGEDEDLESYSRRVTSFSLSGEFSQTFGEVTKNQSGDMMLSTPWGKMFEQLKARRRGRFALMTGLRGGCH;ID=CPT-T1_010.CDS.1;Parent=CPT-T1_010.mRNA;
+AY216660.2 GbkToGff gene 8534 8919 . + . locus_tag=CPT-T1_011;ID=CPT-T1_011.gene;
+AY216660.2 GbkToGff mRNA 8534 8919 . + . locus_tag=CPT-T1_011;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_011.mRNA;Parent=CPT-T1_011.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 8534 8538 . + . locus_tag=CPT-T1_011;regulatory_class=ribosome_binding_site;ID=CPT-T1_011.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_011.mRNA;
+AY216660.2 GbkToGff CDS 8548 8919 . + 0 locus_tag=CPT-T1_011;note=HHPred predicted structural similarity at 96%25 probability to phage SPP1 gp15 Protein Data Bank entry 5A21 over most of protein%3B Orf no. 44 see PMID: 14972552;codon_start=1;transl_table=11;product=head-to-tail connector complex protein;translation=MNYSQIERMARKGVAFFTDPSRPMNLIKQGEYGYDENGFEIPPMEQVIPISGATRRPNAREIDGETIRASDILGIFNNDHEINEGDYIEIDGIRHVVVDARPVQASLEPVAYRPVLRRVSVGG;ID=CPT-T1_011.CDS.1;Parent=CPT-T1_011.mRNA;
+AY216660.2 GbkToGff gene 8897 9355 . + . locus_tag=CPT-T1_012;ID=CPT-T1_012.gene;
+AY216660.2 GbkToGff mRNA 8897 9355 . + . locus_tag=CPT-T1_012;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_012.mRNA;Parent=CPT-T1_012.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 8897 8901 . + . locus_tag=CPT-T1_012;regulatory_class=ribosome_binding_site;ID=CPT-T1_012.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_012.mRNA;
+AY216660.2 GbkToGff CDS 8912 9355 . + 0 locus_tag=CPT-T1_012;codon_start=1;transl_table=11;product=hypothetical protein;translation=MANYQIRRFQGEIDAWINAAESTLEHAIEIFVRDVHDALVSRSPVDTGRFKGNWQITFNEIPNHALNRYDKTGGVVRGEEQAKTYGMFSRGGAITSVHFSNMLIYANALEYGHSQQAPSGVVGLVALRLRSYMADAIKQARRQQNAL;note=Orf no. 43 see PMID: 14972552;ID=CPT-T1_012.CDS.1;Parent=CPT-T1_012.mRNA;
+AY216660.2 GbkToGff gene 9332 9743 . + . locus_tag=CPT-T1_013;ID=CPT-T1_013.gene;
+AY216660.2 GbkToGff mRNA 9332 9743 . + . locus_tag=CPT-T1_013;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_013.mRNA;Parent=CPT-T1_013.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 9332 9336 . + . locus_tag=CPT-T1_013;regulatory_class=ribosome_binding_site;ID=CPT-T1_013.Shine_Dalgarno_seqeunce.2;Parent=CPT-T1_013.mRNA;
+AY216660.2 GbkToGff CDS 9345 9743 . + 0 locus_tag=CPT-T1_013;note=HHPred predicted structural similarity at 96%25 probability to phage lambda minor tail protein U Protein Data Bank entry 3FZ2 over most of protein%3B Orf no. 42 see PMID: 14972552;codon_start=1;transl_table=11;product=minor tail protein;translation=MHYELSAAARAAFLSKYRDFPHYMENRNFTPPKDGGMWLRFNYIEGDTLYLSIDRKCKSYIAIVQIGVVFPPGSGVDEARLKAKEIADFFKDGKMLNVGYIFEGAIVHQIVKHESGWMIPVRFTVRVDTKET;ID=CPT-T1_013.CDS.1;Parent=CPT-T1_013.mRNA;
+AY216660.2 GbkToGff gene 9733 10414 . + . locus_tag=CPT-T1_014;ID=CPT-T1_014.gene;
+AY216660.2 GbkToGff mRNA 9733 10414 . + . locus_tag=CPT-T1_014;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_014.mRNA;Parent=CPT-T1_014.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 9733 9737 . + . locus_tag=CPT-T1_014;regulatory_class=ribosome_binding_site;ID=CPT-T1_014.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_014.mRNA;
+AY216660.2 GbkToGff CDS 9746 10414 . + 0 locus_tag=CPT-T1_014;note=HHPred predicted structural similarity at 99%25 probability to phage lambda major tail protein V Protein Data Bank entry 2K4 over half of protein%3B Orf no. 41 see PMID: 14972552;codon_start=1;transl_table=11;product=major tail protein;translation=MHLPNGAQIFVETSRGVEVEATAITNAENPVATVASKGDLAKGDYVIVTQSTWAKMVSRVLIVTDAQETSITLAGIDTSDTLVFPAGGTMSFAKITGWTEIPCVQEIGQDGGEQQYYTYQCLSDDKEQQIPTFKSAVSLTYTFAHEFDNPIYPILRKLDSSGQVTAVRMYVPKASEMRMWAGILSFNDIPSTQVNEMETVELAVSLKGDFTFISSTLASPGA;ID=CPT-T1_014.CDS.1;Parent=CPT-T1_014.mRNA;
+AY216660.2 GbkToGff gene 10516 10845 . + . locus_tag=CPT-T1_015;ID=CPT-T1_015.gene;
+AY216660.2 GbkToGff mRNA 10516 10845 . + . locus_tag=CPT-T1_015;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_015.mRNA;Parent=CPT-T1_015.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 10516 10520 . + . locus_tag=CPT-T1_015;regulatory_class=ribosome_binding_site;ID=CPT-T1_015.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_015.mRNA;
+AY216660.2 GbkToGff CDS 10528 10845 . + 0 locus_tag=CPT-T1_015;codon_start=1;transl_table=11;product=tape measure chaperone frameshift product;translation=MAKFNFVLGQLPDFKLPVTFTMPNGEDATIIFTVRHLSSKEVQDMYAKQGEMNDSDFITKIASGWNLEEEFNEENTRKLVQYYPSAAYNLTATYIKALAGHRAKN;ID=CPT-T1_015.CDS.1;Parent=CPT-T1_015.mRNA;
+AY216660.2 GbkToGff gene 10516 11162 . + . locus_tag=CPT-T1_016;ID=CPT-T1_016.gene;
+AY216660.2 GbkToGff mRNA 10516 11162 . + . locus_tag=CPT-T1_016;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_016.mRNA;Parent=CPT-T1_016.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 10516 10520 . + . locus_tag=CPT-T1_016;regulatory_class=ribosome_binding_site;ID=CPT-T1_016.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_016.mRNA;
+AY216660.2 GbkToGff CDS 10528 10839 . + 0 locus_tag=CPT-T1_016;codon_start=1;transl_table=11;product=tape measure chaperone frameshift product;translation=MAKFNFVLGQLPDFKLPVTFTMPNGEDATIIFTVRHLSSKEVQDMYAKQGEMNDSDFITKIASGWNLEEEFNEENTRKLVQYYPSAAYNLTATYIKALAGHRAKKLKRAVYLLYQKPPTEEQLRSVGLSLSDYEDEEPETIIGDAEMVKAWNVFTSMLTQWRSSGAGAYGLDYNVLPMLFKIYKIEDEELALQDVRIMEAKALEMIAKQNN;note=Orf no. 40 see PMID: 14972552;ID=CPT-T1_016.CDS.1;Parent=CPT-T1_016.mRNA;
+AY216660.2 GbkToGff CDS 10839 11162 . + 0 locus_tag=CPT-T1_016;codon_start=1;transl_table=11;product=tape measure chaperone frameshift product;translation=MAKFNFVLGQLPDFKLPVTFTMPNGEDATIIFTVRHLSSKEVQDMYAKQGEMNDSDFITKIASGWNLEEEFNEENTRKLVQYYPSAAYNLTATYIKALAGHRAKKLKRAVYLLYQKPPTEEQLRSVGLSLSDYEDEEPETIIGDAEMVKAWNVFTSMLTQWRSSGAGAYGLDYNVLPMLFKIYKIEDEELALQDVRIMEAKALEMIAKQNN;note=Orf no. 40 see PMID: 14972552;ID=CPT-T1_016.CDS.1;Parent=CPT-T1_016.mRNA;
+AY216660.2 GbkToGff gene 11192 14076 . + . locus_tag=CPT-T1_017;ID=CPT-T1_017.gene;
+AY216660.2 GbkToGff mRNA 11192 14076 . + . locus_tag=CPT-T1_017;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_017.mRNA;Parent=CPT-T1_017.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 11192 11197 . + . locus_tag=CPT-T1_017;regulatory_class=ribosome_binding_site;ID=CPT-T1_017.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_017.mRNA;
+AY216660.2 GbkToGff CDS 11203 14076 . + 0 locus_tag=CPT-T1_017;codon_start=1;transl_table=11;product=tape measure protein;translation=MVDKVAGLSLDVDVSTVQRAVKSLKEFSKANDQAADSMGSLINESEVAKQKAKEHAEQLRRQRKEYEAVEKAIDPTVSKMERLKIASQQLDKLWQQGVVPDETFFRLGEMLDLQNAKLARSRAMLTEEGQAALQEAKAKEQAAVRSKAFMDALNGQVNAIGKTHAELMELKAAELGLSKEAAPLIAKLKDQGRAMNAAGISAGEYRQAMRMLPAQITDVVTSLASGMPVWMVAIQQGGQIKDSFGGIGNTFKVLLSYINPVTAGVGVLVGSLGILAKAGYDSYKSITDIQNALIETGGYAGVTAEELDSVSKKIAQTSNSTIGSIREIVTELASSGKYTREQIQNITKATAEWSASTGKSASQIISEFEKIASDPVKGLKKLNEQYNFLEKGQLTYIDTLSRTKGETEAVSEATKLFADVMEKRMKSIADNATPLEKMWSDIKQWASDAWGWVGDHTLGALNLIIDVVQGTVIQVKMILAKGDEYISNFIASAIKATQSLPGMSDFGADVLKEQENIVKSSRDNYDQLASDLDAINARVEKGEMGYIEAMRQRRTLEKQYSEETKEAIRKEAEEIEKRNRERNKQSKIVRSPTEQFDKELISLRAQLKVLQEHKEIGQKLSAQRKALFTTEATIAVLREASSKRQLSAEEKALLASQERVIELAKQKAEIGDQIVKQQQLNDLTDKSLKFVNEMTAATEQLNASRGLSTRDMERQAELAKITTDYINSGGSEGDEKLQNMIKAQNDYYAAEDAKRADWLAGAESAFADYGDAAMDMYGNVNEIASSALNGMSDMMVQFLTTGKANFEDFAKNIIGMIIKMIAQMVIFNTISGMMGGKTWSFAGGASSGASAASQATPTPAASVFRSVSSGGAAVSLAAAAGSVATSGFNASNSAPKVVNHSGGGTVVDVSGMEVKVDNGSDPRGISQGVEMMFKKMIRESCSQGGEVYNYIQEKTGG;note=Orf no. 38 see PMID: 14972552;ID=CPT-T1_017.CDS.1;Parent=CPT-T1_017.mRNA;
+AY216660.2 GbkToGff gene 14067 14432 . + . locus_tag=CPT-T1_018;ID=CPT-T1_018.gene;
+AY216660.2 GbkToGff mRNA 14067 14432 . + . locus_tag=CPT-T1_018;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_018.mRNA;Parent=CPT-T1_018.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 14067 14072 . + . locus_tag=CPT-T1_018;regulatory_class=ribosome_binding_site;ID=CPT-T1_018.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_018.mRNA;
+AY216660.2 GbkToGff CDS 14079 14432 . + 0 locus_tag=CPT-T1_018;note=similar to lambda tail tip protein M UniProt ID P03737%3B Orf no. 37 see PMID: 14972552;codon_start=1;transl_table=11;product=tail tip protein;translation=MATLDTFGWCTQVQGGGGSLTTTNSDRSIQFGNGYMQLASSGFNTTRREYSVVYAGEDFMAVYDFCNSHRIKPFAWTPPDGKIGIWVVKPNSLGAKPVSRDVMEINVTFMEQFTSME;ID=CPT-T1_018.CDS.1;Parent=CPT-T1_018.mRNA;
+AY216660.2 GbkToGff gene 14500 15294 . + . locus_tag=CPT-T1_019;ID=CPT-T1_019.gene;
+AY216660.2 GbkToGff mRNA 14500 15294 . + . locus_tag=CPT-T1_019;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_019.mRNA;Parent=CPT-T1_019.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 14500 14503 . + . locus_tag=CPT-T1_019;regulatory_class=ribosome_binding_site;ID=CPT-T1_019.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_019.mRNA;
+AY216660.2 GbkToGff CDS 14512 15294 . + 0 locus_tag=CPT-T1_019;note=similar to lambda tail tip protein L UniProt ID P03738%3B Orf no. 36 see PMID: 14972552;codon_start=1;transl_table=11;product=tail tip protein;translation=MSENKKLYDEESGKSLFHNCLQSLYPGEIITLIEVDGSKFGAQVYRFHGENIQYTPEEIMQAQQTGTLPPKEITFRGEKYGARPFGISGISFDSSGKATKPQLTVANIDSRVSAMIRAYNGLMQAKVTIWITQRELINSDGSIADGAYRKLVYYIERPNYVDKSVARFDLTSPYDMDGIMIPSRLTQSVCYFAQRGWYKTGKGCGYNGQNGYFDKDNNPVDDPSLDFCPGTVTACRLRFGANNELDFGGCAVASLQRKNQ;ID=CPT-T1_019.CDS.1;Parent=CPT-T1_019.mRNA;
+AY216660.2 GbkToGff gene 15279 16025 . + . locus_tag=CPT-T1_020;ID=CPT-T1_020.gene;
+AY216660.2 GbkToGff mRNA 15279 16025 . + . locus_tag=CPT-T1_020;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_020.mRNA;Parent=CPT-T1_020.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 15279 15282 . + . locus_tag=CPT-T1_020;regulatory_class=ribosome_binding_site;ID=CPT-T1_020.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_020.mRNA;
+AY216660.2 GbkToGff CDS 15291 16025 . + 0 locus_tag=CPT-T1_020;note=similar to lambda tail tip protein K UniProt ID P03729%3B Orf no. 35 see PMID: 14972552;codon_start=1;transl_table=11;product=tail tip protein;translation=MISAKIKLEIMTHAQEEYPRECCGVVTQKGRVQKYHRIDNVHRDPENHFMMDAVQYACIEDDAESTTIAIVHSHTGDGATTLPSAHDTCMCNEMEVTWIIVSVPEGDMRFVKPEKLPLIGRPWSLGSFDCYGLVMAWHKEHGVELRDRRLNFEWWKPEYGINLYQDYYKQDGFVEIPDQNNPSFGDMVIMQIGQNVPVWNHAGIYLGDNQILHHAFGKLSRRDIYSGWYQDHTVLIVRHKDLKL;ID=CPT-T1_020.CDS.1;Parent=CPT-T1_020.mRNA;
+AY216660.2 GbkToGff gene 16009 16621 . + . locus_tag=CPT-T1_021;ID=CPT-T1_021.gene;
+AY216660.2 GbkToGff mRNA 16009 16621 . + . locus_tag=CPT-T1_021;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_021.mRNA;Parent=CPT-T1_021.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 16009 16012 . + . locus_tag=CPT-T1_021;regulatory_class=ribosome_binding_site;ID=CPT-T1_021.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_021.mRNA;
+AY216660.2 GbkToGff CDS 16022 16621 . + 0 locus_tag=CPT-T1_021;note=single transmembrane domain predicted N-in and C-out%3B similar to lambda tail tip assembly protein I UniProt ID P03730%3B Orf no. 34 see PMID: 14972552;codon_start=1;transl_table=11;product=tail assembly protein;translation=MNDVKVIKLSGSLGRRFGVFHRYAVDSYPEAIRALSSQVDGFKEYMQSEVGSRSKFAIFVDGVNVGHHEEEKFKCAKEIRIVPIPTGSKTGGLFQVVLGAAIMVAAFYTGGASLALMGTMSSSLFMMGGAMVLGGVMQMISPQPGGANFEVQSSKNKPSYAFGGAVNTTAAGYPLPVPYGYRAGGGATFSAGSYAEDMS;ID=CPT-T1_021.CDS.1;Parent=CPT-T1_021.mRNA;
+AY216660.2 GbkToGff gene 16688 20217 . + . locus_tag=CPT-T1_022;ID=CPT-T1_022.gene;
+AY216660.2 GbkToGff mRNA 16688 20217 . + . locus_tag=CPT-T1_022;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_022.mRNA;Parent=CPT-T1_022.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 16688 16691 . + . locus_tag=CPT-T1_022;regulatory_class=ribosome_binding_site;ID=CPT-T1_022.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_022.mRNA;
+AY216660.2 GbkToGff CDS 16699 20217 . + 0 locus_tag=CPT-T1_022;note=similar to phage lambda tip attachment protein J UniProt ID P03749%3B Orf no. 33 see PMID: 14972552;codon_start=1;transl_table=11;product=tail fiber;translation=MIQKVISGSKGGSQKPHNPVEMEDNLISINKIKILLAVSDGEIDETFSLKQLMFNSVPVQNEDGSFNFEGVKAEFRPGTQTQEYIKGMEDSSSEVTVNREVTTDNPYTISVTNKTLSAIRIKMFMPRGVRIESNGDKNGVRVEYEVQQAVDGGSFETVLTDVIEGKTMSGYDRSRRVNLPNFNNQVIFRVVRKTPDSNDSNVVDAIQVKSYAEVIDAKFRYPLTGLLFVEFDSKMFPNQLPTISIRKRWKIVNVPSNYDPESRTYNGNWDGTFKKAWTNNPAWVLYDLMINQRYGLDQKELGIAVDKWALYEAAQYCDQMVPDGKGGTEPRYLCDVIIQSQTDAYKVIRDICSIFRGMSFWNGESISVIIDRPREPAYIFTNDNVVNGDFSYTFASEKSMYTTCNVMFDDEQNMYQQDVEPVFDREATLRFGNNVTSITAIGCTRRSEANRRGRWILKTNLRSTTVNFATGLEGMIPTIGDVVAIADNFWSSNLTMNLSGRLLEVSGSQIFLPFRVDARAGDFIIVNKPDGKPVKRTISSVSADGKTIEVNIGFGFPVKPNTVFAIDRTDIALQQYVVTKIDKGDDDEEFTYKITAVEYDPNKYDEIDYGVNIDDRPTSIVEPDQIPRPKNVQVSSESRIVQGMSVETMIVSWDKVPYAVFYDVQWRKDNGNWQNVPQTANKEVYVEGIYAGNYQVRVRSVAGSGTTSGWSNIVAATLTGKQGEPGRPINLTATDDVVFGIRTKWGFSDGSGDTAYTELQQSPDGTVDNASLLSLIPYPQHEYYHSPMPGGNIVWYRVRTVDRIGNVSQWTDFVRGMASTNVDDIIGEISVDIENSPGYEWLVDNATDNAAQNSANAEAAIENALANDKDAIYMKKENGKRKAEYTKSLKLIADETQARVTAIEQLKASFGDQISASNSELREVIATETEALSREIDQLKAQIGDDIQASLTDIREVIATETEALSREIDQLKAQIGDDIQASLTDIREAIANETEARTQADLTLSARLGNNEAALAQKLDSWSNADSTGAMYGVKLGLKYNGQEYSAGMAMSLVGSGAAVKAQILFEASRFAIMTGMNGQTQYPFVVENGQVILSSAIIKNGFITNAMIGNFIQSNNYVFNQSGWRLDKGGTFENYGSDGEGAMKQTNTTISVRDASGRLRVQIGRLTGSW;ID=CPT-T1_022.CDS.1;Parent=CPT-T1_022.mRNA;
+AY216660.2 GbkToGff gene 20251 20568 . + . locus_tag=CPT-T1_023;ID=CPT-T1_023.gene;
+AY216660.2 GbkToGff mRNA 20251 20568 . + . locus_tag=CPT-T1_023;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_023.mRNA;Parent=CPT-T1_023.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 20251 20255 . + . locus_tag=CPT-T1_023;regulatory_class=ribosome_binding_site;ID=CPT-T1_023.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_023.mRNA;
+AY216660.2 GbkToGff CDS 20263 20568 . + 0 locus_tag=CPT-T1_023;codon_start=1;transl_table=11;product=hypothetical protein;translation=MAYGISTWDANGVYNNYGIKPITVVGWNFLSAGQNSASFSYQVPPGMHVNYVISLDDGAISGPGRKIIASGNTITVTPTNSPGPNVYPSSNCYLIAYLEND;note=Orf no. 32 see PMID: 14972552;ID=CPT-T1_023.CDS.1;Parent=CPT-T1_023.mRNA;
+AY216660.2 GbkToGff gene 20556 21257 . + . locus_tag=CPT-T1_024;ID=CPT-T1_024.gene;
+AY216660.2 GbkToGff mRNA 20556 21257 . + . locus_tag=CPT-T1_024;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_024.mRNA;Parent=CPT-T1_024.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 20556 20559 . + . locus_tag=CPT-T1_024;regulatory_class=ribosome_binding_site;ID=CPT-T1_024.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_024.mRNA;
+AY216660.2 GbkToGff CDS 20568 21257 . + 0 locus_tag=CPT-T1_024;note=InterPro domain IPR013750%3B GHMP kinase domain- containing protein%3B Orf no. 31 see PMID: 14972552;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSYGAFIDVNGNPFITPLSTPFALYARGEIQSVNVSGSQVAERYVRIPTGVPVIAFCKTTNTQQGTALSAFTFRSGPNVGTVYIRGTNPANQSYTLTYYIFAIFEQSLPRWGMAIWDASGKLVLTNETKVLSDLVTIGTPGYAGGGLNIDTTLSGSYAVVPTILGNYQVVIGRLPTGQPIIGNSTAGSSCRYNGSTTRINAAATTAAGQIMNTTNNGNIITAIKTAAYD;ID=CPT-T1_024.CDS.1;Parent=CPT-T1_024.mRNA;
+AY216660.2 GbkToGff gene 21279 21518 . - . locus_tag=CPT-T1_025;ID=CPT-T1_025.gene;
+AY216660.2 GbkToGff mRNA 21279 21518 . - . locus_tag=CPT-T1_025;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_025.mRNA;Parent=CPT-T1_025.gene;
+AY216660.2 GbkToGff CDS 21279 21506 . - 0 locus_tag=CPT-T1_025;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKKLITIIAAAFILTGCSSMPERTCTAIYESGGAEYSVYVFGSKMRGKEMVLRAGYPFSFNYVSEKNFKSHDCSI;note=Orf no. 30 see PMID: 14972552;ID=CPT-T1_025.CDS.1;Parent=CPT-T1_025.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 21515 21518 . - . locus_tag=CPT-T1_025;regulatory_class=ribosome_binding_site;ID=CPT-T1_025.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_025.mRNA;
+AY216660.2 GbkToGff gene 21755 21906 . + . locus_tag=CPT-T1_026;ID=CPT-T1_026.gene;
+AY216660.2 GbkToGff mRNA 21755 21906 . + . locus_tag=CPT-T1_026;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_026.mRNA;Parent=CPT-T1_026.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 21755 21758 . + . locus_tag=CPT-T1_026;regulatory_class=ribosome_binding_site;ID=CPT-T1_026.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_026.mRNA;
+AY216660.2 GbkToGff CDS 21766 21906 . + 0 locus_tag=CPT-T1_026;codon_start=1;transl_table=11;product=hypothetical protein;translation=MIRQISIMYVQNLINLDSICRYLCISNKKRLQVLRNRQRIKIYLSH;ID=CPT-T1_026.CDS.1;Parent=CPT-T1_026.mRNA;
+AY216660.2 GbkToGff gene 22020 23098 . + . locus_tag=CPT-T1_027;ID=CPT-T1_027.gene;
+AY216660.2 GbkToGff mRNA 22020 23098 . + . locus_tag=CPT-T1_027;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_027.mRNA;Parent=CPT-T1_027.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 22020 22024 . + . locus_tag=CPT-T1_027;regulatory_class=ribosome_binding_site;ID=CPT-T1_027.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_027.mRNA;
+AY216660.2 GbkToGff CDS 22034 23098 . + 0 locus_tag=CPT-T1_027;codon_start=1;transl_table=11;product=exodeoxyribonuclease VIII;translation=MFQVFTSSQLSNDEYHRNEGWASEYVSGSSLAEIYQTCPANWRFKKNETTKALEFGTQSHTNFESRDLFTATYARCPAPSEFKDLITSQAALAAKLKSFGLKGTSGKQYPDLIKMMVDCGEELNVQYLIELIAEAEARAEGKQLVDADKYDACMKMRAILEQNPDHEACINSETAQREISIFGEISGVKVKVRLDHLDYKENVPGRVLTGYDENGDPVFEDVIFPEALIITDFKTTMSANPLEFPRLAYNHGYYLKMALQHDLLRRAIQAGAFEGNFPEDIPIVVRLLAQEKKEPYIALAYRMTMEQIRIGRNQYISVVHTYKACSEMDVWPGYAGDASEIELETPSWVRYQNK;note=Orf no. 29 see PMID: 14972552;ID=CPT-T1_027.CDS.1;Parent=CPT-T1_027.mRNA;
+AY216660.2 GbkToGff gene 23129 23820 . + . locus_tag=CPT-T1_028;ID=CPT-T1_028.gene;
+AY216660.2 GbkToGff mRNA 23129 23820 . + . locus_tag=CPT-T1_028;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_028.mRNA;Parent=CPT-T1_028.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 23129 23133 . + . locus_tag=CPT-T1_028;regulatory_class=ribosome_binding_site;ID=CPT-T1_028.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_028.mRNA;
+AY216660.2 GbkToGff CDS 23140 23820 . + 0 locus_tag=CPT-T1_028;note=InterPro domain IPR007499%3B Orf no. 28 see PMID: 14972552;codon_start=1;transl_table=11;product=recombinase;translation=MHLIHQSGEVKMQLSPETNEILPALFNARNKFAKAKKDAKNNHLKNSYATLDAMMAAVSPALTDNDIMILQSMLDTSTETTFHLETMLIHKSGQWAKFFMMMPIAKRDPQGVGSAMTYARRYSLAAALGISQSDDDAQLAVKSVKDWKKELDACEDIESLKDVWANAYRQTDTASKSIIQDHYNALKAKFEIGKARGIRPAQPEQKKQVEATSAKPVQSQSITNFE;ID=CPT-T1_028.CDS.1;Parent=CPT-T1_028.mRNA;
+AY216660.2 GbkToGff gene 23854 24289 . + . locus_tag=CPT-T1_029;ID=CPT-T1_029.gene;
+AY216660.2 GbkToGff mRNA 23854 24289 . + . locus_tag=CPT-T1_029;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_029.mRNA;Parent=CPT-T1_029.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 23854 23858 . + . locus_tag=CPT-T1_029;regulatory_class=ribosome_binding_site;ID=CPT-T1_029.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_029.mRNA;
+AY216660.2 GbkToGff CDS 23867 24289 . + 0 locus_tag=CPT-T1_029;note=InterPro domain IPR012340,Orf no. 27 see PMID: 14972552;codon_start=1;transl_table=11;product=single-stranded DNA-binding protein;translation=MHIITGEIRKEPKILERNGGNTYIIELAESYKPRDGDREYTNYTFFFSDGGKPGLADWYREAFQVGRVISVSCETLKISSREHNGMIYNSLQAADFPKLVFSQRGQSNQQQRAPQQQQRSQQQSQPQPNQQSTFDDDIPF;ID=CPT-T1_029.CDS.1;Parent=CPT-T1_029.mRNA;
+AY216660.2 GbkToGff gene 24351 26550 . - . locus_tag=CPT-T1_030;ID=CPT-T1_030.gene;
+AY216660.2 GbkToGff mRNA 24351 26550 . - . locus_tag=CPT-T1_030;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_030.mRNA;Parent=CPT-T1_030.gene;
+AY216660.2 GbkToGff CDS 24351 26537 . - 0 locus_tag=CPT-T1_030;codon_start=1;transl_table=11;product=hypothetical protein;translation=MALYRRGTASMDADGTVHGTDTKWKDQLALIRVGATIVFLEQPIKLAVISDIVSDTELKAISTDGQTAADGKYVILLNDSLTVNGLAQNVAETLRYYQSKETEIASALDIIADLDMDNLNNIVQEIKSNKSAAEAAQNQAELARDSANSARDESISIKNQTQQISDSAIGSINAAKDKAITNVQQKENSAVTHINSEEAAAIQAINDAKGDLSGYVNDAQTAAQTATSAKNDAQAARDAAVSAKDAAAVSAQEAQDAANSVNADNLLTKDGNLSGLADKEQSKKNLAVNRLNQPRGDLTEIYSNDDRTGFKLIVKDSGDWGAMTHDGSENKALGVNFGGTGGTTEEQARTSLKVYKLDRTNLGEKHLDSITGEGDGPGIYMQSSSALATASRGYPEATAGMLEVLPNGANGASACIQRFTPFTYLGTAPESGNSQNEYARAGRGTFYIRMKNGNNAKFSPWIPFQASSSGNVVSSPASNEKSSWVDYVNALSSQPSSLASYNVNSVGWVTAISVRHRNGQGDGSAFGFVIEDASMTSPHYKDVRLRKQTGAGQWQSTQVIWNTGNTTVDSNGFIKRASPIVDIFGNGSHRTNDESEGCTVERISTGEYLIRGCLSLNSDLAWGGVNGGIEIPKDINGQPILWVDYDVNPDGSLVIKTYHRTHDNAPSFARNHKDGYSDGDPIDIPSDVFVSVRVEMPNDSIYNKKVEECKRNHERMVSGEFVESLKNT;note=Orf no. 26 see PMID: 14972552;ID=CPT-T1_030.CDS.1;Parent=CPT-T1_030.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 26546 26550 . - . locus_tag=CPT-T1_030;regulatory_class=ribosome_binding_site;ID=CPT-T1_030.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_030.mRNA;
+AY216660.2 GbkToGff gene 26447 26600 . - . locus_tag=CPT-T1_031;ID=CPT-T1_031.gene;
+AY216660.2 GbkToGff mRNA 26447 26600 . - . locus_tag=CPT-T1_031;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_031.mRNA;Parent=CPT-T1_031.gene;
+AY216660.2 GbkToGff CDS 26447 26590 . - 0 locus_tag=CPT-T1_031;codon_start=1;transl_table=11;product=hypothetical protein;translation=MIKSLLSNSCYLLKEFILWLYIDAVLHQWMQTVRFTEPIQNGKISLL;note=Orf no. 25 see PMID: 14972552;ID=CPT-T1_031.CDS.1;Parent=CPT-T1_031.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 26597 26600 . - . locus_tag=CPT-T1_031;regulatory_class=ribosome_binding_site;ID=CPT-T1_031.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_031.mRNA;
+AY216660.2 GbkToGff gene 26638 27585 . - . locus_tag=CPT-T1_032;ID=CPT-T1_032.gene;
+AY216660.2 GbkToGff mRNA 26638 27585 . - . locus_tag=CPT-T1_032;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_032.mRNA;Parent=CPT-T1_032.gene;
+AY216660.2 GbkToGff CDS 26638 27573 . - 0 locus_tag=CPT-T1_032;codon_start=1;transl_table=11;product=DNA primase;translation=MNEEFMMFQKEDVLPYMKGLWREAFQSICGLPNNVFNKKHQPCPNCGGKDRFRWTDNLNTPGDGGAICNSCGNDSGIGWLMKLTGMPYSECVNILGRFLGKVPQEYIVKANKKARRTPVSGVNVMMAEHEAVMKVMERTEKRVNTPLSVFESLPTESFDVGIKRSEDGRESVFHTIPCQLVHEDGLDDEFCNILIIDEEGRESFYAKKYTSCSVAVTGKTEKAIYLCLNWIDAQHIAFHTKQEVWACFTPENLEMVAYRYKGDREVRVACEPSDKDTLYMADDRQLKIIIPNPGGYRSGMQAKLFSASDLL;note=alternative start codon to Orf no. 24 see PMID: 14972552;ID=CPT-T1_032.CDS.1;Parent=CPT-T1_032.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 27582 27585 . - . locus_tag=CPT-T1_032;regulatory_class=ribosome_binding_site;ID=CPT-T1_032.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_032.mRNA;
+AY216660.2 GbkToGff gene 27635 28098 . - . locus_tag=CPT-T1_033;ID=CPT-T1_033.gene;
+AY216660.2 GbkToGff mRNA 27635 28098 . - . locus_tag=CPT-T1_033;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_033.mRNA;Parent=CPT-T1_033.gene;
+AY216660.2 GbkToGff CDS 27635 28087 . - 0 locus_tag=CPT-T1_033;note=HHPred predicted structural similarity at 97%25 probability to phage P22 repression protein C2 Protein Data Bank entry 2R1J over most of protein and phage lambda repressor entry 3BDN%3BOrf no. 23 see PMID: 14972552;codon_start=1;transl_table=11;product=transcriptional regulator;translation=MSIQRIAESTGEIDKRHINGNNGTRRGKDKKPRQRCGFYIHKEETRAGLRARLDALIEYYGGPAACAKALKVSNQTVQGWKERNMISWQGAEAAHRAYRRQGCKGFRAAWLRFDLKFDGNGKCLEKRCKNKKFMRVVKREDIGTTNSIFS;ID=CPT-T1_033.CDS.1;Parent=CPT-T1_033.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 28095 28098 . - . locus_tag=CPT-T1_033;regulatory_class=ribosome_binding_site;ID=CPT-T1_033.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_033.mRNA;
+AY216660.2 GbkToGff gene 28168 30199 . + . locus_tag=CPT-T1_034;ID=CPT-T1_034.gene;
+AY216660.2 GbkToGff mRNA 28168 30199 . + . locus_tag=CPT-T1_034;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_034.mRNA;Parent=CPT-T1_034.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 28168 28171 . + . locus_tag=CPT-T1_034;regulatory_class=ribosome_binding_site;ID=CPT-T1_034.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_034.mRNA;
+AY216660.2 GbkToGff CDS 28181 30199 . + 0 locus_tag=CPT-T1_034;codon_start=1;transl_table=11;product=DNA helicase;translation=MTMNIKKQIALLGDDYIKRTQERFTVGEVVPYPYQVVAYAEIAKRLSNYEHPFFVKASVSAGKTIIFAMVAKQCQKMGLKMLVLARQGEIVDQDSEEIDNFGVTNSIFSASLGIKSCYFPIVVGSEGTVANGLDNELADFVPHVIGIDECHQVDWEDLAQAIEGKETMEQMRGEKGKIIMDGDIPLIGNDGKPLLGTKRSQYTIVIMEMMRRCKKVHGHDLRIFGMTGSEFRGVVPILVENPKALGFWRERVTDIDTNYLIEFGSVVPTIFGSTDGVHYDLDKFKASSEDGVQDFTEKDMKAMEDEILHDKSLTQRIMQMVAKKAEERNAVLITCAGVRHCKEAAAALPPGSTYAIITGDTDNKARKKILDDVRAGKIKYTFQVMALTTGVNVPNWDFSVILRKIGSLTLLIQLLGRGMRLLKSWQVAEGMVKQDHLVWDFAGTMDELGQLYFDPILEQAQFQKRFENGKDPKTCPKCGCVNSFYARRCVNVIDGERCDHFWTSQICEDQVDERTGKILVKGCGAENDVVARVCRCCDASLVDPNLKLSGKAYTKNDWYEVKNFEVTLTKNQKGIIYKYTLINDDGDEFKAYEKFFPESDSKICGTLWKTKGVLPHVSDPKMRRYFIGMKNAIKILQYSHHIAHPVRVTHRRNQKKEDIISRKDFGMEDIPE;note=Orf no. 22 see PMID: 14972552;ID=CPT-T1_034.CDS.1;Parent=CPT-T1_034.mRNA;
+AY216660.2 GbkToGff gene 30181 30612 . + . locus_tag=CPT-T1_035;ID=CPT-T1_035.gene;
+AY216660.2 GbkToGff mRNA 30181 30612 . + . locus_tag=CPT-T1_035;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_035.mRNA;Parent=CPT-T1_035.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 30181 30185 . + . locus_tag=CPT-T1_035;regulatory_class=ribosome_binding_site;ID=CPT-T1_035.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_035.mRNA;
+AY216660.2 GbkToGff CDS 30196 30612 . + 0 locus_tag=CPT-T1_035;note=HHPred predicted structural similarity at 97%25 probability to B. subtilis recombination protein U/resolvase Protein Data Bank entry 1ZP7 over most of protein%3B Orf no. 21 see PMID: 14972552;codon_start=1;transl_table=11;product=Holliday junction resolvase;translation=MITDKGDYLEFYERDTSDTRKEDAHQVDCVSWLKYNFPHLLFWHTVNEGEKTITSALRDEQAGLLKGVSDFVILIGVNSRYPFAAIELKRVNKSGKGKASPVSDKQREFLQKVRERGGFSAVAYGFGQFKIAIYEMMK;ID=CPT-T1_035.CDS.1;Parent=CPT-T1_035.mRNA;
+AY216660.2 GbkToGff gene 30669 31393 . + . locus_tag=CPT-T1_036;ID=CPT-T1_036.gene;
+AY216660.2 GbkToGff mRNA 30669 31393 . + . locus_tag=CPT-T1_036;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_036.mRNA;Parent=CPT-T1_036.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 30669 30672 . + . locus_tag=CPT-T1_036;regulatory_class=ribosome_binding_site;ID=CPT-T1_036.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_036.mRNA;
+AY216660.2 GbkToGff CDS 30680 31393 . + 0 locus_tag=CPT-T1_036;codon_start=1;transl_table=11;product=DNA adenine methyltransferase;translation=MKDFNDIETIDFAETGCSFTREAIASGGYYQALKTPTCKEISGRRYKGTNTPDAVRDLWSTPREVIAYLEGRYGKYDLDAAASEENKVCEKFYSQETNCLKRWWGKNKHVWLNPPYSRPDIFVKKAIEQMEHNNQIDMLLPADNSTAWFTEARQNAAEIIWIEADLTEDIDGNEYARSGRLAFISGETGKAVDGNNKGSVIFIMRELKEGEVQQTHYIPITSICPSVKNKRAKVRKV;note=Orf no. 20 see PMID: 14972552;ID=CPT-T1_036.CDS.1;Parent=CPT-T1_036.mRNA;
+AY216660.2 GbkToGff gene 31377 31641 . + . locus_tag=CPT-T1_037;ID=CPT-T1_037.gene;
+AY216660.2 GbkToGff mRNA 31377 31641 . + . locus_tag=CPT-T1_037;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_037.mRNA;Parent=CPT-T1_037.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 31377 31380 . + . locus_tag=CPT-T1_037;regulatory_class=ribosome_binding_site;ID=CPT-T1_037.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_037.mRNA;
+AY216660.2 GbkToGff CDS 31393 31641 . + 0 locus_tag=CPT-T1_037;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSEKMVPVKLTEQGLWLLYRATCCEIMERNGLTQDVIGCDLWEFTSSLDMSFDEIKNEYIENWPSIIQKDVEELKADTIVQH;note=alternative start codon to Orf no. 19 see PMID: 14972552;ID=CPT-T1_037.CDS.1;Parent=CPT-T1_037.mRNA;
+AY216660.2 GbkToGff gene 31695 31917 . + . locus_tag=CPT-T1_038;ID=CPT-T1_038.gene;
+AY216660.2 GbkToGff mRNA 31695 31917 . + . locus_tag=CPT-T1_038;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_038.mRNA;Parent=CPT-T1_038.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 31695 31698 . + . locus_tag=CPT-T1_038;regulatory_class=ribosome_binding_site;ID=CPT-T1_038.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_038.mRNA;
+AY216660.2 GbkToGff CDS 31708 31917 . + 0 locus_tag=CPT-T1_038;codon_start=1;transl_table=11;product=hypothetical protein;translation=MARINANFFNIAQQSAKMAVHITNKQGGNFDWDIAMNFLKMSYYRCSVEEVEGFISDVEKLTNADKKAR;note=Orf no. 18 see PMID: 14972552;ID=CPT-T1_038.CDS.1;Parent=CPT-T1_038.mRNA;
+AY216660.2 GbkToGff gene 31881 32167 . + . locus_tag=CPT-T1_039;ID=CPT-T1_039.gene;
+AY216660.2 GbkToGff mRNA 31881 32167 . + . locus_tag=CPT-T1_039;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_039.mRNA;Parent=CPT-T1_039.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 31881 31884 . + . locus_tag=CPT-T1_039;regulatory_class=ribosome_binding_site;ID=CPT-T1_039.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_039.mRNA;
+AY216660.2 GbkToGff CDS 31895 32167 . + 0 locus_tag=CPT-T1_039;codon_start=1;transl_table=11;product=hypothetical protein;translation=MLIKKQGKREVWEHAKECGISDDIALIAKYFDIKDVSIISNGKISFMEGMPRKMQRVPATPSLEFYREEGKRIERERKSTKNGKSSRLKY;ID=CPT-T1_039.CDS.1;Parent=CPT-T1_039.mRNA;
+AY216660.2 GbkToGff gene 32099 32399 . + . locus_tag=CPT-T1_040;ID=CPT-T1_040.gene;
+AY216660.2 GbkToGff mRNA 32099 32399 . + . locus_tag=CPT-T1_040;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_040.mRNA;Parent=CPT-T1_040.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 32099 32102 . + . locus_tag=CPT-T1_040;regulatory_class=ribosome_binding_site;ID=CPT-T1_040.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_040.mRNA;
+AY216660.2 GbkToGff CDS 32112 32399 . + 0 locus_tag=CPT-T1_040;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSEKENPQKTASLPGLNINADEYQAIWIGKKQVKQIPFSDWLPPDFVNVLCTIGIEQELHIGYYSPGRNSMMLEVDGKLVEFKSSDLGFWLKAVA;note=Orf no. 17 see PMID: 14972552;ID=CPT-T1_040.CDS.1;Parent=CPT-T1_040.mRNA;
+AY216660.2 GbkToGff gene 32467 33611 . + . locus_tag=CPT-T1_041;ID=CPT-T1_041.gene;
+AY216660.2 GbkToGff mRNA 32467 33611 . + . locus_tag=CPT-T1_041;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_041.mRNA;Parent=CPT-T1_041.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 32467 32470 . + . locus_tag=CPT-T1_041;regulatory_class=ribosome_binding_site;ID=CPT-T1_041.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_041.mRNA;
+AY216660.2 GbkToGff CDS 32478 33611 . + 0 locus_tag=CPT-T1_041;note=HHPred predicted structural similarity at 99%25 probability to E. coli exonuclease SbcD Protein Data Bank entry 4LTY over two thirds of protein%3B Orf no. 16 see PMID: 14972552;codon_start=1;transl_table=11;product=exonuclease;translation=MSQAKITTEQLIEERMSGLTLREIAEKYGMHIRTVEARHAKLAKEGHFHGNEHVAKMVPEGFMVKGTSTMIDAEGNEKIRWVKTSVDNERLEVLMEKAREAFCSELPKAIPSESPDVSFDEDTLAMYPVFDLHIGALAHKHECGENYDTATAEKVMNGFFDYAVDKAPNSKNAVLVLGGDFLHYDSLESKTPASGHYLDSDSRYAKLVYVAIRSVRRAVSRMLEKHQVIDIKAISGNHDESGMVWLRAALAAFYEDEPRVNVDVSPAAMMMTSFGKTLIGYTHGHQMRKADTRLSVMATDFRKLFGQSDYVYTHSGHWHSQKITETNLGIDEVHGQLGSPDAYSANGGWRSQRQAAVIVYHKEFGEVGRFICRPEMF;ID=CPT-T1_041.CDS.1;Parent=CPT-T1_041.mRNA;
+AY216660.2 GbkToGff gene 33673 34166 . + . locus_tag=CPT-T1_042;ID=CPT-T1_042.gene;
+AY216660.2 GbkToGff mRNA 33673 34166 . + . locus_tag=CPT-T1_042;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_042.mRNA;Parent=CPT-T1_042.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 33673 33677 . + . locus_tag=CPT-T1_042;regulatory_class=ribosome_binding_site;ID=CPT-T1_042.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_042.mRNA;
+AY216660.2 GbkToGff CDS 33684 34166 . + 0 locus_tag=CPT-T1_042;note=similar to phage T7 protein 3.8%3B InterPro domain IPR003615%3B Orf no. 15 see PMID: 14972552;codon_start=1;transl_table=11;product=HNH endonuclease;translation=MNWHEHYEYRDGVLYHKVKPCRRHDVNIGDVAGRVAKNGYHYVVHKNRPYKRSRVIWEMFNGEIPDGFVIDHLNHNATDDRIDNLECKPRRENMVNVKLRIDSTTGVTGVSRKRDNKWRAYITIMGKQKCKSFDTFEEACAQRIEWSVTHDFHPNHGGTY;ID=CPT-T1_042.CDS.1;Parent=CPT-T1_042.mRNA;
+AY216660.2 GbkToGff gene 34226 34415 . + . locus_tag=CPT-T1_043;ID=CPT-T1_043.gene;
+AY216660.2 GbkToGff mRNA 34226 34415 . + . locus_tag=CPT-T1_043;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_043.mRNA;Parent=CPT-T1_043.gene;
+AY216660.2 GbkToGff CDS 34239 34415 . + 0 locus_tag=CPT-T1_043;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKIVKCIRNDSKTLPFRVNQIYSVGYDFGGGLFEIYDGRGSAIQTPLNGHYLEFIEID;note=Orf no. 14 see PMID: 14972552;ID=CPT-T1_043.CDS.1;Parent=CPT-T1_043.mRNA;
+AY216660.2 GbkToGff gene 34525 34749 . + . locus_tag=CPT-T1_044;ID=CPT-T1_044.gene;
+AY216660.2 GbkToGff mRNA 34525 34749 . + . locus_tag=CPT-T1_044;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_044.mRNA;Parent=CPT-T1_044.gene;
+AY216660.2 GbkToGff CDS 34534 34749 . + 0 locus_tag=CPT-T1_044;note=1 transmembrane domain%2C predicted N-out and C-in%3B Orf no. 13 see PMID: 14972552;codon_start=1;transl_table=11;product=pinholin class 2;translation=MKEFLTAATSSTGGASLVGAATGQLYIAGATFICFLLFGAWGAYWKYRDSKAIQEALNDGDLNKALKIRGR;ID=CPT-T1_044.CDS.1;Parent=CPT-T1_044.mRNA;
+AY216660.2 GbkToGff gene 34739 35237 . + . locus_tag=CPT-T1_045;ID=CPT-T1_045.gene;
+AY216660.2 GbkToGff mRNA 34739 35237 . + . locus_tag=CPT-T1_045;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_045.mRNA;Parent=CPT-T1_045.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 34739 34743 . + . locus_tag=CPT-T1_045;regulatory_class=ribosome_binding_site;ID=CPT-T1_045.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_045.mRNA;
+AY216660.2 GbkToGff CDS 34749 35237 . + 0 locus_tag=CPT-T1_045;codon_start=1;transl_table=11;product=SAR endolysin;translation=MSLKNNVIGASIGAALTLTPTLLERIEGIEYEVYYDIAGVPTVCSGITGPDVIPGKKYTKRECDALLIKHIGVAQRYVDKKVKVDIPVTMRASLYSFTFNVGTGAFGSSTMLKLINQRKHKEACNQLWRWVYYYNPKTKKREVSRGIKNRRAEEYAYCVKEL;note=Orf no. 12 see PMID: 14972552;ID=CPT-T1_045.CDS.1;Parent=CPT-T1_045.mRNA;
+AY216660.2 GbkToGff gene 35227 35638 . + . locus_tag=CPT-T1_046;ID=CPT-T1_046.gene;
+AY216660.2 GbkToGff mRNA 35227 35638 . + . locus_tag=CPT-T1_046;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_046.mRNA;Parent=CPT-T1_046.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 35227 35230 . + . locus_tag=CPT-T1_046;regulatory_class=ribosome_binding_site;ID=CPT-T1_046.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_046.mRNA;
+AY216660.2 GbkToGff CDS 35237 35638 . + 0 locus_tag=CPT-T1_046;note=PMID 30135120 shows molecular function of unimolecular spanin%3B Orf no. 11 see PMID: 14972552;codon_start=1;transl_table=11;product=u-spanin;translation=MKLKKTCIAITVAVGVISLSGCSTASALSGLLSDSPDVTAQVGAENTKQLAGVTAKADDKREVKVSDSNIGKIDSSVKKSVEVSTIQANTVNAESITVTKSGSWYDPVVCWILVFIVLLLFYFLIRKHEKKEA;ID=CPT-T1_046.CDS.1;Parent=CPT-T1_046.mRNA;
+AY216660.2 GbkToGff gene 35761 36179 . - . locus_tag=CPT-T1_047;ID=CPT-T1_047.gene;
+AY216660.2 GbkToGff mRNA 35761 36179 . - . locus_tag=CPT-T1_047;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_047.mRNA;Parent=CPT-T1_047.gene;
+AY216660.2 GbkToGff CDS 35761 36168 . - 0 locus_tag=CPT-T1_047;note=HHPred predicted structural similarity at 88%25 probability to S. epidermidis RipR transcriptional regulator Protein Data Bank entry 3IWF over most of protein%3B Orf no. 10 see PMID: 14972552;codon_start=1;transl_table=11;product=HTH domain-containing protein;translation=MLLLLDLFRFCEGYDKYTRQHIAKFIYAHKESERFAKAAGMTRREFTSALSKEFCARCVTEGYLDCKGGFYWCKGKIKRPVMMKLMCIDGYNNRYTWEMMHIGEMSDEDLFGERRNIDRSERRIVRKAPAYERRI;ID=CPT-T1_047.CDS.1;Parent=CPT-T1_047.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 36176 36179 . - . locus_tag=CPT-T1_047;regulatory_class=ribosome_binding_site;ID=CPT-T1_047.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_047.mRNA;
+AY216660.2 GbkToGff gene 36173 37755 . - . locus_tag=CPT-T1_048;ID=CPT-T1_048.gene;
+AY216660.2 GbkToGff mRNA 36173 37755 . - . locus_tag=CPT-T1_048;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_048.mRNA;Parent=CPT-T1_048.gene;
+AY216660.2 GbkToGff CDS 36173 37741 . - 0 locus_tag=CPT-T1_048;note=HHPred predicted structural similarity at 95%25 probability to E. coli AAA ATPase ravA Protein Data Bank entry 3NBX over AAA domain of protein%3B Orf no. 9 see PMID: 14972552;codon_start=1;transl_table=11;product=putative ATPase;translation=MFNIKPKLNYQQIIEIANSTGVNPVAIAIRENSYGDSVSFWQDPIDINSGNDKFPLISLGGDNLVFEYAKAKAESVQFPVSSAYAHFIGCISAAMLGKFWVQYHGEEQPTALYMVISQPPSTGKSAINSAAITPMRAEIQRLNEERKKERIRLTSQLRQIEKEIKNDPKGNTTAALYEDKEKLEEKIKKMADIVFAVSDPTPEGLAKVAAVQGHFSVISDEATAINTLLGLTYGGSDKKSNSELILKAWDKNHMEVARSNQDNNLSLCPVGSICVIAQDETIKGIMDAGQRGIGVSERFLLVREEPLLGTRILCDENGDALYKEVDRGLVSKYYRLVHNIMKEDNVVLSVSRNAMRELNLARQAMEPDFAAGGKYSHSMLRGHLGKFDKHALRIASVLHTIKNWEGESPNRSNREIDLETMQEAIMIFNELSRTYLSSASAAGYAGDEAESRKLIDVITEIAKKNKGRAPIHSIVAKCRNVTPFNGQQKVAERIDSLLITLEEMNYTCRIDDIVFINPRLMG;ID=CPT-T1_048.CDS.1;Parent=CPT-T1_048.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 37751 37755 . - . locus_tag=CPT-T1_048;regulatory_class=ribosome_binding_site;ID=CPT-T1_048.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_048.mRNA;
+AY216660.2 GbkToGff gene 37741 38294 . - . locus_tag=CPT-T1_049;ID=CPT-T1_049.gene;
+AY216660.2 GbkToGff mRNA 37741 38294 . - . locus_tag=CPT-T1_049;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_049.mRNA;Parent=CPT-T1_049.gene;
+AY216660.2 GbkToGff CDS 37741 38280 . - 0 locus_tag=CPT-T1_049;note=InterPro domain IPR003615%3B similar to phage T7 protein 3.8%3B Orf no. 8 see PMID: 14972552;codon_start=1;transl_table=11;product=HNH endonuclease;translation=MKDELKYVSGRLYWKEWRIGRRRNLLAGTVNKKGYRSICFPGGVFEYAHRIVWKIHYGNIPEGMDVDHINHERDDNRIENLRLVTRQDNLRNKGVVSSNTGVMGVYWNKKTNRYTANITINKKTKHLGTFMTLDAAAKARKEAERLYGFHENHGSNSTFCKTRVPLTVYHSRRQLRSLL;ID=CPT-T1_049.CDS.1;Parent=CPT-T1_049.mRNA;
+AY216660.2 GbkToGff gene 38277 38706 . - . locus_tag=CPT-T1_050;ID=CPT-T1_050.gene;
+AY216660.2 GbkToGff mRNA 38277 38706 . - . locus_tag=CPT-T1_050;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_050.mRNA;Parent=CPT-T1_050.gene;
+AY216660.2 GbkToGff CDS 38277 38693 . - 0 locus_tag=CPT-T1_050;note=HHPred predicted structural similarity at 88%25 probability to S. epidermidis RipR transcriptional regulator Protein Data Bank entry 3IWF over most of protein%3B Orf no. 7 see PMID: 14972552;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKSIKLKCTSADKITGFEVNNLYKGRERYDDTREVKLKCGKYLKLEKHDELHIHGSDEIFFAKFTELKTKTLKCTGLDHRNPMKKSFKVGKRYQVESGRALGGVAGYIFDEDGCRWTLFREEVGFSIADGTTFESKYL;ID=CPT-T1_050.CDS.1;Parent=CPT-T1_050.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 38702 38706 . - . locus_tag=CPT-T1_050;regulatory_class=ribosome_binding_site;ID=CPT-T1_050.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_050.mRNA;
+AY216660.2 GbkToGff gene 38774 38994 . - . locus_tag=CPT-T1_051;ID=CPT-T1_051.gene;
+AY216660.2 GbkToGff mRNA 38774 38994 . - . locus_tag=CPT-T1_051;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_051.mRNA;Parent=CPT-T1_051.gene;
+AY216660.2 GbkToGff CDS 38774 38983 . - 0 locus_tag=CPT-T1_051;codon_start=1;transl_table=11;product=hypothetical protein;translation=MEQDNFWTRYFAALDAGLSSEWCIKVAYKEITLDEALGDMDMDAESEYDPNFELPGDDINEDVDDYIPW;note=Orf no. 6 see PMID: 14972552;ID=CPT-T1_051.CDS.1;Parent=CPT-T1_051.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 38991 38994 . - . locus_tag=CPT-T1_051;regulatory_class=ribosome_binding_site;ID=CPT-T1_051.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_051.mRNA;
+AY216660.2 GbkToGff gene 38987 39222 . - . locus_tag=CPT-T1_052;ID=CPT-T1_052.gene;
+AY216660.2 GbkToGff mRNA 38987 39222 . - . locus_tag=CPT-T1_052;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_052.mRNA;Parent=CPT-T1_052.gene;
+AY216660.2 GbkToGff CDS 38987 39211 . - 0 locus_tag=CPT-T1_052;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSIKVENIIKHLNAKGRVFIKMDKSSGFISMTVTKTRNGNSVIGSVPGSRLINATDADVRATLEANSIYINSWG;note=Orf no. 5 see PMID: 14972552;ID=CPT-T1_052.CDS.1;Parent=CPT-T1_052.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 39219 39222 . - . locus_tag=CPT-T1_052;regulatory_class=ribosome_binding_site;ID=CPT-T1_052.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_052.mRNA;
+AY216660.2 GbkToGff gene 39289 39444 . - . locus_tag=CPT-T1_053;ID=CPT-T1_053.gene;
+AY216660.2 GbkToGff mRNA 39289 39444 . - . locus_tag=CPT-T1_053;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_053.mRNA;Parent=CPT-T1_053.gene;
+AY216660.2 GbkToGff CDS 39289 39432 . - 0 locus_tag=CPT-T1_053;codon_start=1;transl_table=11;product=hypothetical protein;translation=MIYVHTFYTGKFNSVKNVRVYDSRQKAMMQKVVLGGTIKECKVISEC;note=Orf no. 4 see PMID: 14972552;ID=CPT-T1_053.CDS.1;Parent=CPT-T1_053.mRNA;
+AY216660.2 GbkToGff gene 39429 39758 . - . locus_tag=CPT-T1_054;ID=CPT-T1_054.gene;
+AY216660.2 GbkToGff mRNA 39429 39758 . - . locus_tag=CPT-T1_054;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_054.mRNA;Parent=CPT-T1_054.gene;
+AY216660.2 GbkToGff CDS 39429 39749 . - 0 locus_tag=CPT-T1_054;codon_start=1;transl_table=11;product=hypothetical protein;translation=MNHTYKITTKSPKINGSTVAALNNAAAIHEKNIMERVKAAVGRFYGINADIADSKRLFKYAPGHPYSRMVDIKHNKELVRIGSLSVDEFDHSINLVTAYQTWDGKK;note=Orf no. 3 see PMID: 14972552;ID=CPT-T1_054.CDS.1;Parent=CPT-T1_054.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 39755 39758 . - . locus_tag=CPT-T1_054;regulatory_class=ribosome_binding_site;ID=CPT-T1_054.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_054.mRNA;
+AY216660.2 GbkToGff gene 39766 39978 . - . locus_tag=CPT-T1_055;ID=CPT-T1_055.gene;
+AY216660.2 GbkToGff mRNA 39766 39978 . - . locus_tag=CPT-T1_055;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_055.mRNA;Parent=CPT-T1_055.gene;
+AY216660.2 GbkToGff CDS 39766 39966 . - 0 locus_tag=CPT-T1_055;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSDNIYRVVAISRKTQKRVIAYMGSSAIEATDAFELLKNNEGFMNTFRVRLERLEPVIIDEARKLS;note=Orf no. 2 see PMID: 14972552;ID=CPT-T1_055.CDS.1;Parent=CPT-T1_055.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 39973 39978 . - . locus_tag=CPT-T1_055;regulatory_class=ribosome_binding_site;ID=CPT-T1_055.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_055.mRNA;
+AY216660.2 GbkToGff gene 39959 40342 . - . locus_tag=CPT-T1_056;ID=CPT-T1_056.gene;
+AY216660.2 GbkToGff mRNA 39959 40342 . - . locus_tag=CPT-T1_056;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_056.mRNA;Parent=CPT-T1_056.gene;
+AY216660.2 GbkToGff CDS 39959 40330 . - 0 locus_tag=CPT-T1_056;codon_start=1;transl_table=11;product=hypothetical protein;translation=MFKQFTDLDFSASSVIQTDEKVHVAIENIARKIHNKQEKAMIAALTAYYDVSDVMECVDRVTRVVDRLGASRLIDNDTGEVITQFNKPFMRTEPGSVAPCFVADYSITVNSFVADRVKEALYE;note=Orf no. 1 see PMID: 14972552;ID=CPT-T1_056.CDS.1;Parent=CPT-T1_056.mRNA;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 40339 40342 . - . locus_tag=CPT-T1_056;regulatory_class=ribosome_binding_site;ID=CPT-T1_056.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_056.mRNA;
+AY216660.2 GbkToGff gene 41020 41606 . + . locus_tag=CPT-T1_057;ID=CPT-T1_057.gene;
+AY216660.2 GbkToGff mRNA 41020 41606 . + . locus_tag=CPT-T1_057;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_057.mRNA;Parent=CPT-T1_057.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 41020 41024 . + . locus_tag=CPT-T1_057;regulatory_class=ribosome_binding_site;ID=CPT-T1_057.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_057.mRNA;
+AY216660.2 GbkToGff CDS 41031 41606 . + 0 locus_tag=CPT-T1_057;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSSYQSDAVQAAIKAAYEKAGVTVEQRPEAKVTDVIRAACDQLYGDGENTEFTFDANKMAEAAARKSMPDADEHDVAKGAESWLLGKTDEINEKFKSSFITPIVSRHFSKIGKSVKVSVTMNDEKLRVVTISVSDEEVPVKKRRSRKKVSLADCLDSFVPDVDDLEKGDVTVSTVRDLVRQMKAHIEKCGL;note=Orf no. 77 see PMID: 14972552;ID=CPT-T1_057.CDS.1;Parent=CPT-T1_057.mRNA;
+AY216660.2 GbkToGff gene 41606 41945 . + . locus_tag=CPT-T1_058;ID=CPT-T1_058.gene;
+AY216660.2 GbkToGff mRNA 41606 41945 . + . locus_tag=CPT-T1_058;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_058.mRNA;Parent=CPT-T1_058.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 41606 41610 . + . locus_tag=CPT-T1_058;regulatory_class=ribosome_binding_site;ID=CPT-T1_058.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_058.mRNA;
+AY216660.2 GbkToGff CDS 41619 41945 . + 0 locus_tag=CPT-T1_058;codon_start=1;transl_table=11;product=hypothetical protein;translation=MFNIKPLTEAEKQAQAKQTENIQVIADALIGKRSIKINLDTVGQSFFTKGLDKYVINVKARDLVARIQKLNNQKLKLIKVEGNMCEIENLSAPDPNKWEITDVEFIVE;note=Orf no. 76 see PMID: 14972552;ID=CPT-T1_058.CDS.1;Parent=CPT-T1_058.mRNA;
+AY216660.2 GbkToGff gene 42013 42254 . + . locus_tag=CPT-T1_059;ID=CPT-T1_059.gene;
+AY216660.2 GbkToGff mRNA 42013 42254 . + . locus_tag=CPT-T1_059;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_059.mRNA;Parent=CPT-T1_059.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 42013 42016 . + . locus_tag=CPT-T1_059;regulatory_class=ribosome_binding_site;ID=CPT-T1_059.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_059.mRNA;
+AY216660.2 GbkToGff CDS 42024 42254 . + 0 locus_tag=CPT-T1_059;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSIVKNQQAIDSTNNNRFAIFITRDNKRFAVKAVPGGYKTYMEDNGKWVRCDNLANFLVWNADLQGFDDISTLIEE;note=Orf no. 75 see PMID: 14972552;ID=CPT-T1_059.CDS.1;Parent=CPT-T1_059.mRNA;
+AY216660.2 GbkToGff gene 42247 42487 . + . locus_tag=CPT-T1_060;ID=CPT-T1_060.gene;
+AY216660.2 GbkToGff mRNA 42247 42487 . + . locus_tag=CPT-T1_060;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_060.mRNA;Parent=CPT-T1_060.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 42247 42251 . + . locus_tag=CPT-T1_060;regulatory_class=ribosome_binding_site;ID=CPT-T1_060.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_060.mRNA;
+AY216660.2 GbkToGff CDS 42260 42487 . + 0 locus_tag=CPT-T1_060;codon_start=1;transl_table=11;product=hypothetical protein;translation=MPRYSNLTQLTRVNGHMIPAKSTHYAMGAKHGLYFKWRGQWNFTVIRNFYIRVTGDDPQSVVENSIGDNKIEVLK;note=Orf no. 74 see PMID: 14972552;ID=CPT-T1_060.CDS.1;Parent=CPT-T1_060.mRNA;
+AY216660.2 GbkToGff gene 42473 42594 . + . locus_tag=CPT-T1_061;ID=CPT-T1_061.gene;
+AY216660.2 GbkToGff mRNA 42473 42594 . + . locus_tag=CPT-T1_061;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_061.mRNA;Parent=CPT-T1_061.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 42473 42477 . + . locus_tag=CPT-T1_061;regulatory_class=ribosome_binding_site;ID=CPT-T1_061.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_061.mRNA;
+AY216660.2 GbkToGff CDS 42484 42594 . + 0 locus_tag=CPT-T1_061;codon_start=1;transl_table=11;product=hypothetical protein;translation=MNFNIIAFWSAVWFFCVGHVVVGIVIMLLLCAGAFE;note=single transmembrane domain predicted N-in and C-out%3BOrf no. 73 see PMID: 14972552;ID=CPT-T1_061.CDS.1;Parent=CPT-T1_061.mRNA;
+AY216660.2 GbkToGff gene 42580 42764 . + . locus_tag=CPT-T1_062;ID=CPT-T1_062.gene;
+AY216660.2 GbkToGff mRNA 42580 42764 . + . locus_tag=CPT-T1_062;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_062.mRNA;Parent=CPT-T1_062.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 42580 42583 . + . locus_tag=CPT-T1_062;regulatory_class=ribosome_binding_site;ID=CPT-T1_062.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_062.mRNA;
+AY216660.2 GbkToGff CDS 42591 42764 . + 0 locus_tag=CPT-T1_062;codon_start=1;transl_table=11;product=hypothetical protein;translation=MMRILICMMAAVAMAILVVSGCGEARDSCHETGSQVTTFVMVGNVLLPITSNEITCE;note=Orf no. 72 see PMID: 14972552;ID=CPT-T1_062.CDS.1;Parent=CPT-T1_062.mRNA;
+AY216660.2 GbkToGff gene 42824 43324 . + . locus_tag=CPT-T1_063;ID=CPT-T1_063.gene;
+AY216660.2 GbkToGff mRNA 42824 43324 . + . locus_tag=CPT-T1_063;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_063.mRNA;Parent=CPT-T1_063.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 42824 42827 . + . locus_tag=CPT-T1_063;regulatory_class=ribosome_binding_site;ID=CPT-T1_063.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_063.mRNA;
+AY216660.2 GbkToGff CDS 42836 43324 . + 0 locus_tag=CPT-T1_063;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKIKLLSNGGYKGFTRDLEADPIVVDAVKCDSSTGGYRVKVDDLVKAGVYDLDYGLSVSPVFGPADFNEKDGTMFFFDWEVKANIKPRKVRLLSNGGYPMRPGYENRTFPVIVDFIGTTDNLVYVSHEQLKAVGFVGGMNKEALCFFHRCPEPIGIECELVY;note=Orf no. 71 see PMID: 14972552;ID=CPT-T1_063.CDS.1;Parent=CPT-T1_063.mRNA;
+AY216660.2 GbkToGff gene 43385 43881 . + . locus_tag=CPT-T1_064;ID=CPT-T1_064.gene;
+AY216660.2 GbkToGff mRNA 43385 43881 . + . locus_tag=CPT-T1_064;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_064.mRNA;Parent=CPT-T1_064.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 43385 43388 . + . locus_tag=CPT-T1_064;regulatory_class=ribosome_binding_site;ID=CPT-T1_064.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_064.mRNA;
+AY216660.2 GbkToGff CDS 43396 43881 . + 0 locus_tag=CPT-T1_064;codon_start=1;transl_table=11;product=hypothetical protein;translation=MLKLKDIQFPVVFNTISCGKITCHSKDRATDSSFNECHPSIVGNLIELHNNHNPDNIPSLPYYVEGVGPGWKVGRSIFHAAKPEIKPALQCTQIENMPLSATLKGVQLDSESWIEITATPKTIEVHDDVVILLLHYGSFKHKTVSGEISIKRGTLVRYEVK;note=Orf no. 70 see PMID: 14972552;ID=CPT-T1_064.CDS.1;Parent=CPT-T1_064.mRNA;
+AY216660.2 GbkToGff gene 43870 44030 . + . locus_tag=CPT-T1_065;ID=CPT-T1_065.gene;
+AY216660.2 GbkToGff mRNA 43870 44030 . + . locus_tag=CPT-T1_065;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_065.mRNA;Parent=CPT-T1_065.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 43870 43874 . + . locus_tag=CPT-T1_065;regulatory_class=ribosome_binding_site;ID=CPT-T1_065.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_065.mRNA;
+AY216660.2 GbkToGff CDS 43881 44030 . + 0 locus_tag=CPT-T1_065;codon_start=1;transl_table=11;product=hypothetical protein;translation=MTAWVLIILMSKGPDHVYMESQQSCNKAREVIAENKPFGYEVKTMCVKR;note=Orf no. 69 see PMID: 14972552;ID=CPT-T1_065.CDS.1;Parent=CPT-T1_065.mRNA;
+AY216660.2 GbkToGff gene 44098 44485 . + . locus_tag=CPT-T1_066;ID=CPT-T1_066.gene;
+AY216660.2 GbkToGff mRNA 44098 44485 . + . locus_tag=CPT-T1_066;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_066.mRNA;Parent=CPT-T1_066.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 44098 44103 . + . locus_tag=CPT-T1_066;regulatory_class=ribosome_binding_site;ID=CPT-T1_066.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_066.mRNA;
+AY216660.2 GbkToGff CDS 44111 44485 . + 0 locus_tag=CPT-T1_066;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKFECISDNTKKFTVGKIYDVPTEHAEQTVALTDDTGRNRIATVTHNGEGLRWNSGGTKFATFGKKRKRTFRVNGNVAANKIHNVKPSEVDRKPALKFKEKVDLFNLAASLVLLVAAISLLSIM;note=single transmembrane domain predicted N-in and C-out%3B Orf no. 68 see PMID: 14972552;ID=CPT-T1_066.CDS.1;Parent=CPT-T1_066.mRNA;
+AY216660.2 GbkToGff regulatory 44493 44526 . + . regulatory_class=terminator%2C rho-independent;ID=AY216660.2.regulatory.16;
+AY216660.2 GbkToGff gene 44527 44649 . + . locus_tag=CPT-T1_067;ID=CPT-T1_067.gene;
+AY216660.2 GbkToGff mRNA 44527 44649 . + . locus_tag=CPT-T1_067;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_067.mRNA;Parent=CPT-T1_067.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 44527 44530 . + . locus_tag=CPT-T1_067;regulatory_class=ribosome_binding_site;ID=CPT-T1_067.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_067.mRNA;
+AY216660.2 GbkToGff CDS 44539 44649 . + 0 locus_tag=CPT-T1_067;codon_start=1;transl_table=11;product=hypothetical protein;translation=MPDFSNWNNEPPSFQELLFCLLVLTLSLKGVLWLLS;note=single transmembrane domain predicted N-out and C-in%3B Orf no. 67 see PMID: 14972552;ID=CPT-T1_067.CDS.1;Parent=CPT-T1_067.mRNA;
+AY216660.2 GbkToGff gene 44625 44852 . + . locus_tag=CPT-T1_068;ID=CPT-T1_068.gene;
+AY216660.2 GbkToGff mRNA 44625 44852 . + . locus_tag=CPT-T1_068;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_068.mRNA;Parent=CPT-T1_068.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 44625 44628 . + . locus_tag=CPT-T1_068;regulatory_class=ribosome_binding_site;ID=CPT-T1_068.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_068.mRNA;
+AY216660.2 GbkToGff CDS 44634 44852 . + 0 locus_tag=CPT-T1_068;codon_start=1;transl_table=11;product=hypothetical protein;translation=MATIMTVEDAARDAVEGMRPNTSRIAHYYKSEVSAVQLVHEILRLPQVDSARVVTCLKNYFCITIKTNSTNC;note=Orf no. 66 see PMID: 14972552;ID=CPT-T1_068.CDS.1;Parent=CPT-T1_068.mRNA;
+AY216660.2 GbkToGff gene 44898 45356 . + . locus_tag=CPT-T1_069;ID=CPT-T1_069.gene;
+AY216660.2 GbkToGff mRNA 44898 45356 . + . locus_tag=CPT-T1_069;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_069.mRNA;Parent=CPT-T1_069.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 44898 44901 . + . locus_tag=CPT-T1_069;regulatory_class=ribosome_binding_site;ID=CPT-T1_069.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_069.mRNA;
+AY216660.2 GbkToGff CDS 44910 45356 . + 0 locus_tag=CPT-T1_069;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKHLICIEAPNDQYTLHGLGVFKGHYITAGTYDARRGDGDLMITSKEVNPYIMQNLGNNEYMAYGCNAVYKHVKIRKRVVRAFKKIAIKYWKMSKKDAGRWARNVADSYFYRNGESCYFLIDELMENYGGDFSQGSFDDWANYEISCW;note=Orf no. 65 see PMID: 14972552;ID=CPT-T1_069.CDS.1;Parent=CPT-T1_069.mRNA;
+AY216660.2 GbkToGff gene 45428 45970 . + . locus_tag=CPT-T1_070;ID=CPT-T1_070.gene;
+AY216660.2 GbkToGff mRNA 45428 45970 . + . locus_tag=CPT-T1_070;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_070.mRNA;Parent=CPT-T1_070.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 45428 45431 . + . locus_tag=CPT-T1_070;regulatory_class=ribosome_binding_site;ID=CPT-T1_070.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_070.mRNA;
+AY216660.2 GbkToGff CDS 45440 45970 . + 0 locus_tag=CPT-T1_070;note=HHPred 99 probability structural alignment to phage T4 polynucleotide kinase Protein Data bank entry 5UJ0 over most of protein%3B Orf no. 64 see PMID: 14972552%3B InterPro domain IPR023214;codon_start=1;transl_table=11;product=polynucleotide kinase PnkP;translation=MDKITIWGQTINLFLGTRRVAIFDFDGTLSDGSGRLHLLPTKDLHLTESWSEFNRAAIFDNPIQSTIDVMNSMFAAGYHVIILTGRSDEVRYASELWLKHHGARYDYLVMRPHTDNRKDTVMKEEAVRAIGIDNILAAWDDSVNIIKKFRDLGITTYQVCEYACDSREDLNSHGVD;ID=CPT-T1_070.CDS.1;Parent=CPT-T1_070.mRNA;
+AY216660.2 GbkToGff gene 45941 46451 . + . locus_tag=CPT-T1_071;ID=CPT-T1_071.gene;
+AY216660.2 GbkToGff mRNA 45941 46451 . + . locus_tag=CPT-T1_071;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_071.mRNA;Parent=CPT-T1_071.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 45941 45944 . + . locus_tag=CPT-T1_071;regulatory_class=ribosome_binding_site;ID=CPT-T1_071.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_071.mRNA;
+AY216660.2 GbkToGff CDS 45957 46451 . + 0 locus_tag=CPT-T1_071;note=InterPro domains IPR001471%2C IPR003615%2C and IPR016177%3B similar to phage T7 protein 3.8%3B alternative in-frame start site proposed as T1 p63a%3B Orf no. 63 see PMID: 14972552;codon_start=1;transl_table=11;product=HNH endonuclease;translation=MVSIDNKSMVRELFTYSDGVLYWKAKSSKYSRAKIGGAAGSKDKDGYIIIRVRNETRGAHRLVWIYHNGKIPDGMEVDHMDGDITNNRIENLRLVTRTINNRNQKKRSDNTTGVSGVTFMKDRGKYRAQVRNKRLGQFDTIEEAAKAVKDERDRLGLFTKRHGV;ID=CPT-T1_071.CDS.1;Parent=CPT-T1_071.mRNA;
+AY216660.2 GbkToGff gene 46444 47026 . + . locus_tag=CPT-T1_072;ID=CPT-T1_072.gene;
+AY216660.2 GbkToGff mRNA 46444 47026 . + . locus_tag=CPT-T1_072;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_072.mRNA;Parent=CPT-T1_072.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 46444 46447 . + . locus_tag=CPT-T1_072;regulatory_class=ribosome_binding_site;ID=CPT-T1_072.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_072.mRNA;
+AY216660.2 GbkToGff CDS 46454 47026 . + 0 locus_tag=CPT-T1_072;note=InterPro domain IPR027417%3B HHPred predicted structural similarity at 99%25 probability to phage T4 DNK Protein Data Bank Entry 1DEK%3B Orf no. 62 see PMID: 14972552;codon_start=1;transl_table=11;product=deoxynucleotide kinase;translation=MKTAIILNGAPGAGKDTIGCILADTYDHVALRSFKAPMFEIARAILGETNFEYFMFLYEDRRYKEEPASILNGKSPRQFMIWISEEVIKPQFGNRFFGMRAESKVKESHSLSVFTDGGFKDEILQMIEGDIQVKLCRIHRNGCNFDNDSRDYIYLDDMIGVNGYQECDFFSVEGHPEITAQHIAATFINK;ID=CPT-T1_072.CDS.1;Parent=CPT-T1_072.mRNA;
+AY216660.2 GbkToGff gene 47085 47308 . + . locus_tag=CPT-T1_073;ID=CPT-T1_073.gene;
+AY216660.2 GbkToGff mRNA 47085 47308 . + . locus_tag=CPT-T1_073;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_073.mRNA;Parent=CPT-T1_073.gene;
+AY216660.2 GbkToGff CDS 47099 47308 . + 0 locus_tag=CPT-T1_073;codon_start=1;transl_table=11;product=hypothetical protein;translation=MMVSTDKFFTCTKTSEVFELVHTDNGDFMHDGCDAFIEVKESDYDDGVYYNPAVNTQFFTPIEEEGEEA;note=Orf no. 61 see PMID: 14972552;ID=CPT-T1_073.CDS.1;Parent=CPT-T1_073.mRNA;
+AY216660.2 GbkToGff gene 47293 47649 . + . locus_tag=CPT-T1_074;ID=CPT-T1_074.gene;
+AY216660.2 GbkToGff mRNA 47293 47649 . + . locus_tag=CPT-T1_074;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_074.mRNA;Parent=CPT-T1_074.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 47293 47297 . + . locus_tag=CPT-T1_074;regulatory_class=ribosome_binding_site;ID=CPT-T1_074.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_074.mRNA;
+AY216660.2 GbkToGff CDS 47305 47649 . + 0 locus_tag=CPT-T1_074;codon_start=1;transl_table=11;product=hypothetical protein;translation=MITINLSDKQAREILDTIGEQLHVKGDTAEILNQIERQLTPVSTNQAEFAAWKSERILPNIIKAWKRKHKKEINVEDLFTDELSPSNVAQYQLRYMESVCNQVLGVSFSFKGDK;note=Orf no. 60 see PMID: 14972552;ID=CPT-T1_074.CDS.1;Parent=CPT-T1_074.mRNA;
+AY216660.2 GbkToGff gene 47637 47879 . + . locus_tag=CPT-T1_075;ID=CPT-T1_075.gene;
+AY216660.2 GbkToGff mRNA 47637 47879 . + . locus_tag=CPT-T1_075;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_075.mRNA;Parent=CPT-T1_075.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 47637 47640 . + . locus_tag=CPT-T1_075;regulatory_class=ribosome_binding_site;ID=CPT-T1_075.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_075.mRNA;
+AY216660.2 GbkToGff CDS 47649 47879 . + 0 locus_tag=CPT-T1_075;codon_start=1;transl_table=11;product=hypothetical protein;translation=MFGLSEAEWNVVKRAAKELNKFVSGMKKEDRKNDKIMIDVISTHHKKVELLIDRYKFVWTAGYIAGRVGNKEGDYE;note=Orf no. 59 see PMID: 14972552;ID=CPT-T1_075.CDS.1;Parent=CPT-T1_075.mRNA;
+AY216660.2 GbkToGff gene 47864 48082 . + . locus_tag=CPT-T1_076;ID=CPT-T1_076.gene;
+AY216660.2 GbkToGff mRNA 47864 48082 . + . locus_tag=CPT-T1_076;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_076.mRNA;Parent=CPT-T1_076.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 47864 47868 . + . locus_tag=CPT-T1_076;regulatory_class=ribosome_binding_site;ID=CPT-T1_076.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_076.mRNA;
+AY216660.2 GbkToGff CDS 47879 48082 . + 0 locus_tag=CPT-T1_076;codon_start=1;transl_table=11;product=hypothetical protein;translation=MANLPKKGDQVRCVTSRNGNALSAGCLYDVEKVSKSKRLVFVYGDDGNLHEIDYPQDVTNGQFEIND;note=Orf no. 58 see PMID: 14972552;ID=CPT-T1_076.CDS.1;Parent=CPT-T1_076.mRNA;
+AY216660.2 GbkToGff gene 48235 48408 . + . locus_tag=CPT-T1_077;ID=CPT-T1_077.gene;
+AY216660.2 GbkToGff mRNA 48235 48408 . + . locus_tag=CPT-T1_077;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_077.mRNA;Parent=CPT-T1_077.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 48235 48238 . + . locus_tag=CPT-T1_077;regulatory_class=ribosome_binding_site;ID=CPT-T1_077.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_077.mRNA;
+AY216660.2 GbkToGff CDS 48247 48408 . + 0 locus_tag=CPT-T1_077;codon_start=1;transl_table=11;product=hypothetical protein;translation=MQKTKDESVKIEIKVTRNGETTRYKKRLNPGEAVIGRIAGVMIKAQEDEAIQS;note=Orf no. 57 see PMID: 14972552;ID=CPT-T1_077.CDS.1;Parent=CPT-T1_077.mRNA;
+AY216660.2 GbkToGff gene 48377 48574 . + . locus_tag=CPT-T1_078;ID=CPT-T1_078.gene;
+AY216660.2 GbkToGff mRNA 48377 48574 . + . locus_tag=CPT-T1_078;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_078.mRNA;Parent=CPT-T1_078.gene;
+AY216660.2 GbkToGff CDS 48389 48574 . + 0 locus_tag=CPT-T1_078;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKRFKVKLIIRKMGMFCQSCKQSFEAELSATSQDEAITKAKKLSGANLDTHKINIELIKEI;note=alternative start codon to Orf no. 56 see PMID: 14972552;ID=CPT-T1_078.CDS.1;Parent=CPT-T1_078.mRNA;
+AY216660.2 GbkToGff gene 48564 48803 . + . locus_tag=CPT-T1_079;ID=CPT-T1_079.gene;
+AY216660.2 GbkToGff mRNA 48564 48803 . + . locus_tag=CPT-T1_079;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_079.mRNA;Parent=CPT-T1_079.gene;
+AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 48564 48568 . + . locus_tag=CPT-T1_079;regulatory_class=ribosome_binding_site;ID=CPT-T1_079.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_079.mRNA;
+AY216660.2 GbkToGff CDS 48576 48803 . + 0 locus_tag=CPT-T1_079;codon_start=1;transl_table=11;product=hypothetical protein;translation=MTIFLLIIAGVIIFGAGLFAGFALVAAAIAMDAKDKTGVWLTYSPKKDQWEMTGDLAHCYSKAKTHPKGIKRRLS;note=single transmembrane domain predicted N-out and C-in%3B Orf no. 55 see PMID: 14972552;ID=CPT-T1_079.CDS.1;Parent=CPT-T1_079.mRNA;
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/test-data/PhageQC_Out.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/test-data/PhageQC_Out.gff3 Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,185 @@
+##gff-version 3
+AY216660.2 feature gene 25 34 . + . note=Missing RBS;ID=CPT-T1_001.gene;Name=
+AY216660.2 feature gene 574 583 . + . note=Missing RBS;ID=CPT-T1_002.gene;Name=
+AY216660.2 feature gene 2215 2224 . + . note=Missing RBS;ID=CPT-T1_003.gene;Name=
+AY216660.2 feature gene 3481 3490 . + . note=Missing RBS;ID=CPT-T1_004.gene;Name=
+AY216660.2 feature gene 4249 4258 . + . note=Missing RBS;ID=CPT-T1_005.gene;Name=
+AY216660.2 feature gene 5374 5383 . + . note=Missing RBS;ID=CPT-T1_006.gene;Name=
+AY216660.2 feature gene 5911 5920 . + . note=Missing RBS;ID=CPT-T1_007.gene;Name=
+AY216660.2 feature gene 6769 6778 . + . note=Missing RBS;ID=CPT-T1_008.gene;Name=
+AY216660.2 feature gene 7779 7788 . + . note=Missing RBS;ID=CPT-T1_009.gene;Name=
+AY216660.2 feature gene 8112 8121 . + . note=Missing RBS;ID=CPT-T1_010.gene;Name=
+AY216660.2 feature gene 8519 8528 . + . note=Missing RBS;ID=CPT-T1_011.gene;Name=
+AY216660.2 feature gene 8882 8891 . + . note=Missing RBS;ID=CPT-T1_012.gene;Name=
+AY216660.2 feature gene 9317 9326 . + . note=Missing RBS;ID=CPT-T1_013.gene;Name=
+AY216660.2 feature gene 9718 9727 . + . note=Missing RBS;ID=CPT-T1_014.gene;Name=
+AY216660.2 feature gene 10501 10510 . + . note=Missing RBS;ID=CPT-T1_015.gene;Name=
+AY216660.2 feature gene 10501 10510 . + . note=Missing RBS;ID=CPT-T1_016.gene;Name=
+AY216660.2 feature gene 11177 11186 . + . note=Missing RBS;ID=CPT-T1_017.gene;Name=
+AY216660.2 feature gene 14052 14061 . + . note=Missing RBS;ID=CPT-T1_018.gene;Name=
+AY216660.2 feature gene 14485 14494 . + . note=Missing RBS;ID=CPT-T1_019.gene;Name=
+AY216660.2 feature gene 15264 15273 . + . note=Missing RBS;ID=CPT-T1_020.gene;Name=
+AY216660.2 feature gene 15994 16003 . + . note=Missing RBS;ID=CPT-T1_021.gene;Name=
+AY216660.2 feature gene 16673 16682 . + . note=Missing RBS;ID=CPT-T1_022.gene;Name=
+AY216660.2 feature gene 20236 20245 . + . note=Missing RBS;ID=CPT-T1_023.gene;Name=
+AY216660.2 feature gene 20541 20550 . + . note=Missing RBS;ID=CPT-T1_024.gene;Name=
+AY216660.2 feature gene 21524 21533 . - . note=Missing RBS;ID=CPT-T1_025.gene;Name=
+AY216660.2 feature gene 21740 21749 . + . note=Missing RBS;ID=CPT-T1_026.gene;Name=
+AY216660.2 feature gene 22005 22014 . + . note=Missing RBS;ID=CPT-T1_027.gene;Name=
+AY216660.2 feature gene 23114 23123 . + . note=Missing RBS;ID=CPT-T1_028.gene;Name=
+AY216660.2 feature gene 23839 23848 . + . note=Missing RBS;ID=CPT-T1_029.gene;Name=
+AY216660.2 feature gene 26556 26565 . - . note=Missing RBS;ID=CPT-T1_030.gene;Name=
+AY216660.2 feature gene 26606 26615 . - . note=Missing RBS;ID=CPT-T1_031.gene;Name=
+AY216660.2 feature gene 27591 27600 . - . note=Missing RBS;ID=CPT-T1_032.gene;Name=
+AY216660.2 feature gene 28104 28113 . - . note=Missing RBS;ID=CPT-T1_033.gene;Name=
+AY216660.2 feature gene 28153 28162 . + . note=Missing RBS;ID=CPT-T1_034.gene;Name=
+AY216660.2 feature gene 30166 30175 . + . note=Missing RBS;ID=CPT-T1_035.gene;Name=
+AY216660.2 feature gene 30654 30663 . + . note=Missing RBS;ID=CPT-T1_036.gene;Name=
+AY216660.2 feature gene 31362 31371 . + . note=Missing RBS;ID=CPT-T1_037.gene;Name=
+AY216660.2 feature gene 31680 31689 . + . note=Missing RBS;ID=CPT-T1_038.gene;Name=
+AY216660.2 feature gene 31866 31875 . + . note=Missing RBS;ID=CPT-T1_039.gene;Name=
+AY216660.2 feature gene 32084 32093 . + . note=Missing RBS;ID=CPT-T1_040.gene;Name=
+AY216660.2 feature gene 32452 32461 . + . note=Missing RBS;ID=CPT-T1_041.gene;Name=
+AY216660.2 feature gene 33658 33667 . + . note=Missing RBS;ID=CPT-T1_042.gene;Name=
+AY216660.2 feature gene 34211 34220 . + . note=Missing RBS;ID=CPT-T1_043.gene;Name=
+AY216660.2 feature gene 34510 34519 . + . note=Missing RBS;ID=CPT-T1_044.gene;Name=
+AY216660.2 feature gene 34724 34733 . + . note=Missing RBS;ID=CPT-T1_045.gene;Name=
+AY216660.2 feature gene 35212 35221 . + . note=Missing RBS;ID=CPT-T1_046.gene;Name=
+AY216660.2 feature gene 36185 36194 . - . note=Missing RBS;ID=CPT-T1_047.gene;Name=
+AY216660.2 feature gene 37761 37770 . - . note=Missing RBS;ID=CPT-T1_048.gene;Name=
+AY216660.2 feature gene 38300 38309 . - . note=Missing RBS;ID=CPT-T1_049.gene;Name=
+AY216660.2 feature gene 38712 38721 . - . note=Missing RBS;ID=CPT-T1_050.gene;Name=
+AY216660.2 feature gene 39000 39009 . - . note=Missing RBS;ID=CPT-T1_051.gene;Name=
+AY216660.2 feature gene 39228 39237 . - . note=Missing RBS;ID=CPT-T1_052.gene;Name=
+AY216660.2 feature gene 39450 39459 . - . note=Missing RBS;ID=CPT-T1_053.gene;Name=
+AY216660.2 feature gene 39764 39773 . - . note=Missing RBS;ID=CPT-T1_054.gene;Name=
+AY216660.2 feature gene 39984 39993 . - . note=Missing RBS;ID=CPT-T1_055.gene;Name=
+AY216660.2 feature gene 40348 40357 . - . note=Missing RBS;ID=CPT-T1_056.gene;Name=
+AY216660.2 feature gene 41005 41014 . + . note=Missing RBS;ID=CPT-T1_057.gene;Name=
+AY216660.2 feature gene 41591 41600 . + . note=Missing RBS;ID=CPT-T1_058.gene;Name=
+AY216660.2 feature gene 41998 42007 . + . note=Missing RBS;ID=CPT-T1_059.gene;Name=
+AY216660.2 feature gene 42232 42241 . + . note=Missing RBS;ID=CPT-T1_060.gene;Name=
+AY216660.2 feature gene 42458 42467 . + . note=Missing RBS;ID=CPT-T1_061.gene;Name=
+AY216660.2 feature gene 42565 42574 . + . note=Missing RBS;ID=CPT-T1_062.gene;Name=
+AY216660.2 feature gene 42809 42818 . + . note=Missing RBS;ID=CPT-T1_063.gene;Name=
+AY216660.2 feature gene 43370 43379 . + . note=Missing RBS;ID=CPT-T1_064.gene;Name=
+AY216660.2 feature gene 43855 43864 . + . note=Missing RBS;ID=CPT-T1_065.gene;Name=
+AY216660.2 feature gene 44083 44092 . + . note=Missing RBS;ID=CPT-T1_066.gene;Name=
+AY216660.2 feature gene 44512 44521 . + . note=Missing RBS;ID=CPT-T1_067.gene;Name=
+AY216660.2 feature gene 44610 44619 . + . note=Missing RBS;ID=CPT-T1_068.gene;Name=
+AY216660.2 feature gene 44883 44892 . + . note=Missing RBS;ID=CPT-T1_069.gene;Name=
+AY216660.2 feature gene 45413 45422 . + . note=Missing RBS;ID=CPT-T1_070.gene;Name=
+AY216660.2 feature gene 45926 45935 . + . note=Missing RBS;ID=CPT-T1_071.gene;Name=
+AY216660.2 feature gene 46429 46438 . + . note=Missing RBS;ID=CPT-T1_072.gene;Name=
+AY216660.2 feature gene 47070 47079 . + . note=Missing RBS;ID=CPT-T1_073.gene;Name=
+AY216660.2 feature gene 47278 47287 . + . note=Missing RBS;ID=CPT-T1_074.gene;Name=
+AY216660.2 feature gene 47622 47631 . + . note=Missing RBS;ID=CPT-T1_075.gene;Name=
+AY216660.2 feature gene 47849 47858 . + . note=Missing RBS;ID=CPT-T1_076.gene;Name=
+AY216660.2 feature gene 48220 48229 . + . note=Missing RBS;ID=CPT-T1_077.gene;Name=
+AY216660.2 feature gene 48362 48371 . + . note=Missing RBS;ID=CPT-T1_078.gene;Name=
+AY216660.2 feature gene 48549 48558 . + . note=Missing RBS;ID=CPT-T1_079.gene;Name=
+AY216660.2 feature gene 2185 2238 . . . note=Excessive gap%2C 54 bases;
+AY216660.2 feature gene 5877 5937 . . . note=Excessive gap%2C 61 bases;
+AY216660.2 feature gene 6706 6795 . . . note=Excessive gap%2C 90 bases;
+AY216660.2 feature gene 10415 10527 . . . note=Excessive gap%2C 113 bases;
+AY216660.2 feature gene 14433 14511 . . . note=Excessive gap%2C 79 bases;
+AY216660.2 feature gene 14424 14540 . + . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 14424 14433 . + . Parent=;
+AY216660.2 feature CDS 14439 14540 . + 0 Parent=;
+AY216660.2 feature gene 14447 14451 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 14452 14461 . - . Parent=;
+AY216660.2 feature CDS 14447 14524 . - 0 Parent=;
+AY216660.2 feature gene 16622 16698 . . . note=Excessive gap%2C 77 bases;
+AY216660.2 feature gene 21507 21765 . . . note=Excessive gap%2C 259 bases;
+AY216660.2 feature gene 21907 22033 . . . note=Excessive gap%2C 127 bases;
+AY216660.2 feature gene 24290 24350 . . . note=Excessive gap%2C 61 bases;
+AY216660.2 feature gene 24286 24390 . + . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 24286 24295 . + . Parent=;
+AY216660.2 feature CDS 24301 24390 . + 0 Parent=;
+AY216660.2 feature gene 27574 27634 . . . note=Excessive gap%2C 61 bases;
+AY216660.2 feature gene 28088 28180 . . . note=Excessive gap%2C 93 bases;
+AY216660.2 feature gene 30613 30679 . . . note=Excessive gap%2C 67 bases;
+AY216660.2 feature gene 30629 30718 . + . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 30629 30638 . + . Parent=;
+AY216660.2 feature CDS 30644 30718 . + 0 Parent=;
+AY216660.2 feature gene 31642 31707 . . . note=Excessive gap%2C 66 bases;
+AY216660.2 feature gene 32400 32477 . . . note=Excessive gap%2C 78 bases;
+AY216660.2 feature gene 32385 32492 . + . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 32385 32394 . + . Parent=;
+AY216660.2 feature CDS 32400 32492 . + 0 Parent=;
+AY216660.2 feature gene 33612 33683 . . . note=Excessive gap%2C 72 bases;
+AY216660.2 feature gene 34167 34238 . . . note=Excessive gap%2C 72 bases;
+AY216660.2 feature gene 34416 34533 . . . note=Excessive gap%2C 118 bases;
+AY216660.2 feature gene 35639 35760 . . . note=Excessive gap%2C 122 bases;
+AY216660.2 feature gene 35674 35678 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 35679 35688 . - . Parent=;
+AY216660.2 feature CDS 35674 35754 . - 0 Parent=;
+AY216660.2 feature gene 35674 35678 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 35679 35688 . - . Parent=;
+AY216660.2 feature CDS 35674 35766 . - 0 Parent=;
+AY216660.2 feature gene 38694 38773 . . . note=Excessive gap%2C 80 bases;
+AY216660.2 feature gene 38719 38814 . + . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 38719 38728 . + . Parent=;
+AY216660.2 feature CDS 38734 38814 . + 0 Parent=;
+AY216660.2 feature gene 38725 38814 . + . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 38725 38734 . + . Parent=;
+AY216660.2 feature CDS 38740 38814 . + 0 Parent=;
+AY216660.2 feature gene 39212 39288 . . . note=Excessive gap%2C 77 bases;
+AY216660.2 feature gene 40331 41030 . . . note=Excessive gap%2C 700 bases;
+AY216660.2 feature gene 40416 40420 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=;
+AY216660.2 feature CDS 40416 40496 . - 0 Parent=;
+AY216660.2 feature gene 40416 40420 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=;
+AY216660.2 feature CDS 40416 40511 . - 0 Parent=;
+AY216660.2 feature gene 40416 40420 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=;
+AY216660.2 feature CDS 40416 40541 . - 0 Parent=;
+AY216660.2 feature gene 40416 40420 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=;
+AY216660.2 feature CDS 40416 40559 . - 0 Parent=;
+AY216660.2 feature gene 40416 40420 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=;
+AY216660.2 feature CDS 40416 40595 . - 0 Parent=;
+AY216660.2 feature gene 40416 40420 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=;
+AY216660.2 feature CDS 40416 40637 . - 0 Parent=;
+AY216660.2 feature gene 40817 40921 . + . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 40817 40826 . + . Parent=;
+AY216660.2 feature CDS 40832 40921 . + 0 Parent=;
+AY216660.2 feature gene 41946 42023 . . . note=Excessive gap%2C 78 bases;
+AY216660.2 feature gene 42765 42835 . . . note=Excessive gap%2C 71 bases;
+AY216660.2 feature gene 43325 43395 . . . note=Excessive gap%2C 71 bases;
+AY216660.2 feature gene 44031 44110 . . . note=Excessive gap%2C 80 bases;
+AY216660.2 feature gene 44486 44538 . . . note=Excessive gap%2C 53 bases;
+AY216660.2 feature gene 44853 44909 . . . note=Excessive gap%2C 57 bases;
+AY216660.2 feature gene 45357 45439 . . . note=Excessive gap%2C 83 bases;
+AY216660.2 feature gene 45368 45372 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=;
+AY216660.2 feature CDS 45368 45445 . - 0 Parent=;
+AY216660.2 feature gene 45368 45372 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=;
+AY216660.2 feature CDS 45368 45463 . - 0 Parent=;
+AY216660.2 feature gene 45368 45372 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=;
+AY216660.2 feature CDS 45368 45466 . - 0 Parent=;
+AY216660.2 feature gene 45368 45372 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=;
+AY216660.2 feature CDS 45368 45475 . - 0 Parent=;
+AY216660.2 feature gene 45368 45372 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=;
+AY216660.2 feature CDS 45368 45478 . - 0 Parent=;
+AY216660.2 feature gene 45368 45372 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=;
+AY216660.2 feature CDS 45368 45481 . - 0 Parent=;
+AY216660.2 feature gene 47027 47098 . . . note=Excessive gap%2C 72 bases;
+AY216660.2 feature gene 48083 48246 . . . note=Excessive gap%2C 164 bases;
+AY216660.2 feature gene 48086 48090 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 48091 48100 . - . Parent=;
+AY216660.2 feature CDS 48086 48169 . - 0 Parent=;
+AY216660.2 feature gene 48160 48164 . - . note=Possible gene;
+AY216660.2 feature Shine_Dalgarno_sequence 48165 48174 . - . Parent=;
+AY216660.2 feature CDS 48160 48255 . - 0 Parent=;
+AY216660.2 feature gene 10528 10844 . . . note=Excessive Overlap;ID=CPT-T1_015.gene;Name=
+AY216660.2 feature gene 26447 26536 . . . note=Excessive Overlap;ID=CPT-T1_030.gene;Name=
+AY216660.2 feature gene 32112 32166 . . . note=Excessive Overlap;ID=CPT-T1_039.gene;Name=
diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/test-data/PhageQC_Out.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_phageqc_annotation/test-data/PhageQC_Out.html Fri Jun 17 13:00:50 2022 +0000
@@ -0,0 +1,1142 @@
+
+
+
+
+
+
+
+
+
+ Phage QC on AY216660.2 - 76
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Phage AY216660.2
+
+
+
+
+
+
+
+
Genes missing RBS 0 / 79
+
The following genes have issues with their RBS.
+
+ Since you have not annotated any possible RBSs, this does not count off from your overall score.
+
+
+
+
+
+ Feature Type |
+ ID |
+ Location |
+ Error |
+ Upstream (-15 .. -5) |
+
+
+
+
+ gene |
+ CPT-T1_001.gene |
+ 40..576 [1] |
+ No RBS annotated, None found |
+ taaatgttaa |
+
+
+ gene |
+ CPT-T1_002.gene |
+ 589..2184 [1] |
+ Unannotated but valid RBS |
+ taacttt AGG |
+
+
+ gene |
+ CPT-T1_003.gene |
+ 2230..3522 [1] |
+ No RBS annotated, None found |
+ acggcccttt |
+
+
+ gene |
+ CPT-T1_004.gene |
+ 3496..4273 [1] |
+ Unannotated but valid RBS |
+ cc GGAG ccgg |
+
+
+ gene |
+ CPT-T1_005.gene |
+ 4264..5388 [1] |
+ No RBS annotated, None found |
+ tccagacttt |
+
+
+ gene |
+ CPT-T1_006.gene |
+ 5389..5876 [1] |
+ No RBS annotated, None found |
+ agcaaagtaa |
+
+
+ gene |
+ CPT-T1_007.gene |
+ 5926..6705 [1] |
+ No RBS annotated, None found |
+ tagtcctttt |
+
+
+ gene |
+ CPT-T1_008.gene |
+ 6784..7755 [1] |
+ No RBS annotated, None found |
+ ttttttatta |
+
+
+ gene |
+ CPT-T1_009.gene |
+ 7794..8093 [1] |
+ No RBS annotated, None found |
+ gtcccttttt |
+
+
+ gene |
+ CPT-T1_010.gene |
+ 8127..8548 [1] |
+ No RBS annotated, None found |
+ agcgcctttt |
+
+
+ gene |
+ CPT-T1_011.gene |
+ 8534..8919 [1] |
+ Unannotated but valid RBS |
+ atgac AGGA c |
+
+
+ gene |
+ CPT-T1_012.gene |
+ 8897..9355 [1] |
+ No RBS annotated, None found |
+ atcgtccagt |
+
+
+ gene |
+ CPT-T1_013.gene |
+ 9332..9743 [1] |
+ No RBS annotated, None found |
+ gcaatcaagc |
+
+
+ gene |
+ CPT-T1_014.gene |
+ 9733..10414 [1] |
+ Unannotated but valid RBS |
+ tac GAG taga |
+
+
+ gene |
+ CPT-T1_015.gene |
+ 10516..10845 [1] |
+ No RBS annotated, None found |
+ tgctatcaac |
+
+
+ gene |
+ CPT-T1_016.gene |
+ 10516..11162 [1] |
+ No RBS annotated, None found |
+ tgctatcaac |
+
+
+ gene |
+ CPT-T1_017.gene |
+ 11192..14076 [1] |
+ No RBS annotated, None found |
+ gttttttcgt |
+
+
+ gene |
+ CPT-T1_018.gene |
+ 14067..14432 [1] |
+ Unannotated but valid RBS |
+ cattc AGGA a |
+
+
+ gene |
+ CPT-T1_019.gene |
+ 14500..15294 [1] |
+ Unannotated but valid RBS |
+ gttgc AGGT a |
+
+
+ gene |
+ CPT-T1_020.gene |
+ 15279..16025 [1] |
+ No RBS annotated, None found |
+ tgtcgcttca |
+
+
+ gene |
+ CPT-T1_021.gene |
+ 16009..16621 [1] |
+ No RBS annotated, None found |
+ taatcgttcg |
+
+
+ gene |
+ CPT-T1_022.gene |
+ 16688..20217 [1] |
+ No RBS annotated, None found |
+ ataaatagca |
+
+
+ gene |
+ CPT-T1_023.gene |
+ 20251..20568 [1] |
+ No RBS annotated, None found |
+ atgccctttt |
+
+
+ gene |
+ CPT-T1_024.gene |
+ 20556..21257 [1] |
+ No RBS annotated, None found |
+ cttaatagca |
+
+
+ gene |
+ CPT-T1_025.gene |
+ 21279..21518 [-1] |
+ No RBS annotated, None found |
+ aatcacacta |
+
+
+ gene |
+ CPT-T1_026.gene |
+ 21755..21906 [1] |
+ No RBS annotated, None found |
+ tgtatatcgt |
+
+
+ gene |
+ CPT-T1_027.gene |
+ 22020..23098 [1] |
+ No RBS annotated, None found |
+ attcatcgta |
+
+
+ gene |
+ CPT-T1_028.gene |
+ 23129..23820 [1] |
+ No RBS annotated, None found |
+ aaacaaataa |
+
+
+ gene |
+ CPT-T1_029.gene |
+ 23854..24289 [1] |
+ No RBS annotated, None found |
+ tcgcccataa |
+
+
+ gene |
+ CPT-T1_030.gene |
+ 24351..26550 [-1] |
+ No RBS annotated, None found |
+ cgtgctattt |
+
+
+ gene |
+ CPT-T1_031.gene |
+ 26447..26600 [-1] |
+ No RBS annotated, None found |
+ tgcccattgc |
+
+
+ gene |
+ CPT-T1_032.gene |
+ 26638..27585 [-1] |
+ No RBS annotated, None found |
+ cgcgtttttt |
+
+
+ gene |
+ CPT-T1_033.gene |
+ 27635..28098 [-1] |
+ No RBS annotated, None found |
+ gcaaaaagtg |
+
+
+ gene |
+ CPT-T1_034.gene |
+ 28168..30199 [1] |
+ No RBS annotated, None found |
+ gttacaacga |
+
+
+ gene |
+ CPT-T1_035.gene |
+ 30181..30612 [1] |
+ No RBS annotated, None found |
+ caaagacttc |
+
+
+ gene |
+ CPT-T1_036.gene |
+ 30669..31393 [1] |
+ No RBS annotated, None found |
+ atctcaccaa |
+
+
+ gene |
+ CPT-T1_037.gene |
+ 31377..31641 [1] |
+ No RBS annotated, None found |
+ aaaacaaacg |
+
+
+ gene |
+ CPT-T1_038.gene |
+ 31695..31917 [1] |
+ No RBS annotated, None found |
+ tcatagaaac |
+
+
+ gene |
+ CPT-T1_039.gene |
+ 31881..32167 [1] |
+ No RBS annotated, None found |
+ cttcatctct |
+
+
+ gene |
+ CPT-T1_040.gene |
+ 32099..32399 [1] |
+ Unannotated but valid RBS |
+ GAG ttttacc |
+
+
+ gene |
+ CPT-T1_041.gene |
+ 32467..33611 [1] |
+ No RBS annotated, None found |
+ tttcattatc |
+
+
+ gene |
+ CPT-T1_042.gene |
+ 33673..34166 [1] |
+ No RBS annotated, None found |
+ ttttatagaa |
+
+
+ gene |
+ CPT-T1_043.gene |
+ 34226..34415 [1] |
+ No RBS annotated, None found |
+ accacatcga |
+
+
+ gene |
+ CPT-T1_044.gene |
+ 34525..34749 [1] |
+ No RBS annotated, None found |
+ gttcaaaaaa |
+
+
+ gene |
+ CPT-T1_045.gene |
+ 34739..35237 [1] |
+ Unannotated but valid RBS |
+ AGG cgcttaa |
+
+
+ gene |
+ CPT-T1_046.gene |
+ 35227..35638 [1] |
+ No RBS annotated, None found |
+ acgcatattg |
+
+
+ gene |
+ CPT-T1_047.gene |
+ 35761..36179 [-1] |
+ No RBS annotated, None found |
+ caatcctcga |
+
+
+ gene |
+ CPT-T1_048.gene |
+ 36173..37755 [-1] |
+ No RBS annotated, None found |
+ tcacgccgtc |
+
+
+ gene |
+ CPT-T1_049.gene |
+ 37741..38294 [-1] |
+ No RBS annotated, None found |
+ gacggcacaa |
+
+
+ gene |
+ CPT-T1_050.gene |
+ 38277..38706 [-1] |
+ No RBS annotated, None found |
+ tcaagataac |
+
+
+ gene |
+ CPT-T1_051.gene |
+ 38774..38994 [-1] |
+ No RBS annotated, None found |
+ tttacattaa |
+
+
+ gene |
+ CPT-T1_052.gene |
+ 38987..39222 [-1] |
+ No RBS annotated, None found |
+ taccaaacaa |
+
+
+ gene |
+ CPT-T1_053.gene |
+ 39289..39444 [-1] |
+ No RBS annotated, None found |
+ ggcatatcaa |
+
+
+ gene |
+ CPT-T1_054.gene |
+ 39429..39758 [-1] |
+ No RBS annotated, None found |
+ tatcctgact |
+
+
+ gene |
+ CPT-T1_055.gene |
+ 39766..39978 [-1] |
+ Unannotated but valid RBS |
+ tagc GGA tcg |
+
+
+ gene |
+ CPT-T1_056.gene |
+ 39959..40342 [-1] |
+ No RBS annotated, None found |
+ acatcaacag |
+
+
+ gene |
+ CPT-T1_057.gene |
+ 41020..41606 [1] |
+ No RBS annotated, None found |
+ tgtatattga |
+
+
+ gene |
+ CPT-T1_058.gene |
+ 41606..41945 [1] |
+ Unannotated but valid RBS |
+ aaaatgt GGA |
+
+
+ gene |
+ CPT-T1_059.gene |
+ 42013..42254 [1] |
+ Unannotated but valid RBS |
+ ac GAG atacc |
+
+
+ gene |
+ CPT-T1_060.gene |
+ 42247..42487 [1] |
+ No RBS annotated, None found |
+ tcagcacttt |
+
+
+ gene |
+ CPT-T1_061.gene |
+ 42473..42594 [1] |
+ No RBS annotated, None found |
+ ggcgacaaca |
+
+
+ gene |
+ CPT-T1_062.gene |
+ 42580..42764 [1] |
+ No RBS annotated, None found |
+ ttgctactgt |
+
+
+ gene |
+ CPT-T1_063.gene |
+ 42824..43324 [1] |
+ No RBS annotated, None found |
+ ccatcgacaa |
+
+
+ gene |
+ CPT-T1_064.gene |
+ 43385..43881 [1] |
+ No RBS annotated, None found |
+ acatcaacca |
+
+
+ gene |
+ CPT-T1_065.gene |
+ 43870..44030 [1] |
+ No RBS annotated, None found |
+ actcttgtcc |
+
+
+ gene |
+ CPT-T1_066.gene |
+ 44098..44485 [1] |
+ No RBS annotated, None found |
+ gcacgacaac |
+
+
+ gene |
+ CPT-T1_067.gene |
+ 44527..44649 [1] |
+ No RBS annotated, None found |
+ gttccccttt |
+
+
+ gene |
+ CPT-T1_068.gene |
+ 44625..44852 [1] |
+ No RBS annotated, None found |
+ gacattatct |
+
+
+ gene |
+ CPT-T1_069.gene |
+ 44898..45356 [1] |
+ No RBS annotated, None found |
+ cctacaccaa |
+
+
+ gene |
+ CPT-T1_070.gene |
+ 45428..45970 [1] |
+ No RBS annotated, None found |
+ ttaagcaacc |
+
+
+ gene |
+ CPT-T1_071.gene |
+ 45941..46451 [1] |
+ No RBS annotated, None found |
+ gcctgtgata |
+
+
+ gene |
+ CPT-T1_072.gene |
+ 46444..47026 [1] |
+ No RBS annotated, None found |
+ tcacaaaaag |
+
+
+ gene |
+ CPT-T1_073.gene |
+ 47085..47308 [1] |
+ No RBS annotated, None found |
+ ctcatcgaca |
+
+
+ gene |
+ CPT-T1_074.gene |
+ 47293..47649 [1] |
+ No RBS annotated, None found |
+ cccgatcgaa |
+
+
+ gene |
+ CPT-T1_075.gene |
+ 47637..47879 [1] |
+ No RBS annotated, None found |
+ aagtttttca |
+
+
+ gene |
+ CPT-T1_076.gene |
+ 47864..48082 [1] |
+ Unannotated but valid RBS |
+ cgt AGGT aac |
+
+
+ gene |
+ CPT-T1_077.gene |
+ 48235..48408 [1] |
+ No RBS annotated, None found |
+ gcggcaacaa |
+
+
+ gene |
+ CPT-T1_078.gene |
+ 48377..48574 [1] |
+ No RBS annotated, None found |
+ ctggcgttat |
+
+
+ gene |
+ CPT-T1_079.gene |
+ 48564..48803 [1] |
+ No RBS annotated, None found |
+ atattgaatt |
+
+
+
+
+
+
Start Codon Usage
+
This section covers genes with unusual start codons
+
+
+
+
+ Start Codon |
+ Count |
+
+
+
+ ATG | 74 |
+ GTG | 3 |
+ TTG | 2 |
+
+
+
+
+
+
+
+
+ Feature Type |
+ ID |
+ Location |
+ Error |
+
+
+
+
+
+
+
+
Intergenic Gaps
+
Phage genomes are under pressure to maintain high coding density. Large intergenic gaps may be a sign of incorrect gene starts or missing genes.
+
+
+
+
+ Region |
+ Size |
+ Bounding Gene Transcription Direction |
+ Message |
+
+
+
+
+ 2184 .. 2238 |
+ 54 |
+ → → |
+
+ |
+
+
+ 5876 .. 5937 |
+ 61 |
+ → → |
+
+ |
+
+
+ 6705 .. 6795 |
+ 90 |
+ → → |
+
+ |
+
+
+ 10414 .. 10527 |
+ 113 |
+ → → |
+
+ |
+
+
+ 14432 .. 14511 |
+ 79 |
+ → → |
+
+ 2 ORFs found in this region
+ |
+
+
+ 16621 .. 16698 |
+ 77 |
+ → → |
+
+ |
+
+
+ 21506 .. 21765 |
+ 259 |
+ → → |
+
+ |
+
+
+ 21906 .. 22033 |
+ 127 |
+ → → |
+
+ |
+
+
+ 24289 .. 24350 |
+ 61 |
+ → → |
+
+ 1 ORFs found in this region
+ |
+
+
+ 27573 .. 27634 |
+ 61 |
+ → → |
+
+ |
+
+
+ 28087 .. 28180 |
+ 93 |
+ → → |
+
+ |
+
+
+ 30612 .. 30679 |
+ 67 |
+ → → |
+
+ 1 ORFs found in this region
+ |
+
+
+ 31641 .. 31707 |
+ 66 |
+ → → |
+
+ |
+
+
+ 32399 .. 32477 |
+ 78 |
+ → → |
+
+ 1 ORFs found in this region
+ |
+
+
+ 33611 .. 33683 |
+ 72 |
+ → → |
+
+ |
+
+
+ 34166 .. 34238 |
+ 72 |
+ → → |
+
+ |
+
+
+ 34415 .. 34533 |
+ 118 |
+ → → |
+
+ |
+
+
+ 35638 .. 35760 |
+ 122 |
+ → → |
+
+ 2 ORFs found in this region
+ |
+
+
+ 38693 .. 38773 |
+ 80 |
+ → → |
+
+ 2 ORFs found in this region
+ |
+
+
+ 39211 .. 39288 |
+ 77 |
+ → → |
+
+ |
+
+
+ 40330 .. 41030 |
+ 700 |
+ → → |
+
+ 7 ORFs found in this region
+ |
+
+
+ 41945 .. 42023 |
+ 78 |
+ → → |
+
+ |
+
+
+ 42764 .. 42835 |
+ 71 |
+ → → |
+
+ |
+
+
+ 43324 .. 43395 |
+ 71 |
+ → → |
+
+ |
+
+
+ 44030 .. 44110 |
+ 80 |
+ → → |
+
+ |
+
+
+ 44485 .. 44538 |
+ 53 |
+ → → |
+
+ |
+
+
+ 44852 .. 44909 |
+ 57 |
+ → → |
+
+ |
+
+
+ 45356 .. 45439 |
+ 83 |
+ → → |
+
+ 6 ORFs found in this region
+ |
+
+
+ 47026 .. 47098 |
+ 72 |
+ → → |
+
+ |
+
+
+ 48082 .. 48246 |
+ 164 |
+ → → |
+
+ 2 ORFs found in this region
+ |
+
+
+
+
+
+
Overlapping Genes
+
Large gene overlaps may indicate an incorrect gene start or miscalled gene.
+
+
+
+
+ Feature A |
+ Feature B |
+ Shared Region |
+ Overlap Length |
+
+
+
+
+ CPT-T1_015.gene ([10516:10845](+)) |
+ CPT-T1_016.gene ([10516:11162](+)) |
+ 10527..10844 |
+ 317 bp |
+
+
+ CPT-T1_030.gene ([24351:26550](-)) |
+ CPT-T1_031.gene ([26447:26600](-)) |
+ 26446..26536 |
+ 90 bp |
+
+
+ CPT-T1_039.gene ([31881:32167](+)) |
+ CPT-T1_040.gene ([32099:32399](+)) |
+ 32111..32166 |
+ 55 bp |
+
+
+
+
+
+
+
+
Possible Morons 78 / 79 (Doesn't count towards score)
+
+
+
+
+ Feature |
+ RBS |
+ Surrounding Features |
+
+
+
+
+ CPT-T1_025.gene |
+ No RBS Available |
+
+ →
+ →
+ ←
+ →
+ →
+
+ |
+
+
+
+
+
+
+
+
Missing Product Tags 79 / 79
+
+
+
+
+ Feature |
+ Qualifiers |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+