| Next changeset 1:4f4b413056f6 (2023-06-05) |
|
Commit message:
Uploaded |
|
added:
cpt_gff_rebase/cpt-macros.xml cpt_gff_rebase/gff3.py cpt_gff_rebase/gff3_rebase.py cpt_gff_rebase/gff3_rebase.xml cpt_gff_rebase/macros.xml cpt_gff_rebase/test-data/T7_CLEAN.gff3 cpt_gff_rebase/test-data/T7_TMHMM.gff3 cpt_gff_rebase/test-data/T7_TMHMM_REBASE.gff3 cpt_gff_rebase/test-data/child.gff cpt_gff_rebase/test-data/nonprotein.gff cpt_gff_rebase/test-data/parent.gff cpt_gff_rebase/test-data/proteins.gff |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/cpt-macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/cpt-macros.xml Fri Jun 17 04:00:49 2022 +0000 |
| [ |
| @@ -0,0 +1,115 @@ +<?xml version="1.0"?> +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd $__tool_directory__ && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros> |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/gff3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/gff3.py Fri Jun 17 04:00:49 2022 +0000 |
| [ |
| b'@@ -0,0 +1,346 @@\n+import copy\n+import logging\n+\n+log = logging.getLogger()\n+log.setLevel(logging.WARN)\n+\n+\n+def feature_lambda(\n+ feature_list,\n+ test,\n+ test_kwargs,\n+ subfeatures=True,\n+ parent=None,\n+ invert=False,\n+ recurse=True,\n+):\n+ """Recursively search through features, testing each with a test function, yielding matches.\n+\n+ GFF3 is a hierachical data structure, so we need to be able to recursively\n+ search through features. E.g. if you\'re looking for a feature with\n+ ID=\'bob.42\', you can\'t just do a simple list comprehension with a test\n+ case. You don\'t know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.\n+\n+ :type feature_list: list\n+ :param feature_list: an iterable of features\n+\n+ :type test: function reference\n+ :param test: a closure with the method signature (feature, **kwargs) where\n+ the kwargs are those passed in the next argument. This\n+ function should return True or False, True if the feature is\n+ to be yielded as part of the main feature_lambda function, or\n+ False if it is to be ignored. This function CAN mutate the\n+ features passed to it (think "apply").\n+\n+ :type test_kwargs: dictionary\n+ :param test_kwargs: kwargs to pass to your closure when it is called.\n+\n+ :type subfeatures: boolean\n+ :param subfeatures: when a feature is matched, should just that feature be\n+ yielded to the caller, or should the entire sub_feature\n+ tree for that feature be included? subfeatures=True is\n+ useful in cases such as searching for a gene feature,\n+ and wanting to know what RBS/Shine_Dalgarno_sequences\n+ are in the sub_feature tree (which can be accomplished\n+ with two feature_lambda calls). subfeatures=False is\n+ useful in cases when you want to process (and possibly\n+ return) the entire feature tree, such as applying a\n+ qualifier to every single feature.\n+\n+ :type invert: boolean\n+ :param invert: Negate/invert the result of the filter.\n+\n+ :rtype: yielded list\n+ :return: Yields a list of matching features.\n+ """\n+ # Either the top level set of [features] or the subfeature attribute\n+ for feature in feature_list:\n+ feature._parent = parent\n+ if not parent:\n+ # Set to self so we cannot go above root.\n+ feature._parent = feature\n+ test_result = test(feature, **test_kwargs)\n+ # if (not invert and test_result) or (invert and not test_result):\n+ if invert ^ test_result:\n+ if not subfeatures:\n+ feature_copy = copy.deepcopy(feature)\n+ feature_copy.sub_features = list()\n+ yield feature_copy\n+ else:\n+ yield feature\n+\n+ if recurse and hasattr(feature, "sub_features"):\n+ for x in feature_lambda(\n+ feature.sub_features,\n+ test,\n+ test_kwargs,\n+ subfeatures=subfeatures,\n+ parent=feature,\n+ invert=invert,\n+ recurse=recurse,\n+ ):\n+ yield x\n+\n+\n+def fetchParent(feature):\n+ if not hasattr(feature, "_parent") or feature._parent is None:\n+ return feature\n+ else:\n+ return fetchParent(feature._parent)\n+\n+\n+def feature_test_true(feature, **kwargs):\n+ return True\n+\n+\n+def feature_test_type(feature, **kwargs):\n+ if "type" in kwargs:\n+ return str(feature.type).upper() == str(kwargs["type"]).upper()\n+ elif "types" in kwargs:\n+ for x in kwargs["types"]:\n+ if str(feature.type).upper() == str(x).upper():\n+ return True\n+ return False\n+ raise Exception("Incorrect feature_test_type call, ne'..b'feature.location.start,\n+ # feature.location.end,\n+ # feature.location.strand\n+ # )\n+ return result\n+\n+\n+def get_gff3_id(gene):\n+ return gene.qualifiers.get("Name", [gene.id])[0]\n+\n+\n+def ensure_location_in_bounds(start=0, end=0, parent_length=0):\n+ # This prevents frameshift errors\n+ while start < 0:\n+ start += 3\n+ while end < 0:\n+ end += 3\n+ while start > parent_length:\n+ start -= 3\n+ while end > parent_length:\n+ end -= 3\n+ return (start, end)\n+\n+\n+def coding_genes(feature_list):\n+ for x in genes(feature_list):\n+ if (\n+ len(\n+ list(\n+ feature_lambda(\n+ x.sub_features,\n+ feature_test_type,\n+ {"type": "CDS"},\n+ subfeatures=False,\n+ )\n+ )\n+ )\n+ > 0\n+ ):\n+ yield x\n+\n+\n+def genes(feature_list, feature_type="gene", sort=False):\n+ """\n+ Simple filter to extract gene features from the feature set.\n+ """\n+\n+ if not sort:\n+ for x in feature_lambda(\n+ feature_list, feature_test_type, {"type": feature_type}, subfeatures=True\n+ ):\n+ yield x\n+ else:\n+ data = list(genes(feature_list, feature_type=feature_type, sort=False))\n+ data = sorted(data, key=lambda feature: feature.location.start)\n+ for x in data:\n+ yield x\n+\n+\n+def wa_unified_product_name(feature):\n+ """\n+ Try and figure out a name. We gave conflicting instructions, so\n+ this isn\'t as trivial as it should be. Sometimes it will be in\n+ \'product\' or \'Product\', othertimes in \'Name\'\n+ """\n+ # Manually applied tags.\n+ protein_product = feature.qualifiers.get(\n+ "product", feature.qualifiers.get("Product", [None])\n+ )[0]\n+\n+ # If neither of those are available ...\n+ if protein_product is None:\n+ # And there\'s a name...\n+ if "Name" in feature.qualifiers:\n+ if not is_uuid(feature.qualifiers["Name"][0]):\n+ protein_product = feature.qualifiers["Name"][0]\n+\n+ return protein_product\n+\n+\n+def is_uuid(name):\n+ return name.count("-") == 4 and len(name) == 36\n+\n+\n+def get_rbs_from(gene):\n+ # Normal RBS annotation types\n+ rbs_rbs = list(\n+ feature_lambda(\n+ gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False\n+ )\n+ )\n+ rbs_sds = list(\n+ feature_lambda(\n+ gene.sub_features,\n+ feature_test_type,\n+ {"type": "Shine_Dalgarno_sequence"},\n+ subfeatures=False,\n+ )\n+ )\n+ # Fraking apollo\n+ apollo_exons = list(\n+ feature_lambda(\n+ gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False\n+ )\n+ )\n+ apollo_exons = [x for x in apollo_exons if len(x) < 10]\n+ # These are more NCBI\'s style\n+ regulatory_elements = list(\n+ feature_lambda(\n+ gene.sub_features,\n+ feature_test_type,\n+ {"type": "regulatory"},\n+ subfeatures=False,\n+ )\n+ )\n+ rbs_regulatory = list(\n+ feature_lambda(\n+ regulatory_elements,\n+ feature_test_quals,\n+ {"regulatory_class": ["ribosome_binding_site"]},\n+ subfeatures=False,\n+ )\n+ )\n+ # Here\'s hoping you find just one ;)\n+ return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons\n+\n+\n+def nice_name(record):\n+ """\n+ get the real name rather than NCBI IDs and so on. If fails, will return record.id\n+ """\n+ name = record.id\n+ likely_parental_contig = list(genes(record.features, feature_type="contig"))\n+ if len(likely_parental_contig) == 1:\n+ name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]\n+ return name\n+\n+\n+def fsort(it):\n+ for i in sorted(it, key=lambda x: int(x.location.start)):\n+ yield i\n' |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/gff3_rebase.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/gff3_rebase.py Fri Jun 17 04:00:49 2022 +0000 |
| [ |
| @@ -0,0 +1,131 @@ +#!/usr/bin/env python +import sys +import logging +import argparse +from gff3 import feature_lambda, feature_test_qual_value +from CPT_GFFParser import gffParse, gffWrite +from Bio.SeqFeature import FeatureLocation + +log = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def __get_features(child, interpro=False): + child_features = {} + for rec in gffParse(child): + log.info("Parsing %s", rec.id) + # Only top level + for feature in rec.features: + # Get the record id as parent_feature_id (since this is how it will be during remapping) + parent_feature_id = rec.id + # If it's an interpro specific gff3 file + if interpro: + # Then we ignore polypeptide features as they're useless + if feature.type == "polypeptide": + continue + + try: + child_features[parent_feature_id].append(feature) + except KeyError: + child_features[parent_feature_id] = [feature] + # Keep a list of feature objects keyed by parent record id + return child_features + + +def __update_feature_location(feature, parent, protein2dna): + start = feature.location.start + end = feature.location.end + if protein2dna: + start *= 3 + end *= 3 + + if parent.location.strand >= 0: + ns = parent.location.start + start + ne = parent.location.start + end + st = +1 + else: + ns = parent.location.end - end + ne = parent.location.end - start + st = -1 + + # Don't let start/stops be less than zero. + # + # Instead, we'll replace with %3 to try and keep it in the same reading + # frame that it should be in. + + if ns < 0: + ns %= 3 + if ne < 0: + ne %= 3 + + feature.location = FeatureLocation(ns, ne, strand=st) + + if hasattr(feature, "sub_features"): + for subfeature in feature.sub_features: + __update_feature_location(subfeature, parent, protein2dna) + + +def rebase(parent, child, interpro=False, protein2dna=False, map_by="ID"): + # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID + child_features = __get_features(child, interpro=interpro) + + for rec in gffParse(parent): + replacement_features = [] + # Horrifically slow I believe + for feature in feature_lambda( + rec.features, + # Filter features in the parent genome by those that are + # "interesting", i.e. have results in child_features array. + # Probably an unnecessary optimisation. + feature_test_qual_value, + {"qualifier": map_by, "attribute_list": child_features.keys()}, + subfeatures=False, + ): + + # Features which will be re-mapped + to_remap = child_features[feature.id] + + fixed_features = [] + for x in to_remap: + # Then update the location of the actual feature + __update_feature_location(x, feature, protein2dna) + + if interpro: + for y in ("status", "Target"): + try: + del x.qualifiers[y] + except: + pass + + fixed_features.append(x) + replacement_features.extend(fixed_features) + # We do this so we don't include the original set of features that we + # were rebasing against in our result. + rec.features = replacement_features + rec.annotations = {} + gffWrite([rec], sys.stdout) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="rebase gff3 features against parent locations", epilog="" + ) + parser.add_argument( + "parent", type=argparse.FileType("r"), help="Parent GFF3 annotations" + ) + parser.add_argument( + "child", + type=argparse.FileType("r"), + help="Child GFF3 annotations to rebase against parent", + ) + parser.add_argument( + "--interpro", action="store_true", help="Interpro specific modifications" + ) + parser.add_argument( + "--protein2dna", + action="store_true", + help="Map protein translated results to original DNA data", + ) + parser.add_argument("--map_by", help="Map by key", default="ID") + args = parser.parse_args() + rebase(**vars(args)) |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/gff3_rebase.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/gff3_rebase.xml Fri Jun 17 04:00:49 2022 +0000 |
| [ |
| @@ -0,0 +1,102 @@ +<tool id="gff3.rebase" name="Rebase GFF3 features" version="19.1.0.0"> + <description>against parent features</description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command interpreter="python" detect_errors="aggressive"><![CDATA[gff3_rebase.py +$parent +$child + +$interpro +$protein2dna +--map_by "$map_by" +> $default]]></command> + <inputs> + <param label="Parent GFF3 annotations" name="parent" format="gff3" type="data"/> + <param label="Child GFF3 annotations to rebase against parent" name="child" format="gff3" type="data"/> + + <param label="Interpro specific modifications" name="interpro" type="boolean" truevalue="--interpro" falsevalue=""/> + <param label="Map protein translated results to original DNA data" name="protein2dna" type="boolean" truevalue="--protein2dna" falsevalue=""/> + + <param label="Mapping Key" name="map_by" type="text" value="ID" /> + </inputs> + <outputs> + <data format="gff3" name="default"/> + </outputs> + <tests> + <test> + <param name="parent" value="T7_CLEAN.gff3"/> + <param name="child" value="T7_TMHMM.gff3"/> + <param name="interpro" value="" /> + <param name="protein2dna" value="--protein2dna" /> + <param name="map_by" value="ID" /> + <output name="default" file="T7_TMHMM_REBASE.gff3"/> + </test> + <test> + <param name="parent" value="parent.gff"/> + <param name="child" value="child.gff"/> + <param name="interpro" value="" /> + <param name="protein2dna" value="--protein2dna" /> + <param name="map_by" value="ID" /> + <output name="default" file="proteins.gff"/> + </test> + <test> + <param name="parent" value="parent.gff"/> + <param name="child" value="child.gff"/> + <param name="interpro" value="" /> + <param name="protein2dna" value="" /> + <param name="map_by" value="ID" /> + <output name="default" file="nonprotein.gff"/> + </test> + </tests> + <help><![CDATA[ +**What it does** + +The workflow in a genomic data analysis typically follows a process of feature +export, analysis and then mapping the results of the analysis back to the genome. + +For meaningful display in JBrowse, it is necessary to accurately map +analysis results back to their corresponding positions in the context of the entire +genome. + +This tool fills that gap, by *rebasing* (calculating parent genome coordinates) +features from analysis results against the parent features which +were originally used for the analysis. + +**Example Input/Output** + +For a *parent* set of annotations:: + + #gff-version 3 + PhageBob maker cds 300 600 . + . ID=cds42 + +Where the analysis had exported the CDS (child) FASTA sequence:: + + >cds42 + MRTNASC + +Then analyzed that feature, producing the *child* annotation file:: + + #gff-version 3 + cds42 blastp match_part 1 50 1e-40 . . ID=m00001;Notes=RNAse A Protein + +This tool will then localize the results properly against the parent and permit +proper visualization of the results in the correct location:: + + #gff-version 3 + PhageBob blastp match_part 300 449 1e-40 + . ID=m00001;Notes=RNAse A Protein + +**Options** + +The **Interpro specific modifications** option selectively ignores *features* (*i.e.* polypeptide) and +qualifiers (status, Target) not needed in the output. + +The **Map protein translated results to original DNA data** option indicates that the DNA sequences were translated into +protein sequence during the genomic export process. When this option is selected, +the tool will multiply the bases by three to obtain the correct DNA locations. + +]]></help> + <expand macro="citations" /> +</tool> |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/macros.xml Fri Jun 17 04:00:49 2022 +0000 |
| b |
| @@ -0,0 +1,85 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="3.8.13">python</requirement> + <requirement type="package" version="1.79">biopython</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + </xml> + <token name="@BLAST_TSV@"> + "$blast_tsv" + </token> + <xml name="blast_tsv"> + <param label="Blast Results" help="TSV/tabular (25 Column)" + name="blast_tsv" type="data" format="tabular" /> + </xml> + + <token name="@BLAST_XML@"> + "$blast_xml" + </token> + <xml name="blast_xml"> + <param label="Blast Results" help="XML format" + name="blast_xml" type="data" format="blastxml" /> + </xml> + <xml name="gff3_with_fasta"> + <param label="Genome Sequences" name="fasta" type="data" format="fasta" /> + <param label="Genome Annotations" name="gff3" type="data" format="gff3" /> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input" /> + <expand macro="genome_selector" /> + </xml> + <token name="@INPUT_GFF@"> + "$gff3_data" + </token> + <token name="@INPUT_FASTA@"> +#if str($reference_genome.reference_genome_source) == 'cached': + "${reference_genome.fasta_indexes.fields.path}" +#else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa +#end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> +#if $reference_genome.reference_genome_source == 'history': + ln -s $reference_genome.genome_fasta genomeref.fa; +#end if + </token> + <token name="@GENOME_SELECTOR@"> +#if str($reference_genome.reference_genome_source) == 'cached': + "${reference_genome.fasta_indexes.fields.path}" +#else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa +#end if + </token> + <xml name="input/fasta"> + <param label="Fasta file" name="sequences" type="data" format="fasta"/> + </xml> + + <token name="@SEQUENCE@"> + "$sequences" + </token> + <xml name="input/fasta/protein"> + <param label="Protein fasta file" name="sequences" type="data" format="fasta"/> + </xml> +</macros> |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/test-data/T7_CLEAN.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/test-data/T7_CLEAN.gff3 Fri Jun 17 04:00:49 2022 +0000 |
| [ |
| b"@@ -0,0 +1,171 @@\n+##gff-version 3\n+NC_001604\tGenBank\tcontig\t1\t39937\t.\t+\t1\tID=NC_001604;Dbxref=BioProject:PRJNA485481,taxon:10760;Name=NC_001604;Note=Enterobacteria phage T7%2C complete genome.,VALIDATED REFSEQ: This record has undergone validation or preliminary review. The reference sequence was derived from V01146. The sequence was submitted by the authors [1] on magnetic tape and revised according to [3],[4],and [5]. [3] made changes at 8 positions in gene 1 without affecting the size of the total sequence but changing gene 1 amino acids 443,474,and 388 to 424. [4] inserted a T at nucleotide 17511,increasing the total sequence to 39937 bp. This change,originally found in T3 DNA [8],revealed gene 5.9 and shortened gene 6. [5] changed the nucleotides at 11061 and 11062 from GT to TG,changing amino acid 119 of T7 lysozyme (gene 3.5) from glycine to valine. Features have been extracted from [1] unless otherwise noted. The sequence shown is that of the l strand,which corresponds to the sequence of all mRNAs of known functional significance. Early mRNAs are produced by three major promoters for E. coli RNA polymerase A1,A2,and A3,located near the left end of the DNA. A fourth major E. coli promoter,A0 (also called D),that would direct transcription leftward,and several minor E. coli promoters (see Table 6 in [1]) function in vitro but have no known in vivo function. Late mRNAs are produced by 15 promoters for T7 RNA polymerase distributed across the right-most 85%25 of the DNA,and named e.g. phi10,for the first gene downstream of the promoter. There are also two T7 promoters,phiOL and phiOR,associated with possible origins of replication at the left and right ends of T7 DNA. The 23 base-pair consensus sequence for T7 promoters stretches from -17 to +6,where the initiating nucleotide is at +1. T7 DNA also contains a 160 base-pair terminal repetition. The beginning and end of RNAs are determined by the promoters,by a terminator for E. coli RNA polymerase,TE,located at the end of the early region,a terminator for T7 RNA polymerase,Tphi,located just downstream of gene 10,and a series of RNase III cleavage sites. Early mRNAs made by E. coli RNA polymerase are listed in Features. The many RNAs predicted to be made by T7 RNA polymerase are not listed but can be deduced from the position of the transcription signals (see Tables 8 and 9 in [1]). Promoters are listed in Features by the known or predicted first nucleotide of the RNA,terminators by the last nucleotide of the RNA,and RNase III sites by the nucleotide 5' of the position of cleavage. Genes are numbered 0.3 to 19.5 in order of their left-to-right position on the genome. Proteins are named by the gene number,e.g.,the gene 1 protein,or by a functional name,e.g.,T7 RNA polymerase. There is now genetic or biochemical evidence that proteins are produced from at least 52 of the 56 T7 genes. Gene 4 produces two proteins,4A and 4B,by initiating translation at two different sites in the same reading frame. Gene 10 produces two proteins,10A and 10B,by frameshifting during translation. Genes 0.6 and 5.5 probably also make two proteins by translational frameshifting,the gene 5.5 frameshift producing a gene 5.5-5.7 fusion protein. COMPLETENESS: full length. ;comment1=VALIDATED REFSEQ: This record has undergone validation or preliminary review. The reference sequence was derived from V01146. The sequence was submitted by the authors [1] on magnetic tape and revised according to [3]%2C [4]%2C and [5]. [3] made changes at 8 positions in gene 1 without affecting the size of the total sequence but changing gene 1 amino acids 443%2C 474%2C and 388 to 424. [4] inserted a T at nucleotide 17511%2C increasing the total sequence to 39937 bp. This change%2C originally found in T3 DNA [8]%2C revealed gene 5.9 and shortened gene 6. [5] changed the nucleotides at 11061 and 11062 from GT to TG%2C changing amino acid 119 of T7 lysozyme (gene 3.5) from glycine to valine. Features have been extracted from [1"..b' However%2C in phage T7 the holin protein gp17.5 does not appear to be essential and gp17.5 mutants only show a minor delay in lysis. Other names: gp17.5%3B lysis protein;codon_start=1;product=type II holin;protein_id=NP_042006.1;transl_table=11;translation=length.67;\n+NC_001604\tGenBank\tgene\t36344\t36547\t.\t+\t1\tID=T7p53.gene;Alias=T7p53;Dbxref=GeneID:1261022;Name=T7p53;Note=gene 17.5;\n+NC_001604\tGenBank\tCDS\t36553\t36822\t.\t+\t1\tID=T7p54;Dbxref=GOA:P03693,UniProtKB/Swiss-Prot:P03693,GeneID:1261042;Name=T7p54;Note=involved in the packaging of genome monomers into a procapsid using head-to-tail concatemers of genomes. other names: DNA packaging protein A%3B DNA maturation protein A%3B terminase%2C small subunit;codon_start=1;product=DNA packaging protein%2C small subunit;protein_id=NP_042007.1;transl_table=11;translation=length.89;\n+NC_001604\tGenBank\tregulatory\t36836\t36836\t.\t+\t1\tID=GenBank:regulatory:NC_001604:36836:36836;Note=E. coli promoter E[6];regulatory_class=promoter;\n+NC_001604\tGenBank\tsequence_secondary_structure\t36856\t36856\t.\t+\t1\tID=GenBank:sequence_secondary_structure:NC_001604:36856:36856;Note=RNase III site R18.5;\n+NC_001604\tGenBank\tgene\t36553\t36822\t.\t+\t1\tID=T7p54.gene;Alias=T7p54;Dbxref=GeneID:1261042;Name=T7p54;Note=gene 18;\n+NC_001604\tGenBank\tCDS\t36917\t37348\t.\t+\t1\tID=T7p55;Dbxref=GOA:P03803,UniProtKB/Swiss-Prot:P03803,GeneID:1261067;Name=T7p55;Note=analog of phage lambda protein Rz%2C a cell lysis protein. Rz and gp18.5 share distant sequence similarity%2C similar function%2C and a similar genome neighborhood. In T7%2C gp18.5 interacts with gp18.7%2C a lambda RZ1-like lysis protein. Other names: gp18.5;codon_start=1;product=phage lambda Rz-like lysis protein;protein_id=NP_042008.1;transl_table=11;translation=length.143;\n+NC_001604\tGenBank\tgene\t36917\t37348\t.\t+\t1\tID=T7p55.gene;Alias=T7p55;Dbxref=GeneID:1261067;Name=T7p55;Note=gene 18.5;\n+NC_001604\tGenBank\tCDS\t37032\t37283\t.\t+\t1\tID=T7p56;Dbxref=UniProtKB/Swiss-Prot:P03788,GeneID:1261057;Name=T7p56;Note=in Enterobacteria phage T7%2C this protein interacts with gp18.5 and is expressed from the -1 frame of a gene completely overlapping gene 18.5. This suggests that it may be an analog of lambda lysis protein Rz1. Other names: gp18.7.;codon_start=1;product=phage lambda Rz1-like protein;protein_id=NP_042009.1;transl_table=11;translation=length.83;\n+NC_001604\tGenBank\tgene\t37032\t37283\t.\t+\t1\tID=T7p56.gene;Alias=T7p56;Dbxref=GeneID:1261057;Name=T7p56;Note=gene 18.7;\n+NC_001604\tGenBank\tCDS\t37370\t39130\t.\t+\t1\tID=T7p57;Dbxref=GOA:P03694,UniProtKB/Swiss-Prot:P03694,GeneID:1261062;Name=T7p57;Note=gene 19;codon_start=1;product=DNA maturation protein;protein_id=NP_042010.1;transl_table=11;translation=length.586;\n+NC_001604\tGenBank\tgene\t37370\t39130\t.\t+\t1\tID=T7p57.gene;Alias=T7p57;Dbxref=GeneID:1261062;Name=T7p57;Note=gene 19;\n+NC_001604\tGenBank\tCDS\t38016\t38273\t.\t+\t1\tID=T7p58;Dbxref=UniProtKB/Swiss-Prot:P03789,GeneID:1261064;Name=T7p58;Note=gene 19.2;codon_start=1;product=hypothetical protein;protein_id=NP_042011.1;transl_table=11;translation=length.85;\n+NC_001604\tGenBank\tgene\t38016\t38273\t.\t+\t1\tID=T7p58.gene;Alias=T7p58;Dbxref=GeneID:1261064;Name=T7p58;Note=gene 19.2;\n+NC_001604\tGenBank\tCDS\t38553\t38726\t.\t+\t1\tID=T7p59;Dbxref=UniProtKB/Swiss-Prot:P03790,GeneID:1261066;Name=T7p59;Note=gene 19.3;codon_start=1;product=hypothetical protein;protein_id=NP_042012.1;transl_table=11;translation=length.57;\n+NC_001604\tGenBank\tregulatory\t39229\t39229\t.\t+\t1\tID=GenBank:regulatory:NC_001604:39229:39229;Note=T7 promoter phiOR;regulatory_class=promoter;\n+NC_001604\tGenBank\tgene\t38553\t38726\t.\t+\t1\tID=T7p59.gene;Alias=T7p59;Dbxref=GeneID:1261066;Name=T7p59;Note=gene 19.3;\n+NC_001604\tGenBank\tCDS\t39389\t39538\t.\t+\t1\tID=T7p60;Dbxref=UniProtKB/Swiss-Prot:P03804,GeneID:1261068;Name=T7p60;Note=gene 19.5;codon_start=1;product=hypothetical protein;protein_id=NP_042013.1;transl_table=11;translation=length.49;\n+NC_001604\tGenBank\tgene\t39389\t39538\t.\t+\t1\tID=T7p60.gene;Alias=T7p60;Dbxref=GeneID:1261068;Name=T7p60;Note=gene 19.5;\n' |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/test-data/T7_TMHMM.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/test-data/T7_TMHMM.gff3 Fri Jun 17 04:00:49 2022 +0000 |
| b |
| @@ -0,0 +1,39 @@ +##gff-version 3 +T7p04 feature Chain 2 47 . + . Description=Transmembrane protein;ID=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d;Note=Transmembrane protein - N out C in;Target=T7p04 +T7p04 TMHMM Topological domain 1 22 . + . Note=Extracellular;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d +T7p04 TMHMM Transmembrane 23 45 . + . Note=Helical;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d +T7p04 TMHMM Topological domain 46 47 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d +##gff-version 3 +T7p11 feature Chain 2 51 . + . Description=Transmembrane protein;ID=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc;Note=Transmembrane protein - N in C in;Target=T7p11 +T7p11 TMHMM Topological domain 1 4 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc +T7p11 TMHMM Transmembrane 5 24 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc +T7p11 TMHMM Topological domain 25 27 . + . Note=Extracellular;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc +T7p11 TMHMM Transmembrane 28 50 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc +T7p11 TMHMM Topological domain 51 51 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc +##gff-version 3 +T7p25 feature Chain 2 112 . + . Description=Transmembrane protein;ID=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9;Note=Transmembrane protein - N in C in;Target=T7p25 +T7p25 TMHMM Topological domain 1 6 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 +T7p25 TMHMM Transmembrane 7 29 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 +T7p25 TMHMM Topological domain 30 33 . + . Note=Extracellular;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 +T7p25 TMHMM Transmembrane 34 56 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 +T7p25 TMHMM Topological domain 57 112 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 +##gff-version 3 +T7p36 feature Chain 2 37 . + . Description=Transmembrane protein;ID=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8;Note=Transmembrane protein - N out C in;Target=T7p36 +T7p36 TMHMM Topological domain 1 4 . + . Note=Extracellular;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8 +T7p36 TMHMM Transmembrane 5 24 . + . Note=Helical;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8 +T7p36 TMHMM Topological domain 25 37 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8 +##gff-version 3 +T7p53 feature Chain 2 67 . + . Description=Transmembrane protein;ID=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb;Note=Transmembrane protein - N out C in;Target=T7p53 +T7p53 TMHMM Topological domain 1 36 . + . Note=Extracellular;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb +T7p53 TMHMM Transmembrane 37 55 . + . Note=Helical;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb +T7p53 TMHMM Topological domain 56 67 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb +##gff-version 3 +T7p56 feature Chain 2 83 . + . Description=Transmembrane protein;ID=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e;Note=Transmembrane protein - N in C out;Target=T7p56 +T7p56 TMHMM Topological domain 1 27 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e +T7p56 TMHMM Transmembrane 28 50 . + . Note=Helical;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e +T7p56 TMHMM Topological domain 51 83 . + . Note=Extracellular;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e +##gff-version 3 +T7p60 feature Chain 2 49 . + . Description=Transmembrane protein;ID=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599;Note=Transmembrane protein - N in C out;Target=T7p60 +T7p60 TMHMM Topological domain 1 12 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599 +T7p60 TMHMM Transmembrane 13 30 . + . Note=Helical;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599 +T7p60 TMHMM Topological domain 31 49 . + . Note=Extracellular;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599 |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/test-data/T7_TMHMM_REBASE.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/test-data/T7_TMHMM_REBASE.gff3 Fri Jun 17 04:00:49 2022 +0000 |
| b |
| @@ -0,0 +1,33 @@ +##gff-version 3 +NC_001604 feature Chain 1499 1636 . + . Description=Transmembrane protein;ID=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d;Note=Transmembrane protein - N out C in;Target=T7p04; +NC_001604 TMHMM Topological domain 1496 1561 . + . Note=Extracellular;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d; +NC_001604 TMHMM Transmembrane 1562 1630 . + . Note=Helical;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d; +NC_001604 TMHMM Topological domain 1631 1636 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d; +NC_001604 feature Chain 7611 7760 . + . Description=Transmembrane protein;ID=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc;Note=Transmembrane protein - N in C in;Target=T7p11; +NC_001604 TMHMM Topological domain 7608 7619 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; +NC_001604 TMHMM Transmembrane 7620 7679 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; +NC_001604 TMHMM Topological domain 7680 7688 . + . Note=Extracellular;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; +NC_001604 TMHMM Transmembrane 7689 7757 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; +NC_001604 TMHMM Topological domain 7758 7760 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; +NC_001604 feature Chain 12991 13323 . + . Description=Transmembrane protein;ID=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9;Note=Transmembrane protein - N in C in;Target=T7p25; +NC_001604 TMHMM Topological domain 12988 13005 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; +NC_001604 TMHMM Transmembrane 13006 13074 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; +NC_001604 TMHMM Topological domain 13075 13086 . + . Note=Extracellular;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; +NC_001604 TMHMM Transmembrane 13087 13155 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; +NC_001604 TMHMM Topological domain 13156 13323 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; +NC_001604 feature Chain 18397 18504 . + . Description=Transmembrane protein;ID=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8;Note=Transmembrane protein - N out C in;Target=T7p36; +NC_001604 TMHMM Topological domain 18394 18405 . + . Note=Extracellular;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8; +NC_001604 TMHMM Transmembrane 18406 18465 . + . Note=Helical;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8; +NC_001604 TMHMM Topological domain 18466 18504 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8; +NC_001604 feature Chain 36347 36544 . + . Description=Transmembrane protein;ID=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb;Note=Transmembrane protein - N out C in;Target=T7p53; +NC_001604 TMHMM Topological domain 36344 36451 . + . Note=Extracellular;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb; +NC_001604 TMHMM Transmembrane 36452 36508 . + . Note=Helical;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb; +NC_001604 TMHMM Topological domain 36509 36544 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb; +NC_001604 feature Chain 37035 37280 . + . Description=Transmembrane protein;ID=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e;Note=Transmembrane protein - N in C out;Target=T7p56; +NC_001604 TMHMM Topological domain 37032 37112 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e; +NC_001604 TMHMM Transmembrane 37113 37181 . + . Note=Helical;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e; +NC_001604 TMHMM Topological domain 37182 37280 . + . Note=Extracellular;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e; +NC_001604 feature Chain 39392 39535 . + . Description=Transmembrane protein;ID=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599;Note=Transmembrane protein - N in C out;Target=T7p60; +NC_001604 TMHMM Topological domain 39389 39424 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599; +NC_001604 TMHMM Transmembrane 39425 39478 . + . Note=Helical;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599; +NC_001604 TMHMM Topological domain 39479 39535 . + . Note=Extracellular;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599; |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/test-data/child.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/test-data/child.gff Fri Jun 17 04:00:49 2022 +0000 |
| b |
| @@ -0,0 +1,2 @@ +#gff-version 3 +cds42 blastp match_part 1 50 1e-40 . . ID=m00001;Notes=RNAse A Protein |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/test-data/nonprotein.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/test-data/nonprotein.gff Fri Jun 17 04:00:49 2022 +0000 |
| b |
| @@ -0,0 +1,2 @@ +##gff-version 3 +PhageBob blastp match_part 300 349 1e-40 + . ID=m00001;Notes=RNAse A Protein; |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/test-data/parent.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/test-data/parent.gff Fri Jun 17 04:00:49 2022 +0000 |
| b |
| @@ -0,0 +1,3 @@ +#gff-version 3 +PhageBob maker cds 300 500 . + . ID=gene42 +PhageBob maker cds 300 500 . + . Parent=gene42;ID=cds42 |
| b |
| diff -r 000000000000 -r 6e7e20cb1fc7 cpt_gff_rebase/test-data/proteins.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gff_rebase/test-data/proteins.gff Fri Jun 17 04:00:49 2022 +0000 |
| b |
| @@ -0,0 +1,2 @@ +##gff-version 3 +PhageBob blastp match_part 300 449 1e-40 + . ID=m00001;Notes=RNAse A Protein; |