Previous changeset 2:6795d3349462 (2022-06-17) Next changeset 4:35a6c466e270 (2023-07-23) |
Commit message:
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c |
added:
cpt-macros.xml gff2gb.py gff2gb.xml gff3.py macros.xml test-data/miro.fa test-data/miro.gff3 test-data/miro_from_tool.gbk |
removed:
cpt_gff_to_gbk/cpt-macros.xml cpt_gff_to_gbk/gff2gb.py cpt_gff_to_gbk/gff2gb.xml cpt_gff_to_gbk/gff3.py cpt_gff_to_gbk/macros.xml cpt_gff_to_gbk/test-data/miro.fa cpt_gff_to_gbk/test-data/miro.gff3 cpt_gff_to_gbk/test-data/miro_from_tool.gbk |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 cpt-macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:44:32 2023 +0000 |
[ |
@@ -0,0 +1,115 @@ +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd '$__tool_directory__' && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros> |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 cpt_gff_to_gbk/cpt-macros.xml --- a/cpt_gff_to_gbk/cpt-macros.xml Fri Jun 17 12:57:03 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,115 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="gff_requirements"> - <requirements> - <requirement type="package" version="2.7">python</requirement> - <requirement type="package" version="1.65">biopython</requirement> - <requirement type="package" version="2.12.1">requests</requirement> - <yield/> - </requirements> - <version_command> - <![CDATA[ - cd $__tool_directory__ && git rev-parse HEAD - ]]> - </version_command> - </xml> - <xml name="citation/mijalisrasche"> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex">@unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </xml> - <xml name="citations"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-crr"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020-AJC-solo"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-clm"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="sl-citations-clm"> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </xml> -</macros> |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 cpt_gff_to_gbk/gff2gb.py --- a/cpt_gff_to_gbk/gff2gb.py Fri Jun 17 12:57:03 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,456 +0,0 @@\n-#!/usr/bin/env python\n-"""Convert a GFF and associated FASTA file into GenBank format.\n-\n-Usage:\n-gff_to_genbank.py <GFF annotation file> <FASTA sequence file>\n-"""\n-import argparse\n-import sys\n-import re\n-import copy\n-import itertools\n-import logging\n-from Bio import SeqIO\n-#from Bio.Alphabet import generic_dna\n-from Bio.SeqFeature import CompoundLocation, FeatureLocation\n-from CPT_GFFParser import gffParse, gffWrite\n-from gff3 import (\n- feature_lambda,\n- wa_unified_product_name,\n- is_uuid,\n- feature_test_type,\n- fsort,\n- feature_test_true,\n- feature_test_quals,\n-)\n-\n-default_name = re.compile(r"^gene_(\\d+)$")\n-logging.basicConfig(level=logging.INFO)\n-\n-\n-def rename_key(ds, k_f, k_t):\n- """Rename a key in a dictionary and return it, FP style"""\n- # If they key is not in the dictionary, just return immediately\n- if k_f not in ds:\n- return ds\n-\n- # Otherwise, we check if the target key is in there\n- if k_t in ds:\n- # If it is, we need to append\n- ds[k_t] += ds[k_f]\n- else:\n- # if not, we can just set.\n- ds[k_t] = ds[k_f]\n-\n- # Remove source\n- del ds[k_f]\n- return ds\n-\n-\n-def gff3_to_genbank(gff_file, fasta_file, transltbl):\n- fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))#, generic_dna))\n- gff_iter = gffParse(gff_file, fasta_input)\n-\n- for record in gff_iter:\n- yield handle_record(record, transltbl)\n-\n-\n-def handle_non_gene_features(features):\n- # These are NON-GENE features (maybe terminators? etc?)\n- for feature in feature_lambda(\n- features,\n- feature_test_type,\n- {"type": "gene"},\n- subfeatures=False,\n- invert=True,\n- recurse=True, # used to catch RBS from new apollo runs (used to be False)\n- ):\n- if feature.type in (\n- "terminator", \n- "tRNA", \n- "Shine_Dalgarno_sequence",\n- "sequence_feature",\n- "recombination_feature",\n- "sequence_alteration",\n- "binding_site",\n- ):\n- yield feature\n- elif feature.type in (\n- "CDS",\n- ):\n- pass\n- else:\n- yield feature\n-\n-\n-\n-def fminmax(feature):\n- fmin = None\n- fmax = None\n- for sf in feature_lambda([feature], feature_test_true, {}, subfeatures=True):\n- if fmin is None:\n- fmin = sf.location.start\n- fmax = sf.location.end\n- if sf.location.start < fmin:\n- fmin = sf.location.start\n- if sf.location.end > fmax:\n- fmax = sf.location.end\n- return fmin, fmax\n-\n-\n-def fix_gene_boundaries(feature):\n- # There is a frustrating bug in apollo whereby we have created gene\n- # features which are LARGER than expected, but we cannot see this.\n- # We only see a perfect sized gene + great SD together.\n- #\n- # So, we have this awful hack to clamp the location of the gene\n- # feature to the contained mRNAs. This is good enough for now.\n- fmin, fmax = fminmax(feature)\n- if feature.location.strand > 0:\n- feature.location = FeatureLocation(fmin, fmax, strand=1)\n- else:\n- feature.location = FeatureLocation(fmin, fmax, strand=-1)\n- return feature\n-\n-\n-def fix_gene_qualifiers(name, feature, fid):\n- for mRNA in feature.sub_features:\n- mRNA.qualifiers["locus_tag"] = "CPT_%s_%03d" % (name, fid)\n- # And some exons below that\n- sf_replacement = []\n- for sf in mRNA.sub_features:\n- # We set a locus_tag on ALL sub features\n- sf.qualifiers["locus_tag"] = "CPT_%s_%03d" % (name, fid)\n- # Remove Names which are UUIDs\n- # NOT GOOD PRACTICE\n- try:\n- if is_uuid(sf.qualifiers["Name"][0]):\n- del sf.qualifiers["Name"]\n- except KeyError:\n- continue # might should go back to pass, I have not put thought into this still\n-\n- # '..b'placement_feats = []\n- replacement_feats += list(handle_non_gene_features(record.features))\n-\n- # Renumbering requires sorting\n- fid = 0\n- for feature in fsort(\n- feature_lambda(\n- record.features, feature_test_type, {"type": "gene"}, subfeatures=True\n- )\n- ):\n- # Our modifications only involve genes\n- fid += 1\n-\n- feature = fix_gene_boundaries(feature)\n- # Which have mRNAs we\'ll drop later\n- feature = fix_gene_qualifiers(record.id, feature, fid)\n-\n- # Wipe out the parent gene\'s data, leaving only a locus_tag\n- feature.qualifiers = {"locus_tag": "CPT_%s_%03d" % (record.id, fid)}\n- \n- # Patch our features back in (even if they\'re non-gene features)\n- replacement_feats.append(feature)\n- \n- replacement_feats = fix_frameshifts(replacement_feats)\n- #exit(0)\n- flat_features = feature_lambda(\n- replacement_feats, lambda x: True, {}, subfeatures=True\n- )\n- \n- flat_features = remove_useless_features(flat_features)\n- \n- # Meat of our modifications\n- for flat_feat in flat_features:\n- # Try and figure out a name. We gave conflicting instructions, so\n- # this isn\'t as trivial as it should be.\n- protein_product = wa_unified_product_name(flat_feat)\n-\n- for x in (\n- "source",\n- "phase",\n- "Parent",\n- "ID",\n- "owner",\n- "date_creation",\n- "date_last_modified",\n- "datasetSource",\n- ):\n- if x in flat_feat.qualifiers:\n- if x == "ID":\n- flat_feat._ID = flat_feat.qualifiers["ID"]\n- del flat_feat.qualifiers[x]\n-\n- # Add product tag\n- if flat_feat.type == "CDS":\n- flat_feat.qualifiers["product"] = [protein_product]\n- flat_feat.qualifiers["transl_table"] = [transltbl]\n- if "Product" in flat_feat.qualifiers:\n- del flat_feat.qualifiers["Product"]\n- elif flat_feat.type == "RBS":\n- if "locus_tag" not in flat_feat.qualifiers.keys():\n- continue\n-\n- elif flat_feat.type == "terminator":\n- flat_feat.type = "regulatory"\n- flat_feat.qualifiers = {"regulatory_class": "terminator"}\n-\n- # In genbank format, note is lower case.\n- flat_feat.qualifiers = rename_key(flat_feat.qualifiers, "Note", "note")\n- flat_feat.qualifiers = rename_key(flat_feat.qualifiers, "description", "note")\n- flat_feat.qualifiers = rename_key(flat_feat.qualifiers, "protein", "note")\n- flat_feat.qualifiers = rename_key(flat_feat.qualifiers, "Dbxref", "db_xref")\n- if "Name" in flat_feat.qualifiers:\n- del flat_feat.qualifiers["Name"]\n-\n- # more apollo nonsense\n- if "Manually set translation start" in flat_feat.qualifiers.get("note", []):\n- flat_feat.qualifiers["note"].remove("Manually set translation start")\n-\n- # Append the feature\n- full_feats.append(flat_feat)\n-\n- # Update our features\n- record.features = fsort(full_feats)\n- # Strip off record names that would cause crashes.\n- record.name = record.name[0:16]\n- return record\n-\n-\n-if __name__ == "__main__":\n- # Grab all of the filters from our plugin loader\n- parser = argparse.ArgumentParser(description="Convert gff3 to gbk")\n- parser.add_argument("gff_file", type=argparse.FileType("r"), help="GFF3 file")\n- parser.add_argument("fasta_file", type=argparse.FileType("r"), help="Fasta Input")\n- parser.add_argument(\n- "--transltbl",\n- type=int,\n- default=11,\n- help="Translation Table choice for CDS tag, default 11",\n- )\n- args = parser.parse_args()\n-\n- for record in gff3_to_genbank(**vars(args)):\n- record.annotations["molecule_type"] = "DNA"\n- #record.seq.alphabet = generic_dna\n- SeqIO.write([record], sys.stdout, "genbank")\n' |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 cpt_gff_to_gbk/gff2gb.xml --- a/cpt_gff_to_gbk/gff2gb.xml Fri Jun 17 12:57:03 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,77 +0,0 @@ -<tool id="edu.tamu.cpt.gff.gff2gb" name="GFF3 to GenBank" version="4.0"> - <description>convert gff3 to GenBank</description> - <macros> - <import>macros.xml</import> - <import>cpt-macros.xml</import> - </macros> - <expand macro="requirements"/> - <command detect_errors="aggressive"><![CDATA[ -@GENOME_SELECTOR_PRE@ - -python $__tool_directory__/gff2gb.py -@INPUT_GFF@ -@INPUT_FASTA@ ---transltbl $transltbl -> $output]]></command> - <inputs> - <expand macro="input/gff3+fasta" /> - <param label="Translation Table" name="transltbl" type="select"> - <option value="1">1. The Standard Code</option> - <option value="2">2. The Vertebrate Mitochondrial Code</option> - <option value="3">3. The Yeast Mitochondrial Code</option> - <option value="4">4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> - <option value="5">5. The Invertebrate Mitochondrial Code</option> - <option value="6">6. The Ciliate, Dasycladacean and Hexamita Nuclear Code</option> - <option value="9">9. The Echinoderm and Flatworm Mitochondrial Code</option> - <option value="10">10. The Euplotid Nuclear Code</option> - <option value="11" selected="true">11. The Bacterial, Archaeal and Plant Plastid Code</option> - <option value="12">12. The Alternative Yeast Nuclear Code</option> - <option value="13">13. The Ascidian Mitochondrial Code</option> - <option value="14">14. The Alternative Flatworm Mitochondrial Code</option> - <option value="16">16. Chlorophycean Mitochondrial Code</option> - <option value="21">21. Trematode Mitochondrial Code</option> - <option value="22">22. Scenedesmus obliquus Mitochondrial Code</option> - <option value="23">23. Thraustochytrium Mitochondrial Code</option> - <option value="24">24. Pterobranchia Mitochondrial Code</option> - <option value="25">25. Candidate Division SR1 and Gracilibacteria Code</option> - </param> - </inputs> - <outputs> - <data format="genbank" hidden="false" name="output" label="${gff3_data.name} as GenBank"/> - </outputs> - <tests> - <!-- There have been issues running the diffs for the files --> - <!-- Going to use asserts and run ONE (or no) diffs --> - <test> <!-- ORIGINAL TEST FILE, regenerated 10.12.2020 --> - <param name="reference_genome_source" value="history" /> - <param name="genome_fasta" value="miro.fa" /> - <param name="gff3_data" value="miro.gff3" /> - <output name="output" file="miro_from_tool.gbk" compare="sim_size" delta_frac="0.05"> - <assert_contents> - <has_text text="RBS" /> - <has_text text="gene" /> - <has_text text="CDS" /> - </assert_contents> - </output> - </test> - </tests> - <help><![CDATA[ -.. class:: warningmark - -This is a LOSSY conversion. This tool **TRUNCATES** genbank file identifiers if -they are too long. Your data may not "match up" after processing through this -tool. - -**What it does**: - -Convert gff3 data to genbank. There are many WebApollo specific conventions. A re-numbering is also done. - -**Supported / Expected Data** - -- gene / mRNA / (CDS, Exon) -- gene / tRNA -- terminator - -]]></help> - <expand macro="citations" /> -</tool> |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 cpt_gff_to_gbk/gff3.py --- a/cpt_gff_to_gbk/gff3.py Fri Jun 17 12:57:03 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,346 +0,0 @@\n-import copy\n-import logging\n-\n-log = logging.getLogger()\n-log.setLevel(logging.WARN)\n-\n-\n-def feature_lambda(\n- feature_list,\n- test,\n- test_kwargs,\n- subfeatures=True,\n- parent=None,\n- invert=False,\n- recurse=True,\n-):\n- """Recursively search through features, testing each with a test function, yielding matches.\n-\n- GFF3 is a hierachical data structure, so we need to be able to recursively\n- search through features. E.g. if you\'re looking for a feature with\n- ID=\'bob.42\', you can\'t just do a simple list comprehension with a test\n- case. You don\'t know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.\n-\n- :type feature_list: list\n- :param feature_list: an iterable of features\n-\n- :type test: function reference\n- :param test: a closure with the method signature (feature, **kwargs) where\n- the kwargs are those passed in the next argument. This\n- function should return True or False, True if the feature is\n- to be yielded as part of the main feature_lambda function, or\n- False if it is to be ignored. This function CAN mutate the\n- features passed to it (think "apply").\n-\n- :type test_kwargs: dictionary\n- :param test_kwargs: kwargs to pass to your closure when it is called.\n-\n- :type subfeatures: boolean\n- :param subfeatures: when a feature is matched, should just that feature be\n- yielded to the caller, or should the entire sub_feature\n- tree for that feature be included? subfeatures=True is\n- useful in cases such as searching for a gene feature,\n- and wanting to know what RBS/Shine_Dalgarno_sequences\n- are in the sub_feature tree (which can be accomplished\n- with two feature_lambda calls). subfeatures=False is\n- useful in cases when you want to process (and possibly\n- return) the entire feature tree, such as applying a\n- qualifier to every single feature.\n-\n- :type invert: boolean\n- :param invert: Negate/invert the result of the filter.\n-\n- :rtype: yielded list\n- :return: Yields a list of matching features.\n- """\n- # Either the top level set of [features] or the subfeature attribute\n- for feature in feature_list:\n- feature._parent = parent\n- if not parent:\n- # Set to self so we cannot go above root.\n- feature._parent = feature\n- test_result = test(feature, **test_kwargs)\n- # if (not invert and test_result) or (invert and not test_result):\n- if invert ^ test_result:\n- if not subfeatures:\n- feature_copy = copy.deepcopy(feature)\n- feature_copy.sub_features = list()\n- yield feature_copy\n- else:\n- yield feature\n-\n- if recurse and hasattr(feature, "sub_features"):\n- for x in feature_lambda(\n- feature.sub_features,\n- test,\n- test_kwargs,\n- subfeatures=subfeatures,\n- parent=feature,\n- invert=invert,\n- recurse=recurse,\n- ):\n- yield x\n-\n-\n-def fetchParent(feature):\n- if not hasattr(feature, "_parent") or feature._parent is None:\n- return feature\n- else:\n- return fetchParent(feature._parent)\n-\n-\n-def feature_test_true(feature, **kwargs):\n- return True\n-\n-\n-def feature_test_type(feature, **kwargs):\n- if "type" in kwargs:\n- return str(feature.type).upper() == str(kwargs["type"]).upper()\n- elif "types" in kwargs:\n- for x in kwargs["types"]:\n- if str(feature.type).upper() == str(x).upper():\n- return True\n- return False\n- raise Exception("Incorrect feature_test_type call, ne'..b'feature.location.start,\n- # feature.location.end,\n- # feature.location.strand\n- # )\n- return result\n-\n-\n-def get_gff3_id(gene):\n- return gene.qualifiers.get("Name", [gene.id])[0]\n-\n-\n-def ensure_location_in_bounds(start=0, end=0, parent_length=0):\n- # This prevents frameshift errors\n- while start < 0:\n- start += 3\n- while end < 0:\n- end += 3\n- while start > parent_length:\n- start -= 3\n- while end > parent_length:\n- end -= 3\n- return (start, end)\n-\n-\n-def coding_genes(feature_list):\n- for x in genes(feature_list):\n- if (\n- len(\n- list(\n- feature_lambda(\n- x.sub_features,\n- feature_test_type,\n- {"type": "CDS"},\n- subfeatures=False,\n- )\n- )\n- )\n- > 0\n- ):\n- yield x\n-\n-\n-def genes(feature_list, feature_type="gene", sort=False):\n- """\n- Simple filter to extract gene features from the feature set.\n- """\n-\n- if not sort:\n- for x in feature_lambda(\n- feature_list, feature_test_type, {"type": feature_type}, subfeatures=True\n- ):\n- yield x\n- else:\n- data = list(genes(feature_list, feature_type=feature_type, sort=False))\n- data = sorted(data, key=lambda feature: feature.location.start)\n- for x in data:\n- yield x\n-\n-\n-def wa_unified_product_name(feature):\n- """\n- Try and figure out a name. We gave conflicting instructions, so\n- this isn\'t as trivial as it should be. Sometimes it will be in\n- \'product\' or \'Product\', othertimes in \'Name\'\n- """\n- # Manually applied tags.\n- protein_product = feature.qualifiers.get(\n- "product", feature.qualifiers.get("Product", [None])\n- )[0]\n-\n- # If neither of those are available ...\n- if protein_product is None:\n- # And there\'s a name...\n- if "Name" in feature.qualifiers:\n- if not is_uuid(feature.qualifiers["Name"][0]):\n- protein_product = feature.qualifiers["Name"][0]\n-\n- return protein_product\n-\n-\n-def is_uuid(name):\n- return name.count("-") == 4 and len(name) == 36\n-\n-\n-def get_rbs_from(gene):\n- # Normal RBS annotation types\n- rbs_rbs = list(\n- feature_lambda(\n- gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False\n- )\n- )\n- rbs_sds = list(\n- feature_lambda(\n- gene.sub_features,\n- feature_test_type,\n- {"type": "Shine_Dalgarno_sequence"},\n- subfeatures=False,\n- )\n- )\n- # Fraking apollo\n- apollo_exons = list(\n- feature_lambda(\n- gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False\n- )\n- )\n- apollo_exons = [x for x in apollo_exons if len(x) < 10]\n- # These are more NCBI\'s style\n- regulatory_elements = list(\n- feature_lambda(\n- gene.sub_features,\n- feature_test_type,\n- {"type": "regulatory"},\n- subfeatures=False,\n- )\n- )\n- rbs_regulatory = list(\n- feature_lambda(\n- regulatory_elements,\n- feature_test_quals,\n- {"regulatory_class": ["ribosome_binding_site"]},\n- subfeatures=False,\n- )\n- )\n- # Here\'s hoping you find just one ;)\n- return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons\n-\n-\n-def nice_name(record):\n- """\n- get the real name rather than NCBI IDs and so on. If fails, will return record.id\n- """\n- name = record.id\n- likely_parental_contig = list(genes(record.features, feature_type="contig"))\n- if len(likely_parental_contig) == 1:\n- name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]\n- return name\n-\n-\n-def fsort(it):\n- for i in sorted(it, key=lambda x: int(x.location.start)):\n- yield i\n' |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 cpt_gff_to_gbk/macros.xml --- a/cpt_gff_to_gbk/macros.xml Fri Jun 17 12:57:03 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,62 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package" version="3.8.13">python</requirement> - <requirement type="package" version="1.79">biopython</requirement> - <requirement type="package" version="1.2.2">cpt_gffparser</requirement> - <yield/> - </requirements> - </xml> - <token name="@BLAST_TSV@"> - "$blast_tsv" - </token> - <xml name="blast_tsv"> - <param label="Blast Results" help="TSV/tabular (25 Column)" - name="blast_tsv" type="data" format="tabular" /> - </xml> - - <token name="@BLAST_XML@"> - "$blast_xml" - </token> - <xml name="blast_xml"> - <param label="Blast Results" help="XML format" - name="blast_xml" type="data" format="blastxml" /> - </xml> - <xml name="gff3_with_fasta"> - <param label="Genome Sequences" name="fasta" type="data" format="fasta" /> - <param label="Genome Annotations" name="gff3" type="data" format="gff3" /> - </xml> - <xml name="genome_selector"> - <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> - </xml> - <xml name="gff3_input"> - <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> - </xml> - <xml name="input/gff3+fasta"> - <expand macro="gff3_input" /> - <expand macro="genome_selector" /> - </xml> - <token name="@INPUT_GFF@"> - "$gff3_data" - </token> - <token name="@INPUT_FASTA@"> - genomeref.fa - </token> - <token name="@GENOME_SELECTOR_PRE@"> - ln -s $genome_fasta genomeref.fa; - </token> - <token name="@GENOME_SELECTOR@"> - genomeref.fa - </token> - <xml name="input/fasta"> - <param label="Fasta file" name="sequences" type="data" format="fasta"/> - </xml> - - <token name="@SEQUENCE@"> - "$sequences" - </token> - <xml name="input/fasta/protein"> - <param label="Protein fasta file" name="sequences" type="data" format="fasta"/> - </xml> -</macros> |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 cpt_gff_to_gbk/test-data/miro.fa --- a/cpt_gff_to_gbk/test-data/miro.fa Fri Jun 17 12:57:03 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b'@@ -1,2936 +0,0 @@\n->Miro\n-TTAGTAATGGCTAAAACCATATGTAACATCAATCATGACTTTATAACGGCATACACGCAT\n-TTTTGCGTTATTGTAATCCACTGGGATCGCTACCACGTCAGCAGGATCCACCTTTACCTG\n-AATGACTCTACCAACACCGCCCCCGTAGTGTGGAAGGTATGATTTAGCCGCAACGTGTAG\n-GCCAGTAGAACAGGTGCGCGTTTTATCTTCGTCTACCATGTTTCGAGGCATGGAGACAGT\n-CACACCAGGACTATTATCAAATTTGCCAGTAGCGAGATCTTTATAGTTATCGCGCACACG\n-TTTCCAGGCAAGGAAACAACCATCATCGGTCAGTTCAATGTCATTATGTACAAGGAACCC\n-ATAAAGCTGGTATACAGCATCGCGTGAAGGGTTTCGCATCAACCGTTCAAAGAAGTTCAC\n-CAGATGTTCATACGGACGATCGTTATACATTTCGCGAATGATTCGTTGAGTGATATCAGA\n-ATCAAACACTACATCTTTATATAGAAGCTGATGACCAATGATTTTAATGTTGCCTTTGCT\n-ATAGGTTCGGATCGCTTCTTGAGTATCCAAACAAGTTACAGCACCTTTGACATCACCAGC\n-TTTCAGCATTTCATGCGCTTTCTTAAAGTTCGGATGTGTTTCACCCGCCATGAAAACGCG\n-CCCTTCGTATACAACCGTAATGAACGATTCAGATCCGATCATACGCGGTACAGTGTCTGA\n-TACTTTAGGCTTTTGTTTTACATCCCTTTTCTTAGGGAATTCCGCTAACTTGCGGTTGAT\n-AACGCGCCCAATGGTTCGGGCGCTTACGTTGAACTGTTGGGCCAGTGCTGTTTTACTTGC\n-CCCCGTCAACCATCCATTATAGATAGCTTTCTGTTGTACTTCGTCGAGAATTTTGACCAT\n-TATTTCCACCGTATTAATTTCTTAAACTCACTGAGATTCTTTTCGTTGTTGTAAATCGGA\n-CGAATTGAATAAGAATCGCTATGTTCAACAAGTGAAGCCAGTAACGGGTTTAGTGATTTA\n-AAGATTTCCCATGCCTGATCAACACGTTTTTTCATATTGCTGCGTTTAACGCGCATTGAT\n-GCTACGGACTCACGCAGAATCGGACAACGCACCCTGGAAAGATTTTTACCATCTTTCTCA\n-TACCCTTCCAGACACACTATACGTTCGAGGGTATCAACAATCTTATACAGTTTTTCATTA\n-TATCGGTTTTTCACAATCCGATCAAGCGATACACCGAAACGGCTATGCAATGCGTCTGTT\n-TCTGTTGAGTGATCCTTCCCAATCCATCCAGGCAAGCAATTATCTTTCAATGCCTTTTCA\n-GATTTAACATACTGCTTGCACAGCATATCATCAAAGCATACCAGATTTGAATCCGGGATC\n-CACTTCCAGAGGCTGTTACGTATAGCAAACACAACAGGGATCCCAGTATGGCGCATGATA\n-CGCGATAAAGTAGATTCTTTCATTGCTGAATCCATAGACAACCCGGAATTTTCACCATCT\n-AAGCGGCTATATTCATCAATACCATACAACCGAACACCTGGGGCTTTATCCAGTGATAAA\n-AACTCTGATTTTGTCATAAACAGAGAAGTTTTTGCCAGATTGCCGTTACTATCCAACTCA\n-TAACGATATACGGTCGGGGTTTTTGGGCGCGGTTCTGAATTTTTCGGTGCATATAGCGCT\n-TTTGATTTTTCGCGATCTGCATCATAGATCTCTTTTTCTTTTGTCATTTCACTGGTACGG\n-AGATACACAATTTCCGATTCATCAAAATGACCTTTCCGAACGATATCATTAACAATCTCA\n-CGCTTTGAATCACTATCGTAATATGCAACAAAGCTAACACGGCTTAGGTTATGCATTTTA\n-GCATACCCGACGATATACGGTTTAACCGTGTTGGTATCCACTTTTAACAGAATGAGTTTC\n-TTCTGTTTCCACGGATAATAAATGCGTGTGATGTCCTGGCGTTTGGTTGTTTCTGGCTTA\n-TACTTGCTCCAGCGGCCTCCGCTACCAGTTACCTGATACCATGCATCTTTACCGTCGTAT\n-TCGTTCGCCCAATACCCAGCAACATAATCATCATTATATTTGTTTGGTTTGACTAGTTCG\n-CTATGGATCCAGCCAATAGAATCACCATTGATGCGAAAATTAGCATCTTTACCAACAAAG\n-TTTTGTACCATTGAAGGCAGAGAATGGAACCACGTCAGTTTATCACGCACGGTTTGTAAC\n-TTATCGAATTCTGATTTAACTCGATTGAAATATACCCGGCTGATTTGTTTCAGACGTTCT\n-TTAACAATCCCTACTGTCATTTTATCCATACTCAACTCTTCGCGAGAAGGCATGAAATCA\n-AGTTCACCGATCGGGAAGTCAATAATATACGTATACTGGCTTTCTGTATAGCAATAGAAC\n-ATCGAGGTATCATACAAATCTTTATCCAGAGGATAAATGATGTTACCCATGCGAGCATAT\n-ACACCGCTAGTGTATGCTGATTTATGACGGATCACCCCGCTATCGTTGGTTGCTTCTTTC\n-GGCTGATAGTTGATTTTGAGAATAGAAGCACCAACAAAGTTAGGACGAATATCAGTAAAT\n-GATTCGTATACCCTTGCTGCTTCGTTTTCCCATTCTTTGATATCTTCAACCTTAACCGGA\n-ACAGTGATAGTAACCCCGTTAGGTTCATCGCTTTCAATCTCATACAGAGGATCGCAGAAA\n-GGTTCCCCATCATCCATATAGATTGTGTAACCGCATTTGATACCGTCTTTTACGGATTCC\n-ACCGTGAAAGCATCGGAATAGCAAAGCGGAGATTTGCAACCCAGACCCATAGAACCGATC\n-AGGTCGTTTGAATCATTTTTAGTTGATTCGAAGTAAACGGTAAACGCATCACTAACGAAA\n-TCAGGAGACATACCGATCCCGTAGTCACGAATAACAAAACGAGGATCAACAGCAGTTGGC\n-AACTGGACATCAAACGGGTTCTGATTTCCCGCTTCTTTGTGTCCATCAATCGCATTACAA\n-GACAGTTCGCGAATGATTGCGCGGATCTTGTATTTGTATACTGTCGAAGAAAGGATCTTA\n-TACGCTTTCTTGTTTGCGCGTAGAGATAGTTTGTTTCGTCCCTTGCTGGTATCTGTACCA\n-ACACGGTAGATGGTTTGCGGTGTATCTTCGCGTAATTTCATTGTTTATTTCTCACTTAAC\n-ATTAAAAATAACTTGGTCACAAGAGTACTTCGTTGGCTTTTTGTTCAGACCATATTCTAC\n-TACTTCACAATAGGTGTCAAGGAATTTTACCAATTTTTCTTCCTCGACCTGCTGTTTCTT\n-CATATCAAGGATACCCCACACGATAGCCCCGATAATGACAGAAAAGAACGCACAAAATCC\n-GAATATGGTCAGATATTTTCCCAACTTAGGCGCATTATAACGTGTCATACCTTACCCCTC\n-TTTGCGAATGTATGCAAGTTCTTCATGGGTTACTGGACGGATATACAGACGGCCTTTTGT\n-ATATGCCTTGCGCCCGCTGATCCAAATGTTTTTCATATCCTTAACACCGTTCATCACATC\n-GTTGTAAAACTTCTTATCAGCTTTAGCCTGATAGACTTCACGGCCTTGATAATCTTTCAT\n-GAACAAACAATAAAGGATCTCATTCTTATCAACTAGATTAGCATCCTTTGTAGTTGTTTT\n-ACTTGGTGAAGGTTTCGCACCCAGGCGCAAGGCCATAGCTTGCCACACTTTACCATGTTC\n-ATAACCGCGCCCGACAAGAGCATGAGCGATTTCGTGTAAAAGAGTGTCTAAAATATCCTC\n-GTAGATATCTTCCGCAACATGACGACCAGACAGTTCGATCAGTTTTTTGGTATAACTGCA\n-ACGG'..b'TCGGGTAATATCG\n-TTTGTGATGGGTGTGAAAACATGGAAACAATTGCTGATAAAAATAATTCTAATAATGGTT\n-ATGTTTCTTATGGTAGTAACTTGGTACAAATGGACTGATATATTCCCGATGATAAAAGGT\n-GCCCTTGTAGTCGATACAAGGGCTATCGAAATGGAAAGAACAGAAAAGTTTAATCAATCC\n-GCGTTGGAACAGTTGAGCATAGTTCATCTTACTTCCAACGCGGATTTTTCGGCGGTACTG\n-GCATTCAGACCAAAGAACATAAACTATTTTGTTGACATTGTAGAATATCAGGGAAAATTA\n-CCATCCCAAATCGATCCTAAAAACCTCGGTGGTTATCCGATCGATAAAACATCCGAAGAA\n-TACACGAATCATATAAATGGCTTGTACTATTCATCAACTACAGCAAGTTCCTACCTACCG\n-ACACGTGATTTTGTGCCAGTAGCTTATACTTTTAGTTGCCCTTATTTCAATCTTGATAAC\n-TACTATTCTGGATCGGTTTTGATGGAATGGTATGCAAAGCGGCCTGATATACCAGATATG\n-AAGATAAACATCATATGTGGACAGGCCGCGCGCATTTTAGGTCGAGCGAGGTGATTAACG\n-TAATGCTGGTGTTAAATTGTGTGATCTTCCAATAGCCCGTTTGATTGCTTTAAAGAAGTT\n-CATCACCGGGCTATTTTTCTCGTAAATATCCCAAACTTTCAATTTGTCCCACGGATCCGG\n-AACATAATCTTCATTCCTTGCCGAAACCCCCAACGTAACCCTCCTGTATCCAGCGCTGTC\n-GTGATAATACACGAAGAATGGCCTACCTATTGGCGCAACCTTACAACCTCTCTTAGCGGC\n-TCTCATAGCCTCATCGAACGTCATGGATTCCCCCAAAAATTTCTATGCATGAATGGTCGA\n-ATTCCGATAGTTTCACTCAAAATGAATATCGGATGATCTAGCTCATCGTGGTTTTCTTCG\n-TCAATGACGATATCCCAATCAGTAGCCTTTTGTTCTTCTACCGTGGCAATGAATACCTGG\n-TTTACTTCACACTTGCGGTGTGTACGTCTGATGATTGTATCCCCCTCACGAAACACAATC\n-ATATCAGGATTGGTAGTGCGGTACGCGGTTTTACCCGCGCACACTTCATTAAGCATATCT\n-TCGTATGTCATTATAAAACCTTTACACGTTGAACGATGGTTTGTTTAACGTCTTTGTATT\n-CTCCGTGCTCTTTAACGGTTGCTTTGAAAGTGATTTCATCACCTTCGTTTGCAATGTTAT\n-TACCGAAGTAAACAACAACATTACCATCAACATTAATTTTGGTCATGAATCTTTCTACAG\n-AAGTGTAGTAAGAAACTTGAGTATATCCCAGTGAAATCACTTTCTCAACGGTTCCGGTCA\n-TTTCCAGACGTTGTTTGATTTCACCGATGTGGTTAGCTTTAGAAATGCGTTCCTGGCGCT\n-CTTGTTCCCACTGTTCGCGGATTTCTTCGCGTTTGGCGATATAATCCTTTTCCAGTGCAA\n-CACCCATGCAGTAAGCGCACACAGCATCGAATACAGGGCTGTTTTTGTTGTCTTCTTTAG\n-ACTGGTCAGCCCACCAAAGAACAGTAAACATTGGCATTTCTGCAATTACTTCGCCTTTAC\n-GCTTGCCAATCGGCATGATCCCTTTTTCCAGCAGTTCCAGTTTTTCAGTATCGAACACTG\n-ACAGTTTACCGCGACGTTCGAACAGATCGAAATCTGCAAAGCCCTGGAATATCATTTTGA\n-AAGTATCAGTTTCAGTTAAACGGGCTGAAACACGGTCGAAATACTCACGCGCTTTCGCTT\n-CCGCTTTCTCCGGATCGGTAGATAAGTTGCAGATATAGTTATCAGAAGTATAACCGCCGC\n-CTCTACGCTCAACACGCAAGGTATACATTGCATTTTTACGACCAGAAGAAATGAAGTAAG\n-TGGTAGTAACTACGGTTGCGTTAGTCATGGTATTTCTCCTTAAAGGGTATCTCGTTTCGA\n-TATGGCTAATATAGCAAAAGCCCCTGACCGAAGTCAAGGGCTTTTTCATCATTCATTCGA\n-ATCTTTCATTGTTTTATGAAGATGAATATCAAAAATTTTCCAGTACGCCTTTCCGCGAGG\n-ATAAATTTTTGCTTTGTCAATATCGTTGTTGCTTCCCCATGTGTTGTTTGGGCCACGACA\n-TCGATTTTTTATATAATCTGTATGCCAGAATAAGCGCTGAACCGATGATTCCGTACCTAA\n-TGGATCTTCTTTACTGAACAGAATTTGTATACTCATAAGAAGAACCCAGTGCGAACAATC\n-AGATCGATTTTCTTTTCTGGTTCAAACGGTGATTTGCTATCGATGTTACACTGATAGAAC\n-ATACCAACATACTTTTCAGGAATGTTGGATTCACGCGCCCATTTCAGGTTATCATCGGTA\n-TTCGGCCCCAGCATGAGGTTAACAACATCAACAGCATAATCCTGTTCTTTGGTGTTGCCG\n-GAACCGTTTACATAATGCCCGTGGGCGGTTTTGAGGATCTCAATTACTTCGCTATCGTCA\n-GTTTCGACTTTGTAAAGCGTTGTATTTTCAGGAATTTCTTCAAGAATGATCAAAGCGGTT\n-TTCATCACACTTACCTTTGTGTTTCTGTTTACGTTTTGCTTCTTTAAATGCTCGCTTGCG\n-ATCGCGGTGAGTAGAAGCGCGGTTGAAATCATGTTTCGCTACCAAATTATTCATATAAGC\n-CCCTTAAAGAAAAATATTTAGGGGCTTTCGCCCCTGTATTAATCCAGCAATTTGCGGATC\n-TTGTCTGCGATACGTCCGGCGCGGGTTGCACTTGCAGTATGATCGCTTTCTTTTGAAGCC\n-AGTTCCGCCAGCTTACGCTGATGTTCTTCTTCTGCTGCTTGACGATCTGCTGCAACCTGT\n-GCAACCTGCTCATTATCGTGAGCAATACGCGCTTCCAGTTCAGACAGGGTTTTGTCGAAA\n-GTTGCTACGATTTCATCTACAGAACGAATTTTATTAAACAGTTTCATAATTTATCTCAAT\n-TGGTTAGTTTTAATCAGTATACATCAATATGGTTGAAATTCAAAATCATAAATGTCATTC\n-AGTGCGCGGTTCCACTCGGTGTAGTTTTCACCAGCACCATAACGCATTTGAATAGCACTT\n-TCGAACGTTGATCCGTTGAGGTTCGGGAAACCGAACAGGTTTTTGATTTTGTCATGTGCT\n-ACATAATACAGAGAAGCACTTTCCAGCATCGCAACCATCGCAGACGGTTCGTGTTCGCGG\n-CGCTTGATACGTAACAGAGTACGACTAGCACCAGTTTTACGGCGTTGATTTGGTGCTACG\n-TAGAAACGGAATACTACGCGCCCTGTTTTATCATCAACTACCAGGTAAAACCCGTTTTCT\n-TTCAGATCCACGCCTTCGAATTTCTTGAAGGTTCCGCGTTTCATGTCACCAATTTTAATT\n-GCATATTTGTGAATGTCAAGTCTTGTCAGAATTCTTTTCATATTTTTTAGATACCAGTTT\n-GCCTAATTTTGTAATTTCGCCTGTTTTTACGTTAACAAACAAGGCGATGCTCAGAAATGG\n-GATGCTAATCACTACGCTGATCAATGTAAACAGAAAACGTATCACAAAAAGAACAGCACG\n-TTCAAGATATCGTTGCATCCACGCGATTCCTAAACAACTATACCCTACTATAAAGGTGGT\n-TGCAACATAAAATGCACCAAATCCTTTACGAAATACGTAACCTTTCCCGGATTCTATCCG\n-GTCGTCGGCCCACATTTCACGGGCAGTTTTCAGAATAGATTCACCACTAGCGCGAGTTTC\n-GTTAGCCGAAGGCATGTTTTTAAATTTCATGATAGTCTCCTATGCGCCCAGAACTCTCCA\n-GGCGCGGTTGTTTAG\n' |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 cpt_gff_to_gbk/test-data/miro.gff3 --- a/cpt_gff_to_gbk/test-data/miro.gff3 Fri Jun 17 12:57:03 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b'@@ -1,827 +0,0 @@\n-##gff-version 3\n-##sequence-region Miro 1 176055\n-Miro\tfeature\tgene\t7454\t7906\t.\t-\t.\tID=Miro_8\n-Miro\tGenBank\tCDS\t7454\t7894\t.\t-\t1\tID=Miro_8.CDS;Name=Miro_8;Parent=Miro_8;obsolete_name=Miro_156;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t7903\t7906\t.\t-\t1\tAlias=Miro_8;ID=Miro_8.RBS;Name=Miro_8;Parent=Miro_8\n-Miro\tfeature\tgene\t7917\t8512\t.\t-\t.\tID=Miro_9\n-Miro\tGenBank\tCDS\t7917\t8501\t.\t-\t1\tID=Miro_9.CDS;Name=Miro_9;Parent=Miro_9;obsolete_name=Miro_155;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t8509\t8512\t.\t-\t1\tAlias=Miro_9;ID=Miro_9.RBS;Name=Miro_9;Parent=Miro_9\n-Miro\tfeature\tgene\t123276\t124212\t.\t+\t.\tID=Miro_206\n-Miro\tGenBank\tCDS\t123286\t124212\t.\t+\t1\tAlias=Miro_206;ID=Miro_206.CDS;Name=Miro_206;Parent=Miro_206;obsolete_name=Miro_234;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t123276\t123279\t.\t+\t1\tID=Miro_206.rbs;Name=Miro_206;Parent=Miro_206\n-Miro\tfeature\tgene\t68490\t70715\t.\t-\t.\tID=Miro_117\n-Miro\tGenBank\tCDS\t68490\t70706\t.\t-\t1\tID=Miro_117.CDS;Name=Miro_117;Note=contains von Willebrand factor%2C type A;Parent=Miro_117;obsolete_name=Miro_047;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t70713\t70715\t.\t-\t1\tAlias=Miro_117;ID=Miro_117.RBS;Name=Miro_117;Parent=Miro_117\n-Miro\tfeature\tgene\t115729\t116735\t.\t+\t.\tID=Miro_200\n-Miro\tGenBank\tCDS\t115743\t116735\t.\t+\t1\tAlias=Miro_200;ID=Miro_200.CDS;Name=Miro_200;Note=T4 gp6-like;Parent=Miro_200;obsolete_name=Miro_240;product=baseplate structural protein\n-Miro\tGenBank\tShine_Dalgarno_sequence\t115729\t115732\t.\t+\t1\tID=Miro_200.RBS;Name=Miro_200;Parent=Miro_200\n-Miro\tfeature\tgene\t116735\t117608\t.\t+\t.\tID=Miro_201\n-Miro\tGenBank\tCDS\t116745\t117608\t.\t+\t1\tAlias=Miro_201;ID=Miro_201.CDS;Name=Miro_201;Note=T4 gp9/gp10-like;Parent=Miro_201;obsolete_name=Miro_239;product=baseplate structural protein\n-Miro\tGenBank\tShine_Dalgarno_sequence\t116735\t116738\t.\t+\t1\tID=Miro_201.RBS;Name=Miro_201;Parent=Miro_201\n-Miro\tfeature\tgene\t117595\t119422\t.\t+\t.\tID=Miro_202\n-Miro\tGenBank\tCDS\t117605\t119422\t.\t+\t1\tAlias=Miro_202;ID=Miro_202.CDS;Name=Miro_202;Note=T4 gp9/gp10-like;Parent=Miro_202;obsolete_name=Miro_238;product=baseplate structural protein\n-Miro\tGenBank\tShine_Dalgarno_sequence\t117595\t117597\t.\t+\t1\tID=Miro_202.RBS;Name=Miro_202;Parent=Miro_202\n-Miro\tfeature\tgene\t119412\t120090\t.\t+\t.\tID=Miro_203\n-Miro\tGenBank\tCDS\t119422\t120090\t.\t+\t1\tAlias=Miro_203;ID=Miro_203.CDS;Name=Miro_203;Note=T4 gp11-like;Parent=Miro_203;obsolete_name=Miro_237;product=baseplate to short tail fiber connector protein\n-Miro\tGenBank\tShine_Dalgarno_sequence\t119412\t119415\t.\t+\t1\tID=Miro_203.RBS;Name=Miro_203;Parent=Miro_203\n-Miro\tfeature\tgene\t81829\t81940\t.\t-\t.\tID=Miro_142\n-Miro\tGenBank\tCDS\t81829\t81927\t.\t-\t1\tID=Miro_142.CDS;Name=Miro_142;Parent=Miro_142;obsolete_name=Miro_022;product=hypothetical conserved;tmhelix=1 TMD %284-26%29 N out%2C C in\n-Miro\tGenBank\tShine_Dalgarno_sequence\t81938\t81940\t.\t-\t1\tAlias=Miro_142;ID=Miro_142.RBS;Name=Miro_142;Parent=Miro_142\n-Miro\tfeature\tgene\t1\t910\t.\t-\t.\tID=Miro_1\n-Miro\tGenBank\tCDS\t1\t900\t.\t-\t1\tID=Miro_1.CDS;Name=Miro_1;Parent=Miro_1;obsolete_name=Miro_163;product=rIIb\n-Miro\tGenBank\tShine_Dalgarno_sequence\t908\t910\t.\t-\t1\tAlias=Miro_1;ID=Miro_1.RBS;Name=Miro_1;Parent=Miro_1\n-Miro\tfeature\tgene\t900\t3173\t.\t-\t.\tID=Miro_2\n-Miro\tGenBank\tCDS\t900\t3161\t.\t-\t1\tID=Miro_2.CDS;Name=Miro_2;Parent=Miro_2;obsolete_name=Miro_162;product=rIIa\n-Miro\tGenBank\tShine_Dalgarno_sequence\t3171\t3173\t.\t-\t1\tAlias=Miro_2;ID=Miro_2.RBS;Name=Miro_2;Parent=Miro_2\n-Miro\tfeature\tgene\t3172\t3417\t.\t-\t.\tID=Miro_3\n-Miro\tGenBank\tCDS\t3172\t3408\t.\t-\t1\tID=Miro_3.CDS;Name=Miro_3;Parent=Miro_3;obsolete_name=Miro_161;product=hypothetical conserved;tmhelix=1 TMD %2812-34%29 N in%2C C out\n-Miro\tGenBank\tShine_Dalgarno_sequence\t3414\t3417\t.\t-\t1\tAlias=Miro_3;ID=Miro_3.RBS;Name=Miro_3;Parent=Miro_3\n-Miro\tfeature\tgene\t3412\t3979\t.\t-\t.\tID=Miro_4\n-Miro\tGenBank\tCDS\t3412\t3966\t.\t-\t1\tID=Miro_4.CDS;Name=Miro_4;Note=contains SprT domain;Parent=Miro_4;obsolete_name=Miro_160;product=hypothetical'..b't=Miro_98;obsolete_name=Miro_066;product=hypothetical conserved;tmhelix=2TMDs %287-26%2C 31-53%29 N in%2C C in\n-Miro\tGenBank\tShine_Dalgarno_sequence\t57368\t57372\t.\t+\t1\tID=Miro_98.RBS;Name=Miro_98;Parent=Miro_98\n-Miro\tfeature\tgene\t57613\t57914\t.\t+\t.\tID=Miro_99\n-Miro\tGenBank\tCDS\t57624\t57914\t.\t+\t1\tAlias=Miro_99;ID=Miro_99.CDS;Name=Miro_99;Parent=Miro_99;obsolete_name=Miro_065;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t57613\t57616\t.\t+\t1\tID=Miro_99.RBS;Name=Miro_99;Parent=Miro_99\n-Miro\tfeature\tgene\t81924\t82086\t.\t-\t.\tID=Miro_143\n-Miro\tGenBank\tCDS\t81924\t82079\t.\t-\t1\tID=Miro_143.CDS;Name=Miro_143;Parent=Miro_143;obsolete_name=Miro_021;product=hypothetical conserved;tmhelix=1 TMD %2815-37%29 N out%2C C in\n-Miro\tGenBank\tShine_Dalgarno_sequence\t82084\t82086\t.\t-\t1\tAlias=Miro_143;ID=Miro_143.RBS;Name=Miro_143;Parent=Miro_143\n-Miro\tfeature\tgene\t67179\t67658\t.\t-\t.\tID=Miro_114\n-Miro\tGenBank\tCDS\t67179\t67649\t.\t-\t1\tID=Miro_114.CDS;Name=Miro_114;Parent=Miro_114;obsolete_name=Miro_050;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t67656\t67658\t.\t-\t1\tAlias=Miro_114;ID=Miro_114.RBS;Name=Miro_114;Parent=Miro_114\n-Miro\tfeature\tgene\t81366\t81851\t.\t-\t.\tID=Miro_141\n-Miro\tGenBank\tCDS\t81366\t81839\t.\t-\t1\tID=Miro_141.CDS;Name=Miro_141;Note=contains macro domain;Parent=Miro_141;obsolete_name=Miro_023;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t81849\t81851\t.\t-\t1\tAlias=Miro_141;ID=Miro_141.RBS;Name=Miro_141;Parent=Miro_141\n-Miro\tfeature\tgene\t81076\t81376\t.\t-\t.\tID=Miro_140\n-Miro\tGenBank\tCDS\t81076\t81363\t.\t-\t1\tID=Miro_140.CDS;Name=Miro_140;Parent=Miro_140;obsolete_name=Miro_024;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t81374\t81376\t.\t-\t1\tAlias=Miro_140;ID=Miro_140.RBS;Name=Miro_140;Parent=Miro_140\n-Miro\tfeature\tgene\t83223\t83630\t.\t-\t.\tID=Miro_147\n-Miro\tGenBank\tCDS\t83223\t83618\t.\t-\t1\tID=Miro_147.CDS;Name=Miro_147;Parent=Miro_147;obsolete_name=Miro_017;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t83627\t83630\t.\t-\t1\tAlias=Miro_147;ID=Miro_147.RBS;Name=Miro_147;Parent=Miro_147\n-Miro\tfeature\tgene\t83066\t83224\t.\t-\t.\tID=Miro_146\n-Miro\tGenBank\tCDS\t83066\t83212\t.\t-\t1\tID=Miro_146.CDS;Name=Miro_146;Parent=Miro_146;obsolete_name=Miro_018;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t83221\t83224\t.\t-\t1\tAlias=Miro_146;ID=Miro_146.RBS;Name=Miro_146;Parent=Miro_146\n-Miro\tfeature\tgene\t82479\t83083\t.\t-\t.\tID=Miro_145\n-Miro\tGenBank\tCDS\t82479\t83069\t.\t-\t1\tID=Miro_145.CDS;Name=Miro_145;Parent=Miro_145;obsolete_name=Miro_019;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t83080\t83083\t.\t-\t1\tAlias=Miro_145;ID=Miro_145.RBS;Name=Miro_145;Parent=Miro_145\n-Miro\tfeature\tgene\t67646\t67816\t.\t-\t.\tID=Miro_115\n-Miro\tGenBank\tCDS\t67646\t67804\t.\t-\t1\tID=Miro_115.CDS;Name=Miro_115;Parent=Miro_115;obsolete_name=Miro_049;product=hypothetical conserved;tmhelix=2TMDs %282-21%2C 31-50%29 N in%2C C in\n-Miro\tGenBank\tShine_Dalgarno_sequence\t67814\t67816\t.\t-\t1\tAlias=Miro_115;ID=Miro_115.RBS;Name=Miro_115;Parent=Miro_115\n-Miro\tfeature\tgene\t84392\t84959\t.\t-\t.\tID=Miro_149\n-Miro\tGenBank\tCDS\t84392\t84946\t.\t-\t1\tID=Miro_149.CDS;Name=Miro_149;Parent=Miro_149;obsolete_name=Miro_015;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t84955\t84959\t.\t-\t1\tAlias=Miro_149;ID=Miro_149.RBS;Name=Miro_149;Parent=Miro_149\n-Miro\tfeature\tgene\t83686\t84337\t.\t-\t.\tID=Miro_148\n-Miro\tGenBank\tCDS\t83686\t84327\t.\t-\t1\tID=Miro_148.CDS;Name=Miro_148;Note=T4 RegB-like;Parent=Miro_148;obsolete_name=Miro_016;product=endoribonuclease;signal=signal peptidase II cleavage site 12-13\n-Miro\tGenBank\tShine_Dalgarno_sequence\t84334\t84337\t.\t-\t1\tAlias=Miro_148;ID=Miro_148.RBS;Name=Miro_148;Parent=Miro_148\n-Miro\tfeature\tgene\t67801\t68461\t.\t-\t.\tID=Miro_116\n-Miro\tGenBank\tCDS\t67801\t68451\t.\t-\t1\tID=Miro_116.CDS;Name=Miro_116;Parent=Miro_116;obsolete_name=Miro_048;product=hypothetical conserved\n-Miro\tGenBank\tShine_Dalgarno_sequence\t68457\t68461\t.\t-\t1\tAlias=Miro_116;ID=Miro_116.RBS;Name=Miro_116;Parent=Miro_116\n' |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 cpt_gff_to_gbk/test-data/miro_from_tool.gbk --- a/cpt_gff_to_gbk/test-data/miro_from_tool.gbk Fri Jun 17 12:57:03 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b'@@ -1,5817 +0,0 @@\n-LOCUS Miro 176055 bp DNA UNK 01-JAN-1980\n-DEFINITION .\n-ACCESSION Miro\n-VERSION Miro\n-KEYWORDS .\n-SOURCE .\n- ORGANISM .\n- .\n-FEATURES Location/Qualifiers\n- DNA 1..176055\n- /Alias="Miro"\n- /date="01-JAN-1980"\n- gene complement(1..910)\n- /locus_tag="CPT_Miro_001"\n- CDS complement(1..900)\n- /obsolete_name="Miro_163"\n- /product="rIIb"\n- /locus_tag="CPT_Miro_001"\n- /transl_table=11\n- gene complement(900..3173)\n- /locus_tag="CPT_Miro_002"\n- CDS complement(900..3161)\n- /obsolete_name="Miro_162"\n- /product="rIIa"\n- /locus_tag="CPT_Miro_002"\n- /transl_table=11\n- RBS complement(908..910)\n- /Alias="Miro_1"\n- /locus_tag="CPT_Miro_001"\n- RBS complement(3171..3173)\n- /Alias="Miro_2"\n- /locus_tag="CPT_Miro_002"\n- gene complement(3172..3417)\n- /locus_tag="CPT_Miro_003"\n- CDS complement(3172..3408)\n- /obsolete_name="Miro_161"\n- /product="hypothetical conserved"\n- /tmhelix="1 TMD (12-34) N in, C out"\n- /locus_tag="CPT_Miro_003"\n- /transl_table=11\n- gene complement(3412..3979)\n- /locus_tag="CPT_Miro_004"\n- CDS complement(3412..3966)\n- /obsolete_name="Miro_160"\n- /product="hypothetical conserved"\n- /locus_tag="CPT_Miro_004"\n- /transl_table=11\n- /note="contains SprT domain"\n- RBS complement(3414..3417)\n- /Alias="Miro_3"\n- /locus_tag="CPT_Miro_003"\n- RBS complement(3975..3979)\n- /Alias="Miro_4"\n- /locus_tag="CPT_Miro_004"\n- gene complement(4038..5334)\n- /locus_tag="CPT_Miro_005"\n- CDS complement(4038..5324)\n- /obsolete_name="Miro_159"\n- /product="DNA topoisomerase II medium subunit"\n- /locus_tag="CPT_Miro_005"\n- /transl_table=11\n- /note="T4 gp52-like"\n- gene complement(5324..7231)\n- /locus_tag="CPT_Miro_006"\n- CDS complement(5324..7222)\n- /obsolete_name="Miro_158"\n- /product="DNA topoisomerase II, large subunit"\n- /locus_tag="CPT_Miro_006"\n- /transl_table=11\n- /note="T4 gp39-like"\n- RBS complement(5331..5334)\n- /Alias="Miro_5"\n- /locus_tag="CPT_Miro_005"\n- RBS complement(7228..7231)\n- /Alias="Miro_6"\n- /locus_tag="CPT_Miro_006"\n- gene complement(7290..7465)\n- /locus_tag="CPT_Miro_007"\n- CDS complement(7290..7454)\n- /obsolete_name="Miro_157"\n- /product="hypothetical conserved"\n- /locus_tag="CPT_Miro_007"\n- /transl_table=11\n- /note="contains zinc ribbon domain"\n- gene complement(7454..7906)\n- /locus_tag="CPT_Miro_008"\n- CDS complement(7454..7894)\n- /obsolete_name="Miro_156"\n- /product="hypothetical conserved"\n- /locus_tag="CPT_Miro_008"\n- /tr'..b'aacccc caacgtaacc ctcctgtatc cagcgctgtc\n- 172981 gtgataatac acgaagaatg gcctacctat tggcgcaacc ttacaacctc tcttagcggc\n- 173041 tctcatagcc tcatcgaacg tcatggattc ccccaaaaat ttctatgcat gaatggtcga\n- 173101 attccgatag tttcactcaa aatgaatatc ggatgatcta gctcatcgtg gttttcttcg\n- 173161 tcaatgacga tatcccaatc agtagccttt tgttcttcta ccgtggcaat gaatacctgg\n- 173221 tttacttcac acttgcggtg tgtacgtctg atgattgtat ccccctcacg aaacacaatc\n- 173281 atatcaggat tggtagtgcg gtacgcggtt ttacccgcgc acacttcatt aagcatatct\n- 173341 tcgtatgtca ttataaaacc tttacacgtt gaacgatggt ttgtttaacg tctttgtatt\n- 173401 ctccgtgctc tttaacggtt gctttgaaag tgatttcatc accttcgttt gcaatgttat\n- 173461 taccgaagta aacaacaaca ttaccatcaa cattaatttt ggtcatgaat ctttctacag\n- 173521 aagtgtagta agaaacttga gtatatccca gtgaaatcac tttctcaacg gttccggtca\n- 173581 tttccagacg ttgtttgatt tcaccgatgt ggttagcttt agaaatgcgt tcctggcgct\n- 173641 cttgttccca ctgttcgcgg atttcttcgc gtttggcgat ataatccttt tccagtgcaa\n- 173701 cacccatgca gtaagcgcac acagcatcga atacagggct gtttttgttg tcttctttag\n- 173761 actggtcagc ccaccaaaga acagtaaaca ttggcatttc tgcaattact tcgcctttac\n- 173821 gcttgccaat cggcatgatc cctttttcca gcagttccag tttttcagta tcgaacactg\n- 173881 acagtttacc gcgacgttcg aacagatcga aatctgcaaa gccctggaat atcattttga\n- 173941 aagtatcagt ttcagttaaa cgggctgaaa cacggtcgaa atactcacgc gctttcgctt\n- 174001 ccgctttctc cggatcggta gataagttgc agatatagtt atcagaagta taaccgccgc\n- 174061 ctctacgctc aacacgcaag gtatacattg catttttacg accagaagaa atgaagtaag\n- 174121 tggtagtaac tacggttgcg ttagtcatgg tatttctcct taaagggtat ctcgtttcga\n- 174181 tatggctaat atagcaaaag cccctgaccg aagtcaaggg ctttttcatc attcattcga\n- 174241 atctttcatt gttttatgaa gatgaatatc aaaaattttc cagtacgcct ttccgcgagg\n- 174301 ataaattttt gctttgtcaa tatcgttgtt gcttccccat gtgttgtttg ggccacgaca\n- 174361 tcgatttttt atataatctg tatgccagaa taagcgctga accgatgatt ccgtacctaa\n- 174421 tggatcttct ttactgaaca gaatttgtat actcataaga agaacccagt gcgaacaatc\n- 174481 agatcgattt tcttttctgg ttcaaacggt gatttgctat cgatgttaca ctgatagaac\n- 174541 ataccaacat acttttcagg aatgttggat tcacgcgccc atttcaggtt atcatcggta\n- 174601 ttcggcccca gcatgaggtt aacaacatca acagcataat cctgttcttt ggtgttgccg\n- 174661 gaaccgttta cataatgccc gtgggcggtt ttgaggatct caattacttc gctatcgtca\n- 174721 gtttcgactt tgtaaagcgt tgtattttca ggaatttctt caagaatgat caaagcggtt\n- 174781 ttcatcacac ttacctttgt gtttctgttt acgttttgct tctttaaatg ctcgcttgcg\n- 174841 atcgcggtga gtagaagcgc ggttgaaatc atgtttcgct accaaattat tcatataagc\n- 174901 cccttaaaga aaaatattta ggggctttcg cccctgtatt aatccagcaa tttgcggatc\n- 174961 ttgtctgcga tacgtccggc gcgggttgca cttgcagtat gatcgctttc ttttgaagcc\n- 175021 agttccgcca gcttacgctg atgttcttct tctgctgctt gacgatctgc tgcaacctgt\n- 175081 gcaacctgct cattatcgtg agcaatacgc gcttccagtt cagacagggt tttgtcgaaa\n- 175141 gttgctacga tttcatctac agaacgaatt ttattaaaca gtttcataat ttatctcaat\n- 175201 tggttagttt taatcagtat acatcaatat ggttgaaatt caaaatcata aatgtcattc\n- 175261 agtgcgcggt tccactcggt gtagttttca ccagcaccat aacgcatttg aatagcactt\n- 175321 tcgaacgttg atccgttgag gttcgggaaa ccgaacaggt ttttgatttt gtcatgtgct\n- 175381 acataataca gagaagcact ttccagcatc gcaaccatcg cagacggttc gtgttcgcgg\n- 175441 cgcttgatac gtaacagagt acgactagca ccagttttac ggcgttgatt tggtgctacg\n- 175501 tagaaacgga atactacgcg ccctgtttta tcatcaacta ccaggtaaaa cccgttttct\n- 175561 ttcagatcca cgccttcgaa tttcttgaag gttccgcgtt tcatgtcacc aattttaatt\n- 175621 gcatatttgt gaatgtcaag tcttgtcaga attcttttca tattttttag ataccagttt\n- 175681 gcctaatttt gtaatttcgc ctgtttttac gttaacaaac aaggcgatgc tcagaaatgg\n- 175741 gatgctaatc actacgctga tcaatgtaaa cagaaaacgt atcacaaaaa gaacagcacg\n- 175801 ttcaagatat cgttgcatcc acgcgattcc taaacaacta taccctacta taaaggtggt\n- 175861 tgcaacataa aatgcaccaa atcctttacg aaatacgtaa cctttcccgg attctatccg\n- 175921 gtcgtcggcc cacatttcac gggcagtttt cagaatagat tcaccactag cgcgagtttc\n- 175981 gttagccgaa ggcatgtttt taaatttcat gatagtctcc tatgcgccca gaactctcca\n- 176041 ggcgcggttg tttag\n-//\n' |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 gff2gb.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff2gb.py Mon Jun 05 02:44:32 2023 +0000 |
[ |
b'@@ -0,0 +1,453 @@\n+#!/usr/bin/env python\n+"""Convert a GFF and associated FASTA file into GenBank format.\n+\n+Usage:\n+gff_to_genbank.py <GFF annotation file> <FASTA sequence file>\n+"""\n+import argparse\n+import sys\n+import re\n+import copy\n+import itertools\n+import logging\n+from Bio import SeqIO\n+\n+# from Bio.Alphabet import generic_dna\n+from Bio.SeqFeature import CompoundLocation, FeatureLocation\n+from CPT_GFFParser import gffParse, gffWrite\n+from gff3 import (\n+ feature_lambda,\n+ wa_unified_product_name,\n+ is_uuid,\n+ feature_test_type,\n+ fsort,\n+ feature_test_true,\n+ feature_test_quals,\n+)\n+\n+default_name = re.compile(r"^gene_(\\d+)$")\n+logging.basicConfig(level=logging.INFO)\n+\n+\n+def rename_key(ds, k_f, k_t):\n+ """Rename a key in a dictionary and return it, FP style"""\n+ # If they key is not in the dictionary, just return immediately\n+ if k_f not in ds:\n+ return ds\n+\n+ # Otherwise, we check if the target key is in there\n+ if k_t in ds:\n+ # If it is, we need to append\n+ ds[k_t] += ds[k_f]\n+ else:\n+ # if not, we can just set.\n+ ds[k_t] = ds[k_f]\n+\n+ # Remove source\n+ del ds[k_f]\n+ return ds\n+\n+\n+def gff3_to_genbank(gff_file, fasta_file, transltbl):\n+ fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta")) # , generic_dna))\n+ gff_iter = gffParse(gff_file, fasta_input)\n+\n+ for record in gff_iter:\n+ yield handle_record(record, transltbl)\n+\n+\n+def handle_non_gene_features(features):\n+ # These are NON-GENE features (maybe terminators? etc?)\n+ for feature in feature_lambda(\n+ features,\n+ feature_test_type,\n+ {"type": "gene"},\n+ subfeatures=False,\n+ invert=True,\n+ recurse=True, # used to catch RBS from new apollo runs (used to be False)\n+ ):\n+ if feature.type in (\n+ "terminator",\n+ "tRNA",\n+ "Shine_Dalgarno_sequence",\n+ "sequence_feature",\n+ "recombination_feature",\n+ "sequence_alteration",\n+ "binding_site",\n+ ):\n+ yield feature\n+ elif feature.type in ("CDS",):\n+ pass\n+ else:\n+ yield feature\n+\n+\n+def fminmax(feature):\n+ fmin = None\n+ fmax = None\n+ for sf in feature_lambda([feature], feature_test_true, {}, subfeatures=True):\n+ if fmin is None:\n+ fmin = sf.location.start\n+ fmax = sf.location.end\n+ if sf.location.start < fmin:\n+ fmin = sf.location.start\n+ if sf.location.end > fmax:\n+ fmax = sf.location.end\n+ return fmin, fmax\n+\n+\n+def fix_gene_boundaries(feature):\n+ # There is a frustrating bug in apollo whereby we have created gene\n+ # features which are LARGER than expected, but we cannot see this.\n+ # We only see a perfect sized gene + great SD together.\n+ #\n+ # So, we have this awful hack to clamp the location of the gene\n+ # feature to the contained mRNAs. This is good enough for now.\n+ fmin, fmax = fminmax(feature)\n+ if feature.location.strand > 0:\n+ feature.location = FeatureLocation(fmin, fmax, strand=1)\n+ else:\n+ feature.location = FeatureLocation(fmin, fmax, strand=-1)\n+ return feature\n+\n+\n+def fix_gene_qualifiers(name, feature, fid):\n+ for mRNA in feature.sub_features:\n+ mRNA.qualifiers["locus_tag"] = "CPT_%s_%03d" % (name, fid)\n+ # And some exons below that\n+ sf_replacement = []\n+ for sf in mRNA.sub_features:\n+ # We set a locus_tag on ALL sub features\n+ sf.qualifiers["locus_tag"] = "CPT_%s_%03d" % (name, fid)\n+ # Remove Names which are UUIDs\n+ # NOT GOOD PRACTICE\n+ try:\n+ if is_uuid(sf.qualifiers["Name"][0]):\n+ del sf.qualifiers["Name"]\n+ except KeyError:\n+ continue # might should go back to pass, I have not put thought into this still\n+\n+ # If it is the RBS exo'..b'terating over\n+ replacement_feats = []\n+ replacement_feats += list(handle_non_gene_features(record.features))\n+\n+ # Renumbering requires sorting\n+ fid = 0\n+ for feature in fsort(\n+ feature_lambda(\n+ record.features, feature_test_type, {"type": "gene"}, subfeatures=True\n+ )\n+ ):\n+ # Our modifications only involve genes\n+ fid += 1\n+\n+ feature = fix_gene_boundaries(feature)\n+ # Which have mRNAs we\'ll drop later\n+ feature = fix_gene_qualifiers(record.id, feature, fid)\n+\n+ # Wipe out the parent gene\'s data, leaving only a locus_tag\n+ feature.qualifiers = {"locus_tag": "CPT_%s_%03d" % (record.id, fid)}\n+\n+ # Patch our features back in (even if they\'re non-gene features)\n+ replacement_feats.append(feature)\n+\n+ replacement_feats = fix_frameshifts(replacement_feats)\n+ # exit(0)\n+ flat_features = feature_lambda(\n+ replacement_feats, lambda x: True, {}, subfeatures=True\n+ )\n+\n+ flat_features = remove_useless_features(flat_features)\n+\n+ # Meat of our modifications\n+ for flat_feat in flat_features:\n+ # Try and figure out a name. We gave conflicting instructions, so\n+ # this isn\'t as trivial as it should be.\n+ protein_product = wa_unified_product_name(flat_feat)\n+\n+ for x in (\n+ "source",\n+ "phase",\n+ "Parent",\n+ "ID",\n+ "owner",\n+ "date_creation",\n+ "date_last_modified",\n+ "datasetSource",\n+ ):\n+ if x in flat_feat.qualifiers:\n+ if x == "ID":\n+ flat_feat._ID = flat_feat.qualifiers["ID"]\n+ del flat_feat.qualifiers[x]\n+\n+ # Add product tag\n+ if flat_feat.type == "CDS":\n+ flat_feat.qualifiers["product"] = [protein_product]\n+ flat_feat.qualifiers["transl_table"] = [transltbl]\n+ if "Product" in flat_feat.qualifiers:\n+ del flat_feat.qualifiers["Product"]\n+ elif flat_feat.type == "RBS":\n+ if "locus_tag" not in flat_feat.qualifiers.keys():\n+ continue\n+\n+ elif flat_feat.type == "terminator":\n+ flat_feat.type = "regulatory"\n+ flat_feat.qualifiers = {"regulatory_class": "terminator"}\n+\n+ # In genbank format, note is lower case.\n+ flat_feat.qualifiers = rename_key(flat_feat.qualifiers, "Note", "note")\n+ flat_feat.qualifiers = rename_key(flat_feat.qualifiers, "description", "note")\n+ flat_feat.qualifiers = rename_key(flat_feat.qualifiers, "protein", "note")\n+ flat_feat.qualifiers = rename_key(flat_feat.qualifiers, "Dbxref", "db_xref")\n+ if "Name" in flat_feat.qualifiers:\n+ del flat_feat.qualifiers["Name"]\n+\n+ # more apollo nonsense\n+ if "Manually set translation start" in flat_feat.qualifiers.get("note", []):\n+ flat_feat.qualifiers["note"].remove("Manually set translation start")\n+\n+ # Append the feature\n+ full_feats.append(flat_feat)\n+\n+ # Update our features\n+ record.features = fsort(full_feats)\n+ # Strip off record names that would cause crashes.\n+ record.name = record.name[0:16]\n+ return record\n+\n+\n+if __name__ == "__main__":\n+ # Grab all of the filters from our plugin loader\n+ parser = argparse.ArgumentParser(description="Convert gff3 to gbk")\n+ parser.add_argument("gff_file", type=argparse.FileType("r"), help="GFF3 file")\n+ parser.add_argument("fasta_file", type=argparse.FileType("r"), help="Fasta Input")\n+ parser.add_argument(\n+ "--transltbl",\n+ type=int,\n+ default=11,\n+ help="Translation Table choice for CDS tag, default 11",\n+ )\n+ args = parser.parse_args()\n+\n+ for record in gff3_to_genbank(**vars(args)):\n+ record.annotations["molecule_type"] = "DNA"\n+ # record.seq.alphabet = generic_dna\n+ SeqIO.write([record], sys.stdout, "genbank")\n' |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 gff2gb.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff2gb.xml Mon Jun 05 02:44:32 2023 +0000 |
[ |
@@ -0,0 +1,78 @@ +<tool id="edu.tamu.cpt.gff.gff2gb" name="GFF3 to GenBank" version="4.0"> + <description>convert gff3 to GenBank</description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +@GENOME_SELECTOR_PRE@ + +'python $__tool_directory__/gff2gb.py' +@INPUT_GFF@ +@INPUT_FASTA@ +--transltbl '$transltbl' +> '$output']]></command> + <inputs> + <expand macro="input/gff3+fasta"/> + <param label="Translation Table" name="transltbl" type="select"> + <option value="1">1. The Standard Code</option> + <option value="2">2. The Vertebrate Mitochondrial Code</option> + <option value="3">3. The Yeast Mitochondrial Code</option> + <option value="4">4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> + <option value="5">5. The Invertebrate Mitochondrial Code</option> + <option value="6">6. The Ciliate, Dasycladacean and Hexamita Nuclear Code</option> + <option value="9">9. The Echinoderm and Flatworm Mitochondrial Code</option> + <option value="10">10. The Euplotid Nuclear Code</option> + <option value="11" selected="true">11. The Bacterial, Archaeal and Plant Plastid Code</option> + <option value="12">12. The Alternative Yeast Nuclear Code</option> + <option value="13">13. The Ascidian Mitochondrial Code</option> + <option value="14">14. The Alternative Flatworm Mitochondrial Code</option> + <option value="16">16. Chlorophycean Mitochondrial Code</option> + <option value="21">21. Trematode Mitochondrial Code</option> + <option value="22">22. Scenedesmus obliquus Mitochondrial Code</option> + <option value="23">23. Thraustochytrium Mitochondrial Code</option> + <option value="24">24. Pterobranchia Mitochondrial Code</option> + <option value="25">25. Candidate Division SR1 and Gracilibacteria Code</option> + </param> + </inputs> + <outputs> + <data format="genbank" hidden="false" name="output" label="${gff3_data.name} as GenBank"/> + </outputs> + <tests> + <!-- There have been issues running the diffs for the files --> + <!-- Going to use asserts and run ONE (or no) diffs --> + <test> + <!-- ORIGINAL TEST FILE, regenerated 10.12.2020 --> + <param name="reference_genome_source" value="history"/> + <param name="genome_fasta" value="miro.fa"/> + <param name="gff3_data" value="miro.gff3"/> + <output name="output" file="miro_from_tool.gbk" compare="sim_size" delta_frac="0.05"> + <assert_contents> + <has_text text="RBS"/> + <has_text text="gene"/> + <has_text text="CDS"/> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +.. class:: warningmark + +This is a LOSSY conversion. This tool **TRUNCATES** genbank file identifiers if +they are too long. Your data may not "match up" after processing through this +tool. + +**What it does**: + +Convert gff3 data to genbank. There are many WebApollo specific conventions. A re-numbering is also done. + +**Supported / Expected Data** + +- gene / mRNA / (CDS, Exon) +- gene / tRNA +- terminator + +]]></help> + <expand macro="citations"/> +</tool> |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 gff3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff3.py Mon Jun 05 02:44:32 2023 +0000 |
[ |
b'@@ -0,0 +1,346 @@\n+import copy\n+import logging\n+\n+log = logging.getLogger()\n+log.setLevel(logging.WARN)\n+\n+\n+def feature_lambda(\n+ feature_list,\n+ test,\n+ test_kwargs,\n+ subfeatures=True,\n+ parent=None,\n+ invert=False,\n+ recurse=True,\n+):\n+ """Recursively search through features, testing each with a test function, yielding matches.\n+\n+ GFF3 is a hierachical data structure, so we need to be able to recursively\n+ search through features. E.g. if you\'re looking for a feature with\n+ ID=\'bob.42\', you can\'t just do a simple list comprehension with a test\n+ case. You don\'t know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.\n+\n+ :type feature_list: list\n+ :param feature_list: an iterable of features\n+\n+ :type test: function reference\n+ :param test: a closure with the method signature (feature, **kwargs) where\n+ the kwargs are those passed in the next argument. This\n+ function should return True or False, True if the feature is\n+ to be yielded as part of the main feature_lambda function, or\n+ False if it is to be ignored. This function CAN mutate the\n+ features passed to it (think "apply").\n+\n+ :type test_kwargs: dictionary\n+ :param test_kwargs: kwargs to pass to your closure when it is called.\n+\n+ :type subfeatures: boolean\n+ :param subfeatures: when a feature is matched, should just that feature be\n+ yielded to the caller, or should the entire sub_feature\n+ tree for that feature be included? subfeatures=True is\n+ useful in cases such as searching for a gene feature,\n+ and wanting to know what RBS/Shine_Dalgarno_sequences\n+ are in the sub_feature tree (which can be accomplished\n+ with two feature_lambda calls). subfeatures=False is\n+ useful in cases when you want to process (and possibly\n+ return) the entire feature tree, such as applying a\n+ qualifier to every single feature.\n+\n+ :type invert: boolean\n+ :param invert: Negate/invert the result of the filter.\n+\n+ :rtype: yielded list\n+ :return: Yields a list of matching features.\n+ """\n+ # Either the top level set of [features] or the subfeature attribute\n+ for feature in feature_list:\n+ feature._parent = parent\n+ if not parent:\n+ # Set to self so we cannot go above root.\n+ feature._parent = feature\n+ test_result = test(feature, **test_kwargs)\n+ # if (not invert and test_result) or (invert and not test_result):\n+ if invert ^ test_result:\n+ if not subfeatures:\n+ feature_copy = copy.deepcopy(feature)\n+ feature_copy.sub_features = list()\n+ yield feature_copy\n+ else:\n+ yield feature\n+\n+ if recurse and hasattr(feature, "sub_features"):\n+ for x in feature_lambda(\n+ feature.sub_features,\n+ test,\n+ test_kwargs,\n+ subfeatures=subfeatures,\n+ parent=feature,\n+ invert=invert,\n+ recurse=recurse,\n+ ):\n+ yield x\n+\n+\n+def fetchParent(feature):\n+ if not hasattr(feature, "_parent") or feature._parent is None:\n+ return feature\n+ else:\n+ return fetchParent(feature._parent)\n+\n+\n+def feature_test_true(feature, **kwargs):\n+ return True\n+\n+\n+def feature_test_type(feature, **kwargs):\n+ if "type" in kwargs:\n+ return str(feature.type).upper() == str(kwargs["type"]).upper()\n+ elif "types" in kwargs:\n+ for x in kwargs["types"]:\n+ if str(feature.type).upper() == str(x).upper():\n+ return True\n+ return False\n+ raise Exception("Incorrect feature_test'..b'feature.location.start,\n+ # feature.location.end,\n+ # feature.location.strand\n+ # )\n+ return result\n+\n+\n+def get_gff3_id(gene):\n+ return gene.qualifiers.get("Name", [gene.id])[0]\n+\n+\n+def ensure_location_in_bounds(start=0, end=0, parent_length=0):\n+ # This prevents frameshift errors\n+ while start < 0:\n+ start += 3\n+ while end < 0:\n+ end += 3\n+ while start > parent_length:\n+ start -= 3\n+ while end > parent_length:\n+ end -= 3\n+ return (start, end)\n+\n+\n+def coding_genes(feature_list):\n+ for x in genes(feature_list):\n+ if (\n+ len(\n+ list(\n+ feature_lambda(\n+ x.sub_features,\n+ feature_test_type,\n+ {"type": "CDS"},\n+ subfeatures=False,\n+ )\n+ )\n+ )\n+ > 0\n+ ):\n+ yield x\n+\n+\n+def genes(feature_list, feature_type="gene", sort=False):\n+ """\n+ Simple filter to extract gene features from the feature set.\n+ """\n+\n+ if not sort:\n+ for x in feature_lambda(\n+ feature_list, feature_test_type, {"type": feature_type}, subfeatures=True\n+ ):\n+ yield x\n+ else:\n+ data = list(genes(feature_list, feature_type=feature_type, sort=False))\n+ data = sorted(data, key=lambda feature: feature.location.start)\n+ for x in data:\n+ yield x\n+\n+\n+def wa_unified_product_name(feature):\n+ """\n+ Try and figure out a name. We gave conflicting instructions, so\n+ this isn\'t as trivial as it should be. Sometimes it will be in\n+ \'product\' or \'Product\', othertimes in \'Name\'\n+ """\n+ # Manually applied tags.\n+ protein_product = feature.qualifiers.get(\n+ "product", feature.qualifiers.get("Product", [None])\n+ )[0]\n+\n+ # If neither of those are available ...\n+ if protein_product is None:\n+ # And there\'s a name...\n+ if "Name" in feature.qualifiers:\n+ if not is_uuid(feature.qualifiers["Name"][0]):\n+ protein_product = feature.qualifiers["Name"][0]\n+\n+ return protein_product\n+\n+\n+def is_uuid(name):\n+ return name.count("-") == 4 and len(name) == 36\n+\n+\n+def get_rbs_from(gene):\n+ # Normal RBS annotation types\n+ rbs_rbs = list(\n+ feature_lambda(\n+ gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False\n+ )\n+ )\n+ rbs_sds = list(\n+ feature_lambda(\n+ gene.sub_features,\n+ feature_test_type,\n+ {"type": "Shine_Dalgarno_sequence"},\n+ subfeatures=False,\n+ )\n+ )\n+ # Fraking apollo\n+ apollo_exons = list(\n+ feature_lambda(\n+ gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False\n+ )\n+ )\n+ apollo_exons = [x for x in apollo_exons if len(x) < 10]\n+ # These are more NCBI\'s style\n+ regulatory_elements = list(\n+ feature_lambda(\n+ gene.sub_features,\n+ feature_test_type,\n+ {"type": "regulatory"},\n+ subfeatures=False,\n+ )\n+ )\n+ rbs_regulatory = list(\n+ feature_lambda(\n+ regulatory_elements,\n+ feature_test_quals,\n+ {"regulatory_class": ["ribosome_binding_site"]},\n+ subfeatures=False,\n+ )\n+ )\n+ # Here\'s hoping you find just one ;)\n+ return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons\n+\n+\n+def nice_name(record):\n+ """\n+ get the real name rather than NCBI IDs and so on. If fails, will return record.id\n+ """\n+ name = record.id\n+ likely_parental_contig = list(genes(record.features, feature_type="contig"))\n+ if len(likely_parental_contig) == 1:\n+ name = likely_parental_contig[0].qualifiers.get("organism", [name])[0]\n+ return name\n+\n+\n+def fsort(it):\n+ for i in sorted(it, key=lambda x: int(x.location.start)):\n+ yield i\n' |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:44:32 2023 +0000 |
b |
@@ -0,0 +1,74 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <!--<requirement type="package" version="2.7">python</requirement>--> + <requirement type="package" version="0.6.4">bcbiogff</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + <token name="@XMFA_INPUT@"> + '$xmfa' + </token> + <xml name="xmfa_input" token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/> + </xml> + <token name="@XMFA_FA_INPUT@"> + '$sequences' + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input"/> + <expand macro="genome_selector"/> + </xml> + <token name="@INPUT_GFF@"> + '$gff3_data' + </token> + <token name="@INPUT_FASTA@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> + #if $reference_genome.reference_genome_source == 'history': + ln -s '$reference_genome.genome_fasta' genomeref.fa; + #end if + </token> + <token name="@GENOME_SELECTOR@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> +</macros> |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 test-data/miro.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/miro.fa Mon Jun 05 02:44:32 2023 +0000 |
b |
b'@@ -0,0 +1,2936 @@\n+>Miro\n+TTAGTAATGGCTAAAACCATATGTAACATCAATCATGACTTTATAACGGCATACACGCAT\n+TTTTGCGTTATTGTAATCCACTGGGATCGCTACCACGTCAGCAGGATCCACCTTTACCTG\n+AATGACTCTACCAACACCGCCCCCGTAGTGTGGAAGGTATGATTTAGCCGCAACGTGTAG\n+GCCAGTAGAACAGGTGCGCGTTTTATCTTCGTCTACCATGTTTCGAGGCATGGAGACAGT\n+CACACCAGGACTATTATCAAATTTGCCAGTAGCGAGATCTTTATAGTTATCGCGCACACG\n+TTTCCAGGCAAGGAAACAACCATCATCGGTCAGTTCAATGTCATTATGTACAAGGAACCC\n+ATAAAGCTGGTATACAGCATCGCGTGAAGGGTTTCGCATCAACCGTTCAAAGAAGTTCAC\n+CAGATGTTCATACGGACGATCGTTATACATTTCGCGAATGATTCGTTGAGTGATATCAGA\n+ATCAAACACTACATCTTTATATAGAAGCTGATGACCAATGATTTTAATGTTGCCTTTGCT\n+ATAGGTTCGGATCGCTTCTTGAGTATCCAAACAAGTTACAGCACCTTTGACATCACCAGC\n+TTTCAGCATTTCATGCGCTTTCTTAAAGTTCGGATGTGTTTCACCCGCCATGAAAACGCG\n+CCCTTCGTATACAACCGTAATGAACGATTCAGATCCGATCATACGCGGTACAGTGTCTGA\n+TACTTTAGGCTTTTGTTTTACATCCCTTTTCTTAGGGAATTCCGCTAACTTGCGGTTGAT\n+AACGCGCCCAATGGTTCGGGCGCTTACGTTGAACTGTTGGGCCAGTGCTGTTTTACTTGC\n+CCCCGTCAACCATCCATTATAGATAGCTTTCTGTTGTACTTCGTCGAGAATTTTGACCAT\n+TATTTCCACCGTATTAATTTCTTAAACTCACTGAGATTCTTTTCGTTGTTGTAAATCGGA\n+CGAATTGAATAAGAATCGCTATGTTCAACAAGTGAAGCCAGTAACGGGTTTAGTGATTTA\n+AAGATTTCCCATGCCTGATCAACACGTTTTTTCATATTGCTGCGTTTAACGCGCATTGAT\n+GCTACGGACTCACGCAGAATCGGACAACGCACCCTGGAAAGATTTTTACCATCTTTCTCA\n+TACCCTTCCAGACACACTATACGTTCGAGGGTATCAACAATCTTATACAGTTTTTCATTA\n+TATCGGTTTTTCACAATCCGATCAAGCGATACACCGAAACGGCTATGCAATGCGTCTGTT\n+TCTGTTGAGTGATCCTTCCCAATCCATCCAGGCAAGCAATTATCTTTCAATGCCTTTTCA\n+GATTTAACATACTGCTTGCACAGCATATCATCAAAGCATACCAGATTTGAATCCGGGATC\n+CACTTCCAGAGGCTGTTACGTATAGCAAACACAACAGGGATCCCAGTATGGCGCATGATA\n+CGCGATAAAGTAGATTCTTTCATTGCTGAATCCATAGACAACCCGGAATTTTCACCATCT\n+AAGCGGCTATATTCATCAATACCATACAACCGAACACCTGGGGCTTTATCCAGTGATAAA\n+AACTCTGATTTTGTCATAAACAGAGAAGTTTTTGCCAGATTGCCGTTACTATCCAACTCA\n+TAACGATATACGGTCGGGGTTTTTGGGCGCGGTTCTGAATTTTTCGGTGCATATAGCGCT\n+TTTGATTTTTCGCGATCTGCATCATAGATCTCTTTTTCTTTTGTCATTTCACTGGTACGG\n+AGATACACAATTTCCGATTCATCAAAATGACCTTTCCGAACGATATCATTAACAATCTCA\n+CGCTTTGAATCACTATCGTAATATGCAACAAAGCTAACACGGCTTAGGTTATGCATTTTA\n+GCATACCCGACGATATACGGTTTAACCGTGTTGGTATCCACTTTTAACAGAATGAGTTTC\n+TTCTGTTTCCACGGATAATAAATGCGTGTGATGTCCTGGCGTTTGGTTGTTTCTGGCTTA\n+TACTTGCTCCAGCGGCCTCCGCTACCAGTTACCTGATACCATGCATCTTTACCGTCGTAT\n+TCGTTCGCCCAATACCCAGCAACATAATCATCATTATATTTGTTTGGTTTGACTAGTTCG\n+CTATGGATCCAGCCAATAGAATCACCATTGATGCGAAAATTAGCATCTTTACCAACAAAG\n+TTTTGTACCATTGAAGGCAGAGAATGGAACCACGTCAGTTTATCACGCACGGTTTGTAAC\n+TTATCGAATTCTGATTTAACTCGATTGAAATATACCCGGCTGATTTGTTTCAGACGTTCT\n+TTAACAATCCCTACTGTCATTTTATCCATACTCAACTCTTCGCGAGAAGGCATGAAATCA\n+AGTTCACCGATCGGGAAGTCAATAATATACGTATACTGGCTTTCTGTATAGCAATAGAAC\n+ATCGAGGTATCATACAAATCTTTATCCAGAGGATAAATGATGTTACCCATGCGAGCATAT\n+ACACCGCTAGTGTATGCTGATTTATGACGGATCACCCCGCTATCGTTGGTTGCTTCTTTC\n+GGCTGATAGTTGATTTTGAGAATAGAAGCACCAACAAAGTTAGGACGAATATCAGTAAAT\n+GATTCGTATACCCTTGCTGCTTCGTTTTCCCATTCTTTGATATCTTCAACCTTAACCGGA\n+ACAGTGATAGTAACCCCGTTAGGTTCATCGCTTTCAATCTCATACAGAGGATCGCAGAAA\n+GGTTCCCCATCATCCATATAGATTGTGTAACCGCATTTGATACCGTCTTTTACGGATTCC\n+ACCGTGAAAGCATCGGAATAGCAAAGCGGAGATTTGCAACCCAGACCCATAGAACCGATC\n+AGGTCGTTTGAATCATTTTTAGTTGATTCGAAGTAAACGGTAAACGCATCACTAACGAAA\n+TCAGGAGACATACCGATCCCGTAGTCACGAATAACAAAACGAGGATCAACAGCAGTTGGC\n+AACTGGACATCAAACGGGTTCTGATTTCCCGCTTCTTTGTGTCCATCAATCGCATTACAA\n+GACAGTTCGCGAATGATTGCGCGGATCTTGTATTTGTATACTGTCGAAGAAAGGATCTTA\n+TACGCTTTCTTGTTTGCGCGTAGAGATAGTTTGTTTCGTCCCTTGCTGGTATCTGTACCA\n+ACACGGTAGATGGTTTGCGGTGTATCTTCGCGTAATTTCATTGTTTATTTCTCACTTAAC\n+ATTAAAAATAACTTGGTCACAAGAGTACTTCGTTGGCTTTTTGTTCAGACCATATTCTAC\n+TACTTCACAATAGGTGTCAAGGAATTTTACCAATTTTTCTTCCTCGACCTGCTGTTTCTT\n+CATATCAAGGATACCCCACACGATAGCCCCGATAATGACAGAAAAGAACGCACAAAATCC\n+GAATATGGTCAGATATTTTCCCAACTTAGGCGCATTATAACGTGTCATACCTTACCCCTC\n+TTTGCGAATGTATGCAAGTTCTTCATGGGTTACTGGACGGATATACAGACGGCCTTTTGT\n+ATATGCCTTGCGCCCGCTGATCCAAATGTTTTTCATATCCTTAACACCGTTCATCACATC\n+GTTGTAAAACTTCTTATCAGCTTTAGCCTGATAGACTTCACGGCCTTGATAATCTTTCAT\n+GAACAAACAATAAAGGATCTCATTCTTATCAACTAGATTAGCATCCTTTGTAGTTGTTTT\n+ACTTGGTGAAGGTTTCGCACCCAGGCGCAAGGCCATAGCTTGCCACACTTTACCATGTTC\n+ATAACCGCGCCCGACAAGAGCATGAGCGATTTCGTGTAAAAGAGTGTCTAAAATATCCTC\n+GTAGATATCTTCCGCAACATGACGACCAGACAGTTCGATCAGTTTTTTGGTATAACTGCA\n+ACGG'..b'TCGGGTAATATCG\n+TTTGTGATGGGTGTGAAAACATGGAAACAATTGCTGATAAAAATAATTCTAATAATGGTT\n+ATGTTTCTTATGGTAGTAACTTGGTACAAATGGACTGATATATTCCCGATGATAAAAGGT\n+GCCCTTGTAGTCGATACAAGGGCTATCGAAATGGAAAGAACAGAAAAGTTTAATCAATCC\n+GCGTTGGAACAGTTGAGCATAGTTCATCTTACTTCCAACGCGGATTTTTCGGCGGTACTG\n+GCATTCAGACCAAAGAACATAAACTATTTTGTTGACATTGTAGAATATCAGGGAAAATTA\n+CCATCCCAAATCGATCCTAAAAACCTCGGTGGTTATCCGATCGATAAAACATCCGAAGAA\n+TACACGAATCATATAAATGGCTTGTACTATTCATCAACTACAGCAAGTTCCTACCTACCG\n+ACACGTGATTTTGTGCCAGTAGCTTATACTTTTAGTTGCCCTTATTTCAATCTTGATAAC\n+TACTATTCTGGATCGGTTTTGATGGAATGGTATGCAAAGCGGCCTGATATACCAGATATG\n+AAGATAAACATCATATGTGGACAGGCCGCGCGCATTTTAGGTCGAGCGAGGTGATTAACG\n+TAATGCTGGTGTTAAATTGTGTGATCTTCCAATAGCCCGTTTGATTGCTTTAAAGAAGTT\n+CATCACCGGGCTATTTTTCTCGTAAATATCCCAAACTTTCAATTTGTCCCACGGATCCGG\n+AACATAATCTTCATTCCTTGCCGAAACCCCCAACGTAACCCTCCTGTATCCAGCGCTGTC\n+GTGATAATACACGAAGAATGGCCTACCTATTGGCGCAACCTTACAACCTCTCTTAGCGGC\n+TCTCATAGCCTCATCGAACGTCATGGATTCCCCCAAAAATTTCTATGCATGAATGGTCGA\n+ATTCCGATAGTTTCACTCAAAATGAATATCGGATGATCTAGCTCATCGTGGTTTTCTTCG\n+TCAATGACGATATCCCAATCAGTAGCCTTTTGTTCTTCTACCGTGGCAATGAATACCTGG\n+TTTACTTCACACTTGCGGTGTGTACGTCTGATGATTGTATCCCCCTCACGAAACACAATC\n+ATATCAGGATTGGTAGTGCGGTACGCGGTTTTACCCGCGCACACTTCATTAAGCATATCT\n+TCGTATGTCATTATAAAACCTTTACACGTTGAACGATGGTTTGTTTAACGTCTTTGTATT\n+CTCCGTGCTCTTTAACGGTTGCTTTGAAAGTGATTTCATCACCTTCGTTTGCAATGTTAT\n+TACCGAAGTAAACAACAACATTACCATCAACATTAATTTTGGTCATGAATCTTTCTACAG\n+AAGTGTAGTAAGAAACTTGAGTATATCCCAGTGAAATCACTTTCTCAACGGTTCCGGTCA\n+TTTCCAGACGTTGTTTGATTTCACCGATGTGGTTAGCTTTAGAAATGCGTTCCTGGCGCT\n+CTTGTTCCCACTGTTCGCGGATTTCTTCGCGTTTGGCGATATAATCCTTTTCCAGTGCAA\n+CACCCATGCAGTAAGCGCACACAGCATCGAATACAGGGCTGTTTTTGTTGTCTTCTTTAG\n+ACTGGTCAGCCCACCAAAGAACAGTAAACATTGGCATTTCTGCAATTACTTCGCCTTTAC\n+GCTTGCCAATCGGCATGATCCCTTTTTCCAGCAGTTCCAGTTTTTCAGTATCGAACACTG\n+ACAGTTTACCGCGACGTTCGAACAGATCGAAATCTGCAAAGCCCTGGAATATCATTTTGA\n+AAGTATCAGTTTCAGTTAAACGGGCTGAAACACGGTCGAAATACTCACGCGCTTTCGCTT\n+CCGCTTTCTCCGGATCGGTAGATAAGTTGCAGATATAGTTATCAGAAGTATAACCGCCGC\n+CTCTACGCTCAACACGCAAGGTATACATTGCATTTTTACGACCAGAAGAAATGAAGTAAG\n+TGGTAGTAACTACGGTTGCGTTAGTCATGGTATTTCTCCTTAAAGGGTATCTCGTTTCGA\n+TATGGCTAATATAGCAAAAGCCCCTGACCGAAGTCAAGGGCTTTTTCATCATTCATTCGA\n+ATCTTTCATTGTTTTATGAAGATGAATATCAAAAATTTTCCAGTACGCCTTTCCGCGAGG\n+ATAAATTTTTGCTTTGTCAATATCGTTGTTGCTTCCCCATGTGTTGTTTGGGCCACGACA\n+TCGATTTTTTATATAATCTGTATGCCAGAATAAGCGCTGAACCGATGATTCCGTACCTAA\n+TGGATCTTCTTTACTGAACAGAATTTGTATACTCATAAGAAGAACCCAGTGCGAACAATC\n+AGATCGATTTTCTTTTCTGGTTCAAACGGTGATTTGCTATCGATGTTACACTGATAGAAC\n+ATACCAACATACTTTTCAGGAATGTTGGATTCACGCGCCCATTTCAGGTTATCATCGGTA\n+TTCGGCCCCAGCATGAGGTTAACAACATCAACAGCATAATCCTGTTCTTTGGTGTTGCCG\n+GAACCGTTTACATAATGCCCGTGGGCGGTTTTGAGGATCTCAATTACTTCGCTATCGTCA\n+GTTTCGACTTTGTAAAGCGTTGTATTTTCAGGAATTTCTTCAAGAATGATCAAAGCGGTT\n+TTCATCACACTTACCTTTGTGTTTCTGTTTACGTTTTGCTTCTTTAAATGCTCGCTTGCG\n+ATCGCGGTGAGTAGAAGCGCGGTTGAAATCATGTTTCGCTACCAAATTATTCATATAAGC\n+CCCTTAAAGAAAAATATTTAGGGGCTTTCGCCCCTGTATTAATCCAGCAATTTGCGGATC\n+TTGTCTGCGATACGTCCGGCGCGGGTTGCACTTGCAGTATGATCGCTTTCTTTTGAAGCC\n+AGTTCCGCCAGCTTACGCTGATGTTCTTCTTCTGCTGCTTGACGATCTGCTGCAACCTGT\n+GCAACCTGCTCATTATCGTGAGCAATACGCGCTTCCAGTTCAGACAGGGTTTTGTCGAAA\n+GTTGCTACGATTTCATCTACAGAACGAATTTTATTAAACAGTTTCATAATTTATCTCAAT\n+TGGTTAGTTTTAATCAGTATACATCAATATGGTTGAAATTCAAAATCATAAATGTCATTC\n+AGTGCGCGGTTCCACTCGGTGTAGTTTTCACCAGCACCATAACGCATTTGAATAGCACTT\n+TCGAACGTTGATCCGTTGAGGTTCGGGAAACCGAACAGGTTTTTGATTTTGTCATGTGCT\n+ACATAATACAGAGAAGCACTTTCCAGCATCGCAACCATCGCAGACGGTTCGTGTTCGCGG\n+CGCTTGATACGTAACAGAGTACGACTAGCACCAGTTTTACGGCGTTGATTTGGTGCTACG\n+TAGAAACGGAATACTACGCGCCCTGTTTTATCATCAACTACCAGGTAAAACCCGTTTTCT\n+TTCAGATCCACGCCTTCGAATTTCTTGAAGGTTCCGCGTTTCATGTCACCAATTTTAATT\n+GCATATTTGTGAATGTCAAGTCTTGTCAGAATTCTTTTCATATTTTTTAGATACCAGTTT\n+GCCTAATTTTGTAATTTCGCCTGTTTTTACGTTAACAAACAAGGCGATGCTCAGAAATGG\n+GATGCTAATCACTACGCTGATCAATGTAAACAGAAAACGTATCACAAAAAGAACAGCACG\n+TTCAAGATATCGTTGCATCCACGCGATTCCTAAACAACTATACCCTACTATAAAGGTGGT\n+TGCAACATAAAATGCACCAAATCCTTTACGAAATACGTAACCTTTCCCGGATTCTATCCG\n+GTCGTCGGCCCACATTTCACGGGCAGTTTTCAGAATAGATTCACCACTAGCGCGAGTTTC\n+GTTAGCCGAAGGCATGTTTTTAAATTTCATGATAGTCTCCTATGCGCCCAGAACTCTCCA\n+GGCGCGGTTGTTTAG\n' |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 test-data/miro.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/miro.gff3 Mon Jun 05 02:44:32 2023 +0000 |
b |
b'@@ -0,0 +1,827 @@\n+##gff-version 3\n+##sequence-region Miro 1 176055\n+Miro\tfeature\tgene\t7454\t7906\t.\t-\t.\tID=Miro_8\n+Miro\tGenBank\tCDS\t7454\t7894\t.\t-\t1\tID=Miro_8.CDS;Name=Miro_8;Parent=Miro_8;obsolete_name=Miro_156;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t7903\t7906\t.\t-\t1\tAlias=Miro_8;ID=Miro_8.RBS;Name=Miro_8;Parent=Miro_8\n+Miro\tfeature\tgene\t7917\t8512\t.\t-\t.\tID=Miro_9\n+Miro\tGenBank\tCDS\t7917\t8501\t.\t-\t1\tID=Miro_9.CDS;Name=Miro_9;Parent=Miro_9;obsolete_name=Miro_155;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t8509\t8512\t.\t-\t1\tAlias=Miro_9;ID=Miro_9.RBS;Name=Miro_9;Parent=Miro_9\n+Miro\tfeature\tgene\t123276\t124212\t.\t+\t.\tID=Miro_206\n+Miro\tGenBank\tCDS\t123286\t124212\t.\t+\t1\tAlias=Miro_206;ID=Miro_206.CDS;Name=Miro_206;Parent=Miro_206;obsolete_name=Miro_234;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t123276\t123279\t.\t+\t1\tID=Miro_206.rbs;Name=Miro_206;Parent=Miro_206\n+Miro\tfeature\tgene\t68490\t70715\t.\t-\t.\tID=Miro_117\n+Miro\tGenBank\tCDS\t68490\t70706\t.\t-\t1\tID=Miro_117.CDS;Name=Miro_117;Note=contains von Willebrand factor%2C type A;Parent=Miro_117;obsolete_name=Miro_047;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t70713\t70715\t.\t-\t1\tAlias=Miro_117;ID=Miro_117.RBS;Name=Miro_117;Parent=Miro_117\n+Miro\tfeature\tgene\t115729\t116735\t.\t+\t.\tID=Miro_200\n+Miro\tGenBank\tCDS\t115743\t116735\t.\t+\t1\tAlias=Miro_200;ID=Miro_200.CDS;Name=Miro_200;Note=T4 gp6-like;Parent=Miro_200;obsolete_name=Miro_240;product=baseplate structural protein\n+Miro\tGenBank\tShine_Dalgarno_sequence\t115729\t115732\t.\t+\t1\tID=Miro_200.RBS;Name=Miro_200;Parent=Miro_200\n+Miro\tfeature\tgene\t116735\t117608\t.\t+\t.\tID=Miro_201\n+Miro\tGenBank\tCDS\t116745\t117608\t.\t+\t1\tAlias=Miro_201;ID=Miro_201.CDS;Name=Miro_201;Note=T4 gp9/gp10-like;Parent=Miro_201;obsolete_name=Miro_239;product=baseplate structural protein\n+Miro\tGenBank\tShine_Dalgarno_sequence\t116735\t116738\t.\t+\t1\tID=Miro_201.RBS;Name=Miro_201;Parent=Miro_201\n+Miro\tfeature\tgene\t117595\t119422\t.\t+\t.\tID=Miro_202\n+Miro\tGenBank\tCDS\t117605\t119422\t.\t+\t1\tAlias=Miro_202;ID=Miro_202.CDS;Name=Miro_202;Note=T4 gp9/gp10-like;Parent=Miro_202;obsolete_name=Miro_238;product=baseplate structural protein\n+Miro\tGenBank\tShine_Dalgarno_sequence\t117595\t117597\t.\t+\t1\tID=Miro_202.RBS;Name=Miro_202;Parent=Miro_202\n+Miro\tfeature\tgene\t119412\t120090\t.\t+\t.\tID=Miro_203\n+Miro\tGenBank\tCDS\t119422\t120090\t.\t+\t1\tAlias=Miro_203;ID=Miro_203.CDS;Name=Miro_203;Note=T4 gp11-like;Parent=Miro_203;obsolete_name=Miro_237;product=baseplate to short tail fiber connector protein\n+Miro\tGenBank\tShine_Dalgarno_sequence\t119412\t119415\t.\t+\t1\tID=Miro_203.RBS;Name=Miro_203;Parent=Miro_203\n+Miro\tfeature\tgene\t81829\t81940\t.\t-\t.\tID=Miro_142\n+Miro\tGenBank\tCDS\t81829\t81927\t.\t-\t1\tID=Miro_142.CDS;Name=Miro_142;Parent=Miro_142;obsolete_name=Miro_022;product=hypothetical conserved;tmhelix=1 TMD %284-26%29 N out%2C C in\n+Miro\tGenBank\tShine_Dalgarno_sequence\t81938\t81940\t.\t-\t1\tAlias=Miro_142;ID=Miro_142.RBS;Name=Miro_142;Parent=Miro_142\n+Miro\tfeature\tgene\t1\t910\t.\t-\t.\tID=Miro_1\n+Miro\tGenBank\tCDS\t1\t900\t.\t-\t1\tID=Miro_1.CDS;Name=Miro_1;Parent=Miro_1;obsolete_name=Miro_163;product=rIIb\n+Miro\tGenBank\tShine_Dalgarno_sequence\t908\t910\t.\t-\t1\tAlias=Miro_1;ID=Miro_1.RBS;Name=Miro_1;Parent=Miro_1\n+Miro\tfeature\tgene\t900\t3173\t.\t-\t.\tID=Miro_2\n+Miro\tGenBank\tCDS\t900\t3161\t.\t-\t1\tID=Miro_2.CDS;Name=Miro_2;Parent=Miro_2;obsolete_name=Miro_162;product=rIIa\n+Miro\tGenBank\tShine_Dalgarno_sequence\t3171\t3173\t.\t-\t1\tAlias=Miro_2;ID=Miro_2.RBS;Name=Miro_2;Parent=Miro_2\n+Miro\tfeature\tgene\t3172\t3417\t.\t-\t.\tID=Miro_3\n+Miro\tGenBank\tCDS\t3172\t3408\t.\t-\t1\tID=Miro_3.CDS;Name=Miro_3;Parent=Miro_3;obsolete_name=Miro_161;product=hypothetical conserved;tmhelix=1 TMD %2812-34%29 N in%2C C out\n+Miro\tGenBank\tShine_Dalgarno_sequence\t3414\t3417\t.\t-\t1\tAlias=Miro_3;ID=Miro_3.RBS;Name=Miro_3;Parent=Miro_3\n+Miro\tfeature\tgene\t3412\t3979\t.\t-\t.\tID=Miro_4\n+Miro\tGenBank\tCDS\t3412\t3966\t.\t-\t1\tID=Miro_4.CDS;Name=Miro_4;Note=contains SprT domain;Parent=Miro_4;obsolete_name=Miro_160;product=hypothetical'..b't=Miro_98;obsolete_name=Miro_066;product=hypothetical conserved;tmhelix=2TMDs %287-26%2C 31-53%29 N in%2C C in\n+Miro\tGenBank\tShine_Dalgarno_sequence\t57368\t57372\t.\t+\t1\tID=Miro_98.RBS;Name=Miro_98;Parent=Miro_98\n+Miro\tfeature\tgene\t57613\t57914\t.\t+\t.\tID=Miro_99\n+Miro\tGenBank\tCDS\t57624\t57914\t.\t+\t1\tAlias=Miro_99;ID=Miro_99.CDS;Name=Miro_99;Parent=Miro_99;obsolete_name=Miro_065;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t57613\t57616\t.\t+\t1\tID=Miro_99.RBS;Name=Miro_99;Parent=Miro_99\n+Miro\tfeature\tgene\t81924\t82086\t.\t-\t.\tID=Miro_143\n+Miro\tGenBank\tCDS\t81924\t82079\t.\t-\t1\tID=Miro_143.CDS;Name=Miro_143;Parent=Miro_143;obsolete_name=Miro_021;product=hypothetical conserved;tmhelix=1 TMD %2815-37%29 N out%2C C in\n+Miro\tGenBank\tShine_Dalgarno_sequence\t82084\t82086\t.\t-\t1\tAlias=Miro_143;ID=Miro_143.RBS;Name=Miro_143;Parent=Miro_143\n+Miro\tfeature\tgene\t67179\t67658\t.\t-\t.\tID=Miro_114\n+Miro\tGenBank\tCDS\t67179\t67649\t.\t-\t1\tID=Miro_114.CDS;Name=Miro_114;Parent=Miro_114;obsolete_name=Miro_050;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t67656\t67658\t.\t-\t1\tAlias=Miro_114;ID=Miro_114.RBS;Name=Miro_114;Parent=Miro_114\n+Miro\tfeature\tgene\t81366\t81851\t.\t-\t.\tID=Miro_141\n+Miro\tGenBank\tCDS\t81366\t81839\t.\t-\t1\tID=Miro_141.CDS;Name=Miro_141;Note=contains macro domain;Parent=Miro_141;obsolete_name=Miro_023;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t81849\t81851\t.\t-\t1\tAlias=Miro_141;ID=Miro_141.RBS;Name=Miro_141;Parent=Miro_141\n+Miro\tfeature\tgene\t81076\t81376\t.\t-\t.\tID=Miro_140\n+Miro\tGenBank\tCDS\t81076\t81363\t.\t-\t1\tID=Miro_140.CDS;Name=Miro_140;Parent=Miro_140;obsolete_name=Miro_024;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t81374\t81376\t.\t-\t1\tAlias=Miro_140;ID=Miro_140.RBS;Name=Miro_140;Parent=Miro_140\n+Miro\tfeature\tgene\t83223\t83630\t.\t-\t.\tID=Miro_147\n+Miro\tGenBank\tCDS\t83223\t83618\t.\t-\t1\tID=Miro_147.CDS;Name=Miro_147;Parent=Miro_147;obsolete_name=Miro_017;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t83627\t83630\t.\t-\t1\tAlias=Miro_147;ID=Miro_147.RBS;Name=Miro_147;Parent=Miro_147\n+Miro\tfeature\tgene\t83066\t83224\t.\t-\t.\tID=Miro_146\n+Miro\tGenBank\tCDS\t83066\t83212\t.\t-\t1\tID=Miro_146.CDS;Name=Miro_146;Parent=Miro_146;obsolete_name=Miro_018;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t83221\t83224\t.\t-\t1\tAlias=Miro_146;ID=Miro_146.RBS;Name=Miro_146;Parent=Miro_146\n+Miro\tfeature\tgene\t82479\t83083\t.\t-\t.\tID=Miro_145\n+Miro\tGenBank\tCDS\t82479\t83069\t.\t-\t1\tID=Miro_145.CDS;Name=Miro_145;Parent=Miro_145;obsolete_name=Miro_019;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t83080\t83083\t.\t-\t1\tAlias=Miro_145;ID=Miro_145.RBS;Name=Miro_145;Parent=Miro_145\n+Miro\tfeature\tgene\t67646\t67816\t.\t-\t.\tID=Miro_115\n+Miro\tGenBank\tCDS\t67646\t67804\t.\t-\t1\tID=Miro_115.CDS;Name=Miro_115;Parent=Miro_115;obsolete_name=Miro_049;product=hypothetical conserved;tmhelix=2TMDs %282-21%2C 31-50%29 N in%2C C in\n+Miro\tGenBank\tShine_Dalgarno_sequence\t67814\t67816\t.\t-\t1\tAlias=Miro_115;ID=Miro_115.RBS;Name=Miro_115;Parent=Miro_115\n+Miro\tfeature\tgene\t84392\t84959\t.\t-\t.\tID=Miro_149\n+Miro\tGenBank\tCDS\t84392\t84946\t.\t-\t1\tID=Miro_149.CDS;Name=Miro_149;Parent=Miro_149;obsolete_name=Miro_015;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t84955\t84959\t.\t-\t1\tAlias=Miro_149;ID=Miro_149.RBS;Name=Miro_149;Parent=Miro_149\n+Miro\tfeature\tgene\t83686\t84337\t.\t-\t.\tID=Miro_148\n+Miro\tGenBank\tCDS\t83686\t84327\t.\t-\t1\tID=Miro_148.CDS;Name=Miro_148;Note=T4 RegB-like;Parent=Miro_148;obsolete_name=Miro_016;product=endoribonuclease;signal=signal peptidase II cleavage site 12-13\n+Miro\tGenBank\tShine_Dalgarno_sequence\t84334\t84337\t.\t-\t1\tAlias=Miro_148;ID=Miro_148.RBS;Name=Miro_148;Parent=Miro_148\n+Miro\tfeature\tgene\t67801\t68461\t.\t-\t.\tID=Miro_116\n+Miro\tGenBank\tCDS\t67801\t68451\t.\t-\t1\tID=Miro_116.CDS;Name=Miro_116;Parent=Miro_116;obsolete_name=Miro_048;product=hypothetical conserved\n+Miro\tGenBank\tShine_Dalgarno_sequence\t68457\t68461\t.\t-\t1\tAlias=Miro_116;ID=Miro_116.RBS;Name=Miro_116;Parent=Miro_116\n' |
b |
diff -r 6795d3349462 -r c8fcb7246ac3 test-data/miro_from_tool.gbk --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/miro_from_tool.gbk Mon Jun 05 02:44:32 2023 +0000 |
b |
b'@@ -0,0 +1,5817 @@\n+LOCUS Miro 176055 bp DNA UNK 01-JAN-1980\n+DEFINITION .\n+ACCESSION Miro\n+VERSION Miro\n+KEYWORDS .\n+SOURCE .\n+ ORGANISM .\n+ .\n+FEATURES Location/Qualifiers\n+ DNA 1..176055\n+ /Alias="Miro"\n+ /date="01-JAN-1980"\n+ gene complement(1..910)\n+ /locus_tag="CPT_Miro_001"\n+ CDS complement(1..900)\n+ /obsolete_name="Miro_163"\n+ /product="rIIb"\n+ /locus_tag="CPT_Miro_001"\n+ /transl_table=11\n+ gene complement(900..3173)\n+ /locus_tag="CPT_Miro_002"\n+ CDS complement(900..3161)\n+ /obsolete_name="Miro_162"\n+ /product="rIIa"\n+ /locus_tag="CPT_Miro_002"\n+ /transl_table=11\n+ RBS complement(908..910)\n+ /Alias="Miro_1"\n+ /locus_tag="CPT_Miro_001"\n+ RBS complement(3171..3173)\n+ /Alias="Miro_2"\n+ /locus_tag="CPT_Miro_002"\n+ gene complement(3172..3417)\n+ /locus_tag="CPT_Miro_003"\n+ CDS complement(3172..3408)\n+ /obsolete_name="Miro_161"\n+ /product="hypothetical conserved"\n+ /tmhelix="1 TMD (12-34) N in, C out"\n+ /locus_tag="CPT_Miro_003"\n+ /transl_table=11\n+ gene complement(3412..3979)\n+ /locus_tag="CPT_Miro_004"\n+ CDS complement(3412..3966)\n+ /obsolete_name="Miro_160"\n+ /product="hypothetical conserved"\n+ /locus_tag="CPT_Miro_004"\n+ /transl_table=11\n+ /note="contains SprT domain"\n+ RBS complement(3414..3417)\n+ /Alias="Miro_3"\n+ /locus_tag="CPT_Miro_003"\n+ RBS complement(3975..3979)\n+ /Alias="Miro_4"\n+ /locus_tag="CPT_Miro_004"\n+ gene complement(4038..5334)\n+ /locus_tag="CPT_Miro_005"\n+ CDS complement(4038..5324)\n+ /obsolete_name="Miro_159"\n+ /product="DNA topoisomerase II medium subunit"\n+ /locus_tag="CPT_Miro_005"\n+ /transl_table=11\n+ /note="T4 gp52-like"\n+ gene complement(5324..7231)\n+ /locus_tag="CPT_Miro_006"\n+ CDS complement(5324..7222)\n+ /obsolete_name="Miro_158"\n+ /product="DNA topoisomerase II, large subunit"\n+ /locus_tag="CPT_Miro_006"\n+ /transl_table=11\n+ /note="T4 gp39-like"\n+ RBS complement(5331..5334)\n+ /Alias="Miro_5"\n+ /locus_tag="CPT_Miro_005"\n+ RBS complement(7228..7231)\n+ /Alias="Miro_6"\n+ /locus_tag="CPT_Miro_006"\n+ gene complement(7290..7465)\n+ /locus_tag="CPT_Miro_007"\n+ CDS complement(7290..7454)\n+ /obsolete_name="Miro_157"\n+ /product="hypothetical conserved"\n+ /locus_tag="CPT_Miro_007"\n+ /transl_table=11\n+ /note="contains zinc ribbon domain"\n+ gene complement(7454..7906)\n+ /locus_tag="CPT_Miro_008"\n+ CDS complement(7454..7894)\n+ /obsolete_name="Miro_156"\n+ /product="hypothetical conserved"\n+ /locus_tag="CPT_Miro_008"\n+ /tr'..b'aacccc caacgtaacc ctcctgtatc cagcgctgtc\n+ 172981 gtgataatac acgaagaatg gcctacctat tggcgcaacc ttacaacctc tcttagcggc\n+ 173041 tctcatagcc tcatcgaacg tcatggattc ccccaaaaat ttctatgcat gaatggtcga\n+ 173101 attccgatag tttcactcaa aatgaatatc ggatgatcta gctcatcgtg gttttcttcg\n+ 173161 tcaatgacga tatcccaatc agtagccttt tgttcttcta ccgtggcaat gaatacctgg\n+ 173221 tttacttcac acttgcggtg tgtacgtctg atgattgtat ccccctcacg aaacacaatc\n+ 173281 atatcaggat tggtagtgcg gtacgcggtt ttacccgcgc acacttcatt aagcatatct\n+ 173341 tcgtatgtca ttataaaacc tttacacgtt gaacgatggt ttgtttaacg tctttgtatt\n+ 173401 ctccgtgctc tttaacggtt gctttgaaag tgatttcatc accttcgttt gcaatgttat\n+ 173461 taccgaagta aacaacaaca ttaccatcaa cattaatttt ggtcatgaat ctttctacag\n+ 173521 aagtgtagta agaaacttga gtatatccca gtgaaatcac tttctcaacg gttccggtca\n+ 173581 tttccagacg ttgtttgatt tcaccgatgt ggttagcttt agaaatgcgt tcctggcgct\n+ 173641 cttgttccca ctgttcgcgg atttcttcgc gtttggcgat ataatccttt tccagtgcaa\n+ 173701 cacccatgca gtaagcgcac acagcatcga atacagggct gtttttgttg tcttctttag\n+ 173761 actggtcagc ccaccaaaga acagtaaaca ttggcatttc tgcaattact tcgcctttac\n+ 173821 gcttgccaat cggcatgatc cctttttcca gcagttccag tttttcagta tcgaacactg\n+ 173881 acagtttacc gcgacgttcg aacagatcga aatctgcaaa gccctggaat atcattttga\n+ 173941 aagtatcagt ttcagttaaa cgggctgaaa cacggtcgaa atactcacgc gctttcgctt\n+ 174001 ccgctttctc cggatcggta gataagttgc agatatagtt atcagaagta taaccgccgc\n+ 174061 ctctacgctc aacacgcaag gtatacattg catttttacg accagaagaa atgaagtaag\n+ 174121 tggtagtaac tacggttgcg ttagtcatgg tatttctcct taaagggtat ctcgtttcga\n+ 174181 tatggctaat atagcaaaag cccctgaccg aagtcaaggg ctttttcatc attcattcga\n+ 174241 atctttcatt gttttatgaa gatgaatatc aaaaattttc cagtacgcct ttccgcgagg\n+ 174301 ataaattttt gctttgtcaa tatcgttgtt gcttccccat gtgttgtttg ggccacgaca\n+ 174361 tcgatttttt atataatctg tatgccagaa taagcgctga accgatgatt ccgtacctaa\n+ 174421 tggatcttct ttactgaaca gaatttgtat actcataaga agaacccagt gcgaacaatc\n+ 174481 agatcgattt tcttttctgg ttcaaacggt gatttgctat cgatgttaca ctgatagaac\n+ 174541 ataccaacat acttttcagg aatgttggat tcacgcgccc atttcaggtt atcatcggta\n+ 174601 ttcggcccca gcatgaggtt aacaacatca acagcataat cctgttcttt ggtgttgccg\n+ 174661 gaaccgttta cataatgccc gtgggcggtt ttgaggatct caattacttc gctatcgtca\n+ 174721 gtttcgactt tgtaaagcgt tgtattttca ggaatttctt caagaatgat caaagcggtt\n+ 174781 ttcatcacac ttacctttgt gtttctgttt acgttttgct tctttaaatg ctcgcttgcg\n+ 174841 atcgcggtga gtagaagcgc ggttgaaatc atgtttcgct accaaattat tcatataagc\n+ 174901 cccttaaaga aaaatattta ggggctttcg cccctgtatt aatccagcaa tttgcggatc\n+ 174961 ttgtctgcga tacgtccggc gcgggttgca cttgcagtat gatcgctttc ttttgaagcc\n+ 175021 agttccgcca gcttacgctg atgttcttct tctgctgctt gacgatctgc tgcaacctgt\n+ 175081 gcaacctgct cattatcgtg agcaatacgc gcttccagtt cagacagggt tttgtcgaaa\n+ 175141 gttgctacga tttcatctac agaacgaatt ttattaaaca gtttcataat ttatctcaat\n+ 175201 tggttagttt taatcagtat acatcaatat ggttgaaatt caaaatcata aatgtcattc\n+ 175261 agtgcgcggt tccactcggt gtagttttca ccagcaccat aacgcatttg aatagcactt\n+ 175321 tcgaacgttg atccgttgag gttcgggaaa ccgaacaggt ttttgatttt gtcatgtgct\n+ 175381 acataataca gagaagcact ttccagcatc gcaaccatcg cagacggttc gtgttcgcgg\n+ 175441 cgcttgatac gtaacagagt acgactagca ccagttttac ggcgttgatt tggtgctacg\n+ 175501 tagaaacgga atactacgcg ccctgtttta tcatcaacta ccaggtaaaa cccgttttct\n+ 175561 ttcagatcca cgccttcgaa tttcttgaag gttccgcgtt tcatgtcacc aattttaatt\n+ 175621 gcatatttgt gaatgtcaag tcttgtcaga attcttttca tattttttag ataccagttt\n+ 175681 gcctaatttt gtaatttcgc ctgtttttac gttaacaaac aaggcgatgc tcagaaatgg\n+ 175741 gatgctaatc actacgctga tcaatgtaaa cagaaaacgt atcacaaaaa gaacagcacg\n+ 175801 ttcaagatat cgttgcatcc acgcgattcc taaacaacta taccctacta taaaggtggt\n+ 175861 tgcaacataa aatgcaccaa atcctttacg aaatacgtaa cctttcccgg attctatccg\n+ 175921 gtcgtcggcc cacatttcac gggcagtttt cagaatagat tcaccactag cgcgagtttc\n+ 175981 gttagccgaa ggcatgtttt taaatttcat gatagtctcc tatgcgccca gaactctcca\n+ 176041 ggcgcggttg tttag\n+//\n' |