Repository 'getorganelle'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/getorganelle

Changeset 2:06bcf65179fb (2023-02-23)
Previous changeset 1:8b330a577046 (2022-09-28) Next changeset 3:611219887a48 (2023-05-16)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/getorganelle commit b7f743ebc67b5ffabac0eddf3b20960a6444365f
modified:
get_organelle_from_reads.xml
added:
get_annotated_regions_from_gb.xml
macros.xml
test-data/NC_047059.gb
test-data/NC_047060.gb
test-data/NC_047400.gb
b
diff -r 8b330a577046 -r 06bcf65179fb get_annotated_regions_from_gb.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_annotated_regions_from_gb.xml Thu Feb 23 17:06:37 2023 +0000
[
@@ -0,0 +1,140 @@
+<tool id="get_annotated_regions_from_gb" name="Get annotated regions from genbank files (getorganelle)" version="0.1.0" python_template_version="3.5">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">getorganelle</requirement>
+        <requirement type="package" version="@BIOPYTHON_VERSION@">biopython</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #import re
+
+        ## create list of symbolic links to input files
+
+        #set file_names = []
+        #for $input in $inputs
+            #set $input_name = re.sub('[^\w\-\s]', '_', str($input.name)).replace('_gb', '.gb')
+            ln -s '$input' '$input_name' &&
+            $file_names.append($input_name)
+        #end for
+
+        ## run python script
+
+        get_annotated_regions_from_gb.py
+        #for file in file_names:
+            '$file'
+        #end for
+        -o results_directory
+        #if str($gene_type_selector) == "CDS":
+            -t CDS
+        #end if
+        #if str($gene_type_selector) == "tRNA":
+            -t tRNA
+        #end if
+        #if str($gene_type_selector) == "rRNA":
+            -t rRNA
+        #end if
+        --mix
+
+    ]]></command>
+    <inputs>
+        <param type="data" multiple="true" name="inputs" format="gb" label="Annotated genbank file(s)" help="Genbank files with annotated regions to extract. Multiple files can be selected." />
+        <param name="gene_type_selector" type="select" label="Gene type">
+            <option value="CDS" selected="true">CDS</option>
+            <option value="tRNA">tRNA</option>
+            <option value="rRNA">rRNA</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output_fasta" format="fasta" from_work_dir="results_directory/gene/gene.fasta" label='${tool.name} on ${on_string}: Annotated genes'/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="inputs" value="NC_047059.gb,NC_047060.gb,NC_047400.gb"/>
+            <param name="gene_type_selector" value="CDS"/>
+            <assert_stdout>
+                <has_text text="Time cost" />
+            </assert_stdout>
+            <output name="output_fasta">
+                <assert_contents>
+                    <has_line line=">matK CDS - NC_047059--Styphnolobium_japonicum_voucher_Yi15212-KUN_plastid__complete_genome" />
+                    <has_line line=">matK CDS - NC_047060--Haematoxylum_brasiletto_voucher_N._Zamora6857-Costa_Rica_plastid__complete_genome" />
+                    <has_line line=">matK CDS - NC_047400--Chamaecrista_mimosoides_voucher_Yi15441-KUN_plastid__complete_genome" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="inputs" value="NC_047059.gb,NC_047060.gb,NC_047400.gb"/>
+            <param name="gene_type_selector" value="tRNA"/>
+            <assert_stdout>
+                <has_text text="Time cost" />
+            </assert_stdout>
+            <output name="output_fasta">
+                <assert_contents>
+                    <has_line line=">trnA-UGC tRNA - NC_047059--Styphnolobium_japonicum_voucher_Yi15212-KUN_plastid__complete_genome" />
+                    <has_line line=">trnA-UGC tRNA - NC_047060--Haematoxylum_brasiletto_voucher_N._Zamora6857-Costa_Rica_plastid__complete_genome" />
+                    <has_line line=">trnA-UGC tRNA - NC_047400--Chamaecrista_mimosoides_voucher_Yi15441-KUN_plastid__complete_genome" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="inputs" value="NC_047059.gb,NC_047060.gb,NC_047400.gb"/>
+            <param name="gene_type_selector" value="rRNA"/>
+            <assert_stdout>
+                <has_text text="Time cost" />
+            </assert_stdout>
+            <output name="output_fasta">
+                <assert_contents>
+                    <has_line line=">rrn16 rRNA - NC_047059--Styphnolobium_japonicum_voucher_Yi15212-KUN_plastid__complete_genome" />
+                    <has_line line=">rrn16 rRNA - NC_047060--Haematoxylum_brasiletto_voucher_N._Zamora6857-Costa_Rica_plastid__complete_genome" />
+                    <has_line line=">rrn16 rRNA - NC_047400--Chamaecrista_mimosoides_voucher_Yi15441-KUN_plastid__complete_genome" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+            Python script to extract annotated genes from genbank files. The annotated genes are in the correct format to be used as seed sequences in GetOrganelle.
+            Help information from the python script is below. Only options for input genebank files and gene type are included in this tool.
+
+            By jinjianjun@mail.kib.ac.cn 2017
+            Usage: get_annotated_regions_from_gb.py gb_files -o out_dir
+
+            Options:
+              -h, --help            show this help message and exit
+              -o OUT_PUT            Output.
+              -t GENE_TYPES         Annotation type taken as gene. Default: CDS,tRNA,rRNA
+              --separate-copy       By default, only keep one copy (see '--copy-mode' for
+                                    more) if there are several regions with the same name.
+                                    Exception: if there are one copy with intron(s) and
+                                    another copy without intron, they would be both kept.
+                                    This exception was specially made for the convenience
+                                    of commonly-incorrectly-annotated rps12 gene of
+                                    plastome.
+              --copy-mode=COPY_MODE
+                                    first|longest|leastN|leastN_longest (default).
+              --separate-exon       By default, combining exons.
+              --keys=GENE_KEYS      The key to the gene name: gene, label, product or
+                                    other keys in the qualifiers region.Default:
+                                    gene,label,product,note.
+              --mix                 Mix different genes into a single fasta file. In this
+                                    mode, the sequence header will be gene_name - gb_info
+              --case-mode=CASE_TREATMENT
+                                    first: Gene name case-non-sensitive. Consistent to the
+                                    first appearance.  lower: Gene name case-non-
+                                    sensitive. All gene name set to lower case.  upper:
+                                    Gene name case-non-sensitive. All gene name set to
+                                    Upper case.  raw: Gene name case-sensitive.
+              --ignore-format-error
+                                    Skip the Error: key "*" not found in annotation. Not
+                                    suggested.
+              --translate-to-product
+                                    Translate the tRNA gene name to the form of their
+                                    product. Default: False
+              --overwrite           Choose to overwrite previous result.
+
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1093/sysbio/syaa047</citation>
+    </citations>
+</tool>
b
diff -r 8b330a577046 -r 06bcf65179fb get_organelle_from_reads.xml
--- a/get_organelle_from_reads.xml Wed Sep 28 22:05:41 2022 +0000
+++ b/get_organelle_from_reads.xml Thu Feb 23 17:06:37 2023 +0000
b
@@ -1,6 +1,6 @@
 <tool id="get_organelle_from_reads" name="Get organelle from reads" version="@TOOL_VERSION@">
     <macros>
-        <token name="@TOOL_VERSION@">1.7.6.1</token>
+        <import>macros.xml</import>
         <xml name="seed_and_genes" tokens="optional">
             <param type="data" argument="-s" format="fasta" optional="@OPTIONAL@" label="Seed sequence(s)" help="Fasta file to use as initial seed. A seed sequence in GetOrganelle is only used for identifying initial organelle reads."/>
             <param type="data" argument="--genes" format="fasta" optional="@OPTIONAL@" label="Gene sequence(s)" help="Fasta file containing protein coding genes and ribosomal RNAs extracted from a reference genome that you want to assemble."/>
b
diff -r 8b330a577046 -r 06bcf65179fb macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Thu Feb 23 17:06:37 2023 +0000
b
@@ -0,0 +1,4 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.7.7.0</token>
+    <token name="@BIOPYTHON_VERSION@">1.79</token>
+</macros>
b
diff -r 8b330a577046 -r 06bcf65179fb test-data/NC_047059.gb
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/NC_047059.gb Thu Feb 23 17:06:37 2023 +0000
b
b'@@ -0,0 +1,4672 @@\n+LOCUS       NC_047059             158656 bp    DNA     circular PLN 17-APR-2020\n+DEFINITION  Styphnolobium japonicum voucher Yi15212-KUN plastid, complete\n+            genome.\n+ACCESSION   NC_047059\n+VERSION     NC_047059.1\n+DBLINK      BioProject: PRJNA626053\n+KEYWORDS    RefSeq.\n+SOURCE      plastid Styphnolobium japonicum (Japanese pagoda tree)\n+  ORGANISM  Styphnolobium japonicum\n+            Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;\n+            Spermatophyta; Magnoliopsida; eudicotyledons; Gunneridae;\n+            Pentapetalae; rosids; fabids; Fabales; Fabaceae; Papilionoideae;\n+            Cladrastis clade; Styphnolobium.\n+REFERENCE   1  (bases 1 to 158656)\n+  AUTHORS   Zhang,R., Wang,Y.H., Jin,J.J., Stull,G.W., Bruneau,A., Cardoso,D.,\n+            de Queiroz,L.P., Moore,M.J., Zhang,S.D., Chen,S.Y., Wang,J., Li,D.Z.\n+            and Yi,T.S.\n+  TITLE     Exploration of Plastid Phylogenomic Conflict Yields New Insights\n+            into the Deep Relationships of Leguminosae\n+  JOURNAL   Syst. Biol. (2020) In press\n+   PUBMED   32065640\n+  REMARK    Publication Status: Available-Online prior to print\n+REFERENCE   2  (bases 1 to 158656)\n+  CONSRTM   NCBI Genome Project\n+  TITLE     Direct Submission\n+  JOURNAL   Submitted (17-APR-2020) National Center for Biotechnology\n+            Information, NIH, Bethesda, MD 20894, USA\n+REFERENCE   3  (bases 1 to 158656)\n+  AUTHORS   Zhang,R., Wang,Y.-H., Jin,J.-J., Stull,G.W., Bruneau,A., Cardoso,D.,\n+            de Queiroz,L.P., Moore,M.J., Zhang,S.-D., Chen,S.-Y., Wang,J.,\n+            Li,D.-Z. and Yi,T.-S.\n+  TITLE     Direct Submission\n+  JOURNAL   Submitted (18-NOV-2019) Germplasm Bank of Wild Species, Kunming\n+            Institute of Botany, 132 Lanhei Rd, Kunming, Yunnan 650201, China\n+COMMENT     ##Assembly-Data-START##\n+            Assembly Method       :: GetOrganelle v. August 25, 2016; CLC\n+            Sequencing Technology :: Illumina\n+            ##Assembly-Data-END##\n+            PROVISIONAL REFSEQ: This record has not yet been subject to final\n+            NCBI review. The reference sequence is identical to MN709788.\n+            Genomics Workbench v. v8.5.1\n+            COMPLETENESS: full length.\n+FEATURES             Location/Qualifiers\n+     source          1..158656\n+                     /organism="Styphnolobium japonicum"\n+                     /organelle="plastid"\n+                     /mol_type="genomic DNA"\n+                     /specimen_voucher="Yi15212-KUN"\n+                     /db_xref="taxon:3897"\n+                     /tissue_type="Fresh"\n+     gene            complement(join(102600..103393,73767..73880))\n+                     /gene="rps12"\n+                     /locus_tag="HHI28_pgp041"\n+                     /trans_splicing=""\n+                     /db_xref="GeneID:54371971"\n+     CDS             complement(join(102600..102625,103162..103393,\n+                     73767..73880))\n+                     /gene="rps12"\n+                     /locus_tag="HHI28_pgp041"\n+                     /trans_splicing=""\n+                     /codon_start=1\n+                     /transl_table=11\n+                     /product="ribosomal protein S12"\n+                     /protein_id="YP_009755925.1"\n+                     /db_xref="GeneID:54371971"\n+                     /translation="MPTIKQLIRNTRQPIRNVTKSPALRGCPQRRGTCTRVYTITPKKP\n+                     NSALRKVARVRLTSGFEITAYIPGIGHNLQEHSVVLVRGGRVKDLPGVRYHIVRGTLDA\n+                     VGVKDRQQGRSKYGVKKPK"\n+     gene            complement(45..118)\n+                     /gene="trnH-GUG"\n+                     /locus_tag="HHI28_pgt001"\n+                     /db_xref="GeneID:54371842"\n+     tRNA            complement(45..118)\n+                     /gene="trnH-GUG"\n+                     /locus_tag="HHI28_pgt001"\n+                     /product="tRNA-His"\n+                     /note="anticodon:GUG"\n+                     /db_xref="GeneID:54371842"\n+     gene            complement(515..1576)\n+'..b'aagag atgggaaaca tgctcaatat catttgattg\n+   155581 aatagttgac ccagcccctt gttgtttgaa gaaaccctcc acttcaattg gtattttttc\n+   155641 gcgaaaagca aacatgagat aataaatcca gtctttcact aagatttcga atagctgtcc\n+   155701 cgaattcaag ttgattatgt ttcgcctctt cctcggagaa agacgatcaa acaattccca\n+   155761 atcatggtcc ttgcggatcg gatcatccat ataatataca aaaagaaact ccagatattt\n+   155821 gatatctttc tctttgaatg agatctcaat tccagcgacg gtttcattag atatcttaca\n+   155881 actagaatcc ctcttttttc cgatccagtt cctccaccac cgcgaacccc agttagattc\n+   155941 aggcatgata cactttttag ttattgggag aacccaagta ctctctttcg gatccaggaa\n+   156001 agagctctca gagatctttt ttccttttgg aagatacagg agcgaaacaa tcaaccaatt\n+   156061 gatattggaa gactcaaaag attcttccaa tgtatcattt ctgggtccaa tggaattcat\n+   156121 aggtatagga agaagccctg tcaaatagag attttttctt tcgaccatct ttcgattgtt\n+   156181 aatacgatat ataaggaccg ctactacaaa gagtactaca cccttgatcg tgaaatatcg\n+   156241 attgcttgtt gaaccctgtg aattgcgtga aagtaggata ctccaaattc gggagtccaa\n+   156301 gagttttata aaacgctctt gatggaaaaa aatgtgaatg aaagatccca ctgaattgaa\n+   156361 ttgggtccat gaatctaaga aatagtgaga attcttgatc tctctcaata tctctctcaa\n+   156421 ttcgaaaatc caggatttga attgatgtcc tttcattgaa tcctcctaaa ttgcattgat\n+   156481 ttatcctaaa gatttcattt caattggaat ttggttattc accatgtacg aggatccccg\n+   156541 ctaagcatcc atggctgaat ggttaaagcg cccaactcat aattggcgaa ttcgtaggtt\n+   156601 caattcctac tggatgcacg ccaatgggac cctccaataa gtctattgga attggctctg\n+   156661 tatcaatgga atctcatcat ccatccataa cgaattggtg tggtatattc atatcataac\n+   156721 atatgaacag taagaactag cattcttatt gagactagaa ctcataggga agaaaataga\n+   156781 tttatggatg gaatcaaata tgcagtattt acagacaaaa gtattcggtt attggggaaa\n+   156841 aatcaatata cttctaatgt cgaatcagga tcaactagga cagaaataaa gcattgggtc\n+   156901 gaactcttct ttggtgtcaa ggtaatagct atgaatagtc atcgactccc ggtaaagggt\n+   156961 agaagaatgg gacctattat gggacataca atgcattaca gacgtatgat cattacgctt\n+   157021 caaccgggtt attctattcc acctcttaga aagaaaagaa cttaaatcaa aatacttaat\n+   157081 agcatggcga tacatttata caaaacttct accccgagca cacgcaatgg agccgtagac\n+   157141 agtcaagtga aatccaatcc acgaaataat ttgatctatg gacagcatcg ttgtggtaaa\n+   157201 ggccgtaatg ccagaggaat cattaccgca gggcatagag ggggaggtca taagcgttta\n+   157261 taccgtaaaa tcgattttcg acggaatgaa aaagacatat atggtagaat cataaccata\n+   157321 gaatacgacc ctaatcgaaa tgcatacatt tgtctcatac actatgggga tggtgagaaa\n+   157381 agatatattt tacatcccag aggggctata attggagata ccattgtttc tggtacagaa\n+   157441 gttcctataa aaatgggaaa tgccctacct ttgagtgcgg tttgaactat tgatttacgt\n+   157501 aattggaagt aaccaattag gtttacggcg aaacctagaa atcgatcact gatccaattt\n+   157561 gagtacctct acaggataga cctcaacaga aaactgaaga gtaacggcag caagtgattg\n+   157621 agttcagtag ttcctcatat aaaattattg actctagaga tatagtaata tggagaagac\n+   157681 aaaattgttt caagcaccga cagaaccaga agcgcccctt gtttcaaaga ggggaggacg\n+   157741 ggttattcac atttcatttg atggtcagag gcgaattgaa agctaagcag tggtaattct\n+   157801 aaggattccc cgggggaaaa atagagatgt ctcctacgtt acccgtacta tgtggaagta\n+   157861 gcgacgtaat ttcatagagt cattcggtct gaatgctaca tgaagaacat aagccagatg\n+   157921 aaggaacggg aagacctagg atgtagaaga tcataacatg agtgattcgg cagatttgga\n+   157981 ttcctatata tccactcatg cggtacttca ttgtgcgata tatataagaa ttctacgata\n+   158041 tatataagat ccatctgtat agatatcatc atctacatcc ggaaagccgt atgctttgga\n+   158101 agaagcttgt acagtttggg aaggggtttt gattgatcaa aaagaagaat ctacttcaac\n+   158161 cgatatgccc ttaggcacgg ccatacataa catagaaatc acactcggaa agggtggaca\n+   158221 attagctaga gcagcaggtg ctgtagcgaa actgattgca aaagagggga aatcggccac\n+   158281 attaaaatta ccttctgggg aggtccgttt gatatccaaa aactgctcag caacagtcgg\n+   158341 acaagtgggg aatgttgggg taaaccagaa aagtttgggt agagccggat cgaaatgttg\n+   158401 gctaggtaag cgtcctgtag taagaggagt agttatgaac cctgtagacc atccccatgg\n+   158461 gggtggtgaa gggagggccc caattggtag aaaaaaaccc gcaactcctt ggggttatcc\n+   158521 tgcacttgga agaagaagta gaaaaaggaa taaatatagt gataatttga ttcttcgtcg\n+   158581 ccgtagtaaa tagtagagta gagaaaatag aatttgtttc ttcgtcttta caaaaaaaaa\n+   158641 taggagtaat taactg\n+//\n'
b
diff -r 8b330a577046 -r 06bcf65179fb test-data/NC_047060.gb
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/NC_047060.gb Thu Feb 23 17:06:37 2023 +0000
b
b'@@ -0,0 +1,4599 @@\n+LOCUS       NC_047060             157616 bp    DNA     circular PLN 17-APR-2020\n+DEFINITION  Haematoxylum brasiletto voucher N._Zamora6857-Costa_Rica plastid,\n+            complete genome.\n+ACCESSION   NC_047060\n+VERSION     NC_047060.1\n+DBLINK      BioProject: PRJNA626055\n+KEYWORDS    RefSeq.\n+SOURCE      plastid Haematoxylum brasiletto\n+  ORGANISM  Haematoxylum brasiletto\n+            Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;\n+            Spermatophyta; Magnoliopsida; eudicotyledons; Gunneridae;\n+            Pentapetalae; rosids; fabids; Fabales; Fabaceae; Caesalpinioideae;\n+            Cassia clade; Haematoxylum.\n+REFERENCE   1  (bases 1 to 157616)\n+  AUTHORS   Zhang,R., Wang,Y.H., Jin,J.J., Stull,G.W., Bruneau,A., Cardoso,D.,\n+            de Queiroz,L.P., Moore,M.J., Zhang,S.D., Chen,S.Y., Wang,J., Li,D.Z.\n+            and Yi,T.S.\n+  TITLE     Exploration of Plastid Phylogenomic Conflict Yields New Insights\n+            into the Deep Relationships of Leguminosae\n+  JOURNAL   Syst. Biol. (2020) In press\n+   PUBMED   32065640\n+  REMARK    Publication Status: Available-Online prior to print\n+REFERENCE   2  (bases 1 to 157616)\n+  CONSRTM   NCBI Genome Project\n+  TITLE     Direct Submission\n+  JOURNAL   Submitted (17-APR-2020) National Center for Biotechnology\n+            Information, NIH, Bethesda, MD 20894, USA\n+REFERENCE   3  (bases 1 to 157616)\n+  AUTHORS   Zhang,R., Wang,Y.-H., Jin,J.-J., Stull,G.W., Bruneau,A., Cardoso,D.,\n+            de Queiroz,L.P., Moore,M.J., Zhang,S.-D., Chen,S.-Y., Wang,J.,\n+            Li,D.-Z. and Yi,T.-S.\n+  TITLE     Direct Submission\n+  JOURNAL   Submitted (18-NOV-2019) Germplasm Bank of Wild Species, Kunming\n+            Institute of Botany, 132 Lanhei Rd, Kunming, Yunnan 650201, China\n+COMMENT     ##Assembly-Data-START##\n+            Assembly Method       :: GetOrganelle v. August 25, 2016; CLC\n+            Sequencing Technology :: Illumina\n+            ##Assembly-Data-END##\n+            PROVISIONAL REFSEQ: This record has not yet been subject to final\n+            NCBI review. The reference sequence is identical to MN709823.\n+            Genomics Workbench v. v8.5.1\n+            COMPLETENESS: full length.\n+FEATURES             Location/Qualifiers\n+     source          1..157616\n+                     /organism="Haematoxylum brasiletto"\n+                     /organelle="plastid"\n+                     /mol_type="genomic DNA"\n+                     /specimen_voucher="N._Zamora6857-Costa_Rica"\n+                     /db_xref="taxon:191923"\n+                     /tissue_type="silica"\n+     gene            complement(join(101153..101946,72424..72537))\n+                     /gene="rps12"\n+                     /locus_tag="HHI30_pgp041"\n+                     /trans_splicing=""\n+                     /db_xref="GeneID:54372102"\n+     CDS             complement(join(101153..101178,101715..101946,\n+                     72424..72537))\n+                     /gene="rps12"\n+                     /locus_tag="HHI30_pgp041"\n+                     /trans_splicing=""\n+                     /codon_start=1\n+                     /transl_table=11\n+                     /product="ribosomal protein S12"\n+                     /protein_id="YP_009756008.1"\n+                     /db_xref="GeneID:54372102"\n+                     /translation="MPTIKQLIRNTRQPIRNVTKSPALRGCPQRRGTCTRVYTITPKKP\n+                     NSALRKVARVRLTSGFEITAYIPGIGHNLQEHSVVLVRGGRVKDLPGVRYHIVRGTLDA\n+                     VGVKDRQQGRSKYGVKKPK"\n+     gene            complement(1..74)\n+                     /gene="trnH-GUG"\n+                     /locus_tag="HHI30_pgt001"\n+                     /db_xref="GeneID:54371973"\n+     tRNA            complement(1..74)\n+                     /gene="trnH-GUG"\n+                     /locus_tag="HHI30_pgt001"\n+                     /product="tRNA-His"\n+                     /db_xref="GeneID:54371973"\n+     repeat_region   1\n+                     /gene="rps19"\n+                     /locus_t'..b'54501 aataaatcca gtctttcact aagatttcga atagctgtcc cgaattcaag ttgattatgt\n+   154561 ttcgcctctt cctcggagaa agacgatcaa acaattccca atcatggtcc ttgcggatcg\n+   154621 gatcatccat ataatataca aaaagaaact ccagatattt gatatctttc tctttgaatg\n+   154681 agatctcaat tccagcgacg gtttcattag atatcttaca actagaatcc ctcttttttc\n+   154741 cgatccagtt cctccaccac cgcgaacccc agttagattc aggcatgata cactttttag\n+   154801 ttattgggag aacccaagta ctctctttcg gatccaggaa agagctctca gagatctttt\n+   154861 ttccttttgg aagatacagg agcgaaacaa tcaacctatt gatattggaa gactcaaaag\n+   154921 attcttccaa tgtatcattt ctgggtccaa tggaattcat aggtatagga agaagccctg\n+   154981 tcaaatagag attttttctt tcgaccatct ttcgattgtt aatacgatat ataaggaccg\n+   155041 ctactacaaa gagtactaca cccttgatcg tgaaatatcg attgcttgtt gaaccctgtg\n+   155101 aattgcgtga aagtaggata ctccaaattc gggagtccaa gagttttata aaacgctctt\n+   155161 gatggaaaaa aatgtgaatg aaagatccca ctgaattgaa ttgggtccat gaatctaaga\n+   155221 aatagtgaga attcttgatc tctctcaata tctctctcaa ttcgaaaatc caggatttga\n+   155281 attgatgtcc tttcattgag tcctcctaaa ttgcattgat ttatcctaaa gatttcattt\n+   155341 caattggaat ttggttattc accatgtacg aggatccccg ctaagcatcc atggctgaat\n+   155401 ggttaaagcg cccaactcat aattggcgaa ttcgtaggtt caattcctac tggatgcacg\n+   155461 ccaatgggac cctccaataa gtctattgga attggctctg tatcaatgga atctcatcat\n+   155521 ccatacataa cgaattggtg tggtatattc atatcataac atatgaacag taagaactag\n+   155581 cattcttatt gagactagaa ctcataggga agaaaataga tttatggatg gaatcaaata\n+   155641 tgcagtattt acagacaaaa gtattcggtt attggtgaaa aatcaatata cttctaatgt\n+   155701 cgaatcagga tcaactagga cagaaataaa gcattgggtc gaactcttct ttggtgtcaa\n+   155761 ggtaatagct atgaatagtc atcgactccc ggtaaagggt agaagaatgg gacctattat\n+   155821 gggacataca atgcattaca gacgtatgat cattacgctt caaccgggtt attctattcc\n+   155881 acctcttaga aagaaaagaa cttaaatcaa aatacttaat agcatggcga tacatttata\n+   155941 caaaacttct accccgagca cacgcaatgg agccgtagac agtcaagtga aatccaatcc\n+   156001 acgaaataat ttgatctatg gacagcatcg ttgtggtaaa ggtcgtaatg ccagaggaat\n+   156061 cattaccgca gggcatagag ggggaggtca taagcgtcta taccgtaaaa tcgattttcg\n+   156121 acggaatgaa aaagacatat atggtagaat cgtaaccata gaatacgacc ctaatcgaaa\n+   156181 tgcatacatt tgtctcatac actatgggga tggtgagaaa agatatattt tacatcccag\n+   156241 aggggctata attggagata ccattgtttc tggtacagaa gttcctataa aaatgggaaa\n+   156301 tgccctacct ttgagtgcgg tttgaactat tgatttacgt aattggaagt aaccaattag\n+   156361 gtttacggcg aaacctagaa atcgatcact gatccaattt gagtacctct acaggataga\n+   156421 cctcaacaga aaactgaaga gtaacggcag caagtgattg agttcagtag ttcctcatat\n+   156481 aaaattattg actctagaga tatagtaata tggagaagac aaaattgttt caagcaccga\n+   156541 cagaaccaga agcgcccctt gtttcaaaga ggggaggacg ggttattcac atttcatttg\n+   156601 atggtcagag gcgaattgaa agctaagcag tggtaattct aaggatttcc cgggggaaaa\n+   156661 atagagatgt ctcctacgtt acccgtaata tgtggaagta tcgacgtaat ttcatagagt\n+   156721 cattcggtct gaatgctaca tgaagaacat aagccagatg aaggaacggg aagacctagg\n+   156781 atgtagaaga tcataacatg agggattcgg cagatttgga ttcctatata tccactcatg\n+   156841 cggtacttca ttgtacgata tatataagaa ttctacgata tatataagat ccatctgtat\n+   156901 agatatcatc atctacatcc agaaagccgt atgctttgga agaagcttgt acagtttggg\n+   156961 aaggggtttt gattgatcaa aaagaagaat ctacttcaac cgatatgccc ttaggcacgg\n+   157021 ccatacataa catagaaatc acactcggaa agggtggaca attagctaga gcagcaggtg\n+   157081 ctgtagcgaa actgattgca aaagagggga aatcggccac attaaaatta ccttctgggg\n+   157141 aggtccgttt gatatccaaa aactgctcag caacagtcgg acaagtgggg aatgttgggg\n+   157201 taaaccagaa aagtttgggt agagccggat ctaaatgttg gctaggtaag cgtcctgtag\n+   157261 taagaggagt agttatgaac cctgtagacc atccccatgg gggtggtgaa gggagggccc\n+   157321 caattggtag aaaaaaaccc gcaactcctt ggggttatcc tgcacttgga agaagaagta\n+   157381 gaaaaaggaa taaatatagt gataatttga ttcttcgtcg ccgtagtaaa tagtagagta\n+   157441 gagaaaatag aatttgtttc ttcgtcttta caaaaaaata gggggagtaa gtaaaaaaaa\n+   157501 taggagtaat gaactgtgac acgttcacta aaaaaaaatc cttttgtagc gaatcattta\n+   157561 ttaaaaaaaa taaagaagct taacacaaaa gcagaaaaag aaataatagt aacttg\n+//\n'
b
diff -r 8b330a577046 -r 06bcf65179fb test-data/NC_047400.gb
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/NC_047400.gb Thu Feb 23 17:06:37 2023 +0000
b
b'@@ -0,0 +1,4672 @@\n+LOCUS       NC_047400             158739 bp    DNA     circular PLN 26-APR-2020\n+DEFINITION  Chamaecrista mimosoides voucher Yi15441-KUN plastid, complete\n+            genome.\n+ACCESSION   NC_047400\n+VERSION     NC_047400.1\n+DBLINK      BioProject: PRJNA628455\n+KEYWORDS    RefSeq.\n+SOURCE      plastid Chamaecrista mimosoides\n+  ORGANISM  Chamaecrista mimosoides\n+            Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;\n+            Spermatophyta; Magnoliopsida; eudicotyledons; Gunneridae;\n+            Pentapetalae; rosids; fabids; Fabales; Fabaceae; Caesalpinioideae;\n+            Cassia clade; Chamaecrista.\n+REFERENCE   1  (bases 1 to 158739)\n+  AUTHORS   Zhang,R., Wang,Y.H., Jin,J.J., Stull,G.W., Bruneau,A., Cardoso,D.,\n+            de Queiroz,L.P., Moore,M.J., Zhang,S.D., Chen,S.Y., Wang,J., Li,D.Z.\n+            and Yi,T.S.\n+  TITLE     Exploration of Plastid Phylogenomic Conflict Yields New Insights\n+            into the Deep Relationships of Leguminosae\n+  JOURNAL   Syst. Biol. (2020) In press\n+   PUBMED   32065640\n+  REMARK    Publication Status: Available-Online prior to print\n+REFERENCE   2  (bases 1 to 158739)\n+  CONSRTM   NCBI Genome Project\n+  TITLE     Direct Submission\n+  JOURNAL   Submitted (26-APR-2020) National Center for Biotechnology\n+            Information, NIH, Bethesda, MD 20894, USA\n+REFERENCE   3  (bases 1 to 158739)\n+  AUTHORS   Zhang,R., Wang,Y.-H., Jin,J.-J., Stull,G.W., Bruneau,A., Cardoso,D.,\n+            de Queiroz,L.P., Moore,M.J., Zhang,S.-D., Chen,S.-Y., Wang,J.,\n+            Li,D.-Z. and Yi,T.-S.\n+  TITLE     Direct Submission\n+  JOURNAL   Submitted (18-NOV-2019) Germplasm Bank of Wild Species, Kunming\n+            Institute of Botany, 132 Lanhei Rd, Kunming, Yunnan 650201, China\n+COMMENT     ##Assembly-Data-START##\n+            Assembly Method       :: GetOrganelle v. August 25, 2016; CLC\n+            Sequencing Technology :: Illumina\n+            ##Assembly-Data-END##\n+            PROVISIONAL REFSEQ: This record has not yet been subject to final\n+            NCBI review. The reference sequence is identical to MN709886.\n+            Genomics Workbench v. v8.5.1\n+            COMPLETENESS: full length.\n+FEATURES             Location/Qualifiers\n+     source          1..158739\n+                     /organism="Chamaecrista mimosoides"\n+                     /organelle="plastid"\n+                     /mol_type="genomic DNA"\n+                     /specimen_voucher="Yi15441-KUN"\n+                     /db_xref="taxon:948715"\n+                     /tissue_type="Fresh"\n+     gene            complement(join(102408..103201,74197..74310))\n+                     /gene="rps12"\n+                     /locus_tag="HJF93_pgp041"\n+                     /trans_splicing=""\n+                     /db_xref="GeneID:54625455"\n+     CDS             complement(join(102408..102433,102970..103201,\n+                     74197..74310))\n+                     /gene="rps12"\n+                     /locus_tag="HJF93_pgp041"\n+                     /trans_splicing=""\n+                     /codon_start=1\n+                     /transl_table=11\n+                     /product="ribosomal protein S12"\n+                     /protein_id="YP_009772065.1"\n+                     /db_xref="GeneID:54625455"\n+                     /translation="MPTIKQLIRNTRQPIRNVTKSPALRGCPQRRGTCTRVYTITPKKP\n+                     NSALRKVARVRLTSGFEITAYIPGIGHNLQEHSVVLVRGGRVKDLPGVRYHIVRGTLDA\n+                     VGVKDRQQGRSKYGVKKPK"\n+     gene            complement(13..86)\n+                     /gene="trnH-GUG"\n+                     /locus_tag="HJF93_pgt001"\n+                     /db_xref="GeneID:54625326"\n+     tRNA            complement(13..86)\n+                     /gene="trnH-GUG"\n+                     /locus_tag="HJF93_pgt001"\n+                     /product="tRNA-His"\n+                     /db_xref="GeneID:54625326"\n+     gene            complement(368..1429)\n+                     /gene="psbA"\n+                     /locus_tag="HJ'..b'ag ctgtcccgaa\n+   155641 ttcaagttga ttatgtttcg cctcttcctc ggagaaagac gatcaaacaa ttcccaatca\n+   155701 tggtccttgc ggatcggatc atccatataa tatacaaaaa gaaactccag atatttgata\n+   155761 tctttctctt tgaatgagat ctcaattcca gcgacggttt cattagatat cttacaacta\n+   155821 gaatccctct tttttccgat ccagttcctc caccaccgcg aaccccagtt agattcaggc\n+   155881 atgatacact ttttagttat tgggagaacc caagtactct ctttcggatc caggaaagag\n+   155941 ctctcagaga tcttttttcc ttttggaata tacaggagcg aaacaatcaa cctattgata\n+   156001 ttggaagact caaaagattc ttccaatgta tcatttctgg gtccaatgga attcataggt\n+   156061 ataggaagaa gccctgtcaa atagagattt tttctttcga ccatctttcg attgttaata\n+   156121 cgatatataa ggaccgctac taccaagagt actacaccct tgatcgtgaa atatcgattg\n+   156181 cttgttgaac cctgtgaatt gcgtgaaagt aggatactcc aaattcggga gtccaagagt\n+   156241 tttataaaac gctcttgatg gaaaaaaatg tgaatgaaag atcccactga attgaattgg\n+   156301 gtccatgaat ctaagaaata gtgagaattc ttgatctctc tcaatatctc cctcaattcg\n+   156361 aaaatccagg atttgaattg atgtcctttc attgagtcct cctaaattgc attgatttat\n+   156421 cctaaagatt tcatttcaat tggaatttgg ttattcacca tgtacaagga tccccgctaa\n+   156481 gcatccatgg ctgaatggtt aaagcaccca actcataatt ggcgaattcg taggttcaat\n+   156541 tcctactgga tgcacgccaa tgggaccctc caatacaatt ccaataagtc tattggaatt\n+   156601 ggctctgtat caatggaatc tcatcatcca tacccaacga attggtgtgg tatattaata\n+   156661 tcataacata tgaacaataa gaactagcat tcttattgaa actagaactc atagggaaga\n+   156721 aaattgatat ggatgaaata aaatatgcag tagttacaga caaaagtatt cggttattgg\n+   156781 tgaaaaatca atatacttct aatgtcgaat caggatcaac taggacagaa ataaagcatt\n+   156841 gggtcgaact cttctttggt gtcaaggtaa tagctatgaa tagtcatcga ctcccggtaa\n+   156901 agggtagaag aatgggacct attatgggac atacaatgca ttacagacgt atgatcatta\n+   156961 cgcttcaacc gggttattct attccacctc ttagaaagaa aagaacttaa atcaaaatac\n+   157021 ttaatagcat ggcgatacat ttatacaaaa cttctacccc gagcacacgc aatggagccg\n+   157081 tagacagtca agtgaaatcc aatccacgaa ataatttgat ctatggacaa catcgttgtg\n+   157141 gtaaaggtcg taatgccaga ggaatcatta ccgcagggca tagaggggga ggtcataagc\n+   157201 gtctataccg taaaatcgat tttcgacgga atgaaaaaga catatatggt agaatcgtaa\n+   157261 ccatagaata cgaccctaat cgaaatgcat acatttgtct catacactat ggggatggtg\n+   157321 agaaaagata tattttacat cccagagggg ctataattgg agataccatt gtttctggta\n+   157381 cagaagttcc tataaaaatg ggaaatgccc tacctttgag tgcggtttga actattgatt\n+   157441 tacgtaattg gaagtaacca attaggttta cggcgaaacc tagaaatcga tcactgatcc\n+   157501 aatttgagta cctctacagg atagacctca tacaggatag acctcaacag aaaactgaag\n+   157561 agtaacggca gcaagtgatt gagttcagta gttcctcata tcaaattatt gactctagag\n+   157621 atatagtaat atggagaaga caaaattgtt tcaagcaccg acagaaccag aagcgcccct\n+   157681 tgtttcaaag aggggaggac gggttattca catttcattt gatggtcaga ggcgaattga\n+   157741 aagctaagca gtggtaattc taaggattcc ccggggaaaa atagagatgt ctcctacgtt\n+   157801 acccgtaata tgtggaagtg tcgacgtaat ttcatagagt cattcggtct gaatgctaca\n+   157861 tgaagaacat aagccagatg aaggaacggg aagacctagg atgtagaaga tcataacatg\n+   157921 agtgattcgg cagatttgga ttcctatata tccactcatg cggtacttca ttgtacgata\n+   157981 tataagaatt ctacgatata tataagatcc atctgtatag atatcatcat ctacatccag\n+   158041 aaagccgtat gctttggaag aagcttgtac agtttgggaa ggggttttga ttgatcaaaa\n+   158101 agaagaatct acttcaaccg atatgccctt aggcacggcc atacataaca tagaaatcac\n+   158161 actcggaaag ggtggacaat tagctagagc agcaggtgct gtagcgaaac tgattgcaaa\n+   158221 agaggggaaa tcggccacat taaaattacc ttctggggag gtccgtttga tatccaaaaa\n+   158281 ctgctcagca acagtcggac aagtggggaa tgttggggta aaccagaaaa gtttgggtag\n+   158341 agccggatct aaatgttggc taggtaagcg tcctgtagta agaggagtag ttatgaaccc\n+   158401 tgtagaccat ccccatgggg gtggtgaagg gagggcccca attggtagaa aaaaacccgc\n+   158461 aactccttgg ggttatcctg cacttggaag aagaagtaga aaaaggaata aatatagtga\n+   158521 taatttgatt cttcgtcgcc gtagtaaata gtagagtaga gaaaatcgaa tttgtttctt\n+   158581 cgtctttaca aaaaaaaaat agggggagta agtaaaaaaa ataggagtaa tgaactgtga\n+   158641 cacgttcact aaaaaaaaat ccttttgtag cgaatcattt attaagaaaa ataaataagc\n+   158701 ttaacacaaa agcagaaaaa gaaataatag taacttggt\n+//\n'