Repository 'gffread'
hg clone https://toolshed.g2.bx.psu.edu/repos/devteam/gffread

Changeset 8:154d00cbbf2d (2021-09-25)
Previous changeset 7:4dea02886337 (2019-11-11) Next changeset 9:3e436657dcd0 (2023-12-15)
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gffread commit f40643d8b80299ebb84faebe92579321ac459746"
modified:
gffread.xml
test-data/Homo_sapiens.GRCh37_19.71.gff3
test-data/ecoli-k12.processed.gff3
added:
test-data/Homo_sapiens.GRCh37_19.71.bed
test-data/stop_codons.gtf
b
diff -r 4dea02886337 -r 154d00cbbf2d gffread.xml
--- a/gffread.xml Mon Nov 11 18:27:46 2019 -0500
+++ b/gffread.xml Sat Sep 25 15:38:31 2021 +0000
[
b'@@ -1,13 +1,21 @@\n-<tool id="gffread" name="gffread" version="@VERSION@.0">\n+<tool id="gffread" name="gffread" version="@GALAXY_TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.05">\n     <description>Filters and/or converts GFF3/GTF2 records</description>\n+    <xrefs>\n+        <xref type="bio.tools">gffread</xref>\n+    </xrefs>\n     <macros>\n-        <token name="@VERSION@">0.11.6</token>\n+        <!-- the version of this tool must not be lowered since in the past 2.x was used\n+            lets use small increments and hope that gffread catches up one day -->\n+        <token name="@GALAXY_TOOL_VERSION@">2.2.1.3</token>\n+        <token name="@TOOL_VERSION@">0.12.7</token>\n+        <token name="@VERSION_SUFFIX@">0</token>\n         <xml name="fasta_output_select">\n             <param name="fa_outputs" type="select" display="checkboxes" multiple="true" label="Select fasta outputs">\n-                <option value="-w exons.fa">fasta file with spliced exons for each GFF transcript (-w exons.fa)</option>\n-                <option value="-x cds.fa">fasta file with spliced CDS for each GFF transcript (-x cds.fa)</option>\n-                <option value="-y pep.fa">protein fasta file with the translation of CDS for each record (-y pep.fa)</option>\n+                <option value="-w exons.fa">fasta file with spliced exons for each GFF transcript (-w)</option>\n+                <option value="-x cds.fa">fasta file with spliced CDS for each GFF transcript (-x)</option>\n+                <option value="-y pep.fa">protein fasta file with the translation of CDS for each record (-y)</option>\n                 <option value="-W">for each fasta: record the exon coordinates projected onto the spliced sequence (-W)</option>\n+                <option value="-S">for protein fasta: use \'*\' instead of \'.\' as stop codon translation (-S)</option>\n             </param>\n         </xml>\n         <xml name="ref_filtering_select">\n@@ -22,14 +30,14 @@\n             </param>\n         </xml>\n         <xml name="trackname">\n-            <param name="tname" type="text" value="" optional="true" label="Trackname to use in the second column of each GFF output line" help="(-t track_name}">\n+            <param argument="-t" name="tname" type="text" value="" optional="true" label="Trackname to use in the second column of each GFF output line" help="">\n                 <validator type="regex">\\w+</validator>\n             </param>\n         </xml>\n         <xml name="merge_opts">\n              <option value="-K">also collapse shorter, fully contained transcripts with fewer introns than the container (-K)</option>\n              <option value="-Q">remove the containment restriction: multi-exon transcripts will be collapsed if just their introns match, while single-exon transcripts can partially overlap 80% (-Q)</option>\n-             <option value="-d dupinfo">output collapsing info (-d dupinfo)</option>\n+             <option value="-d dupinfo">output collapsing info (-d)</option>\n         </xml>\n         <xml name="cluster_opts">\n              <option value="--force-exons"> make sure that the lowest level GFF features are printed as \'exon\' features (--force-exons)</option>\n@@ -48,14 +56,19 @@\n         </xml>\n     </macros>\n     <requirements>\n-        <requirement type="package" version="@VERSION@">gffread</requirement>\n+        <requirement type="package" version="@TOOL_VERSION@">gffread</requirement>\n     </requirements>\n+    <version_command>gffread --version</version_command>\n     <command detect_errors="aggressive">\n <![CDATA[\n     #if $reference_genome.source == \'history\':\n         ln -s \'$reference_genome.genome_fasta\' genomeref.fa &&\n     #end if\n+\n     gffread \'$input\'\n+    #if $input.ext.startswith("bed")\n+        --in-bed\n+    #end if\n     #if $reference_genome.source == \'cached\':\n         -g \'${reference_genome.fasta_indexes.fields.path}\'\n         #if $reference_genome.ref_filtering and str($reference_genome.ref_filtering) != \'\':\n@@ -104,22 +117,68 @@\n         #end '..b'  -M/--merge : cluster the input transcripts into loci, discarding\n-          "duplicated" transcripts (those with the same exact introns\n+          "redundant" transcripts (those with the same exact introns\n           and fully contained or equal boundaries)\n      -d <dupinfo> : for -M option, write duplication info to file <dupinfo>\n      --cluster-only: same as -M/--merge but without discarding any of the\n@@ -452,7 +549,6 @@\n           multi-exon transcripts, and >=80% overlap for single-exon transcripts\n      -Y   for -M option, enforce -Q but also discard overlapping single-exon \n           transcripts, even on the opposite strand (can be combined with -K)\n-          \n     Output options:\n      --force-exons: make sure that the lowest level GFF features are considered\n            "exon" features\n@@ -465,25 +561,26 @@\n      -g   full path to a multi-fasta file with the genomic sequences\n           for all input mappings, OR a directory with single-fasta files\n           (one per genomic sequence, with file names matching sequence names)\n-     -w    write a fasta file with spliced exons for each GFF transcript\n+     -j    output the junctions and the corresponding transcripts\n+     -w    write a fasta file with spliced exons for each transcript\n+     --w-add <N> for the -w option, extract additional <N> bases\n+           both upstream and downstream of the transcript boundaries\n+     --w-nocds for -w, disable the output of CDS info in the FASTA file\n      -x    write a fasta file with spliced CDS for each GFF transcript\n      -y    write a protein fasta file with the translation of CDS for each record\n-     -W    for -w and -x options, write in the FASTA defline the exon\n+     -W    for -w, -x and -y options, write in the FASTA defline all the exon\n            coordinates projected onto the spliced sequence;\n-           for -y option, write transcript attributes in the FASTA defline\n      -S    for -y option, use \'*\' instead of \'.\' as stop codon translation\n-     -L    Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n+     -L    Ensembl GTF to GFF3 conversion, adds version to IDs\n      -m    <chr_replace> is a name mapping table for converting reference \n            sequence names, having this 2-column format:\n            <original_ref_ID> <new_ref_ID>\n-           WARNING: all GFF records on reference sequences whose original IDs\n-           are not found in the 1st column of this table will be discarded!\n      -t    use <trackname> in the 2nd column of each GFF/GTF output line\n-     -o    write the records into <outfile> instead of stdout\n+     -o    write the output records into <outfile> instead of stdout\n      -T    main output will be GTF instead of GFF3\n      --bed output records in BED format instead of default GFF3\n      --tlf output "transcript line format" which is like GFF\n-           but exons, CDS features and related data are stored as GFF \n+           but with exons and CDS related features stored as GFF \n            attributes in the transcript feature line, like this:\n              exoncount=N;exons=<exons>;CDSphase=<N>;CDS=<CDScoords> \n            <exons> is a comma-delimited list of exon_start-exon_end coordinates;\n@@ -491,9 +588,14 @@\n      --table output a simple tab delimited format instead of GFF, with columns\n            having the values of GFF attributes given in <attrlist>; special\n            pseudo-attributes (prefixed by @) are recognized:\n-           @chr, @start, @end, @strand, @numexons, @exons, @cds, @covlen, @cdslen\n+           @id, @geneid, @chr, @start, @end, @strand, @numexons, @exons, \n+           @cds, @covlen, @cdslen\n+           If any of -w/-y/-x FASTA output files are enabled, the same fields\n+           (excluding @id) are appended to the definition line of corresponding\n+           FASTA records\n      -v,-E expose (warn about) duplicate transcript IDs and other potential\n            problems with the given GFF/GTF records\n+\n ]]>\n     </help>\n     <citations>\n'
b
diff -r 4dea02886337 -r 154d00cbbf2d test-data/Homo_sapiens.GRCh37_19.71.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Homo_sapiens.GRCh37_19.71.bed Sat Sep 25 15:38:31 2021 +0000
b
@@ -0,0 +1,42 @@
+19 223157 223261 ENST00000410397 100 - 223157 223261 0,0,0 1 104, 0, geneID=ENSG00000222329;gene_name=U6
+19 229639 230165 ENST00000587910 100 - 229639 230165 0,0,0 2 70,82, 0,444, geneID=ENSG00000267600;gene_name=AC098474.1
+19 239144 239247 ENST00000588755 100 - 239144 239247 0,0,0 1 103, 0, geneID=ENSG00000267305;gene_name=CTD-3113P16.7
+19 279494 280170 ENST00000589981 100 + 279494 280170 0,0,0 1 676, 0, geneID=ENSG00000267447;gene_name=VN2R11P
+19 281042 291386 ENST00000269812 100 - 281387 291336 0,0,0 6 495,177,58,278,152,102, 0,1091,1709,6431,6977,10242, CDS=281387:291336;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C
+19 281344 291393 ENST00000434325 100 - 281387 288055 0,0,0 6 193,177,58,278,152,68, 0,789,1407,6129,6675,9981, CDS=281387:288055;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C
+19 281387 291200 ENST00000327790 100 - 281387 291066 0,0,0 6 150,177,58,278,152,249, 0,746,1364,6086,6632,9564, CDS=281387:291066;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C
+19 281990 287636 ENST00000586998 100 - 282121 287636 2,0,0 3 320,58,163, 0,761,5483, CDS=282121:287636;CDSphase=2;geneID=ENSG00000141934;gene_name=PPAP2C
+19 287160 288530 ENST00000589672 100 - 287160 288530 0,0,0 2 591,511, 0,859, geneID=ENSG00000141934;gene_name=PPAP2C
+19 287473 291382 ENST00000591572 100 - 287473 291336 0,0,0 3 278,170,98, 0,546,3811, CDS=287473:291336;CDSphase=0;geneID=ENSG00000141934;gene_name=PPAP2C
+19 305572 306467 ENST00000591533 100 + 305572 306467 0,0,0 2 131,411, 0,484, geneID=ENSG00000267124;gene_name=CTD-3113P16.5
+19 305574 344793 ENST00000264819 100 - 306689 344782 0,0,0 14 1137,418,89,125,95,82,152,70,92,124,126,143,91,20, 0,1544,3002,3226,6270,6616,7917,20060,20932,21558,22289,28825,30508,39199, CDS=306689:344782;CDSphase=0;geneID=ENSG00000105556;gene_name=MIER2
+19 305578 325706 ENST00000589092 100 + 305578 325706 0,0,0 2 356,83, 0,20045, geneID=ENSG00000267124;gene_name=CTD-3113P16.5
+19 326606 336178 ENST00000586994 100 - 326606 336178 0,0,0 4 650,126,143,96, 0,1257,7793,9476, geneID=ENSG00000105556;gene_name=MIER2
+19 327863 340599 ENST00000592722 100 - 327863 340599 0,0,0 5 126,117,143,91,86, 0,2400,6536,8219,12650, geneID=ENSG00000105556;gene_name=MIER2
+19 334114 344798 ENST00000587966 100 - 334114 344798 0,0,0 2 428,25, 0,10659, geneID=ENSG00000105556;gene_name=MIER2
+19 361749 376013 ENST00000342640 100 - 362199 375970 0,0,0 8 677,160,118,70,62,72,123,351, 0,5315,9455,10881,11720,12190,12549,13913, CDS=362199:375970;CDSphase=0;geneID=ENSG00000105549;gene_name=THEG
+19 362057 374620 ENST00000530711 100 - 362057 374620 0,0,0 3 369,160,322, 0,5007,12241, geneID=ENSG00000105549;gene_name=THEG
+19 362199 375970 ENST00000346878 100 - 362199 375970 0,0,0 7 227,160,118,70,62,123,308, 0,4865,9005,10431,11270,12099,13463, CDS=362199:375970;CDSphase=0;geneID=ENSG00000105549;gene_name=THEG
+19 367201 374249 ENST00000528213 100 - 367201 374249 0,0,0 5 23,118,70,62,310, 0,4003,5429,6268,6738, geneID=ENSG00000105549;gene_name=THEG
+19 397588 398941 ENST00000591757 100 + 397588 398941 0,0,0 2 45,252, 0,1101, geneID=ENSG00000267443;gene_name=AC010641.1
+19 405444 409139 ENST00000332235 100 - 407095 408361 0,0,0 2 2957,134, 0,3561, CDS=407095:408361;CDSphase=0;geneID=ENSG00000183186;gene_name=C2CD4C
+19 416582 419879 ENST00000587423 100 - 416582 419879 0,0,0 2 740,957, 0,2340, geneID=ENSG00000129946;gene_name=SHC2
+19 416582 422828 ENST00000588376 100 - 416582 422828 0,0,0 3 740,134,683, 0,2340,5563, geneID=ENSG00000129946;gene_name=SHC2
+19 416592 460996 ENST00000264554 100 - 418927 460996 0,0,0 13 730,134,311,135,64,157,127,52,54,120,61,71,468, 0,2330,5553,8504,14091,18116,19572,19787,20037,22125,22377,24269,43936, CDS=418927:460996;CDSphase=0;geneID=ENSG00000129946;gene_name=SHC2
+19 416608 441384 ENST00000589922 100 - 416608 441384 0,0,0 11 714,134,311,135,64,157,127,304,120,61,523, 0,2314,5537,8488,14075,18100,19556,19771,22109,22361,24253, geneID=ENSG00000129946;gene_name=SHC2
+19 417199 436258 ENST00000590170 100 - 434761 436258 0,0,0 6 123,134,234,64,157,94, 0,1723,4946,13484,17509,18965, CDS=434761:436258;CDSphase=0;geneID=ENSG00000129946;gene_name=SHC2
+19 417199 436258 ENST00000591948 100 - 417199 436258 0,0,0 6 123,134,234,64,157,94, 0,1723,4946,13484,17509,18965, geneID=ENSG00000129946;gene_name=SHC2
+19 434701 460571 ENST00000590222 100 - 439397 460571 1,0,0 9 164,127,52,54,120,61,259,71,43, 0,1463,1678,1928,4016,4268,4531,6160,25827, CDS=439397:460571;CDSphase=1;geneID=ENSG00000129946;gene_name=SHC2
+19 435770 436534 ENST00000591388 100 - 435770 436534 0,0,0 3 191,127,155, 0,394,609, geneID=ENSG00000129946;gene_name=SHC2
+19 435778 439031 ENST00000590113 100 - 435778 439031 0,0,0 6 183,127,52,54,120,62, 0,386,601,851,2939,3191, geneID=ENSG00000129946;gene_name=SHC2
+19 453133 453245 ENST00000516730 100 + 453133 453245 0,0,0 1 112, 0, geneID=ENSG00000252539;gene_name=RNA5SP462
+19 463345 474983 ENST00000315489 100 - 463843 474747 0,0,0 4 1019,114,108,363, 0,4303,9048,11275, CDS=463843:474747;CDSphase=0;geneID=ENSG00000181781;gene_name=ODF3L2
+19 463466 474880 ENST00000382696 100 - 463843 474747 0,0,0 3 898,114,260, 0,4182,11154, CDS=463843:474747;CDSphase=0;geneID=ENSG00000181781;gene_name=ODF3L2
+19 464145 472631 ENST00000591681 100 - 464145 472631 0,0,0 3 219,114,238, 0,3503,8248, geneID=ENSG00000181781;gene_name=ODF3L2
+19 489175 505342 ENST00000587541 100 + 489175 505342 0,0,0 3 864,261,598, 0,12493,15569, geneID=ENSG00000099866;gene_name=MADCAM1
+19 490045 507813 ENST00000592413 100 - 490045 507813 0,0,0 3 308,84,438, 0,11495,17330, geneID=ENSG00000266933;gene_name=AC005775.2
+19 496453 505207 ENST00000346144 100 + 496499 504965 0,0,0 4 98,285,330,463, 0,1379,2042,8291, CDS=496499:504965;CDSphase=0;geneID=ENSG00000099866;gene_name=MADCAM1
+19 496453 505347 ENST00000215637 100 + 496499 504965 0,0,0 5 98,285,330,261,603, 0,1379,2042,5215,8291, CDS=496499:504965;CDSphase=0;geneID=ENSG00000099866;gene_name=MADCAM1
+19 496499 504965 ENST00000382683 100 + 496499 504965 0,0,0 3 52,330,221, 0,1996,8245, CDS=496499:504965;CDSphase=0;geneID=ENSG00000099866;gene_name=MADCAM1
+19 507298 519654 ENST00000359315 100 + 507506 519423 0,0,0 2 546,766, 0,11590, CDS=507506:519423;CDSphase=0;geneID=ENSG00000141933;gene_name=TPGS1
+19 507499 510372 ENST00000588278 100 + 507499 510372 0,0,0 1 2873, 0, geneID=ENSG00000141933;gene_name=TPGS1
b
diff -r 4dea02886337 -r 154d00cbbf2d test-data/Homo_sapiens.GRCh37_19.71.gff3
--- a/test-data/Homo_sapiens.GRCh37_19.71.gff3 Mon Nov 11 18:27:46 2019 -0500
+++ b/test-data/Homo_sapiens.GRCh37_19.71.gff3 Sat Sep 25 15:38:31 2021 +0000
b
@@ -1,6 +1,6 @@
-# gffread /tmp/tmpq6d_yfqc/files/9/2/2/dataset_922cd54b-d77c-48fb-abf7-6fc8d8fdb97c.dat -o output.gff3
-# gffread v0.11.6
 ##gff-version 3
+# gffread v0.12.7
+# gffread /tmp/tmpk_iy6dhb/files/e/1/9/dataset_e191f2e3-7ad2-452e-b21c-edd22b6ba6e2.dat -o output.gff
 19 snRNA transcript 223158 223261 . - . ID=ENST00000410397;geneID=ENSG00000222329;gene_name=U6
 19 snRNA exon 223158 223261 . - . Parent=ENST00000410397
 19 unprocessed_pseudogene transcript 229640 230165 . - . ID=ENST00000587910;geneID=ENSG00000267600;gene_name=AC098474.1
b
diff -r 4dea02886337 -r 154d00cbbf2d test-data/ecoli-k12.processed.gff3
--- a/test-data/ecoli-k12.processed.gff3 Mon Nov 11 18:27:46 2019 -0500
+++ b/test-data/ecoli-k12.processed.gff3 Sat Sep 25 15:38:31 2021 +0000
b
b'@@ -1,33 +1,33 @@\n-# gffread /tmp/tmpq6d_yfqc/files/2/7/7/dataset_277f6e18-b25a-4b59-b712-49b5c202a183.dat -F -o output.gff3\n-# gffread v0.11.6\n ##gff-version 3\n-NC_000913.3\tRefSeq\tgene\t190\t255\t.\t+\t.\tID=gene-b0001;geneID=gene-b0001;gene_name=thrL;Dbxref=ASAP:ABE-0000006,ECOCYC:EG11277,EcoGene:EG11277,GeneID:944742;Name=thrL;gbkey=Gene;gene=thrL;gene_biotype=protein_coding;gene_synonym=ECK0001;locus_tag=b0001\n+# gffread v0.12.7\n+# gffread /tmp/tmpk_iy6dhb/files/7/c/b/dataset_7cbb521e-a7fc-4b92-8335-006b4f916f5c.dat -F -o output.gff\n+NC_000913.3\tRefSeq\tgene\t190\t255\t.\t+\t.\tID=gene-b0001;gene_name=thrL;Dbxref=ASAP:ABE-0000006,ECOCYC:EG11277,EcoGene:EG11277,GeneID:944742;Name=thrL;gbkey=Gene;gene=thrL;gene_biotype=protein_coding;gene_synonym=ECK0001;locus_tag=b0001\n NC_000913.3\tRefSeq\tCDS\t190\t255\t.\t+\t0\tParent=gene-b0001;Dbxref=UniProtKB/Swiss-Prot:P0AD86,Genbank:NP_414542.1,ASAP:ABE-0000006,ECOCYC:EG11277,EcoGene:EG11277,GeneID:944742;Name=NP_414542.1;gbkey=CDS;gene=thrL;locus_tag=b0001;orig_transcript_id=gnl|b0001|mrna.b0001;product=thr operon leader peptide;protein_id=NP_414542.1;transl_table=11\n-NC_000913.3\tRefSeq\tgene\t337\t2799\t.\t+\t.\tID=gene-b0002;geneID=gene-b0002;gene_name=thrA;Dbxref=ASAP:ABE-0000008,ECOCYC:EG10998,EcoGene:EG10998,GeneID:945803;Name=thrA;gbkey=Gene;gene=thrA;gene_biotype=protein_coding;gene_synonym=ECK0002,Hs,thrA1,thrA2,thrD;locus_tag=b0002\n+NC_000913.3\tRefSeq\tgene\t337\t2799\t.\t+\t.\tID=gene-b0002;gene_name=thrA;Dbxref=ASAP:ABE-0000008,ECOCYC:EG10998,EcoGene:EG10998,GeneID:945803;Name=thrA;gbkey=Gene;gene=thrA;gene_biotype=protein_coding;gene_synonym=ECK0002,Hs,thrA1,thrA2,thrD;locus_tag=b0002\n NC_000913.3\tRefSeq\tCDS\t337\t2799\t.\t+\t0\tParent=gene-b0002;Dbxref=UniProtKB/Swiss-Prot:P00561,Genbank:NP_414543.1,ASAP:ABE-0000008,ECOCYC:EG10998,EcoGene:EG10998,GeneID:945803;Name=NP_414543.1;gbkey=CDS;gene=thrA;locus_tag=b0002;orig_transcript_id=gnl|b0002|mrna.b0002;product=fused aspartate kinase/homoserine dehydrogenase 1;protein_id=NP_414543.1;transl_table=11\n-NC_000913.3\tRefSeq\tgene\t2801\t3733\t.\t+\t.\tID=gene-b0003;geneID=gene-b0003;gene_name=thrB;Dbxref=ASAP:ABE-0000010,ECOCYC:EG10999,EcoGene:EG10999,GeneID:947498;Name=thrB;gbkey=Gene;gene=thrB;gene_biotype=protein_coding;gene_synonym=ECK0003;locus_tag=b0003\n+NC_000913.3\tRefSeq\tgene\t2801\t3733\t.\t+\t.\tID=gene-b0003;gene_name=thrB;Dbxref=ASAP:ABE-0000010,ECOCYC:EG10999,EcoGene:EG10999,GeneID:947498;Name=thrB;gbkey=Gene;gene=thrB;gene_biotype=protein_coding;gene_synonym=ECK0003;locus_tag=b0003\n NC_000913.3\tRefSeq\tCDS\t2801\t3733\t.\t+\t0\tParent=gene-b0003;Dbxref=UniProtKB/Swiss-Prot:P00547,Genbank:NP_414544.1,ASAP:ABE-0000010,ECOCYC:EG10999,EcoGene:EG10999,GeneID:947498;Name=NP_414544.1;gbkey=CDS;gene=thrB;locus_tag=b0003;orig_transcript_id=gnl|b0003|mrna.b0003;product=homoserine kinase;protein_id=NP_414544.1;transl_table=11\n-NC_000913.3\tRefSeq\tgene\t3734\t5020\t.\t+\t.\tID=gene-b0004;geneID=gene-b0004;gene_name=thrC;Dbxref=ASAP:ABE-0000012,ECOCYC:EG11000,EcoGene:EG11000,GeneID:945198;Name=thrC;gbkey=Gene;gene=thrC;gene_biotype=protein_coding;gene_synonym=ECK0004;locus_tag=b0004\n+NC_000913.3\tRefSeq\tgene\t3734\t5020\t.\t+\t.\tID=gene-b0004;gene_name=thrC;Dbxref=ASAP:ABE-0000012,ECOCYC:EG11000,EcoGene:EG11000,GeneID:945198;Name=thrC;gbkey=Gene;gene=thrC;gene_biotype=protein_coding;gene_synonym=ECK0004;locus_tag=b0004\n NC_000913.3\tRefSeq\tCDS\t3734\t5020\t.\t+\t0\tParent=gene-b0004;Dbxref=UniProtKB/Swiss-Prot:P00934,Genbank:NP_414545.1,ASAP:ABE-0000012,ECOCYC:EG11000,EcoGene:EG11000,GeneID:945198;Name=NP_414545.1;gbkey=CDS;gene=thrC;locus_tag=b0004;orig_transcript_id=gnl|b0004|mrna.b0004;product=threonine synthase;protein_id=NP_414545.1;transl_table=11\n-NC_000913.3\tRefSeq\tgene\t5234\t5530\t.\t+\t.\tID=gene-b0005;geneID=gene-b0005;gene_name=yaaX;Dbxref=ASAP:ABE-0000015,ECOCYC:G6081,EcoGene:EG14384,GeneID:944747;Name=yaaX;gbkey=Gene;gene=yaaX;gene_biotype=protein_coding;gene_synonym=ECK0005;locus_tag=b0005\n+NC_000913.3\tRefSeq\tgene\t5234\t5530\t.\t+\t.\tID=gene-b0005;gene_name=yaaX;Dbxref=ASAP:ABE-0000015,ECOCYC:G6081'..b'-b0011;gene_name=yaaW;Dbxref=ASAP:ABE-0000037,ECOCYC:G6082,EcoGene:EG14340,GeneID:944771;Name=yaaW;gbkey=Gene;gene=yaaW;gene_biotype=protein_coding;gene_synonym=ECK0011;locus_tag=b0011\n NC_000913.3\tRefSeq\tCDS\t10643\t11356\t.\t-\t0\tParent=gene-b0011;Dbxref=UniProtKB/Swiss-Prot:P75617,Genbank:NP_414552.1,ASAP:ABE-0000037,ECOCYC:G6082,EcoGene:EG14340,GeneID:944771;Name=NP_414552.1;gbkey=CDS;gene=yaaW;locus_tag=b0011;orig_transcript_id=gnl|b0011|mrna.b0011;product=putative enzyme-specific chaperone YaaW;protein_id=NP_414552.1;transl_table=11\n-NC_000913.3\tRefSeq\tgene\t10830\t11315\t.\t+\t.\tID=gene-b0012;geneID=gene-b0012;gene_name=mbiA;Dbxref=ASAP:ABE-0000040,ECOCYC:EG11509,EcoGene:EG11509,GeneID:948295;Name=mbiA;gbkey=Gene;gene=mbiA;gene_biotype=protein_coding;gene_synonym=ECK0012,htgA,htpY;locus_tag=b0012\n+NC_000913.3\tRefSeq\tgene\t10830\t11315\t.\t+\t.\tID=gene-b0012;gene_name=mbiA;Dbxref=ASAP:ABE-0000040,ECOCYC:EG11509,EcoGene:EG11509,GeneID:948295;Name=mbiA;gbkey=Gene;gene=mbiA;gene_biotype=protein_coding;gene_synonym=ECK0012,htgA,htpY;locus_tag=b0012\n NC_000913.3\tRefSeq\tCDS\t10830\t11315\t.\t+\t0\tParent=gene-b0012;Dbxref=UniProtKB/Swiss-Prot:P28697,Genbank:YP_009518733.1,ASAP:ABE-0000040,ECOCYC:EG11509,EcoGene:EG11509,GeneID:948295;Name=YP_009518733.1;gbkey=CDS;gene=mbiA;locus_tag=b0012;orig_transcript_id=gnl|b0012|mrna.CDS13;product=uncharacterized protein MbiA;protein_id=YP_009518733.1;transl_table=11\n-NC_000913.3\tRefSeq\tgene\t11382\t11786\t.\t-\t.\tID=gene-b0013;geneID=gene-b0013;gene_name=yaaI;Dbxref=ASAP:ABE-0000043,ECOCYC:G8202,EcoGene:EG11513,GeneID:944751;Name=yaaI;gbkey=Gene;gene=yaaI;gene_biotype=protein_coding;gene_synonym=ECK0013;locus_tag=b0013\n+NC_000913.3\tRefSeq\tgene\t11382\t11786\t.\t-\t.\tID=gene-b0013;gene_name=yaaI;Dbxref=ASAP:ABE-0000043,ECOCYC:G8202,EcoGene:EG11513,GeneID:944751;Name=yaaI;gbkey=Gene;gene=yaaI;gene_biotype=protein_coding;gene_synonym=ECK0013;locus_tag=b0013\n NC_000913.3\tRefSeq\tCDS\t11382\t11786\t.\t-\t0\tParent=gene-b0013;Dbxref=UniProtKB/Swiss-Prot:P28696,Genbank:NP_414554.1,ASAP:ABE-0000043,ECOCYC:G8202,EcoGene:EG11513,GeneID:944751;Name=NP_414554.1;gbkey=CDS;gene=yaaI;locus_tag=b0013;orig_transcript_id=gnl|b0013|mrna.b0013;product=DUF2541 domain-containing protein YaaI;protein_id=NP_414554.1;transl_table=11\n-NC_000913.3\tRefSeq\tgene\t12163\t14079\t.\t+\t.\tID=gene-b0014;geneID=gene-b0014;gene_name=dnaK;Dbxref=ASAP:ABE-0000052,ECOCYC:EG10241,EcoGene:EG10241,GeneID:944750;Name=dnaK;gbkey=Gene;gene=dnaK;gene_biotype=protein_coding;gene_synonym=ECK0014,groPAB,groPC,groPF,grpC,grpF,seg;locus_tag=b0014\n+NC_000913.3\tRefSeq\tgene\t12163\t14079\t.\t+\t.\tID=gene-b0014;gene_name=dnaK;Dbxref=ASAP:ABE-0000052,ECOCYC:EG10241,EcoGene:EG10241,GeneID:944750;Name=dnaK;gbkey=Gene;gene=dnaK;gene_biotype=protein_coding;gene_synonym=ECK0014,groPAB,groPC,groPF,grpC,grpF,seg;locus_tag=b0014\n NC_000913.3\tRefSeq\tCDS\t12163\t14079\t.\t+\t0\tParent=gene-b0014;Dbxref=UniProtKB/Swiss-Prot:P0A6Y8,Genbank:NP_414555.1,ASAP:ABE-0000052,ECOCYC:EG10241,EcoGene:EG10241,GeneID:944750;Name=NP_414555.1;gbkey=CDS;gene=dnaK;locus_tag=b0014;orig_transcript_id=gnl|b0014|mrna.b0014;product=chaperone protein DnaK;protein_id=NP_414555.1;transl_table=11\n-NC_000913.3\tRefSeq\tgene\t14168\t15298\t.\t+\t.\tID=gene-b0015;geneID=gene-b0015;gene_name=dnaJ;Dbxref=ASAP:ABE-0000054,ECOCYC:EG10240,EcoGene:EG10240,GeneID:944753;Name=dnaJ;gbkey=Gene;gene=dnaJ;gene_biotype=protein_coding;gene_synonym=ECK0015,groP,grpC;locus_tag=b0015\n+NC_000913.3\tRefSeq\tgene\t14168\t15298\t.\t+\t.\tID=gene-b0015;gene_name=dnaJ;Dbxref=ASAP:ABE-0000054,ECOCYC:EG10240,EcoGene:EG10240,GeneID:944753;Name=dnaJ;gbkey=Gene;gene=dnaJ;gene_biotype=protein_coding;gene_synonym=ECK0015,groP,grpC;locus_tag=b0015\n NC_000913.3\tRefSeq\tCDS\t14168\t15298\t.\t+\t0\tParent=gene-b0015;Dbxref=UniProtKB/Swiss-Prot:P08622,Genbank:NP_414556.1,ASAP:ABE-0000054,ECOCYC:EG10240,EcoGene:EG10240,GeneID:944753;Name=NP_414556.1;gbkey=CDS;gene=dnaJ;locus_tag=b0015;orig_transcript_id=gnl|b0015|mrna.b0015;product=chaperone protein DnaJ;protein_id=NP_414556.1;transl_table=11\n'
b
diff -r 4dea02886337 -r 154d00cbbf2d test-data/stop_codons.gtf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/stop_codons.gtf Sat Sep 25 15:38:31 2021 +0000
b
@@ -0,0 +1,14 @@
+19 protein_coding exon 291275 291386 . - .  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "1"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00001234447";
+19 protein_coding CDS 291275 291336 . - 0  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "1"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812";
+19 protein_coding start_codon 291334 291336 . - 0  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "1"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001";
+19 protein_coding exon 288020 288171 . - .  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "2"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00003304149";
+19 protein_coding CDS 288020 288171 . - 2  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "2"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812";
+19 protein_coding exon 287474 287751 . - .  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "3"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00003352024";
+19 protein_coding CDS 287474 287751 . - 0  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "3"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812";
+19 protein_coding exon 282752 282809 . - .  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "4"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00000951309";
+19 protein_coding CDS 282752 282809 . - 1  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "4"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812";
+19 protein_coding exon 282134 282310 . - .  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "5"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00000951310";
+19 protein_coding CDS 282134 282310 . - 0  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "5"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812";
+19 protein_coding exon 281043 281537 . - .  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "6"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; exon_id "ENSE00000951311";
+19 protein_coding CDS 281391 281537 . - 0  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "6"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001"; protein_id "ENSP00000269812";
+19 protein_coding stop_codon 281388 281390 . - 0  gene_id "ENSG00000141934"; transcript_id "ENST00000269812"; exon_number "6"; gene_name "PPAP2C"; gene_biotype "protein_coding"; transcript_name "PPAP2C-001";