Repository 'gstf_preparation'
hg clone https://toolshed.g2.bx.psu.edu/repos/earlhaminst/gstf_preparation

Changeset 11:dbe37a658cd2 (2020-09-27)
Previous changeset 10:e8e75a79de59 (2019-10-31) Next changeset 12:99bae410128c (2020-10-05)
Commit message:
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
modified:
gstf_preparation.py
gstf_preparation.xml
test-data/test1.sqlite
test-data/test4.fasta
test-data/test4.sqlite
test-data/test5.ns.fasta
test-data/test5_filtered.fasta
test-data/test6.sqlite
b
diff -r e8e75a79de59 -r dbe37a658cd2 gstf_preparation.py
--- a/gstf_preparation.py Thu Oct 31 08:16:51 2019 -0400
+++ b/gstf_preparation.py Sun Sep 27 18:54:31 2020 +0000
[
b'@@ -6,7 +6,7 @@\n import sqlite3\n import sys\n \n-version = "0.4.0"\n+version = "0.5.0"\n gene_count = 0\n \n \n@@ -61,25 +61,39 @@\n         seq_region_end INTEGER NOT NULL,\n         seq_region_strand INTEGER NOT NULL,\n         species VARCHAR NOT NULL,\n+        biotype VARCHAR,\n         gene_json VARCHAR NOT NULL)\'\'\')\n     cur.execute(\'CREATE INDEX gene_symbol_index ON gene (gene_symbol)\')\n \n     cur.execute(\'\'\'CREATE TABLE transcript (\n         transcript_id VARCHAR PRIMARY KEY NOT NULL,\n+        transcript_symbol VARCHAR,\n         protein_id VARCHAR UNIQUE,\n         protein_sequence VARCHAR,\n+        biotype VARCHAR,\n+        is_canonical BOOLEAN NOT NULL DEFAULT FALSE,\n         gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))\'\'\')\n \n-    cur.execute(\'\'\'CREATE VIEW transcript_species AS\n-        SELECT transcript_id, species, seq_region_name\n+    # The following temporary view is not used in GAFA, so schema changes to it\n+    # don\'t require a meta version upgrade.\n+    cur.execute(\'\'\'CREATE TEMPORARY VIEW transcript_join_gene AS\n+        SELECT transcript_id, transcript_symbol, COALESCE(transcript.biotype, gene.biotype) AS biotype, is_canonical, gene_id, gene_symbol, seq_region_name, species\n         FROM transcript JOIN gene\n-        ON transcript.gene_id = gene.gene_id\'\'\')\n+        USING (gene_id)\'\'\')\n \n     conn.commit()\n \n \n-def remove_type_from_list_of_ids(l):\n-    return \',\'.join(remove_type_from_id(_) for _ in l.split(\',\'))\n+def fetch_transcript_and_gene(conn, transcript_id):\n+    cur = conn.cursor()\n+\n+    cur.execute(\'SELECT * FROM transcript_join_gene WHERE transcript_id=?\',\n+                (transcript_id, ))\n+    return cur.fetchone()\n+\n+\n+def remove_type_from_list_of_ids(ids):\n+    return \',\'.join(remove_type_from_id(id_) for id_ in ids.split(\',\'))\n \n \n def remove_type_from_id(id_):\n@@ -103,6 +117,8 @@\n                 value = remove_type_from_id(value)\n             elif tag == \'Parent\':\n                 value = remove_type_from_list_of_ids(value)\n+            elif tag == \'representative\':\n+                tag = \'is_canonical\'\n             d[tag] = value\n     if cols[6] == \'+\':\n         d[\'strand\'] = 1\n@@ -122,27 +138,27 @@\n def add_gene_to_dict(cols, species, gene_dict):\n     global gene_count\n     gene = feature_to_dict(cols)\n+    if not gene[\'id\']:\n+        raise Exception("Id not found among column 9 attribute tags: %s" % cols[8])\n     gene.update({\n         \'member_id\': gene_count,\n         \'object_type\': \'Gene\',\n         \'seq_region_name\': cols[0],\n         \'species\': species,\n         \'Transcript\': [],\n-        \'display_name\': gene.get(\'Name\', None)\n+        \'display_name\': gene.get(\'Name\'),\n     })\n-    if gene[\'id\']:\n-        gene_dict[gene[\'id\']] = gene\n-        gene_count = gene_count + 1\n+    gene_dict[gene[\'id\']] = gene\n+    gene_count = gene_count + 1\n \n \n def add_transcript_to_dict(cols, species, transcript_dict):\n     transcript = feature_to_dict(cols)\n-    if \'biotype\' in transcript and transcript[\'biotype\'] != \'protein_coding\':\n-        return\n     transcript.update({\n         \'object_type\': \'Transcript\',\n         \'seq_region_name\': cols[0],\n         \'species\': species,\n+        \'display_name\': transcript.get(\'Name\'),\n     })\n     transcript_dict[transcript[\'id\']] = transcript\n \n@@ -242,45 +258,30 @@\n         if gene is None:\n             # This can happen when loading a JSON file from Ensembl\n             continue\n+        if \'confidence\' in gene and gene[\'confidence\'] != \'high\':\n+            print("Gene %s has confidence %s (not high), discarding" % (gene[\'id\'], gene[\'confidence\']), file=sys.stderr)\n+            continue\n         gene_id = gene[\'id\']\n-        cur.execute(\'INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)\',\n-                    (gene_id, gene.get(\'display_name\', None), gene[\'seq_region_name\'], gene[\'start\'], gene[\'end\'], gene[\'strand\'], gene[\'species\'], json.dumps(gene))'..b'ript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0]\n+            elif len(canonical_transcript_ids) > 1:\n+                raise Exception("Gene %s has more than 1 canonical transcripts" % (gene_id))\n+            else:\n+                selected_transcript_id = canonical_transcript_ids[0]\n+            selected_transcript_ids.append(selected_transcript_id)\n \n     regions = [_.strip().lower() for _ in options.regions.split(",")]\n     with open(options.of, \'w\') as output_fasta_file, open(options.ff, \'w\') as filtered_fasta_file:\n@@ -417,24 +431,37 @@\n             force_remove_id_version = fasta_arg in force_remove_id_version_file_list\n             for entry in FASTAReader_gen(fasta_arg):\n                 transcript_id = remove_id_version(entry.header[1:].lstrip().split(\' \')[0], force_remove_id_version)\n-                if options.longestCDS and transcript_id not in selected_transcript_ids:\n-                    continue\n \n-                if len(entry.sequence) % 3 != 0:\n-                    print("Transcript \'%s\' in FASTA file \'%s\' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)\n-                    continue\n-\n-                species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)\n-                if not species_for_transcript:\n+                transcript = fetch_transcript_and_gene(conn, transcript_id)\n+                if not transcript:\n                     print("Transcript \'%s\' in FASTA file \'%s\' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)\n                     continue\n \n-                if options.headers:\n+                if options.filter == \'canonical\':\n+                    # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict\n+                    if transcript_id not in selected_transcript_ids:\n+                        continue\n+                elif options.filter == \'coding\':\n+                    if len(entry.sequence) % 3 != 0:\n+                        print("Transcript \'%s\' in FASTA file \'%s\' has a coding sequence length which is not multiple of 3, removing from FASTA output" % (transcript_id, fasta_arg), file=sys.stderr)\n+                        continue\n+                    transcript_biotype = transcript[\'biotype\']  # This is the biotype of the transcript or, if that is NULL, the one of the gene\n+                    if transcript_biotype and transcript_biotype != \'protein_coding\':\n+                        print("Transcript %s has biotype %s (not protein-coding), removing from FASTA output" % (transcript_id, transcript_biotype), file=sys.stderr)\n+                        continue\n+\n+                if options.headers == "TranscriptId_species":\n                     # Change the FASTA header to \'>TranscriptId_species\', as required by TreeBest\n                     # Remove any underscore in the species\n-                    entry.header = ">%s_%s" % (transcript_id, species_for_transcript.replace(\'_\', \'\'))\n+                    entry.header = ">%s_%s" % (transcript_id, transcript[\'species\'].replace(\'_\', \'\'))\n+                elif options.headers == "GeneSymbol-TranscriptID_species":\n+                    # Remove any underscore in the species\n+                    entry.header = ">%s-%s_%s" % (transcript[\'gene_symbol\'], transcript_id, transcript[\'species\'].replace(\'_\', \'\'))\n+                elif options.headers == "TranscriptSymbol-TranscriptID_species":\n+                    # Remove any underscore in the species\n+                    entry.header = ">%s-%s_%s" % (transcript[\'transcript_symbol\'], transcript_id, transcript[\'species\'].replace(\'_\', \'\'))\n \n-                if seq_region_for_transcript.lower() in regions:\n+                if transcript[\'seq_region_name\'].lower() in regions:\n                     entry.print(filtered_fasta_file)\n                 else:\n                     entry.print(output_fasta_file)\n'
b
diff -r e8e75a79de59 -r dbe37a658cd2 gstf_preparation.xml
--- a/gstf_preparation.xml Thu Oct 31 08:16:51 2019 -0400
+++ b/gstf_preparation.xml Sun Sep 27 18:54:31 2020 +0000
[
b'@@ -1,5 +1,8 @@\n-<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.1">\n+<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.2">\n     <description>converts data for the workflow</description>\n+    <requirements>\n+        <requirement type="package" version="3.7">python</requirement>\n+    </requirements>\n     <command detect_errors="exit_code"><![CDATA[\n python \'$__tool_directory__/gstf_preparation.py\'\n #for $q in $queries\n@@ -14,10 +17,10 @@\n     --fasta \'${fasta_input}\'\n #end for\n #if $headers\n-    --headers\n+    --headers $headers\n #end if\n-#if $longestCDS\n-    -l\n+#if $filter\n+    --filter $filter\n #end if\n #if $regions\n     --regions \'$regions\'\n@@ -36,8 +39,18 @@\n         </repeat>\n         <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by \'Get features by Ensembl ID\' tool" />\n         <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" />\n-        <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" />\n-        <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the &gt;TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" />\n+        <param name="filter" type="select" display="radio" label="Which transcripts to keep">\n+            <option value="canonical" selected="true">Only canonical transcripts (or longest CDS per gene)</option>\n+            <option value="coding">Only protein-coding transcripts</option>\n+            <option value="">All transcripts</option>\n+        </param>\n+\n+        <param name="headers" type="select" display="radio" label="Change the header line of the FASTA sequences to the following format" help="As required by TreeBest, part of the GeneSeqToFamily workflow, only TranscriptId_species is acceptable format by Aequatus visualisation">\n+            <option value="TranscriptId_species" selected="true">TranscriptId_species</option>\n+            <option value="GeneSymbol-TranscriptID_species">GeneSymbol-TranscriptID_species</option>\n+            <option value="TranscriptSymbol-TranscriptID_species">TranscriptSymbol-TranscriptID_species</option>\n+            <option value="">Don\'t change</option>\n+        </param>\n         <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" />\n     </inputs>\n \n@@ -51,49 +64,52 @@\n \n     <tests>\n         <test expect_num_outputs="2">\n+            <repeat name="queries">\n+                <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />\n+                <param name="genome" value="caenorhabditis_elegans" />\n+            </repeat>\n             <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />\n-            <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />\n-            <param name="genome" value="caenorhabditis_elegans" />\n-            <param name="longestCDS" value="false" />\n-            <param name="headers" value="true" />\n+            <param name="filter" value="coding" />\n+            <param name="headers" value="TranscriptId_species" />\n \n             <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" />\n             <output name="output_fasta" file="test1.fasta" />\n         </test>\n         <test expect_num_outputs="2">\n+    '..b'ns" />\n-            <param name="longestCDS" value="false" />\n-            <param name="headers" value="false" />\n+            <param name="json" ftype="gff3" value="gene.json" />\n+            <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />\n+            <param name="filter" value="" />\n+            <param name="headers" value="" />\n \n-            <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" />\n-            <output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />\n+            <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />\n+            <output name="output_fasta" file="CDS.fasta" />\n         </test>\n         <test expect_num_outputs="2">\n+            <param name="json" ftype="json" value="gene.json" />\n             <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />\n-            <param name="json" ftype="json" value="gene.json" />\n-            <param name="longestCDS" value="false" />\n-            <param name="headers" value="true" />\n+            <param name="filter" value="coding" />\n+            <param name="headers" value="TranscriptId_species" />\n \n             <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />\n             <output name="output_fasta" file="test4.fasta" />\n         </test>\n         <test>\n+            <param name="json" ftype="json" value="gene.json" />\n             <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />\n-            <param name="json" ftype="json" value="gene.json" />\n-            <param name="longestCDS" value="false" />\n-            <param name="headers" value="true" />\n+            <param name="filter" value="coding" />\n+            <param name="headers" value="TranscriptId_species" />\n             <param name="regions" value="X" />\n \n             <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />\n@@ -101,11 +117,13 @@\n             <output name="filtered_fasta" file="test5.ns.fasta" />\n         </test>\n         <test expect_num_outputs="2">\n+            <repeat name="queries">\n+                <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" />\n+                <param name="genome" value="mus_pahari" />\n+            </repeat>\n             <param name="fasta_inputs" ftype="fasta" value="Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa" />\n-            <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" />\n-            <param name="genome" value="mus_pahari" />\n-            <param name="longestCDS" value="true" />\n-            <param name="headers" value="true" />\n+            <param name="filter" value="canonical" />\n+            <param name="headers" value="TranscriptId_species" />\n \n             <output name="output_db" file="test6.sqlite" compare="sim_size" delta="30000" />\n             <output name="output_fasta" file="test6.fasta" />\n@@ -116,12 +134,12 @@\n \n This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format.\n \n-It also filters the CDS FASTA datasets to:\n+It also filters the CDS FASTA datasets to keep only the transcripts present in the gene feature information.\n \n-- remove coding sequences whose length is not a multiple of 3\n-- keep only the transcripts present in the gene feature information.\n-\n-Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).\n+Optionally it can also:\n+- keep only canonical transcripts (or the longest CDS per gene, if this attribute is not provided)\n+- remove sequences which are annotated as non protein-coding or whose length is not a multiple of 3\n+- change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).\n \n Example GFF3 file::\n \n'
b
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test1.sqlite
b
Binary file test-data/test1.sqlite has changed
b
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test4.fasta
--- a/test-data/test4.fasta Thu Oct 31 08:16:51 2019 -0400
+++ b/test-data/test4.fasta Sun Sep 27 18:54:31 2020 +0000
b
@@ -299,28 +299,6 @@
 TTCCTGGAGAGACACCTGCCTTCTGTACCAGGCCTGCTAAAGCTGTTTGGATTGACCACC
 ATCTTGTCAGCAACAGCTCTTGGTTTCCTGGCCCACAAAAGGGGCCTGTTTGTACATTTT
 TAA
->ENSMUST00000168613_musmusculus
-ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGGTATGGCGGCAGCC
-AAACTTCTGCATGATTGTGGCCTCAGTGTGGTGGTTCTGGAAGCACGGGACCGTGTAGGA
-GGCAGGACTTACACAATTAGGAATAAAAACGTTAAATATGTGGACCTTGGAGGATCTTAT
-GTTGGGCCAACCCAGAATCGTATCTTACGATTGGCCAAAGAGCTAGGATTGGAGACCTAT
-AAAGTTAATGAAGTTGAGCGGCTGATACACTTTGTAAAGGGAAAATCATATGCCTTCAGG
-GGCCCATTTCCACCAGTATGGAATCCTATCACCTACCTAGATAATAACAACCTCTGGAGG
-ACAATGGATGAGATGGGCCAAGAGATTCCCAGTGATGCTCCATGGAAAGCACCCCTTGCT
-GAAGAGTGGGACTACATGACAATGAAAGAATTGCTAGATAAGATCTGCTGGACCAAATCT
-ACAAAGCAGATTGCCACGCTCTTTGTGAACCTGTGTGTAACTGCAGAGACCCATGAGGTC
-TCTGCACTATGGTTCCTGTGGTATGTGAAGCAGTGTGGAGGTACAACCAGAATCATCTCA
-ACAACCAATGGAGGACAGGGGAAAATGTTATTGTGA
->ENSMUST00000163344_musmusculus
-ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGGTATGGCGGCAGCC
-AAACTTCTGCATGATTGTGGCCTCAGTGTGGTGGTTCTGGAAGCACGGGACCGTGTAGGA
-GGCAGGACTTACACAATTAGGAATAAAAACGTTAAATATGTGGACCTTGGAGGATCTTAT
-GTTGGGCCAACCCAGAATCGTATCTTACGATTGGCCAAAGAGCTAGGATTGGAGACCTAT
-AAAGTTAATGAAGTTGAGCGGCTGATACACTTTGTAAAGATCTACAAAGCAGATTGCCAC
-GCTCTTTGTGAACCTGTGTGTAACTGCAGAGACCCATGA
->ENSMUST00000173143_musmusculus
-ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGAAAAACCTGCTAAT
-TCTAGTCAGTTTACAAGCTCATTGTGGAGGAGAATTGAAAAACTGTGA
 >ENSSSCT00000033745_susscrofa
 ATGGCAGCGGCCAAACTTCTGCATGACTCTGGCCTGAGTGTGATTGTTCTGGAAGCCCGG
 GACCGCGTGGGAGGCAGGACTTACACCGTCAGGAACCAACAAGTTAAATATGTGGACCTT
@@ -489,16 +467,6 @@
 CTGGAGGGCCTGCGGAGCACGCTGGCCGAGAGCAGCGACCACGTGGAAAAGAGTCCCCAG
 TCCCTCCTGCAGGACATGCTGGCCACGGGAGGCTTCCTGCAGGGGGACGAGGCCGACTGC
 TACTGA
->ENST00000421712_homosapiens
-ATGGACCCAGAATGCGCCCAGCTGCTCCCGGCTCTCTGTGCTGTTCTGGTAGATCCCAGG
-CAGCCGGTGGCAGATGACACCTGTTTGGAGAAGCTCCTGGACTGGTTTAAAACGGTCACT
-GAAGGAGAGTCCAGTGTCGTGCTGCTGCAGGAGCACCCCTGCCTGGTGGAGCTGCTGTCC
-CATGTGCTGAAAGTCCAGGACCTGAGTTCTGGGGTCCTCTCCTTCTCACTGCGCCTGGCA
-GGAACCTTCGCAGCCCAGGAAAACTGCTTCCAGTATCTTCAGGTGCGGTCGACACCATCT
-TCTCCCTGCAGGGAGACTCCAGCCTGTTTGTGGCCTCGGCGGCCAGTCAGCTCCTGGTGC
-ACGTCCTGGCTTTGTCCATGCGAGGTGGAGCCGAGGGGCAGCCCTGCCTGCCGGGGGGTG
-ACTGGCCCGCGTGTGCCCAGAAGATCATGGATCACGTTGAAGAGTCCTTGTGCTCCGCGG
-CCACCCCCAAGGTCACTCAGGCCCTGA
 >ENSRNOT00000064726_rattusnorvegicus
 ATGGACGCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCTAGA
 CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTACTGGACTGGTTTAAAACAGTGACA
@@ -584,23 +552,6 @@
 GACCTGGAGGGCCTGCAGGGCAGGCTGGCCAAGAGCAGCGACCATGTGGAGAAGAGCCCA
 CAGTCCCTGCTGCAGGACATGCTGGCCACGGTGGGTGTGTTGGAGGAGAACGAAGCTGAC
 TGCTACTAA
->ENSMUST00000153440_musmusculus
-ATGGACCCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCCAGA
-CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTGCTGGACTGGTTTAAAACAGTGACA
-GAGGCAGAGTCTAGCCTCCAACTACTACAGGACCATCCCTGCTTAATGGAGCTCCTGTCC
-CATGTGCTGAAGCCACAGGACGTGAGCCCTAGGGTCCTCTCCTTTGCTCTGCGCCTTGTT
-GGGGTCTTCGCAGCCCAGGAAGACTGTTTTGAGTACCTTCAGCAGGGAGAGTTGTTGCTG
-GGGCTCTTTGGGGAGTCAGGTGCCCCCGGCTGGGCAGCCTGGAGCATCCCAAGTGTGCGC
-AGCGGCTGGATCCAGGGTCTGTGCTACCTGGCACACCACCCTAGCGCCCTGCACTTCCTG
-GCTGACAGTGGTGCTGTGGACACGCTCTTCTCCTTGCAGGGAGACCCCAGCCTGTTCGTC
-GCCTCAGCAGCCAGCCAGCTCCTAGTACATATCCTGGCTCTGTCCATGCAAGGTGGAGCC
-CCAGGGTCCCCCGTCCCTGAAGCTGCTGCTTGGCCTATGTGTGCCCAGAAGATTGTGAAC
-CATGTGGATGAGTCCCTGCATGCCAAAGCCACCCCCCAGGTCACACAGGCCTTGAATGTC
-CTGACTACGACCTTCGGGCGCTGCCATAACCCCTGGACAGGGGTCCTCTGGGAGCGGCTA
-AGTCCCCCTGTTGCCCGCCTGTTTGAGAGAGACCCCATTCCAGCCGTGCACGCGCTCATG
-GACCTTCTTCTTAGTGTGGCCAGGTCGCCTGTGTTGAATTTTGCAGCCTGTGGCCTGTGG
-GAGATGCTGGCCCAGACTCTGAGCCGCCTGAGCCCCATACAAGCTGGGCCTCTAGCCCTG
-GGGACCCTGAAACTTCAGCACTGGCTTGCTGGATGGGACTGTGGGTAG
 >ENSMUST00000110806_musmusculus
 ATGGACCCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCCAGA
 CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTGCTGGACTGGTTTAAAACAGTGACA
@@ -982,12 +933,6 @@
 TCATCATCCCAGATTATTCTGAAGTGGAAACCACCCTCCGACCCCAATGGCAACATCACC
 CACTACCTGGTTTTCTGGGAGAGGCAGGCGGAAGACAGTGAGCTGTTCGAGCTGGATTAT
 TGCCTCAAAGGGCGAGTCCAGTCATCAGCTCCGCTGTAA
->ENSMUST00000208839_musmusculus
-NAGACAGATTACTATCGGAAAGGGGGCAAGGGACTGCTTCCTGTGAGGTGGATGTCACCT
-GAGTCCCTGAAGGATGGAGTCTTTACTGCTTCTTCTGATATGTGGTCCTTTGGGGTGGTC
-CTTTGGGAAATCACTAGCCTGGCTGAGCAACCTTATCAAGGCCTGTCTAATGAACAGGTG
-TTGAAGTTTGTCATGGATGGAGGCTATCTGGATCCCCCTGATAACTGTCCAGAGAGACTG
-AGATATGAGATAAAGACACACTGGCCACCCTGA
 >ENSMUST00000091291_musmusculus
 ATGGGCTTCGGGAGAGGATGTGAGACGACGGCTGTGCCATTGCTGGTGGCCGTGGCCGCG
 TTGCTGGTGGGCACAGCCGGCCACCTGTACCCTGGAGAGGTGTGCCCTGGTATGGACATC
b
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test4.sqlite
b
Binary file test-data/test4.sqlite has changed
b
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test5.ns.fasta
--- a/test-data/test5.ns.fasta Thu Oct 31 08:16:51 2019 -0400
+++ b/test-data/test5.ns.fasta Sun Sep 27 18:54:31 2020 +0000
b
@@ -299,28 +299,6 @@
 TTCCTGGAGAGACACCTGCCTTCTGTACCAGGCCTGCTAAAGCTGTTTGGATTGACCACC
 ATCTTGTCAGCAACAGCTCTTGGTTTCCTGGCCCACAAAAGGGGCCTGTTTGTACATTTT
 TAA
->ENSMUST00000168613_musmusculus
-ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGGTATGGCGGCAGCC
-AAACTTCTGCATGATTGTGGCCTCAGTGTGGTGGTTCTGGAAGCACGGGACCGTGTAGGA
-GGCAGGACTTACACAATTAGGAATAAAAACGTTAAATATGTGGACCTTGGAGGATCTTAT
-GTTGGGCCAACCCAGAATCGTATCTTACGATTGGCCAAAGAGCTAGGATTGGAGACCTAT
-AAAGTTAATGAAGTTGAGCGGCTGATACACTTTGTAAAGGGAAAATCATATGCCTTCAGG
-GGCCCATTTCCACCAGTATGGAATCCTATCACCTACCTAGATAATAACAACCTCTGGAGG
-ACAATGGATGAGATGGGCCAAGAGATTCCCAGTGATGCTCCATGGAAAGCACCCCTTGCT
-GAAGAGTGGGACTACATGACAATGAAAGAATTGCTAGATAAGATCTGCTGGACCAAATCT
-ACAAAGCAGATTGCCACGCTCTTTGTGAACCTGTGTGTAACTGCAGAGACCCATGAGGTC
-TCTGCACTATGGTTCCTGTGGTATGTGAAGCAGTGTGGAGGTACAACCAGAATCATCTCA
-ACAACCAATGGAGGACAGGGGAAAATGTTATTGTGA
->ENSMUST00000163344_musmusculus
-ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGGTATGGCGGCAGCC
-AAACTTCTGCATGATTGTGGCCTCAGTGTGGTGGTTCTGGAAGCACGGGACCGTGTAGGA
-GGCAGGACTTACACAATTAGGAATAAAAACGTTAAATATGTGGACCTTGGAGGATCTTAT
-GTTGGGCCAACCCAGAATCGTATCTTACGATTGGCCAAAGAGCTAGGATTGGAGACCTAT
-AAAGTTAATGAAGTTGAGCGGCTGATACACTTTGTAAAGATCTACAAAGCAGATTGCCAC
-GCTCTTTGTGAACCTGTGTGTAACTGCAGAGACCCATGA
->ENSMUST00000173143_musmusculus
-ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGAAAAACCTGCTAAT
-TCTAGTCAGTTTACAAGCTCATTGTGGAGGAGAATTGAAAAACTGTGA
 >ENSSSCT00000033745_susscrofa
 ATGGCAGCGGCCAAACTTCTGCATGACTCTGGCCTGAGTGTGATTGTTCTGGAAGCCCGG
 GACCGCGTGGGAGGCAGGACTTACACCGTCAGGAACCAACAAGTTAAATATGTGGACCTT
b
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test5_filtered.fasta
--- a/test-data/test5_filtered.fasta Thu Oct 31 08:16:51 2019 -0400
+++ b/test-data/test5_filtered.fasta Sun Sep 27 18:54:31 2020 +0000
b
@@ -84,16 +84,6 @@
 CTGGAGGGCCTGCGGAGCACGCTGGCCGAGAGCAGCGACCACGTGGAAAAGAGTCCCCAG
 TCCCTCCTGCAGGACATGCTGGCCACGGGAGGCTTCCTGCAGGGGGACGAGGCCGACTGC
 TACTGA
->ENST00000421712_homosapiens
-ATGGACCCAGAATGCGCCCAGCTGCTCCCGGCTCTCTGTGCTGTTCTGGTAGATCCCAGG
-CAGCCGGTGGCAGATGACACCTGTTTGGAGAAGCTCCTGGACTGGTTTAAAACGGTCACT
-GAAGGAGAGTCCAGTGTCGTGCTGCTGCAGGAGCACCCCTGCCTGGTGGAGCTGCTGTCC
-CATGTGCTGAAAGTCCAGGACCTGAGTTCTGGGGTCCTCTCCTTCTCACTGCGCCTGGCA
-GGAACCTTCGCAGCCCAGGAAAACTGCTTCCAGTATCTTCAGGTGCGGTCGACACCATCT
-TCTCCCTGCAGGGAGACTCCAGCCTGTTTGTGGCCTCGGCGGCCAGTCAGCTCCTGGTGC
-ACGTCCTGGCTTTGTCCATGCGAGGTGGAGCCGAGGGGCAGCCCTGCCTGCCGGGGGGTG
-ACTGGCCCGCGTGTGCCCAGAAGATCATGGATCACGTTGAAGAGTCCTTGTGCTCCGCGG
-CCACCCCCAAGGTCACTCAGGCCCTGA
 >ENSRNOT00000064726_rattusnorvegicus
 ATGGACGCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCTAGA
 CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTACTGGACTGGTTTAAAACAGTGACA
@@ -179,23 +169,6 @@
 GACCTGGAGGGCCTGCAGGGCAGGCTGGCCAAGAGCAGCGACCATGTGGAGAAGAGCCCA
 CAGTCCCTGCTGCAGGACATGCTGGCCACGGTGGGTGTGTTGGAGGAGAACGAAGCTGAC
 TGCTACTAA
->ENSMUST00000153440_musmusculus
-ATGGACCCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCCAGA
-CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTGCTGGACTGGTTTAAAACAGTGACA
-GAGGCAGAGTCTAGCCTCCAACTACTACAGGACCATCCCTGCTTAATGGAGCTCCTGTCC
-CATGTGCTGAAGCCACAGGACGTGAGCCCTAGGGTCCTCTCCTTTGCTCTGCGCCTTGTT
-GGGGTCTTCGCAGCCCAGGAAGACTGTTTTGAGTACCTTCAGCAGGGAGAGTTGTTGCTG
-GGGCTCTTTGGGGAGTCAGGTGCCCCCGGCTGGGCAGCCTGGAGCATCCCAAGTGTGCGC
-AGCGGCTGGATCCAGGGTCTGTGCTACCTGGCACACCACCCTAGCGCCCTGCACTTCCTG
-GCTGACAGTGGTGCTGTGGACACGCTCTTCTCCTTGCAGGGAGACCCCAGCCTGTTCGTC
-GCCTCAGCAGCCAGCCAGCTCCTAGTACATATCCTGGCTCTGTCCATGCAAGGTGGAGCC
-CCAGGGTCCCCCGTCCCTGAAGCTGCTGCTTGGCCTATGTGTGCCCAGAAGATTGTGAAC
-CATGTGGATGAGTCCCTGCATGCCAAAGCCACCCCCCAGGTCACACAGGCCTTGAATGTC
-CTGACTACGACCTTCGGGCGCTGCCATAACCCCTGGACAGGGGTCCTCTGGGAGCGGCTA
-AGTCCCCCTGTTGCCCGCCTGTTTGAGAGAGACCCCATTCCAGCCGTGCACGCGCTCATG
-GACCTTCTTCTTAGTGTGGCCAGGTCGCCTGTGTTGAATTTTGCAGCCTGTGGCCTGTGG
-GAGATGCTGGCCCAGACTCTGAGCCGCCTGAGCCCCATACAAGCTGGGCCTCTAGCCCTG
-GGGACCCTGAAACTTCAGCACTGGCTTGCTGGATGGGACTGTGGGTAG
 >ENSMUST00000110806_musmusculus
 ATGGACCCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCCAGA
 CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTGCTGGACTGGTTTAAAACAGTGACA
@@ -577,12 +550,6 @@
 TCATCATCCCAGATTATTCTGAAGTGGAAACCACCCTCCGACCCCAATGGCAACATCACC
 CACTACCTGGTTTTCTGGGAGAGGCAGGCGGAAGACAGTGAGCTGTTCGAGCTGGATTAT
 TGCCTCAAAGGGCGAGTCCAGTCATCAGCTCCGCTGTAA
->ENSMUST00000208839_musmusculus
-NAGACAGATTACTATCGGAAAGGGGGCAAGGGACTGCTTCCTGTGAGGTGGATGTCACCT
-GAGTCCCTGAAGGATGGAGTCTTTACTGCTTCTTCTGATATGTGGTCCTTTGGGGTGGTC
-CTTTGGGAAATCACTAGCCTGGCTGAGCAACCTTATCAAGGCCTGTCTAATGAACAGGTG
-TTGAAGTTTGTCATGGATGGAGGCTATCTGGATCCCCCTGATAACTGTCCAGAGAGACTG
-AGATATGAGATAAAGACACACTGGCCACCCTGA
 >ENSMUST00000091291_musmusculus
 ATGGGCTTCGGGAGAGGATGTGAGACGACGGCTGTGCCATTGCTGGTGGCCGTGGCCGCG
 TTGCTGGTGGGCACAGCCGGCCACCTGTACCCTGGAGAGGTGTGCCCTGGTATGGACATC
b
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test6.sqlite
b
Binary file test-data/test6.sqlite has changed