Previous changeset 10:e8e75a79de59 (2019-10-31) Next changeset 12:99bae410128c (2020-10-05) |
Commit message:
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812" |
modified:
gstf_preparation.py gstf_preparation.xml test-data/test1.sqlite test-data/test4.fasta test-data/test4.sqlite test-data/test5.ns.fasta test-data/test5_filtered.fasta test-data/test6.sqlite |
b |
diff -r e8e75a79de59 -r dbe37a658cd2 gstf_preparation.py --- a/gstf_preparation.py Thu Oct 31 08:16:51 2019 -0400 +++ b/gstf_preparation.py Sun Sep 27 18:54:31 2020 +0000 |
[ |
b'@@ -6,7 +6,7 @@\n import sqlite3\n import sys\n \n-version = "0.4.0"\n+version = "0.5.0"\n gene_count = 0\n \n \n@@ -61,25 +61,39 @@\n seq_region_end INTEGER NOT NULL,\n seq_region_strand INTEGER NOT NULL,\n species VARCHAR NOT NULL,\n+ biotype VARCHAR,\n gene_json VARCHAR NOT NULL)\'\'\')\n cur.execute(\'CREATE INDEX gene_symbol_index ON gene (gene_symbol)\')\n \n cur.execute(\'\'\'CREATE TABLE transcript (\n transcript_id VARCHAR PRIMARY KEY NOT NULL,\n+ transcript_symbol VARCHAR,\n protein_id VARCHAR UNIQUE,\n protein_sequence VARCHAR,\n+ biotype VARCHAR,\n+ is_canonical BOOLEAN NOT NULL DEFAULT FALSE,\n gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))\'\'\')\n \n- cur.execute(\'\'\'CREATE VIEW transcript_species AS\n- SELECT transcript_id, species, seq_region_name\n+ # The following temporary view is not used in GAFA, so schema changes to it\n+ # don\'t require a meta version upgrade.\n+ cur.execute(\'\'\'CREATE TEMPORARY VIEW transcript_join_gene AS\n+ SELECT transcript_id, transcript_symbol, COALESCE(transcript.biotype, gene.biotype) AS biotype, is_canonical, gene_id, gene_symbol, seq_region_name, species\n FROM transcript JOIN gene\n- ON transcript.gene_id = gene.gene_id\'\'\')\n+ USING (gene_id)\'\'\')\n \n conn.commit()\n \n \n-def remove_type_from_list_of_ids(l):\n- return \',\'.join(remove_type_from_id(_) for _ in l.split(\',\'))\n+def fetch_transcript_and_gene(conn, transcript_id):\n+ cur = conn.cursor()\n+\n+ cur.execute(\'SELECT * FROM transcript_join_gene WHERE transcript_id=?\',\n+ (transcript_id, ))\n+ return cur.fetchone()\n+\n+\n+def remove_type_from_list_of_ids(ids):\n+ return \',\'.join(remove_type_from_id(id_) for id_ in ids.split(\',\'))\n \n \n def remove_type_from_id(id_):\n@@ -103,6 +117,8 @@\n value = remove_type_from_id(value)\n elif tag == \'Parent\':\n value = remove_type_from_list_of_ids(value)\n+ elif tag == \'representative\':\n+ tag = \'is_canonical\'\n d[tag] = value\n if cols[6] == \'+\':\n d[\'strand\'] = 1\n@@ -122,27 +138,27 @@\n def add_gene_to_dict(cols, species, gene_dict):\n global gene_count\n gene = feature_to_dict(cols)\n+ if not gene[\'id\']:\n+ raise Exception("Id not found among column 9 attribute tags: %s" % cols[8])\n gene.update({\n \'member_id\': gene_count,\n \'object_type\': \'Gene\',\n \'seq_region_name\': cols[0],\n \'species\': species,\n \'Transcript\': [],\n- \'display_name\': gene.get(\'Name\', None)\n+ \'display_name\': gene.get(\'Name\'),\n })\n- if gene[\'id\']:\n- gene_dict[gene[\'id\']] = gene\n- gene_count = gene_count + 1\n+ gene_dict[gene[\'id\']] = gene\n+ gene_count = gene_count + 1\n \n \n def add_transcript_to_dict(cols, species, transcript_dict):\n transcript = feature_to_dict(cols)\n- if \'biotype\' in transcript and transcript[\'biotype\'] != \'protein_coding\':\n- return\n transcript.update({\n \'object_type\': \'Transcript\',\n \'seq_region_name\': cols[0],\n \'species\': species,\n+ \'display_name\': transcript.get(\'Name\'),\n })\n transcript_dict[transcript[\'id\']] = transcript\n \n@@ -242,45 +258,30 @@\n if gene is None:\n # This can happen when loading a JSON file from Ensembl\n continue\n+ if \'confidence\' in gene and gene[\'confidence\'] != \'high\':\n+ print("Gene %s has confidence %s (not high), discarding" % (gene[\'id\'], gene[\'confidence\']), file=sys.stderr)\n+ continue\n gene_id = gene[\'id\']\n- cur.execute(\'INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)\',\n- (gene_id, gene.get(\'display_name\', None), gene[\'seq_region_name\'], gene[\'start\'], gene[\'end\'], gene[\'strand\'], gene[\'species\'], json.dumps(gene))'..b'ript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0]\n+ elif len(canonical_transcript_ids) > 1:\n+ raise Exception("Gene %s has more than 1 canonical transcripts" % (gene_id))\n+ else:\n+ selected_transcript_id = canonical_transcript_ids[0]\n+ selected_transcript_ids.append(selected_transcript_id)\n \n regions = [_.strip().lower() for _ in options.regions.split(",")]\n with open(options.of, \'w\') as output_fasta_file, open(options.ff, \'w\') as filtered_fasta_file:\n@@ -417,24 +431,37 @@\n force_remove_id_version = fasta_arg in force_remove_id_version_file_list\n for entry in FASTAReader_gen(fasta_arg):\n transcript_id = remove_id_version(entry.header[1:].lstrip().split(\' \')[0], force_remove_id_version)\n- if options.longestCDS and transcript_id not in selected_transcript_ids:\n- continue\n \n- if len(entry.sequence) % 3 != 0:\n- print("Transcript \'%s\' in FASTA file \'%s\' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)\n- continue\n-\n- species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)\n- if not species_for_transcript:\n+ transcript = fetch_transcript_and_gene(conn, transcript_id)\n+ if not transcript:\n print("Transcript \'%s\' in FASTA file \'%s\' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)\n continue\n \n- if options.headers:\n+ if options.filter == \'canonical\':\n+ # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict\n+ if transcript_id not in selected_transcript_ids:\n+ continue\n+ elif options.filter == \'coding\':\n+ if len(entry.sequence) % 3 != 0:\n+ print("Transcript \'%s\' in FASTA file \'%s\' has a coding sequence length which is not multiple of 3, removing from FASTA output" % (transcript_id, fasta_arg), file=sys.stderr)\n+ continue\n+ transcript_biotype = transcript[\'biotype\'] # This is the biotype of the transcript or, if that is NULL, the one of the gene\n+ if transcript_biotype and transcript_biotype != \'protein_coding\':\n+ print("Transcript %s has biotype %s (not protein-coding), removing from FASTA output" % (transcript_id, transcript_biotype), file=sys.stderr)\n+ continue\n+\n+ if options.headers == "TranscriptId_species":\n # Change the FASTA header to \'>TranscriptId_species\', as required by TreeBest\n # Remove any underscore in the species\n- entry.header = ">%s_%s" % (transcript_id, species_for_transcript.replace(\'_\', \'\'))\n+ entry.header = ">%s_%s" % (transcript_id, transcript[\'species\'].replace(\'_\', \'\'))\n+ elif options.headers == "GeneSymbol-TranscriptID_species":\n+ # Remove any underscore in the species\n+ entry.header = ">%s-%s_%s" % (transcript[\'gene_symbol\'], transcript_id, transcript[\'species\'].replace(\'_\', \'\'))\n+ elif options.headers == "TranscriptSymbol-TranscriptID_species":\n+ # Remove any underscore in the species\n+ entry.header = ">%s-%s_%s" % (transcript[\'transcript_symbol\'], transcript_id, transcript[\'species\'].replace(\'_\', \'\'))\n \n- if seq_region_for_transcript.lower() in regions:\n+ if transcript[\'seq_region_name\'].lower() in regions:\n entry.print(filtered_fasta_file)\n else:\n entry.print(output_fasta_file)\n' |
b |
diff -r e8e75a79de59 -r dbe37a658cd2 gstf_preparation.xml --- a/gstf_preparation.xml Thu Oct 31 08:16:51 2019 -0400 +++ b/gstf_preparation.xml Sun Sep 27 18:54:31 2020 +0000 |
[ |
b'@@ -1,5 +1,8 @@\n-<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.1">\n+<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.2">\n <description>converts data for the workflow</description>\n+ <requirements>\n+ <requirement type="package" version="3.7">python</requirement>\n+ </requirements>\n <command detect_errors="exit_code"><![CDATA[\n python \'$__tool_directory__/gstf_preparation.py\'\n #for $q in $queries\n@@ -14,10 +17,10 @@\n --fasta \'${fasta_input}\'\n #end for\n #if $headers\n- --headers\n+ --headers $headers\n #end if\n-#if $longestCDS\n- -l\n+#if $filter\n+ --filter $filter\n #end if\n #if $regions\n --regions \'$regions\'\n@@ -36,8 +39,18 @@\n </repeat>\n <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by \'Get features by Ensembl ID\' tool" />\n <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" />\n- <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" />\n- <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the >TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" />\n+ <param name="filter" type="select" display="radio" label="Which transcripts to keep">\n+ <option value="canonical" selected="true">Only canonical transcripts (or longest CDS per gene)</option>\n+ <option value="coding">Only protein-coding transcripts</option>\n+ <option value="">All transcripts</option>\n+ </param>\n+\n+ <param name="headers" type="select" display="radio" label="Change the header line of the FASTA sequences to the following format" help="As required by TreeBest, part of the GeneSeqToFamily workflow, only TranscriptId_species is acceptable format by Aequatus visualisation">\n+ <option value="TranscriptId_species" selected="true">TranscriptId_species</option>\n+ <option value="GeneSymbol-TranscriptID_species">GeneSymbol-TranscriptID_species</option>\n+ <option value="TranscriptSymbol-TranscriptID_species">TranscriptSymbol-TranscriptID_species</option>\n+ <option value="">Don\'t change</option>\n+ </param>\n <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" />\n </inputs>\n \n@@ -51,49 +64,52 @@\n \n <tests>\n <test expect_num_outputs="2">\n+ <repeat name="queries">\n+ <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />\n+ <param name="genome" value="caenorhabditis_elegans" />\n+ </repeat>\n <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />\n- <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />\n- <param name="genome" value="caenorhabditis_elegans" />\n- <param name="longestCDS" value="false" />\n- <param name="headers" value="true" />\n+ <param name="filter" value="coding" />\n+ <param name="headers" value="TranscriptId_species" />\n \n <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" />\n <output name="output_fasta" file="test1.fasta" />\n </test>\n <test expect_num_outputs="2">\n+ '..b'ns" />\n- <param name="longestCDS" value="false" />\n- <param name="headers" value="false" />\n+ <param name="json" ftype="gff3" value="gene.json" />\n+ <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />\n+ <param name="filter" value="" />\n+ <param name="headers" value="" />\n \n- <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" />\n- <output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />\n+ <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />\n+ <output name="output_fasta" file="CDS.fasta" />\n </test>\n <test expect_num_outputs="2">\n+ <param name="json" ftype="json" value="gene.json" />\n <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />\n- <param name="json" ftype="json" value="gene.json" />\n- <param name="longestCDS" value="false" />\n- <param name="headers" value="true" />\n+ <param name="filter" value="coding" />\n+ <param name="headers" value="TranscriptId_species" />\n \n <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />\n <output name="output_fasta" file="test4.fasta" />\n </test>\n <test>\n+ <param name="json" ftype="json" value="gene.json" />\n <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />\n- <param name="json" ftype="json" value="gene.json" />\n- <param name="longestCDS" value="false" />\n- <param name="headers" value="true" />\n+ <param name="filter" value="coding" />\n+ <param name="headers" value="TranscriptId_species" />\n <param name="regions" value="X" />\n \n <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" />\n@@ -101,11 +117,13 @@\n <output name="filtered_fasta" file="test5.ns.fasta" />\n </test>\n <test expect_num_outputs="2">\n+ <repeat name="queries">\n+ <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" />\n+ <param name="genome" value="mus_pahari" />\n+ </repeat>\n <param name="fasta_inputs" ftype="fasta" value="Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa" />\n- <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" />\n- <param name="genome" value="mus_pahari" />\n- <param name="longestCDS" value="true" />\n- <param name="headers" value="true" />\n+ <param name="filter" value="canonical" />\n+ <param name="headers" value="TranscriptId_species" />\n \n <output name="output_db" file="test6.sqlite" compare="sim_size" delta="30000" />\n <output name="output_fasta" file="test6.fasta" />\n@@ -116,12 +134,12 @@\n \n This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format.\n \n-It also filters the CDS FASTA datasets to:\n+It also filters the CDS FASTA datasets to keep only the transcripts present in the gene feature information.\n \n-- remove coding sequences whose length is not a multiple of 3\n-- keep only the transcripts present in the gene feature information.\n-\n-Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).\n+Optionally it can also:\n+- keep only canonical transcripts (or the longest CDS per gene, if this attribute is not provided)\n+- remove sequences which are annotated as non protein-coding or whose length is not a multiple of 3\n+- change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).\n \n Example GFF3 file::\n \n' |
b |
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test1.sqlite |
b |
Binary file test-data/test1.sqlite has changed |
b |
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test4.fasta --- a/test-data/test4.fasta Thu Oct 31 08:16:51 2019 -0400 +++ b/test-data/test4.fasta Sun Sep 27 18:54:31 2020 +0000 |
b |
@@ -299,28 +299,6 @@ TTCCTGGAGAGACACCTGCCTTCTGTACCAGGCCTGCTAAAGCTGTTTGGATTGACCACC ATCTTGTCAGCAACAGCTCTTGGTTTCCTGGCCCACAAAAGGGGCCTGTTTGTACATTTT TAA ->ENSMUST00000168613_musmusculus -ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGGTATGGCGGCAGCC -AAACTTCTGCATGATTGTGGCCTCAGTGTGGTGGTTCTGGAAGCACGGGACCGTGTAGGA -GGCAGGACTTACACAATTAGGAATAAAAACGTTAAATATGTGGACCTTGGAGGATCTTAT -GTTGGGCCAACCCAGAATCGTATCTTACGATTGGCCAAAGAGCTAGGATTGGAGACCTAT -AAAGTTAATGAAGTTGAGCGGCTGATACACTTTGTAAAGGGAAAATCATATGCCTTCAGG -GGCCCATTTCCACCAGTATGGAATCCTATCACCTACCTAGATAATAACAACCTCTGGAGG -ACAATGGATGAGATGGGCCAAGAGATTCCCAGTGATGCTCCATGGAAAGCACCCCTTGCT -GAAGAGTGGGACTACATGACAATGAAAGAATTGCTAGATAAGATCTGCTGGACCAAATCT -ACAAAGCAGATTGCCACGCTCTTTGTGAACCTGTGTGTAACTGCAGAGACCCATGAGGTC -TCTGCACTATGGTTCCTGTGGTATGTGAAGCAGTGTGGAGGTACAACCAGAATCATCTCA -ACAACCAATGGAGGACAGGGGAAAATGTTATTGTGA ->ENSMUST00000163344_musmusculus -ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGGTATGGCGGCAGCC -AAACTTCTGCATGATTGTGGCCTCAGTGTGGTGGTTCTGGAAGCACGGGACCGTGTAGGA -GGCAGGACTTACACAATTAGGAATAAAAACGTTAAATATGTGGACCTTGGAGGATCTTAT -GTTGGGCCAACCCAGAATCGTATCTTACGATTGGCCAAAGAGCTAGGATTGGAGACCTAT -AAAGTTAATGAAGTTGAGCGGCTGATACACTTTGTAAAGATCTACAAAGCAGATTGCCAC -GCTCTTTGTGAACCTGTGTGTAACTGCAGAGACCCATGA ->ENSMUST00000173143_musmusculus -ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGAAAAACCTGCTAAT -TCTAGTCAGTTTACAAGCTCATTGTGGAGGAGAATTGAAAAACTGTGA >ENSSSCT00000033745_susscrofa ATGGCAGCGGCCAAACTTCTGCATGACTCTGGCCTGAGTGTGATTGTTCTGGAAGCCCGG GACCGCGTGGGAGGCAGGACTTACACCGTCAGGAACCAACAAGTTAAATATGTGGACCTT @@ -489,16 +467,6 @@ CTGGAGGGCCTGCGGAGCACGCTGGCCGAGAGCAGCGACCACGTGGAAAAGAGTCCCCAG TCCCTCCTGCAGGACATGCTGGCCACGGGAGGCTTCCTGCAGGGGGACGAGGCCGACTGC TACTGA ->ENST00000421712_homosapiens -ATGGACCCAGAATGCGCCCAGCTGCTCCCGGCTCTCTGTGCTGTTCTGGTAGATCCCAGG -CAGCCGGTGGCAGATGACACCTGTTTGGAGAAGCTCCTGGACTGGTTTAAAACGGTCACT -GAAGGAGAGTCCAGTGTCGTGCTGCTGCAGGAGCACCCCTGCCTGGTGGAGCTGCTGTCC -CATGTGCTGAAAGTCCAGGACCTGAGTTCTGGGGTCCTCTCCTTCTCACTGCGCCTGGCA -GGAACCTTCGCAGCCCAGGAAAACTGCTTCCAGTATCTTCAGGTGCGGTCGACACCATCT -TCTCCCTGCAGGGAGACTCCAGCCTGTTTGTGGCCTCGGCGGCCAGTCAGCTCCTGGTGC -ACGTCCTGGCTTTGTCCATGCGAGGTGGAGCCGAGGGGCAGCCCTGCCTGCCGGGGGGTG -ACTGGCCCGCGTGTGCCCAGAAGATCATGGATCACGTTGAAGAGTCCTTGTGCTCCGCGG -CCACCCCCAAGGTCACTCAGGCCCTGA >ENSRNOT00000064726_rattusnorvegicus ATGGACGCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCTAGA CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTACTGGACTGGTTTAAAACAGTGACA @@ -584,23 +552,6 @@ GACCTGGAGGGCCTGCAGGGCAGGCTGGCCAAGAGCAGCGACCATGTGGAGAAGAGCCCA CAGTCCCTGCTGCAGGACATGCTGGCCACGGTGGGTGTGTTGGAGGAGAACGAAGCTGAC TGCTACTAA ->ENSMUST00000153440_musmusculus -ATGGACCCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCCAGA -CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTGCTGGACTGGTTTAAAACAGTGACA -GAGGCAGAGTCTAGCCTCCAACTACTACAGGACCATCCCTGCTTAATGGAGCTCCTGTCC -CATGTGCTGAAGCCACAGGACGTGAGCCCTAGGGTCCTCTCCTTTGCTCTGCGCCTTGTT -GGGGTCTTCGCAGCCCAGGAAGACTGTTTTGAGTACCTTCAGCAGGGAGAGTTGTTGCTG -GGGCTCTTTGGGGAGTCAGGTGCCCCCGGCTGGGCAGCCTGGAGCATCCCAAGTGTGCGC -AGCGGCTGGATCCAGGGTCTGTGCTACCTGGCACACCACCCTAGCGCCCTGCACTTCCTG -GCTGACAGTGGTGCTGTGGACACGCTCTTCTCCTTGCAGGGAGACCCCAGCCTGTTCGTC -GCCTCAGCAGCCAGCCAGCTCCTAGTACATATCCTGGCTCTGTCCATGCAAGGTGGAGCC -CCAGGGTCCCCCGTCCCTGAAGCTGCTGCTTGGCCTATGTGTGCCCAGAAGATTGTGAAC -CATGTGGATGAGTCCCTGCATGCCAAAGCCACCCCCCAGGTCACACAGGCCTTGAATGTC -CTGACTACGACCTTCGGGCGCTGCCATAACCCCTGGACAGGGGTCCTCTGGGAGCGGCTA -AGTCCCCCTGTTGCCCGCCTGTTTGAGAGAGACCCCATTCCAGCCGTGCACGCGCTCATG -GACCTTCTTCTTAGTGTGGCCAGGTCGCCTGTGTTGAATTTTGCAGCCTGTGGCCTGTGG -GAGATGCTGGCCCAGACTCTGAGCCGCCTGAGCCCCATACAAGCTGGGCCTCTAGCCCTG -GGGACCCTGAAACTTCAGCACTGGCTTGCTGGATGGGACTGTGGGTAG >ENSMUST00000110806_musmusculus ATGGACCCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCCAGA CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTGCTGGACTGGTTTAAAACAGTGACA @@ -982,12 +933,6 @@ TCATCATCCCAGATTATTCTGAAGTGGAAACCACCCTCCGACCCCAATGGCAACATCACC CACTACCTGGTTTTCTGGGAGAGGCAGGCGGAAGACAGTGAGCTGTTCGAGCTGGATTAT TGCCTCAAAGGGCGAGTCCAGTCATCAGCTCCGCTGTAA ->ENSMUST00000208839_musmusculus -NAGACAGATTACTATCGGAAAGGGGGCAAGGGACTGCTTCCTGTGAGGTGGATGTCACCT -GAGTCCCTGAAGGATGGAGTCTTTACTGCTTCTTCTGATATGTGGTCCTTTGGGGTGGTC -CTTTGGGAAATCACTAGCCTGGCTGAGCAACCTTATCAAGGCCTGTCTAATGAACAGGTG -TTGAAGTTTGTCATGGATGGAGGCTATCTGGATCCCCCTGATAACTGTCCAGAGAGACTG -AGATATGAGATAAAGACACACTGGCCACCCTGA >ENSMUST00000091291_musmusculus ATGGGCTTCGGGAGAGGATGTGAGACGACGGCTGTGCCATTGCTGGTGGCCGTGGCCGCG TTGCTGGTGGGCACAGCCGGCCACCTGTACCCTGGAGAGGTGTGCCCTGGTATGGACATC |
b |
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test4.sqlite |
b |
Binary file test-data/test4.sqlite has changed |
b |
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test5.ns.fasta --- a/test-data/test5.ns.fasta Thu Oct 31 08:16:51 2019 -0400 +++ b/test-data/test5.ns.fasta Sun Sep 27 18:54:31 2020 +0000 |
b |
@@ -299,28 +299,6 @@ TTCCTGGAGAGACACCTGCCTTCTGTACCAGGCCTGCTAAAGCTGTTTGGATTGACCACC ATCTTGTCAGCAACAGCTCTTGGTTTCCTGGCCCACAAAAGGGGCCTGTTTGTACATTTT TAA ->ENSMUST00000168613_musmusculus -ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGGTATGGCGGCAGCC -AAACTTCTGCATGATTGTGGCCTCAGTGTGGTGGTTCTGGAAGCACGGGACCGTGTAGGA -GGCAGGACTTACACAATTAGGAATAAAAACGTTAAATATGTGGACCTTGGAGGATCTTAT -GTTGGGCCAACCCAGAATCGTATCTTACGATTGGCCAAAGAGCTAGGATTGGAGACCTAT -AAAGTTAATGAAGTTGAGCGGCTGATACACTTTGTAAAGGGAAAATCATATGCCTTCAGG -GGCCCATTTCCACCAGTATGGAATCCTATCACCTACCTAGATAATAACAACCTCTGGAGG -ACAATGGATGAGATGGGCCAAGAGATTCCCAGTGATGCTCCATGGAAAGCACCCCTTGCT -GAAGAGTGGGACTACATGACAATGAAAGAATTGCTAGATAAGATCTGCTGGACCAAATCT -ACAAAGCAGATTGCCACGCTCTTTGTGAACCTGTGTGTAACTGCAGAGACCCATGAGGTC -TCTGCACTATGGTTCCTGTGGTATGTGAAGCAGTGTGGAGGTACAACCAGAATCATCTCA -ACAACCAATGGAGGACAGGGGAAAATGTTATTGTGA ->ENSMUST00000163344_musmusculus -ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGGTATGGCGGCAGCC -AAACTTCTGCATGATTGTGGCCTCAGTGTGGTGGTTCTGGAAGCACGGGACCGTGTAGGA -GGCAGGACTTACACAATTAGGAATAAAAACGTTAAATATGTGGACCTTGGAGGATCTTAT -GTTGGGCCAACCCAGAATCGTATCTTACGATTGGCCAAAGAGCTAGGATTGGAGACCTAT -AAAGTTAATGAAGTTGAGCGGCTGATACACTTTGTAAAGATCTACAAAGCAGATTGCCAC -GCTCTTTGTGAACCTGTGTGTAACTGCAGAGACCCATGA ->ENSMUST00000173143_musmusculus -ATGAGCAACAAAAGCGATGTGATCGTGGTGGGGGGCGGCATCTCAGAAAAACCTGCTAAT -TCTAGTCAGTTTACAAGCTCATTGTGGAGGAGAATTGAAAAACTGTGA >ENSSSCT00000033745_susscrofa ATGGCAGCGGCCAAACTTCTGCATGACTCTGGCCTGAGTGTGATTGTTCTGGAAGCCCGG GACCGCGTGGGAGGCAGGACTTACACCGTCAGGAACCAACAAGTTAAATATGTGGACCTT |
b |
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test5_filtered.fasta --- a/test-data/test5_filtered.fasta Thu Oct 31 08:16:51 2019 -0400 +++ b/test-data/test5_filtered.fasta Sun Sep 27 18:54:31 2020 +0000 |
b |
@@ -84,16 +84,6 @@ CTGGAGGGCCTGCGGAGCACGCTGGCCGAGAGCAGCGACCACGTGGAAAAGAGTCCCCAG TCCCTCCTGCAGGACATGCTGGCCACGGGAGGCTTCCTGCAGGGGGACGAGGCCGACTGC TACTGA ->ENST00000421712_homosapiens -ATGGACCCAGAATGCGCCCAGCTGCTCCCGGCTCTCTGTGCTGTTCTGGTAGATCCCAGG -CAGCCGGTGGCAGATGACACCTGTTTGGAGAAGCTCCTGGACTGGTTTAAAACGGTCACT -GAAGGAGAGTCCAGTGTCGTGCTGCTGCAGGAGCACCCCTGCCTGGTGGAGCTGCTGTCC -CATGTGCTGAAAGTCCAGGACCTGAGTTCTGGGGTCCTCTCCTTCTCACTGCGCCTGGCA -GGAACCTTCGCAGCCCAGGAAAACTGCTTCCAGTATCTTCAGGTGCGGTCGACACCATCT -TCTCCCTGCAGGGAGACTCCAGCCTGTTTGTGGCCTCGGCGGCCAGTCAGCTCCTGGTGC -ACGTCCTGGCTTTGTCCATGCGAGGTGGAGCCGAGGGGCAGCCCTGCCTGCCGGGGGGTG -ACTGGCCCGCGTGTGCCCAGAAGATCATGGATCACGTTGAAGAGTCCTTGTGCTCCGCGG -CCACCCCCAAGGTCACTCAGGCCCTGA >ENSRNOT00000064726_rattusnorvegicus ATGGACGCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCTAGA CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTACTGGACTGGTTTAAAACAGTGACA @@ -179,23 +169,6 @@ GACCTGGAGGGCCTGCAGGGCAGGCTGGCCAAGAGCAGCGACCATGTGGAGAAGAGCCCA CAGTCCCTGCTGCAGGACATGCTGGCCACGGTGGGTGTGTTGGAGGAGAACGAAGCTGAC TGCTACTAA ->ENSMUST00000153440_musmusculus -ATGGACCCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCCAGA -CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTGCTGGACTGGTTTAAAACAGTGACA -GAGGCAGAGTCTAGCCTCCAACTACTACAGGACCATCCCTGCTTAATGGAGCTCCTGTCC -CATGTGCTGAAGCCACAGGACGTGAGCCCTAGGGTCCTCTCCTTTGCTCTGCGCCTTGTT -GGGGTCTTCGCAGCCCAGGAAGACTGTTTTGAGTACCTTCAGCAGGGAGAGTTGTTGCTG -GGGCTCTTTGGGGAGTCAGGTGCCCCCGGCTGGGCAGCCTGGAGCATCCCAAGTGTGCGC -AGCGGCTGGATCCAGGGTCTGTGCTACCTGGCACACCACCCTAGCGCCCTGCACTTCCTG -GCTGACAGTGGTGCTGTGGACACGCTCTTCTCCTTGCAGGGAGACCCCAGCCTGTTCGTC -GCCTCAGCAGCCAGCCAGCTCCTAGTACATATCCTGGCTCTGTCCATGCAAGGTGGAGCC -CCAGGGTCCCCCGTCCCTGAAGCTGCTGCTTGGCCTATGTGTGCCCAGAAGATTGTGAAC -CATGTGGATGAGTCCCTGCATGCCAAAGCCACCCCCCAGGTCACACAGGCCTTGAATGTC -CTGACTACGACCTTCGGGCGCTGCCATAACCCCTGGACAGGGGTCCTCTGGGAGCGGCTA -AGTCCCCCTGTTGCCCGCCTGTTTGAGAGAGACCCCATTCCAGCCGTGCACGCGCTCATG -GACCTTCTTCTTAGTGTGGCCAGGTCGCCTGTGTTGAATTTTGCAGCCTGTGGCCTGTGG -GAGATGCTGGCCCAGACTCTGAGCCGCCTGAGCCCCATACAAGCTGGGCCTCTAGCCCTG -GGGACCCTGAAACTTCAGCACTGGCTTGCTGGATGGGACTGTGGGTAG >ENSMUST00000110806_musmusculus ATGGACCCAGAATGCTCCAGGCTCCTCCCGGCTCTCTGTGCTGTTTTGGCAGATCCCAGA CAGCTGGTGGCAGATGACACCTGCTTGGAGAAACTGCTGGACTGGTTTAAAACAGTGACA @@ -577,12 +550,6 @@ TCATCATCCCAGATTATTCTGAAGTGGAAACCACCCTCCGACCCCAATGGCAACATCACC CACTACCTGGTTTTCTGGGAGAGGCAGGCGGAAGACAGTGAGCTGTTCGAGCTGGATTAT TGCCTCAAAGGGCGAGTCCAGTCATCAGCTCCGCTGTAA ->ENSMUST00000208839_musmusculus -NAGACAGATTACTATCGGAAAGGGGGCAAGGGACTGCTTCCTGTGAGGTGGATGTCACCT -GAGTCCCTGAAGGATGGAGTCTTTACTGCTTCTTCTGATATGTGGTCCTTTGGGGTGGTC -CTTTGGGAAATCACTAGCCTGGCTGAGCAACCTTATCAAGGCCTGTCTAATGAACAGGTG -TTGAAGTTTGTCATGGATGGAGGCTATCTGGATCCCCCTGATAACTGTCCAGAGAGACTG -AGATATGAGATAAAGACACACTGGCCACCCTGA >ENSMUST00000091291_musmusculus ATGGGCTTCGGGAGAGGATGTGAGACGACGGCTGTGCCATTGCTGGTGGCCGTGGCCGCG TTGCTGGTGGGCACAGCCGGCCACCTGTACCCTGGAGAGGTGTGCCCTGGTATGGACATC |
b |
diff -r e8e75a79de59 -r dbe37a658cd2 test-data/test6.sqlite |
b |
Binary file test-data/test6.sqlite has changed |