Mercurial > repos > earlhaminst > gstf_preparation
changeset 10:e8e75a79de59 draft
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
author | earlhaminst |
---|---|
date | Thu, 31 Oct 2019 08:16:51 -0400 |
parents | f4acbfe8d6fe |
children | dbe37a658cd2 |
files | gstf_preparation.py gstf_preparation.xml test-data/MGP_PahariEiJ_G0008413.1.gff3 test-data/Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa test-data/test1.ns.fasta test-data/test4.ns.fasta test-data/test5.sqlite test-data/test6.fasta test-data/test6.sqlite |
diffstat | 7 files changed, 289 insertions(+), 35 deletions(-) [+] |
line wrap: on
line diff
--- a/gstf_preparation.py Wed Oct 17 07:31:29 2018 -0400 +++ b/gstf_preparation.py Thu Oct 31 08:16:51 2019 -0400 @@ -2,6 +2,7 @@ import json import optparse +import os import sqlite3 import sys @@ -114,10 +115,7 @@ # a 5' UTR can be split among multiple exons # a CDS can be part of multiple transcripts for parent in d['Parent'].split(','): - if parent not in parent_dict: - parent_dict[parent] = [d] - else: - parent_dict[parent].append(d) + parent_dict.setdefault(parent, []).append(d) return d @@ -139,6 +137,8 @@ def add_transcript_to_dict(cols, species, transcript_dict): transcript = feature_to_dict(cols) + if 'biotype' in transcript and transcript['biotype'] != 'protein_coding': + return transcript.update({ 'object_type': 'Transcript', 'seq_region_name': cols[0], @@ -302,7 +302,7 @@ parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') parser.add_option('-o', '--output', help='Path of the output SQLite file') parser.add_option('--of', help='Path of the output FASTA file') - parser.add_option('--ff', help='Path of the filtered sequences output FASTA file') + parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file') options, args = parser.parse_args() if args: @@ -403,10 +403,7 @@ else: break - if gene_id in gene_transcripts_dict: - gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) - else: - gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] + gene_transcripts_dict.setdefault(gene_id, []).append((transcript_id, len(entry.sequence))) if options.longestCDS: # For each gene, select the transcript with the longest sequence.
--- a/gstf_preparation.xml Wed Oct 17 07:31:29 2018 -0400 +++ b/gstf_preparation.xml Thu Oct 31 08:16:51 2019 -0400 @@ -1,7 +1,6 @@ <tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.1"> <description>converts data for the workflow</description> - <command detect_errors="exit_code"> -<![CDATA[ + <command detect_errors="exit_code"><![CDATA[ python '$__tool_directory__/gstf_preparation.py' #for $q in $queries --gff3 '${q.genome}:${q.gff3_input}' @@ -22,12 +21,11 @@ #end if #if $regions --regions '$regions' + --ff '$filtered_fasta' #end if -o '$output_db' --of '$output_fasta' ---ff '$filtered_fasta' -]]> - </command> + ]]></command> <inputs> <repeat name="queries" title="GFF3 dataset"> @@ -40,58 +38,56 @@ <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding CDS datasets in FASTA format" help="Each FASTA header line should start with a transcript id" /> <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" /> <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the >TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" /> - <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" /> + <param name="regions" type="text" optional="true" label="Comma-separated list of region IDs (e.g. chromosomes or scaffolds) for which FASTA sequences should be filtered out" help="Region IDs are in the `seqid` column for GFF3 and in the `seq_region_name` field in JSON. This is typically used to filter out chromosomes with a non-standard genetic code, like mitochondria, to be analysed separately" /> </inputs> <outputs> - <data name="output_db" format="sqlite" label="${tool.name} on ${on_string}: SQLite" /> - <data name="output_fasta" format="fasta" label="${tool.name} on ${on_string}: FASTA" /> - <data name="filtered_fasta" format="fasta" label="${tool.name} on ${on_string}: filtered sequences" /> + <data name="output_db" format="sqlite" label="${tool.name} on ${on_string}: SQLite" /> + <data name="output_fasta" format="fasta" label="${tool.name} on ${on_string}: FASTA" /> + <data name="filtered_fasta" format="fasta" label="${tool.name} on ${on_string}: filtered sequences"> + <filter>regions</filter> + </data> </outputs> <tests> - <test> + <test expect_num_outputs="2"> <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> <param name="genome" value="caenorhabditis_elegans" /> <param name="longestCDS" value="false" /> <param name="headers" value="true" /> - <output name="output_db" file="test1.sqlite" compare="sim_size" /> + <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> <output name="output_fasta" file="test1.fasta" /> - <output name="filtered_fasta" file="test1.ns.fasta" /> </test> - <test> + <test expect_num_outputs="2"> <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> <param name="genome" value="caenorhabditis_elegans" /> <param name="longestCDS" value="true" /> <param name="headers" value="true" /> - <output name="output_db" file="test1.sqlite" compare="sim_size" /> + <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> <output name="output_fasta" file="test1_longest.fasta" /> - <output name="filtered_fasta" file="test1.ns.fasta" /> </test> - <test> + <test expect_num_outputs="2"> <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" /> <param name="genome" value="caenorhabditis_elegans" /> <param name="longestCDS" value="false" /> <param name="headers" value="false" /> - <output name="output_db" file="test1.sqlite" compare="sim_size" /> + <output name="output_db" file="test1.sqlite" compare="sim_size" delta="30000" /> <output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" /> - <output name="filtered_fasta" file="test1.ns.fasta" /> </test> - <test> + <test expect_num_outputs="2"> <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> <param name="json" ftype="json" value="gene.json" /> <param name="longestCDS" value="false" /> <param name="headers" value="true" /> - <output name="output_db" file="test4.sqlite" compare="sim_size" /> + <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> <output name="output_fasta" file="test4.fasta" /> - <output name="filtered_fasta" file="test4.ns.fasta" /> </test> <test> <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" /> @@ -100,13 +96,22 @@ <param name="headers" value="true" /> <param name="regions" value="X" /> - <output name="output_db" file="test5.sqlite" compare="sim_size" /> + <output name="output_db" file="test4.sqlite" compare="sim_size" delta="30000" /> <output name="output_fasta" file="test5_filtered.fasta" /> <output name="filtered_fasta" file="test5.ns.fasta" /> </test> + <test expect_num_outputs="2"> + <param name="fasta_inputs" ftype="fasta" value="Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa" /> + <param name="gff3_input" ftype="gff3" value="MGP_PahariEiJ_G0008413.1.gff3" /> + <param name="genome" value="mus_pahari" /> + <param name="longestCDS" value="true" /> + <param name="headers" value="true" /> + + <output name="output_db" file="test6.sqlite" compare="sim_size" delta="30000" /> + <output name="output_fasta" file="test6.fasta" /> + </test> </tests> - <help> -<![CDATA[ + <help><![CDATA[ **What it does** This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format. @@ -140,8 +145,7 @@ .. class:: warningmark If a value in the **ID** and **Parent** attribute contains a colon, everything up to the first colon will be discarded. -]]> - </help> + ]]></help> <citations> </citations> </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/MGP_PahariEiJ_G0008413.1.gff3 Thu Oct 31 08:16:51 2019 -0400 @@ -0,0 +1,139 @@ +##gff-version 3 +##sequence-region 13 1 96704406 +13 Ensembl gene 62596741 62686932 . + . ID=MGP_PahariEiJ_G0008413.1;Name=MGP_PahariEiJ_G0008413.1;biotype=polymorphic_pseudogene +13 Ensembl transcript 62596741 62626623 . + . ID=MGP_PahariEiJ_T0009933.1;Name=MGP_PahariEiJ_T0009933.1;Parent=MGP_PahariEiJ_G0008413.1;biotype=protein_coding +13 Ensembl transcript 62596741 62686932 . + . ID=MGP_PahariEiJ_T0009934.1;Name=MGP_PahariEiJ_T0009934.1;Parent=MGP_PahariEiJ_G0008413.1;biotype=polymorphic_pseudogene +13 Ensembl transcript 62596766 62625799 . + . ID=MGP_PahariEiJ_T0009935.1;Name=MGP_PahariEiJ_T0009935.1;Parent=MGP_PahariEiJ_G0008413.1;biotype=retained_intron +13 Ensembl transcript 62660839 62686932 . + . ID=MGP_PahariEiJ_T0009936.1;Name=MGP_PahariEiJ_T0009936.1;Parent=MGP_PahariEiJ_G0008413.1;biotype=processed_transcript +13 Ensembl transcript 62671962 62686919 . + . ID=MGP_PahariEiJ_T0009937.1;Name=MGP_PahariEiJ_T0009937.1;Parent=MGP_PahariEiJ_G0008413.1;biotype=processed_transcript +13 Ensembl transcript 62671962 62686918 . + . ID=MGP_PahariEiJ_T0009938.1;Name=MGP_PahariEiJ_T0009938.1;Parent=MGP_PahariEiJ_G0008413.1;biotype=protein_coding +13 Ensembl intron 62596975 62624027 . + . Name=intron00001;Parent=MGP_PahariEiJ_T0009933.1 +13 Ensembl intron 62624355 62626424 . + . Name=intron00002;Parent=MGP_PahariEiJ_T0009933.1 +13 Ensembl intron 62596975 62624027 . + . Name=intron00003;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62624355 62626424 . + . Name=intron00004;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62626620 62637349 . + . Name=intron00005;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62637436 62640660 . + . Name=intron00006;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62640768 62641046 . + . Name=intron00007;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62641179 62641725 . + . Name=intron00008;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62641854 62641961 . + . Name=intron00009;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62642215 62651556 . + . Name=intron00010;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62651793 62657150 . + . Name=intron00011;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62657340 62660197 . + . Name=intron00012;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62660808 62662195 . + . Name=intron00013;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62662303 62663623 . + . Name=intron00014;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62663751 62665451 . + . Name=intron00015;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62665637 62668991 . + . Name=intron00016;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62669299 62671283 . + . Name=intron00017;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62671361 62671958 . + . Name=intron00018;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62672085 62673958 . + . Name=intron00019;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62674160 62678497 . + . Name=intron00020;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62678579 62679702 . + . Name=intron00021;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62679808 62683727 . + . Name=intron00022;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62683916 62685193 . + . Name=intron00023;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl intron 62596975 62624027 . + . Name=intron00024;Parent=MGP_PahariEiJ_T0009935.1 +13 Ensembl intron 62660879 62662195 . + . Name=intron00025;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62662389 62663623 . + . Name=intron00026;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62663751 62665451 . + . Name=intron00027;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62665637 62668991 . + . Name=intron00028;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62669299 62671283 . + . Name=intron00029;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62671361 62671958 . + . Name=intron00030;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62672085 62673958 . + . Name=intron00031;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62674160 62678497 . + . Name=intron00032;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62678579 62679702 . + . Name=intron00033;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62679808 62683727 . + . Name=intron00034;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62683916 62685193 . + . Name=intron00035;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl intron 62672085 62674007 . + . Name=intron00036;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl intron 62674160 62678497 . + . Name=intron00037;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl intron 62678579 62679702 . + . Name=intron00038;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl intron 62679808 62683727 . + . Name=intron00039;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl intron 62683916 62685193 . + . Name=intron00040;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl intron 62672085 62674007 . + . Name=intron00041;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl intron 62674160 62678497 . + . Name=intron00042;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl intron 62678579 62679702 . + . Name=intron00043;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl intron 62679808 62683727 . + . Name=intron00044;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl intron 62683916 62685193 . + . Name=intron00045;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl CDS 62596855 62596942 . + 0 Name=MGP_PahariEiJ_P0009933;Parent=MGP_PahariEiJ_T0009933.1 +13 Ensembl CDS 62596943 62596974 . + 2 Name=MGP_PahariEiJ_P0009933;Parent=MGP_PahariEiJ_T0009933.1 +13 Ensembl CDS 62624028 62624354 . + 0 Name=MGP_PahariEiJ_P0009933;Parent=MGP_PahariEiJ_T0009933.1 +13 Ensembl CDS 62626425 62626620 . + 0 Name=MGP_PahariEiJ_P0009933;Parent=MGP_PahariEiJ_T0009933.1 +13 Ensembl CDS 62596855 62596942 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62596943 62596974 . + 2 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62624028 62624354 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62626425 62626619 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62637350 62637435 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62640661 62640767 . + 1 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62641047 62641178 . + 2 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62641726 62641853 . + 2 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62641962 62642214 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62651557 62651792 . + 2 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62657151 62657339 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62660198 62660807 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62662196 62662302 . + 2 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62663624 62663750 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62665452 62665636 . + 2 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62668992 62669298 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62671284 62671360 . + 2 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62671959 62672084 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62673959 62674159 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62678498 62678578 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62679703 62679807 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62683728 62683915 . + 0 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62685194 62685509 . + 1 Name=MGP_PahariEiJ_P0009934;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl CDS 62674016 62674159 . + 0 Name=MGP_PahariEiJ_P0009938;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl CDS 62678498 62678578 . + 0 Name=MGP_PahariEiJ_P0009938;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl CDS 62679703 62679807 . + 0 Name=MGP_PahariEiJ_P0009938;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl CDS 62683728 62683915 . + 0 Name=MGP_PahariEiJ_P0009938;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl CDS 62685194 62685509 . + 1 Name=MGP_PahariEiJ_P0009938;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl exon 62596741 62596942 . + . Name=MGP_PahariEiJ_E0009933.1;Parent=MGP_PahariEiJ_T0009933.1 +13 Ensembl exon 62596943 62596974 . + . Name=MGP_PahariEiJ_E0009933.2;Parent=MGP_PahariEiJ_T0009933.1 +13 Ensembl exon 62624028 62624354 . + . Name=MGP_PahariEiJ_E0009933.3;Parent=MGP_PahariEiJ_T0009933.1 +13 Ensembl exon 62626425 62626623 . + . Name=MGP_PahariEiJ_E0009933.4;Parent=MGP_PahariEiJ_T0009933.1 +13 Ensembl exon 62596741 62596942 . + . Name=MGP_PahariEiJ_E0009933.1;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62596943 62596974 . + . Name=MGP_PahariEiJ_E0009933.2;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62624028 62624354 . + . Name=MGP_PahariEiJ_E0009933.3;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62626425 62626619 . + . Name=MGP_PahariEiJ_E0009934.4;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62637350 62637435 . + . Name=MGP_PahariEiJ_E0009934.5;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62640661 62640767 . + . Name=MGP_PahariEiJ_E0009934.6;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62641047 62641178 . + . Name=MGP_PahariEiJ_E0009934.7;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62641726 62641853 . + . Name=MGP_PahariEiJ_E0009934.8;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62641962 62642214 . + . Name=MGP_PahariEiJ_E0009934.9;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62651557 62651792 . + . Name=MGP_PahariEiJ_E0009934.10;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62657151 62657339 . + . Name=MGP_PahariEiJ_E0009934.11;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62660198 62660807 . + . Name=MGP_PahariEiJ_E0009934.12;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62662196 62662302 . + . Name=MGP_PahariEiJ_E0009934.13;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62663624 62663750 . + . Name=MGP_PahariEiJ_E0009934.14;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62665452 62665636 . + . Name=MGP_PahariEiJ_E0009934.15;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62668992 62669298 . + . Name=MGP_PahariEiJ_E0009934.16;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62671284 62671360 . + . Name=MGP_PahariEiJ_E0009934.17;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62671959 62672084 . + . Name=MGP_PahariEiJ_E0009934.18;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62673959 62674159 . + . Name=MGP_PahariEiJ_E0009934.19;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62678498 62678578 . + . Name=MGP_PahariEiJ_E0009934.20;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62679703 62679807 . + . Name=MGP_PahariEiJ_E0009934.21;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62683728 62683915 . + . Name=MGP_PahariEiJ_E0009934.22;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62685194 62686932 . + . Name=MGP_PahariEiJ_E0009934.23;Parent=MGP_PahariEiJ_T0009934.1 +13 Ensembl exon 62596766 62596974 . + . Name=MGP_PahariEiJ_E0009935.1;Parent=MGP_PahariEiJ_T0009935.1 +13 Ensembl exon 62624028 62625799 . + . Name=MGP_PahariEiJ_E0009935.2;Parent=MGP_PahariEiJ_T0009935.1 +13 Ensembl exon 62660839 62660878 . + . Name=MGP_PahariEiJ_E0009936.1;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62662196 62662388 . + . Name=MGP_PahariEiJ_E0009936.2;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62663624 62663750 . + . Name=MGP_PahariEiJ_E0009936.3;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62665452 62665636 . + . Name=MGP_PahariEiJ_E0009936.4;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62668992 62669298 . + . Name=MGP_PahariEiJ_E0009936.5;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62671284 62671360 . + . Name=MGP_PahariEiJ_E0009936.6;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62671959 62672084 . + . Name=MGP_PahariEiJ_E0009936.7;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62673959 62674159 . + . Name=MGP_PahariEiJ_E0009936.8;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62678498 62678578 . + . Name=MGP_PahariEiJ_E0009936.9;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62679703 62679807 . + . Name=MGP_PahariEiJ_E0009936.10;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62683728 62683915 . + . Name=MGP_PahariEiJ_E0009936.11;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62685194 62686932 . + . Name=MGP_PahariEiJ_E0009936.12;Parent=MGP_PahariEiJ_T0009936.1 +13 Ensembl exon 62671962 62672084 . + . Name=MGP_PahariEiJ_E0009937.1;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl exon 62674008 62674159 . + . Name=MGP_PahariEiJ_E0009937.2;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl exon 62678498 62678578 . + . Name=MGP_PahariEiJ_E0009936.9;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl exon 62679703 62679807 . + . Name=MGP_PahariEiJ_E0009936.10;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl exon 62683728 62683915 . + . Name=MGP_PahariEiJ_E0009936.11;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl exon 62685194 62686919 . + . Name=MGP_PahariEiJ_E0009937.6;Parent=MGP_PahariEiJ_T0009937.1 +13 Ensembl exon 62671962 62672084 . + . Name=MGP_PahariEiJ_E0009937.1;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl exon 62674008 62674159 . + . Name=MGP_PahariEiJ_E0009938.2;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl exon 62678498 62678578 . + . Name=MGP_PahariEiJ_E0009934.20;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl exon 62679703 62679807 . + . Name=MGP_PahariEiJ_E0009934.21;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl exon 62683728 62683915 . + . Name=MGP_PahariEiJ_E0009934.22;Parent=MGP_PahariEiJ_T0009938.1 +13 Ensembl exon 62685194 62686918 . + . Name=MGP_PahariEiJ_E0009938.6;Parent=MGP_PahariEiJ_T0009938.1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Mus_pahari.PAHARI_EIJ_v1.1.cds.all.shortened.fa Thu Oct 31 08:16:51 2019 -0400 @@ -0,0 +1,99 @@ +>MGP_PahariEiJ_T0009933.1 cds chromosome:PAHARI_EIJ_v1.1:13:62596741:62626623:1 gene:MGP_PahariEiJ_G0008413.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:Atp10d description:ATPase, class V, type 10D [Source:MGI Symbol;Acc:MGI:2450125] +AAGACAAATGGCTGGCTTGGAAGCGTAACTCTCACCGCCCTTTGGATCCCTGCTCGCTTC +TCTTTTGGCACTTTGGGATCCGAGGTAACCATGCGGTGATGAGCGGCCCGGGAGGGACAG +ATCACCTGAACCAGCCGGGTCTCCCTGCGTCTTGGACATGACTGAGCTTCTGCAGTGGGC +CAGACATCACTGGCGTCGGCTGAGCCATGGGAGAACCCAGGGTGAAGATGAGAGGCCGTA +CAACTACGCCTCCCTGCTGGCCTGTGGGGGCAAGTCCCCCCGGACCCCCAGGCCTGCAGG +AAAGCACCGTGTCGTTATTCCTCACCTTCAGTGCTTCAGGGATGAGTACGAGAGGTTTTC +TGGAACCTACGTGAATAACCGGATACGGACGACCAAGTACACACTCCTGAACTTTGTGCC +AAGGAACTTATTTGAACAGTTTCACAGGGCTGCCAATTTATATTTCCTGTTCCTCGTGGT +CCTGAACTGGGTGCCTTTGGTAGAAGCCTTCCAAAAGGAAATCACCATGCTGCCTCTGGT +GGTGGTCCTCACAATTATTGCAATTAAAGATGGCTTGGAAGACTACCGGAAGTACAAAAT +TGACAAGCAGATCAACAACTTAATAACCAAGGTTTACAGTAGG +>MGP_PahariEiJ_T0009934.1 cds chromosome:PAHARI_EIJ_v1.1:13:62596741:62686932:1 gene:MGP_PahariEiJ_G0008413.1 gene_biotype:polymorphic_pseudogene transcript_biotype:polymorphic_pseudogene gene_symbol:Atp10d description:ATPase, class V, type 10D [Source:MGI Symbol;Acc:MGI:2450125] +AAGACAAATGGCTGGCTTGGAAGCGTAACTCTCACCGCCCTTTGGATCCCTGCTCGCTTC +TCTTTTGGCACTTTGGGATCCGAGGTAACCATGCGGTGATGAGCGGCCCGGGAGGGACAG +ATCACCTGAACCAGCCGGGTCTCCCTGCGTCTTGGACATGACTGAGCTTCTGCAGTGGGC +CAGACATCACTGGCGTCGGCTGAGCCATGGGAGAACCCAGGGTGAAGATGAGAGGCCGTA +CAACTACGCCTCCCTGCTGGCCTGTGGGGGCAAGTCCCCCCGGACCCCCAGGCCTGCAGG +AAAGCACCGTGTCGTTATTCCTCACCTTCAGTGCTTCAGGGATGAGTACGAGAGGTTTTC +TGGAACCTACGTGAATAACCGGATACGGACGACCAAGTACACACTCCTGAACTTTGTGCC +AAGGAACTTATTTGAACAGTTTCACAGGGCTGCCAATTTATATTTCCTGTTCCTCGTGGT +CCTGAACTGGGTGCCTTTGGTAGAAGCCTTCCAAAAGGAAATCACCATGCTGCCTCTGGT +GGTGGTCCTCACAATTATTGCAATTAAAGATGGCTTGGAAGACTACCGGAAGTACAAAAT +TGACAAGCAGATCAACAACTTAATAACCAAGGTTTACAGTAGGACTCTGAAGTTGATCCT +GAGAAGTTCTCCAGTAGGATAGAATGTGAAAGCCCAAACAATGACCTCAGCAGATTCCGA +GGTTTCCTGGAACATGCCAATAAAGACCGTGTGGGCCTCAGCAAAGAGAATTTATTGCTC +CGCGGGTGCACCATCAGAAACACAGAGGCTGTGGTGGGCATTGTGGTCTATGCAGGTCAT +GAAACCAAAGCAATGCTGAACAACAGTGGGCCACGGTATAAGCGCAGTAAGTTAGAGAGA +AGAGCAAATACAGACGTCCTCTGGTGTGTCCTGCTTCTGATCGTCATGTGCTTAACTGGT +GCACTGGGTCACGGCATATGGCTGAGCAGGTATGAGAACATGCTCTTTTTTAACATCCCT +GAGCCGGACGGACGTGTCCTATCACCTGTGCTGACTGGGTTCTATGTGTTCTGGACCATG +ATCATCTTGCTGCAGGTCCTGATCCCCATTTCTCTCTACGTGTCCATTGAGATCGTGAAG +CTGGGACAGATCTATTTCATCCAGAGCGATGTAGATTTCTACAACGAGAAAATGGATTCG +ACCATTCAGTGCCGAGCCCTAAACATCACTGAGGACCTTGGGCAGATTCAATACCTCTTT +TCTGATAAGACAGGAACCCTCACAGAGAATAAGATGGTGTTTCGGAGGTGCAGTGTAGCA +GGGTTTGACTACTGCCATGAAGAAAACGCCAGGAGGCTCGAGTCCTATCAGGAAGCTGTC +TCTGAAGAGGAGGAACGCGCAGACACTCTCGGCGGCTCCCTCAGCAACGTGGCGAGACCC +AGAGCCCAGGGCTGCAGGACAGTTCACAGTGGGCTTCCGGGAAAACCCCCGGCTCACCTC +TCCGGGAGCACCTCTGCTGTAGGAGACGCAGAAGGATCCGGGGAAGTGCCTCATTCCAGA +CAGGCTGCCTTCAGTAGTCCCATGGAAACAGACGTGGTACCAGATACCAGACTTTTAGAC +AAATTTAGCCAGATTACCCCTCAGCTGCTCACTGGACTGGATGGGACCTTGCAGAGCTCA +TCACTGGAGACCTTGTACATCATGGACTTCTTTATTGCACTGGCAATTTGCAACACGGTG +GTGGTTTCTGCCCCAAACCAACCTCGGCAAAAGATTGGGCTCTCCTCACTGGGTGGAATG +CCCATCAAGTCCTTGGACGAGATTAAAAACATCTTCCAGAAATTGTCTGTCCGGAGATCA +AGTTCACCATCCCTTGCCAGCGGGAAGGATTCATCCTCTGGGACTCCCTGTGCCTTTGTG +AGCAGAATCTCTTTCTTTAGTCGACCAAAACTGTCACCTCCTATGGAGGACGAGTCTTCC +CAAATGGATGAAATCCCCCAGGCCAGTAACTCAGCTTGCTGTACAGAAACGGAGGCACAA +AACAGTGCCTTAGGACTCAGCGTCGGCTCCGCGGAAGCCCTAAATGGACCACCGCCCTTG +GCTTCCAACCTGTGTTATGAGGCGGAGAGTCCAGATGAAGCAGCCTTGGTGTATGCCGCC +AGAGCTTATCATTGCACTTTACAGTCTCGGACCCCAGAGCAGGTCATGGTGGAGTTTGCA +GCTTTGGGCTCATTAACATTTCAACTCCTACACATCCTGCCCTTTGACTCAGTAAGGAAA +AGAATGTCGGTGGTGGTCCGGCACCCTCTTTCCAAACAAGTCGTGGTGTATACAAAAGGC +GCTGATTCCGTGATCATGGAGCTGCTGTCTATGGCTTCCTCGGATGGAACAAATCTGGAA +GAACAACAGATGATAATAAGGGAGAGAACGCAGAGGCACCTGGACGAGTATGCCAGACGA +GGGCTGCGCACTCTGTGTGTTGCAAAGAAGGTCATGAGTGACACGGAATATGCAGAGTGG +CTGAGGAATCACTTCCTAGCTGAAACCAGCATTGACAACAGGGAGGAGCTGCTAGTTGAG +TCTGCCATGAGACTAGAAAACAAACTCACGTTACTTGGTGCTACTGGCATTGAAGATCGT +CTGCAGGAGGGGGTCCCTGAGTCTATAGAAGCCCTTCACCAAGCTGGCATCAAGATCTGG +ATGCTGACAGGGGACAAGCAGGAGACAGCTGTCAACATAGCTTATGCATGCAGACTCCTG +GAACCAGATGACAAGCTCTTCATCCTCAATACACAAAGTGAGGATGCCTGTGGGATGCTG +ATGAGTGCAATTTTGGAAGAACTTCAGAAGAGAGCTCAGGTGTCTCCGGAGCTGGCATCA +CCAAGAAAGAACTTTCCTCAGCCCCCTGACCCTCAGGGCCAGGGACGTGCGGGACTTGTT +ATCACTGGGAAGAGCCTGGAGTTTGCCCTGCAGGAGAGTCTACAAAGACAGTTCCTTGAG +CTGACTGCATGGTGCCAAGCTGTGATCTGCTGCCGAGCCACCCCCCTTCAAAAGAGTGAG +GTGGTGAAATTGGTTCGAAACCATCTCCATGTGATGACCCTAGCCATTGGTGACGGTGCC +AATGATGTTAGCATGATACAAGTGGCTGACATTGGGATCGGTGTCTCAGGTCAAGAAGGC +ATGCAGGCTGTGATGGCCAGTGACTTCGCCATCTCTCAGTTCAGACATCTCAGCAAGCTT +CTCCTCGTGCACGGGCACTGGTGTTACACCCGGCTCTCCAACATGATTCTCTATTTTTTC +TACAAGAATGTGGCCTATGTGAATCTCCTTTTCTGGTACCAGTTCTTTTGTGGGTTTTCA +GGAACATCGATGACTGACTACTGGGTGCTGATCTTCTTCAACCTCCTCTTCACATCTGTC +CCCCCCATCATTTATGGCGTTTTGGAGAAAGATGTGTCAGCAGAGACCCTCCTGCAGCTG +CCTGAACTTTACCGGAGTGGTCAGCGATCAGAGGAATACTTGCCCGTCACTTTCTGGATC +ACCTTGTTGGATGCCTTTTATCAAAGCCTGGTCTGCTTCTTTGTGCCTTACTTTACCTAC +CAGGGCTCTGACATTGACATCTTTACCTTTGGGAATCCCCTGAACACGGCGGCTCTGTTC +ATCATTCTCCTCCACCTGGTGATCGAAAGCAAGAGTTTGACTTGGATCCACATGCTGGTC +ATTGTTGGGAGCATCTTGTCCTACTTTTTCTTTGCCTTGGCTTTTGGAGCCTTATGTGTC +ACTTGCAACCCACCCTCCAACCCCTACGGGATCATGCAGAAGCACATGCTAGACCCTGTG +TTCTACTTAGTTTGTGTTCTTACAACCTTCGTAGCACTCCTGCCCAGGTTTGCCTACCGA +GTTCTTCAGGGATCCATGTTTCCATCTCCAGTTCTCAGAGCCAAGTACTTTGACCGACTA +CCTCCAGAGGAGAGAGCTGAAGCTCTCAAGAGGTGGAGAGGGACTGCAAAGATCAATCAC +GTGGCATCTCAGCATGCCAGCCAATCAGCTGCTAAGTCAGGAAGACCCACGCCTGGGTCT +TCTGCTGTCCTTGCAATGAAGACAGCAACAGTGCGTACTGTTGAGCAGAGCACATGTGAA +ACTGCGCTAGACCATGGCTGCTCTGAACCTGGGGCCTCCAGGACGACTGGACCCTCAGCA +AGT +>MGP_PahariEiJ_T0009938.1 cds chromosome:PAHARI_EIJ_v1.1:13:62671962:62686918:1 gene:MGP_PahariEiJ_G0008413.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:Atp10d description:ATPase, class V, type 10D [Source:MGI Symbol;Acc:MGI:2450125] +ATGACTGACTACTGGGTGCTGATCTTCTTCAACCTCCTCTTCACATCTGTCCCCCCCATC +ATTTATGGCGTTTTGGAGAAAGATGTGTCAGCAGAGACCCTCCTGCAGCTGCCTGAACTT +TACCGGAGTGGTCAGCGATCAGAGGAATACTTGCCCGTCACTTTCTGGATCACCTTGTTG +GATGCCTTTTATCAAAGCCTGGTCTGCTTCTTTGTGCCTTACTTTACCTACCAGGGCTCT +GACATTGACATCTTTACCTTTGGGAATCCCCTGAACACGGCGGCTCTGTTCATCATTCTC +CTCCACCTGGTGATCGAAAGCAAGAGTTTGACTTGGATCCACATGCTGGTCATTGTTGGG +AGCATCTTGTCCTACTTTTTCTTTGCCTTGGCTTTTGGAGCCTTATGTGTCACTTGCAAC +CCACCCTCCAACCCCTACGGGATCATGCAGAAGCACATGCTAGACCCTGTGTTCTACTTA +GTTTGTGTTCTTACAACCTTCGTAGCACTCCTGCCCAGGTTTGCCTACCGAGTTCTTCAG +GGATCCATGTTTCCATCTCCAGTTCTCAGAGCCAAGTACTTTGACCGACTACCTCCAGAG +GAGAGAGCTGAAGCTCTCAAGAGGTGGAGAGGGACTGCAAAGATCAATCACGTGGCATCT +CAGCATGCCAGCCAATCAGCTGCTAAGTCAGGAAGACCCACGCCTGGGTCTTCTGCTGTC +CTTGCAATGAAGACAGCAACAGTGCGTACTGTTGAGCAGAGCACATGTGAAACTGCGCTA +GACCATGGCTGCTCTGAACCTGGGGCCTCCAGGACGACTGGACCCTCAGCAAGT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test6.fasta Thu Oct 31 08:16:51 2019 -0400 @@ -0,0 +1,15 @@ +>MGP_PahariEiJ_T0009938.1_muspahari +ATGACTGACTACTGGGTGCTGATCTTCTTCAACCTCCTCTTCACATCTGTCCCCCCCATC +ATTTATGGCGTTTTGGAGAAAGATGTGTCAGCAGAGACCCTCCTGCAGCTGCCTGAACTT +TACCGGAGTGGTCAGCGATCAGAGGAATACTTGCCCGTCACTTTCTGGATCACCTTGTTG +GATGCCTTTTATCAAAGCCTGGTCTGCTTCTTTGTGCCTTACTTTACCTACCAGGGCTCT +GACATTGACATCTTTACCTTTGGGAATCCCCTGAACACGGCGGCTCTGTTCATCATTCTC +CTCCACCTGGTGATCGAAAGCAAGAGTTTGACTTGGATCCACATGCTGGTCATTGTTGGG +AGCATCTTGTCCTACTTTTTCTTTGCCTTGGCTTTTGGAGCCTTATGTGTCACTTGCAAC +CCACCCTCCAACCCCTACGGGATCATGCAGAAGCACATGCTAGACCCTGTGTTCTACTTA +GTTTGTGTTCTTACAACCTTCGTAGCACTCCTGCCCAGGTTTGCCTACCGAGTTCTTCAG +GGATCCATGTTTCCATCTCCAGTTCTCAGAGCCAAGTACTTTGACCGACTACCTCCAGAG +GAGAGAGCTGAAGCTCTCAAGAGGTGGAGAGGGACTGCAAAGATCAATCACGTGGCATCT +CAGCATGCCAGCCAATCAGCTGCTAAGTCAGGAAGACCCACGCCTGGGTCTTCTGCTGTC +CTTGCAATGAAGACAGCAACAGTGCGTACTGTTGAGCAGAGCACATGTGAAACTGCGCTA +GACCATGGCTGCTCTGAACCTGGGGCCTCCAGGACGACTGGACCCTCAGCAAGT