Mercurial > repos > iuc > berokka
changeset 0:9a1626faa05c draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/berokka commit 387f04ffbf5205aaaa7b46e9e3d518edb62a538f
author | iuc |
---|---|
date | Mon, 25 Mar 2019 12:55:16 -0400 |
parents | |
children | f91f6054fca7 |
files | README.rst berokka.xml test-data/berokka_test1.fasta test-data/results_1 test-data/results_2 test-data/results_3 test-data/trimmed_1 test-data/trimmed_2 test-data/trimmed_3 |
diffstat | 9 files changed, 440 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Mon Mar 25 12:55:16 2019 -0400 @@ -0,0 +1,14 @@ +Galaxy Wrapper for Berokka +========================== + +Trim, circularise, orient & filter long read bacterial genome assemblies. + +Detailed Description +-------------------- + +View original Berokka documentation here: https://github.com/tseemann/berokka/blob/master/README.md + +License +------- + +`GPLv3 <https://raw.githubusercontent.com/tseemann/berokka/master/LICENSE>`_
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/berokka.xml Mon Mar 25 12:55:16 2019 -0400 @@ -0,0 +1,107 @@ +<tool id="berokka" name="Berokka" version="0.2"> + <description>Trim, circularise, orient and filter long read bacterial genome assemblies</description> + <requirements> + <requirement type="package" version="0.2">berokka</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + berokka + --outdir default '${input_file}' + + #if $filter_fasta: + --filter '${filter_fasta}' + #end if + + --readlen '${read_length}' + + --fuzz '${fuzz}' + + #unless $anno: + --noanno + #end unless + + ]]></command> + <inputs> + <param name="input_file" type="data" format="fasta" label="Input (FASTA)" help="Should be completed long-read assemblies in FASTA format, such as those from CANU or HGAP"/> + <param name="filter_fasta" optional="true" type="data" format="fasta" label="Filter (FASTA)" help="Give a fasta to use as a filter."/> + <param name="read_length" type="integer" value="60000" min="28" label="Read Length" help="Approximate max read length (default '60000')"/> + <param name="fuzz" type="integer" value="5" label="Fuzz" help="Accept local alignment within X bp of global (default '5')"/> + <param name="anno" type="boolean" checked="true" label="Annotation" help="Annotate Trimmed FASTA"/> + </inputs> + <outputs> + <data name="trimmed" format="fasta" from_work_dir="default/02.trimmed.fa" label="${tool.name} on ${on_string}: Trimmed"/> + <data name="results" format="tabular" from_work_dir="default/03.results.tab" label="${tool.name} on ${on_string}: Results"/> + </outputs> + <tests> + <test> + <param name="input_file" value="berokka_test1.fasta"/> + <param name="read_length" value="60000"/> + <param name="fuzz" value="5"/> + <param name="anno" value="true"/> + <output name="trimmed" file="trimmed_1" ftype="fasta"/> + <output name="results" file="results_1" ftype="tabular"/> + </test> + <test> + <param name="input_file" value="berokka_test1.fasta"/> + <param name="filter_select" value="true"/> + <param name="filter_fasta" value="berokka_test1.fasta"/> + <param name="read_length" value="60000"/> + <param name="fuzz" value="5"/> + <param name="anno" value="true"/> + <output name="trimmed" file="trimmed_2" ftype="fasta"/> + <output name="results" file="results_2" ftype="tabular"/> + </test> + <test> + <param name="input_file" value="berokka_test1.fasta"/> + <param name="read_length" value="100"/> + <param name="fuzz" value="50"/> + <param name="anno" value="false"/> + <output name="trimmed" file="trimmed_3" ftype="fasta"/> + <output name="results" file="results_3" ftype="tabular"/> + </test> + </tests> + <help><![CDATA[ +**Summary** + +Trim, circularise, orient & filter long read bacterial genome assemblies +There is already a good piece of software to trim/circularise and orient genome assemblies called Circlator. Please try that first! + +You should only try Berokka if: + +1. You only have the contig files and do not have the corrected reads anymore +2. Your contigs are simple cases with clear overhang and could be done manually with BLAST +3. Circlator fails on your data even after troubleshooting + +NOTE: orientation to dnaA or rep genes is not yet implemented. + +**Input** + +Input should be completed long-read assemblies in FASTA format, such as those from CANU or HGAP. + +**Output** + +1. trimmed: The (possibly) trimmed sequences (FASTA) + +2. results: Summary of results (TSV) + +**Options** + +* `Filter <FASTA>` allows you to remove contigs which match 50% of sequences in this file. Berokka comes with the standard Pacbio control sequence. You can provide your own FASTA file using this option. + +* `Read Length <LENGTH>` can be used for datasets that won't seem to circularise. It affects the length of the match it attempts to make using BLAST. + +* `Fuzz` can be used to accept local alignment within X bp of global (default '5') + +* `Annotation` can be set to "No" to ensure that the FASTA descriptions are not altered between the input and output FASTA files. + + ]]></help> + <citations> + <citation type="bibtex"> +@UNPUBLISHED{Seemann2016, + author = {Seemann, Torsten}, + title = {Berokka: Faster Trim, circularise and orient long read bacterial genome assemblies}, + year = {2016}, + url = {https://github.com/tseemann/berokka}, +} + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/berokka_test1.fasta Mon Mar 25 12:55:16 2019 -0400 @@ -0,0 +1,70 @@ +>gi|145231|gb|M33724.1|ECOALPHOA Escherichia coli K-12 truncated PhoA (phoA) gene, partial cds; and transposon Mu dI, partial sequence +CAAAGCTCCGGGCCTCACCCAGGCGCTAAATACCAAAGATGGCGCAGTGATGGTGATGAGTTACGGGAAC +TCCGAAGAGGATTCACAAGAACATACCGGCAGTCAGTTGCGTATTGCGGCGTATGGCCCGCATGCCGCCA +ATGAAGCGGCGCACGAAAAACGCGAAAGCGT + +>gi|145232|gb|M33725.1|ECOALPHOB Escherichia coli K12 phoA pseudogene and transposon Mu dl-R, partial sequence +CTGTCATAAAGTTGTCACGGCCGAGACTTATAGTCGCTTTGTTTTTATTTTTTAATGTATTTGTACATGG +AGAAAATAAAGTGAAACAAAGCACTATTGCACTGGCACTCTTACCGTTACTGTTTACCCCTGTGACAAAA +GCCCGGACACCAGTGAAGCGGCGCACGAAAAACGCGAAAGCGT + +>gi|145234|gb|M33727.1|ECOALPHOE Escherichia coli K12 upstream sequence of psiA5::Mu dI. is identical to psiA30 upstream sequence; putative (phoA) pseudogene and transposon Mu dl-R, partial sequence +TTGTTTTTATTTTTTAATGTATTTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATTGCACTGGTGA +AGCGGCGCACGAAAAACGCGAAAGCGT + +>gi|146195|gb|J01619.1|ECOGLTA Eschericia coli gltA gene, sdhCDAB operon and sucABCD operons, complete sequence +GAATTCGACCGCCATTGCGCAAGGCATCGCCATGACCAGGCAGGATACAAAAGAGAGTCGATAAATATTC +ACGGTGTCCATACCTGATAAATATTTTATGAAAGGCGGCGATGATGCCGCAAAATAATACTTATTTATAA +TCCAGCACGTAGGTTGCGTTAGCGGTTACTTCACCTGCCGTGACATCGACTGCATTATCAATTTGTTCCA +TCCAGGCGAAAAAGTTCAGCGTCTGTTCTGATGAGCTTGCATCCAGGTCAAGATCTGGCGCGGCTGAACC +TAATACGATGTTACCGTCATTTTTGTCCATCAGTCGTACACCGACCCCAGTTGCTTCGCCTGCACTGGTG +TTGCTCAACAAAGGCGTAGCACCAGTTGTCTTAGCCGTGCTATCGAAGGTTACGCCAAACTTTGGATACC +GGCATTCCGCTACCGTTGTCAGAAGCAGGCAGATCACAGTTGATCAAGCGAATGTCGACGGCCACTTTAT +TGCTATGATGCTCCCGGTTTATATGGGTTGTCGTGACTTGTCCAAGATCTATGTTTTTATCAATATCTTC +TGGATGAATTTCACAAGGTGCTTCAATAACCTCCCCCTTAAAGTGAATTTCGCCAGAACCTTCATCAGCA +GCATAAACAGGTGCAGTGAACAGCAGAGATACGGCCAGTGCGGCCAATGTTTTTTGTCCTTTAAACATAA +CAGAGTCCTTTAAGGATATAGAATAGGGGTATAGCTACGCCAGAATATCGTATTTGATTATTGCTAGTTT +TTAGTTTTGCTTAAAAAATATTGTTAGTTTTATTAAATTGGAAAACTAAATTATTGGTATCATGAATTGT +TGTATGATGATAAATATAGGGGGGATATGATAGACGTCATTTTCATAGGGTTATAAAATGCGACTACCAT +GAAGTTTTTAATTCAAAGTATTGGGTTGCTGATAATTTGAGCTGTTCTATTCTTTTTAAATATCTATATA +GGTCTGTTAATGGATTTTATTTTTACAAGTTTTTTGTGTTTAGGCATATAAAAATCAAGCCCGCCATATG +AACGGCGGGTTAAAATATTTACAACTTAGCAATCGAACCATTAACGCTTGATATCGCTTTTAAAGTCGCG +TTTTTCATATCCTGTATACAGCTGACGCGGACGGGCAATCTTCATACCGTCACTGTGCATTTCGCTCCAG +TGGGCGATCCAGCCAACGGTACGTGCCATTGCGAAAATGACGGTGAACATGGAAGACGGAATACCCATCG +CTTTCAGGATGATACCAGAGTAGAAATCGACGTTCGGGTACAGTTTCTTCTCGATAAAGTACGGGTCGTT +CAGCGCGATGTTTTCCAGCTCCATAGCCACTTCCAGCAGGTCATCCTTCGTGCCCAGCTCTTTCAGCACT +TCATGGCAGGTTTCACGCATTACGGTGGCGCGCGGGTCGTAATTTTTGTACACGCGGTGACCGAAGCCCA +TCAGGCGGAAAGAATCATTTTTGTCTTTCGCACGACGAAAAAATTCCGGAATGTGTTTAACGGAGCTGAT +TTCTTCCAGCATTTTCAGCGCCGCTTCGTTAGCACCGCCGTGCGCAGGTCCCCACAGTGAAGCAATACCT +GCTGCGATACAGGCAAACGGGTTCGCACCCGAAGAGCCAGCGGTACGCACGGTGGAGGTAGAGGCGTTCT +GTTCATGGTCAGCGTGCAGGATCAGAATACGGTCCATAGCACGTTCCAGAATCGGATTAACTTCATACGG +TTCGCACGGCGTGGAGAACATCATATTCAGGAAGTTACCGGCGTAGGAGAGATCGTTGCGCGGGTAAACA +AATGGCTGACCAATGGAATACTTGTAACACATCGCGGCCATGGTCGGCATTTTCGACAGCAGGCGGAACG +CGGCAATTTCACGGTGACGAGGATTGTTAACATCCAGCGAGTCGTGATAGAACGCCGCCAGCGCGCCGGT +AATACCACACATGACTGCCATTGGATGCGAGTCGCGACGGAAAGCATGGAACAGACGGGTAATCTGCTCG +TGGATCATGGTATGACGGGTCACCGTAGTTTTAAATTCGTCATACTGTTCCTGAGTCGGTTTTTCACCAT +TCAGCAGGATGTAACAAACTTCCAGGTAGTTAGAATCGGTCGCCAGCTGATCGATCGGGAAACCGCGGTG +CAGCAAAATACCTTCATCACCATCAATAAAAGTAATTTTAGATTCGCAGGATGCGGTTGAAGTGAAGCCT +GGGTCAAAGGTGAACACACCTTTTGAACCGAGAGTACGGATATCAATAACATCTTGACCCAGCGTGCCTT +TCAGCACATCCAGTTCAACAGCTGTATCCCCGTTGAGGGTGAGTTTTGCTTTTGTATCAGCCATTTAAGG +TCTCCTTAGCGCCTTATTGCGTAAGACTGCCGGAACTTAAATTTGCCTTCGCACATCAACCTGGCTTTAC +CCGTTTTTTATTTGGCTCGCCGCTCTGTGAAAGAGGGGAAAACCTGGGTACAGAGCTCTGGGCGCTTGCA +GGTAAAGGATCCATTGATGACGAATAAATGGCGAATCAAGTACTTAGCAATCCGAATTATTAAACTTGTC +TACCACTAATAACTGTCCCGAATGAATTGGTCAATACTCCACACTGTTACATAAGTTAATCTTAGGTGAA +ATACCGACTTCATAACTTTTACGCATTATATGCTTTTCCTGGTAATGTTTGTAACAACTTTGTTGAATGA +TTGTCAAATTAGATGATTAAAAATTAAATAAATGTTGTTATCGTGACCTGGATCACTGTTCAGGATAAAA +CCCGACAAACTATATGTAGGTTAATTGTAATGATTTTGTGAACAGCCTATACTGCCGCCAGTCTCCGGAA +CACCCTGCAATCCCGAGCCACCCAGCGTTGTAACGTGTCGTTTTCGCATCTGGAAGCAGTGTTTTGCATG +ACGCGCAGTTATAGAAAGGACGCTGTCTGACCCGCAAGCAGACCGGAGGAAGGAAATCCCGACGTCTCCA +GGTAACAGAAAGTTAACCTCTGTGCCCGTAGTCCCCAGGGAATAATAAGAACAGCATGTGGGCGTTATTC +ATGATAAGAAATGTGAAAAAACAAAGACCTGTTAATCTGGACCTACAGACCATCCGGTTCCCCATCACGG +CGATAGCGTCCATTCTCCATCGCGTTTCCGGTGTGATCACCTTTGTTGCAGTGGGCATCCTGCTGTGGCT +TCTGGGTACCAGCCTCTCTTCCCCTGAAGGTTTCGAGCAAGCTTCCGCGATTATGGGCAGCTTCTTCGTC +AAATTTATCATGTGGGGCATCCTTACCGCTCTGGCGTATCACGTCGTCGTAGGTATTCGCCACATGATGA +TGGATTTTGGCTATCTGGAAGAAACATTCGAAGCGGGTAAACGCTCCGCCAAAATCTCCTTTGTTATTAC +TGTCGTGCTTTCACTTCTCGCAGGAGTCCTCGTATGGTAAGCAACGCCTCCGCATTAGGACGCAATGGCG +TACATGATTTCATCCTCGTTCGCGCTACCGCTATCGTCCTGACGCTCTACATCATTTATATGGTCGGTTT +TTTCGCTACCAGTGGCGAGCTGACATATGAAGTCTGGATCGGTTTCTTCGCCTCTGCGTTCACCAAAGTG +TTCACCCTGCTGGCGCTGTTTTCTATCTTGATCCATGCCTGGATCGGCATGTGGCAGGTGTTGACCGACT +ACGTTAAACCGCTGGCTTTGCGCCTGATGCTGCAACTGGTGATTGTCGTTGCACTGGTGGTTTACGTGAT +TTATGGATTCGTTGTGGTGTGGGGTGTGTGATGAAATTGCCAGTCAGAGAATTTGATGCAGTTGTGATTG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/results_1 Mon Mar 25 12:55:16 2019 -0400 @@ -0,0 +1,5 @@ +#sequence status old_len new_len trimmed +gi|145231|gb|M33724.1|ECOALPHOA kept 171 171 0 +gi|145232|gb|M33725.1|ECOALPHOB kept 183 183 0 +gi|145234|gb|M33727.1|ECOALPHOE kept 97 97 0 +gi|146195|gb|J01619.1|ECOGLTA kept 3850 3850 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/results_2 Mon Mar 25 12:55:16 2019 -0400 @@ -0,0 +1,5 @@ +#sequence status old_len new_len trimmed +gi|145231|gb|M33724.1|ECOALPHOA kept 171 171 0 +gi|145232|gb|M33725.1|ECOALPHOB kept 183 183 0 +gi|145234|gb|M33727.1|ECOALPHOE kept 97 97 0 +gi|146195|gb|J01619.1|ECOGLTA kept 3850 3850 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/results_3 Mon Mar 25 12:55:16 2019 -0400 @@ -0,0 +1,5 @@ +#sequence status old_len new_len trimmed +gi|145231|gb|M33724.1|ECOALPHOA kept 171 171 0 +gi|145232|gb|M33725.1|ECOALPHOB kept 183 183 0 +gi|145234|gb|M33727.1|ECOALPHOE kept 97 97 0 +gi|146195|gb|J01619.1|ECOGLTA kept 3850 3850 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/trimmed_1 Mon Mar 25 12:55:16 2019 -0400 @@ -0,0 +1,78 @@ +>gi|145231|gb|M33724.1|ECOALPHOA Escherichia coli K-12 truncated PhoA (phoA) gene, partial cds; and transposon Mu dI, partial sequence +CAAAGCTCCGGGCCTCACCCAGGCGCTAAATACCAAAGATGGCGCAGTGATGGTGATGAG +TTACGGGAACTCCGAAGAGGATTCACAAGAACATACCGGCAGTCAGTTGCGTATTGCGGC +GTATGGCCCGCATGCCGCCAATGAAGCGGCGCACGAAAAACGCGAAAGCGT +>gi|145232|gb|M33725.1|ECOALPHOB Escherichia coli K12 phoA pseudogene and transposon Mu dl-R, partial sequence +CTGTCATAAAGTTGTCACGGCCGAGACTTATAGTCGCTTTGTTTTTATTTTTTAATGTAT +TTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATTGCACTGGCACTCTTACCGTTAC +TGTTTACCCCTGTGACAAAAGCCCGGACACCAGTGAAGCGGCGCACGAAAAACGCGAAAG +CGT +>gi|145234|gb|M33727.1|ECOALPHOE Escherichia coli K12 upstream sequence of psiA5::Mu dI. is identical to psiA30 upstream sequence; putative (phoA) pseudogene and transposon Mu dl-R, partial sequence +TTGTTTTTATTTTTTAATGTATTTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATT +GCACTGGTGAAGCGGCGCACGAAAAACGCGAAAGCGT +>gi|146195|gb|J01619.1|ECOGLTA Eschericia coli gltA gene, sdhCDAB operon and sucABCD operons, complete sequence +GAATTCGACCGCCATTGCGCAAGGCATCGCCATGACCAGGCAGGATACAAAAGAGAGTCG +ATAAATATTCACGGTGTCCATACCTGATAAATATTTTATGAAAGGCGGCGATGATGCCGC +AAAATAATACTTATTTATAATCCAGCACGTAGGTTGCGTTAGCGGTTACTTCACCTGCCG +TGACATCGACTGCATTATCAATTTGTTCCATCCAGGCGAAAAAGTTCAGCGTCTGTTCTG +ATGAGCTTGCATCCAGGTCAAGATCTGGCGCGGCTGAACCTAATACGATGTTACCGTCAT +TTTTGTCCATCAGTCGTACACCGACCCCAGTTGCTTCGCCTGCACTGGTGTTGCTCAACA +AAGGCGTAGCACCAGTTGTCTTAGCCGTGCTATCGAAGGTTACGCCAAACTTTGGATACC +GGCATTCCGCTACCGTTGTCAGAAGCAGGCAGATCACAGTTGATCAAGCGAATGTCGACG +GCCACTTTATTGCTATGATGCTCCCGGTTTATATGGGTTGTCGTGACTTGTCCAAGATCT +ATGTTTTTATCAATATCTTCTGGATGAATTTCACAAGGTGCTTCAATAACCTCCCCCTTA +AAGTGAATTTCGCCAGAACCTTCATCAGCAGCATAAACAGGTGCAGTGAACAGCAGAGAT +ACGGCCAGTGCGGCCAATGTTTTTTGTCCTTTAAACATAACAGAGTCCTTTAAGGATATA +GAATAGGGGTATAGCTACGCCAGAATATCGTATTTGATTATTGCTAGTTTTTAGTTTTGC +TTAAAAAATATTGTTAGTTTTATTAAATTGGAAAACTAAATTATTGGTATCATGAATTGT +TGTATGATGATAAATATAGGGGGGATATGATAGACGTCATTTTCATAGGGTTATAAAATG +CGACTACCATGAAGTTTTTAATTCAAAGTATTGGGTTGCTGATAATTTGAGCTGTTCTAT +TCTTTTTAAATATCTATATAGGTCTGTTAATGGATTTTATTTTTACAAGTTTTTTGTGTT +TAGGCATATAAAAATCAAGCCCGCCATATGAACGGCGGGTTAAAATATTTACAACTTAGC +AATCGAACCATTAACGCTTGATATCGCTTTTAAAGTCGCGTTTTTCATATCCTGTATACA +GCTGACGCGGACGGGCAATCTTCATACCGTCACTGTGCATTTCGCTCCAGTGGGCGATCC +AGCCAACGGTACGTGCCATTGCGAAAATGACGGTGAACATGGAAGACGGAATACCCATCG +CTTTCAGGATGATACCAGAGTAGAAATCGACGTTCGGGTACAGTTTCTTCTCGATAAAGT +ACGGGTCGTTCAGCGCGATGTTTTCCAGCTCCATAGCCACTTCCAGCAGGTCATCCTTCG +TGCCCAGCTCTTTCAGCACTTCATGGCAGGTTTCACGCATTACGGTGGCGCGCGGGTCGT +AATTTTTGTACACGCGGTGACCGAAGCCCATCAGGCGGAAAGAATCATTTTTGTCTTTCG +CACGACGAAAAAATTCCGGAATGTGTTTAACGGAGCTGATTTCTTCCAGCATTTTCAGCG +CCGCTTCGTTAGCACCGCCGTGCGCAGGTCCCCACAGTGAAGCAATACCTGCTGCGATAC +AGGCAAACGGGTTCGCACCCGAAGAGCCAGCGGTACGCACGGTGGAGGTAGAGGCGTTCT +GTTCATGGTCAGCGTGCAGGATCAGAATACGGTCCATAGCACGTTCCAGAATCGGATTAA +CTTCATACGGTTCGCACGGCGTGGAGAACATCATATTCAGGAAGTTACCGGCGTAGGAGA +GATCGTTGCGCGGGTAAACAAATGGCTGACCAATGGAATACTTGTAACACATCGCGGCCA +TGGTCGGCATTTTCGACAGCAGGCGGAACGCGGCAATTTCACGGTGACGAGGATTGTTAA +CATCCAGCGAGTCGTGATAGAACGCCGCCAGCGCGCCGGTAATACCACACATGACTGCCA +TTGGATGCGAGTCGCGACGGAAAGCATGGAACAGACGGGTAATCTGCTCGTGGATCATGG +TATGACGGGTCACCGTAGTTTTAAATTCGTCATACTGTTCCTGAGTCGGTTTTTCACCAT +TCAGCAGGATGTAACAAACTTCCAGGTAGTTAGAATCGGTCGCCAGCTGATCGATCGGGA +AACCGCGGTGCAGCAAAATACCTTCATCACCATCAATAAAAGTAATTTTAGATTCGCAGG +ATGCGGTTGAAGTGAAGCCTGGGTCAAAGGTGAACACACCTTTTGAACCGAGAGTACGGA +TATCAATAACATCTTGACCCAGCGTGCCTTTCAGCACATCCAGTTCAACAGCTGTATCCC +CGTTGAGGGTGAGTTTTGCTTTTGTATCAGCCATTTAAGGTCTCCTTAGCGCCTTATTGC +GTAAGACTGCCGGAACTTAAATTTGCCTTCGCACATCAACCTGGCTTTACCCGTTTTTTA +TTTGGCTCGCCGCTCTGTGAAAGAGGGGAAAACCTGGGTACAGAGCTCTGGGCGCTTGCA +GGTAAAGGATCCATTGATGACGAATAAATGGCGAATCAAGTACTTAGCAATCCGAATTAT +TAAACTTGTCTACCACTAATAACTGTCCCGAATGAATTGGTCAATACTCCACACTGTTAC +ATAAGTTAATCTTAGGTGAAATACCGACTTCATAACTTTTACGCATTATATGCTTTTCCT +GGTAATGTTTGTAACAACTTTGTTGAATGATTGTCAAATTAGATGATTAAAAATTAAATA +AATGTTGTTATCGTGACCTGGATCACTGTTCAGGATAAAACCCGACAAACTATATGTAGG +TTAATTGTAATGATTTTGTGAACAGCCTATACTGCCGCCAGTCTCCGGAACACCCTGCAA +TCCCGAGCCACCCAGCGTTGTAACGTGTCGTTTTCGCATCTGGAAGCAGTGTTTTGCATG +ACGCGCAGTTATAGAAAGGACGCTGTCTGACCCGCAAGCAGACCGGAGGAAGGAAATCCC +GACGTCTCCAGGTAACAGAAAGTTAACCTCTGTGCCCGTAGTCCCCAGGGAATAATAAGA +ACAGCATGTGGGCGTTATTCATGATAAGAAATGTGAAAAAACAAAGACCTGTTAATCTGG +ACCTACAGACCATCCGGTTCCCCATCACGGCGATAGCGTCCATTCTCCATCGCGTTTCCG +GTGTGATCACCTTTGTTGCAGTGGGCATCCTGCTGTGGCTTCTGGGTACCAGCCTCTCTT +CCCCTGAAGGTTTCGAGCAAGCTTCCGCGATTATGGGCAGCTTCTTCGTCAAATTTATCA +TGTGGGGCATCCTTACCGCTCTGGCGTATCACGTCGTCGTAGGTATTCGCCACATGATGA +TGGATTTTGGCTATCTGGAAGAAACATTCGAAGCGGGTAAACGCTCCGCCAAAATCTCCT +TTGTTATTACTGTCGTGCTTTCACTTCTCGCAGGAGTCCTCGTATGGTAAGCAACGCCTC +CGCATTAGGACGCAATGGCGTACATGATTTCATCCTCGTTCGCGCTACCGCTATCGTCCT +GACGCTCTACATCATTTATATGGTCGGTTTTTTCGCTACCAGTGGCGAGCTGACATATGA +AGTCTGGATCGGTTTCTTCGCCTCTGCGTTCACCAAAGTGTTCACCCTGCTGGCGCTGTT +TTCTATCTTGATCCATGCCTGGATCGGCATGTGGCAGGTGTTGACCGACTACGTTAAACC +GCTGGCTTTGCGCCTGATGCTGCAACTGGTGATTGTCGTTGCACTGGTGGTTTACGTGAT +TTATGGATTCGTTGTGGTGTGGGGTGTGTGATGAAATTGCCAGTCAGAGAATTTGATGCA +GTTGTGATTG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/trimmed_2 Mon Mar 25 12:55:16 2019 -0400 @@ -0,0 +1,78 @@ +>gi|145231|gb|M33724.1|ECOALPHOA Escherichia coli K-12 truncated PhoA (phoA) gene, partial cds; and transposon Mu dI, partial sequence +CAAAGCTCCGGGCCTCACCCAGGCGCTAAATACCAAAGATGGCGCAGTGATGGTGATGAG +TTACGGGAACTCCGAAGAGGATTCACAAGAACATACCGGCAGTCAGTTGCGTATTGCGGC +GTATGGCCCGCATGCCGCCAATGAAGCGGCGCACGAAAAACGCGAAAGCGT +>gi|145232|gb|M33725.1|ECOALPHOB Escherichia coli K12 phoA pseudogene and transposon Mu dl-R, partial sequence +CTGTCATAAAGTTGTCACGGCCGAGACTTATAGTCGCTTTGTTTTTATTTTTTAATGTAT +TTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATTGCACTGGCACTCTTACCGTTAC +TGTTTACCCCTGTGACAAAAGCCCGGACACCAGTGAAGCGGCGCACGAAAAACGCGAAAG +CGT +>gi|145234|gb|M33727.1|ECOALPHOE Escherichia coli K12 upstream sequence of psiA5::Mu dI. is identical to psiA30 upstream sequence; putative (phoA) pseudogene and transposon Mu dl-R, partial sequence +TTGTTTTTATTTTTTAATGTATTTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATT +GCACTGGTGAAGCGGCGCACGAAAAACGCGAAAGCGT +>gi|146195|gb|J01619.1|ECOGLTA Eschericia coli gltA gene, sdhCDAB operon and sucABCD operons, complete sequence +GAATTCGACCGCCATTGCGCAAGGCATCGCCATGACCAGGCAGGATACAAAAGAGAGTCG +ATAAATATTCACGGTGTCCATACCTGATAAATATTTTATGAAAGGCGGCGATGATGCCGC +AAAATAATACTTATTTATAATCCAGCACGTAGGTTGCGTTAGCGGTTACTTCACCTGCCG +TGACATCGACTGCATTATCAATTTGTTCCATCCAGGCGAAAAAGTTCAGCGTCTGTTCTG +ATGAGCTTGCATCCAGGTCAAGATCTGGCGCGGCTGAACCTAATACGATGTTACCGTCAT +TTTTGTCCATCAGTCGTACACCGACCCCAGTTGCTTCGCCTGCACTGGTGTTGCTCAACA +AAGGCGTAGCACCAGTTGTCTTAGCCGTGCTATCGAAGGTTACGCCAAACTTTGGATACC +GGCATTCCGCTACCGTTGTCAGAAGCAGGCAGATCACAGTTGATCAAGCGAATGTCGACG +GCCACTTTATTGCTATGATGCTCCCGGTTTATATGGGTTGTCGTGACTTGTCCAAGATCT +ATGTTTTTATCAATATCTTCTGGATGAATTTCACAAGGTGCTTCAATAACCTCCCCCTTA +AAGTGAATTTCGCCAGAACCTTCATCAGCAGCATAAACAGGTGCAGTGAACAGCAGAGAT +ACGGCCAGTGCGGCCAATGTTTTTTGTCCTTTAAACATAACAGAGTCCTTTAAGGATATA +GAATAGGGGTATAGCTACGCCAGAATATCGTATTTGATTATTGCTAGTTTTTAGTTTTGC +TTAAAAAATATTGTTAGTTTTATTAAATTGGAAAACTAAATTATTGGTATCATGAATTGT +TGTATGATGATAAATATAGGGGGGATATGATAGACGTCATTTTCATAGGGTTATAAAATG +CGACTACCATGAAGTTTTTAATTCAAAGTATTGGGTTGCTGATAATTTGAGCTGTTCTAT +TCTTTTTAAATATCTATATAGGTCTGTTAATGGATTTTATTTTTACAAGTTTTTTGTGTT +TAGGCATATAAAAATCAAGCCCGCCATATGAACGGCGGGTTAAAATATTTACAACTTAGC +AATCGAACCATTAACGCTTGATATCGCTTTTAAAGTCGCGTTTTTCATATCCTGTATACA +GCTGACGCGGACGGGCAATCTTCATACCGTCACTGTGCATTTCGCTCCAGTGGGCGATCC +AGCCAACGGTACGTGCCATTGCGAAAATGACGGTGAACATGGAAGACGGAATACCCATCG +CTTTCAGGATGATACCAGAGTAGAAATCGACGTTCGGGTACAGTTTCTTCTCGATAAAGT +ACGGGTCGTTCAGCGCGATGTTTTCCAGCTCCATAGCCACTTCCAGCAGGTCATCCTTCG +TGCCCAGCTCTTTCAGCACTTCATGGCAGGTTTCACGCATTACGGTGGCGCGCGGGTCGT +AATTTTTGTACACGCGGTGACCGAAGCCCATCAGGCGGAAAGAATCATTTTTGTCTTTCG +CACGACGAAAAAATTCCGGAATGTGTTTAACGGAGCTGATTTCTTCCAGCATTTTCAGCG +CCGCTTCGTTAGCACCGCCGTGCGCAGGTCCCCACAGTGAAGCAATACCTGCTGCGATAC +AGGCAAACGGGTTCGCACCCGAAGAGCCAGCGGTACGCACGGTGGAGGTAGAGGCGTTCT +GTTCATGGTCAGCGTGCAGGATCAGAATACGGTCCATAGCACGTTCCAGAATCGGATTAA +CTTCATACGGTTCGCACGGCGTGGAGAACATCATATTCAGGAAGTTACCGGCGTAGGAGA +GATCGTTGCGCGGGTAAACAAATGGCTGACCAATGGAATACTTGTAACACATCGCGGCCA +TGGTCGGCATTTTCGACAGCAGGCGGAACGCGGCAATTTCACGGTGACGAGGATTGTTAA +CATCCAGCGAGTCGTGATAGAACGCCGCCAGCGCGCCGGTAATACCACACATGACTGCCA +TTGGATGCGAGTCGCGACGGAAAGCATGGAACAGACGGGTAATCTGCTCGTGGATCATGG +TATGACGGGTCACCGTAGTTTTAAATTCGTCATACTGTTCCTGAGTCGGTTTTTCACCAT +TCAGCAGGATGTAACAAACTTCCAGGTAGTTAGAATCGGTCGCCAGCTGATCGATCGGGA +AACCGCGGTGCAGCAAAATACCTTCATCACCATCAATAAAAGTAATTTTAGATTCGCAGG +ATGCGGTTGAAGTGAAGCCTGGGTCAAAGGTGAACACACCTTTTGAACCGAGAGTACGGA +TATCAATAACATCTTGACCCAGCGTGCCTTTCAGCACATCCAGTTCAACAGCTGTATCCC +CGTTGAGGGTGAGTTTTGCTTTTGTATCAGCCATTTAAGGTCTCCTTAGCGCCTTATTGC +GTAAGACTGCCGGAACTTAAATTTGCCTTCGCACATCAACCTGGCTTTACCCGTTTTTTA +TTTGGCTCGCCGCTCTGTGAAAGAGGGGAAAACCTGGGTACAGAGCTCTGGGCGCTTGCA +GGTAAAGGATCCATTGATGACGAATAAATGGCGAATCAAGTACTTAGCAATCCGAATTAT +TAAACTTGTCTACCACTAATAACTGTCCCGAATGAATTGGTCAATACTCCACACTGTTAC +ATAAGTTAATCTTAGGTGAAATACCGACTTCATAACTTTTACGCATTATATGCTTTTCCT +GGTAATGTTTGTAACAACTTTGTTGAATGATTGTCAAATTAGATGATTAAAAATTAAATA +AATGTTGTTATCGTGACCTGGATCACTGTTCAGGATAAAACCCGACAAACTATATGTAGG +TTAATTGTAATGATTTTGTGAACAGCCTATACTGCCGCCAGTCTCCGGAACACCCTGCAA +TCCCGAGCCACCCAGCGTTGTAACGTGTCGTTTTCGCATCTGGAAGCAGTGTTTTGCATG +ACGCGCAGTTATAGAAAGGACGCTGTCTGACCCGCAAGCAGACCGGAGGAAGGAAATCCC +GACGTCTCCAGGTAACAGAAAGTTAACCTCTGTGCCCGTAGTCCCCAGGGAATAATAAGA +ACAGCATGTGGGCGTTATTCATGATAAGAAATGTGAAAAAACAAAGACCTGTTAATCTGG +ACCTACAGACCATCCGGTTCCCCATCACGGCGATAGCGTCCATTCTCCATCGCGTTTCCG +GTGTGATCACCTTTGTTGCAGTGGGCATCCTGCTGTGGCTTCTGGGTACCAGCCTCTCTT +CCCCTGAAGGTTTCGAGCAAGCTTCCGCGATTATGGGCAGCTTCTTCGTCAAATTTATCA +TGTGGGGCATCCTTACCGCTCTGGCGTATCACGTCGTCGTAGGTATTCGCCACATGATGA +TGGATTTTGGCTATCTGGAAGAAACATTCGAAGCGGGTAAACGCTCCGCCAAAATCTCCT +TTGTTATTACTGTCGTGCTTTCACTTCTCGCAGGAGTCCTCGTATGGTAAGCAACGCCTC +CGCATTAGGACGCAATGGCGTACATGATTTCATCCTCGTTCGCGCTACCGCTATCGTCCT +GACGCTCTACATCATTTATATGGTCGGTTTTTTCGCTACCAGTGGCGAGCTGACATATGA +AGTCTGGATCGGTTTCTTCGCCTCTGCGTTCACCAAAGTGTTCACCCTGCTGGCGCTGTT +TTCTATCTTGATCCATGCCTGGATCGGCATGTGGCAGGTGTTGACCGACTACGTTAAACC +GCTGGCTTTGCGCCTGATGCTGCAACTGGTGATTGTCGTTGCACTGGTGGTTTACGTGAT +TTATGGATTCGTTGTGGTGTGGGGTGTGTGATGAAATTGCCAGTCAGAGAATTTGATGCA +GTTGTGATTG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/trimmed_3 Mon Mar 25 12:55:16 2019 -0400 @@ -0,0 +1,78 @@ +>gi|145231|gb|M33724.1|ECOALPHOA Escherichia coli K-12 truncated PhoA (phoA) gene, partial cds; and transposon Mu dI, partial sequence +CAAAGCTCCGGGCCTCACCCAGGCGCTAAATACCAAAGATGGCGCAGTGATGGTGATGAG +TTACGGGAACTCCGAAGAGGATTCACAAGAACATACCGGCAGTCAGTTGCGTATTGCGGC +GTATGGCCCGCATGCCGCCAATGAAGCGGCGCACGAAAAACGCGAAAGCGT +>gi|145232|gb|M33725.1|ECOALPHOB Escherichia coli K12 phoA pseudogene and transposon Mu dl-R, partial sequence +CTGTCATAAAGTTGTCACGGCCGAGACTTATAGTCGCTTTGTTTTTATTTTTTAATGTAT +TTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATTGCACTGGCACTCTTACCGTTAC +TGTTTACCCCTGTGACAAAAGCCCGGACACCAGTGAAGCGGCGCACGAAAAACGCGAAAG +CGT +>gi|145234|gb|M33727.1|ECOALPHOE Escherichia coli K12 upstream sequence of psiA5::Mu dI. is identical to psiA30 upstream sequence; putative (phoA) pseudogene and transposon Mu dl-R, partial sequence +TTGTTTTTATTTTTTAATGTATTTGTACATGGAGAAAATAAAGTGAAACAAAGCACTATT +GCACTGGTGAAGCGGCGCACGAAAAACGCGAAAGCGT +>gi|146195|gb|J01619.1|ECOGLTA Eschericia coli gltA gene, sdhCDAB operon and sucABCD operons, complete sequence +GAATTCGACCGCCATTGCGCAAGGCATCGCCATGACCAGGCAGGATACAAAAGAGAGTCG +ATAAATATTCACGGTGTCCATACCTGATAAATATTTTATGAAAGGCGGCGATGATGCCGC +AAAATAATACTTATTTATAATCCAGCACGTAGGTTGCGTTAGCGGTTACTTCACCTGCCG +TGACATCGACTGCATTATCAATTTGTTCCATCCAGGCGAAAAAGTTCAGCGTCTGTTCTG +ATGAGCTTGCATCCAGGTCAAGATCTGGCGCGGCTGAACCTAATACGATGTTACCGTCAT +TTTTGTCCATCAGTCGTACACCGACCCCAGTTGCTTCGCCTGCACTGGTGTTGCTCAACA +AAGGCGTAGCACCAGTTGTCTTAGCCGTGCTATCGAAGGTTACGCCAAACTTTGGATACC +GGCATTCCGCTACCGTTGTCAGAAGCAGGCAGATCACAGTTGATCAAGCGAATGTCGACG +GCCACTTTATTGCTATGATGCTCCCGGTTTATATGGGTTGTCGTGACTTGTCCAAGATCT +ATGTTTTTATCAATATCTTCTGGATGAATTTCACAAGGTGCTTCAATAACCTCCCCCTTA +AAGTGAATTTCGCCAGAACCTTCATCAGCAGCATAAACAGGTGCAGTGAACAGCAGAGAT +ACGGCCAGTGCGGCCAATGTTTTTTGTCCTTTAAACATAACAGAGTCCTTTAAGGATATA +GAATAGGGGTATAGCTACGCCAGAATATCGTATTTGATTATTGCTAGTTTTTAGTTTTGC +TTAAAAAATATTGTTAGTTTTATTAAATTGGAAAACTAAATTATTGGTATCATGAATTGT +TGTATGATGATAAATATAGGGGGGATATGATAGACGTCATTTTCATAGGGTTATAAAATG +CGACTACCATGAAGTTTTTAATTCAAAGTATTGGGTTGCTGATAATTTGAGCTGTTCTAT +TCTTTTTAAATATCTATATAGGTCTGTTAATGGATTTTATTTTTACAAGTTTTTTGTGTT +TAGGCATATAAAAATCAAGCCCGCCATATGAACGGCGGGTTAAAATATTTACAACTTAGC +AATCGAACCATTAACGCTTGATATCGCTTTTAAAGTCGCGTTTTTCATATCCTGTATACA +GCTGACGCGGACGGGCAATCTTCATACCGTCACTGTGCATTTCGCTCCAGTGGGCGATCC +AGCCAACGGTACGTGCCATTGCGAAAATGACGGTGAACATGGAAGACGGAATACCCATCG +CTTTCAGGATGATACCAGAGTAGAAATCGACGTTCGGGTACAGTTTCTTCTCGATAAAGT +ACGGGTCGTTCAGCGCGATGTTTTCCAGCTCCATAGCCACTTCCAGCAGGTCATCCTTCG +TGCCCAGCTCTTTCAGCACTTCATGGCAGGTTTCACGCATTACGGTGGCGCGCGGGTCGT +AATTTTTGTACACGCGGTGACCGAAGCCCATCAGGCGGAAAGAATCATTTTTGTCTTTCG +CACGACGAAAAAATTCCGGAATGTGTTTAACGGAGCTGATTTCTTCCAGCATTTTCAGCG +CCGCTTCGTTAGCACCGCCGTGCGCAGGTCCCCACAGTGAAGCAATACCTGCTGCGATAC +AGGCAAACGGGTTCGCACCCGAAGAGCCAGCGGTACGCACGGTGGAGGTAGAGGCGTTCT +GTTCATGGTCAGCGTGCAGGATCAGAATACGGTCCATAGCACGTTCCAGAATCGGATTAA +CTTCATACGGTTCGCACGGCGTGGAGAACATCATATTCAGGAAGTTACCGGCGTAGGAGA +GATCGTTGCGCGGGTAAACAAATGGCTGACCAATGGAATACTTGTAACACATCGCGGCCA +TGGTCGGCATTTTCGACAGCAGGCGGAACGCGGCAATTTCACGGTGACGAGGATTGTTAA +CATCCAGCGAGTCGTGATAGAACGCCGCCAGCGCGCCGGTAATACCACACATGACTGCCA +TTGGATGCGAGTCGCGACGGAAAGCATGGAACAGACGGGTAATCTGCTCGTGGATCATGG +TATGACGGGTCACCGTAGTTTTAAATTCGTCATACTGTTCCTGAGTCGGTTTTTCACCAT +TCAGCAGGATGTAACAAACTTCCAGGTAGTTAGAATCGGTCGCCAGCTGATCGATCGGGA +AACCGCGGTGCAGCAAAATACCTTCATCACCATCAATAAAAGTAATTTTAGATTCGCAGG +ATGCGGTTGAAGTGAAGCCTGGGTCAAAGGTGAACACACCTTTTGAACCGAGAGTACGGA +TATCAATAACATCTTGACCCAGCGTGCCTTTCAGCACATCCAGTTCAACAGCTGTATCCC +CGTTGAGGGTGAGTTTTGCTTTTGTATCAGCCATTTAAGGTCTCCTTAGCGCCTTATTGC +GTAAGACTGCCGGAACTTAAATTTGCCTTCGCACATCAACCTGGCTTTACCCGTTTTTTA +TTTGGCTCGCCGCTCTGTGAAAGAGGGGAAAACCTGGGTACAGAGCTCTGGGCGCTTGCA +GGTAAAGGATCCATTGATGACGAATAAATGGCGAATCAAGTACTTAGCAATCCGAATTAT +TAAACTTGTCTACCACTAATAACTGTCCCGAATGAATTGGTCAATACTCCACACTGTTAC +ATAAGTTAATCTTAGGTGAAATACCGACTTCATAACTTTTACGCATTATATGCTTTTCCT +GGTAATGTTTGTAACAACTTTGTTGAATGATTGTCAAATTAGATGATTAAAAATTAAATA +AATGTTGTTATCGTGACCTGGATCACTGTTCAGGATAAAACCCGACAAACTATATGTAGG +TTAATTGTAATGATTTTGTGAACAGCCTATACTGCCGCCAGTCTCCGGAACACCCTGCAA +TCCCGAGCCACCCAGCGTTGTAACGTGTCGTTTTCGCATCTGGAAGCAGTGTTTTGCATG +ACGCGCAGTTATAGAAAGGACGCTGTCTGACCCGCAAGCAGACCGGAGGAAGGAAATCCC +GACGTCTCCAGGTAACAGAAAGTTAACCTCTGTGCCCGTAGTCCCCAGGGAATAATAAGA +ACAGCATGTGGGCGTTATTCATGATAAGAAATGTGAAAAAACAAAGACCTGTTAATCTGG +ACCTACAGACCATCCGGTTCCCCATCACGGCGATAGCGTCCATTCTCCATCGCGTTTCCG +GTGTGATCACCTTTGTTGCAGTGGGCATCCTGCTGTGGCTTCTGGGTACCAGCCTCTCTT +CCCCTGAAGGTTTCGAGCAAGCTTCCGCGATTATGGGCAGCTTCTTCGTCAAATTTATCA +TGTGGGGCATCCTTACCGCTCTGGCGTATCACGTCGTCGTAGGTATTCGCCACATGATGA +TGGATTTTGGCTATCTGGAAGAAACATTCGAAGCGGGTAAACGCTCCGCCAAAATCTCCT +TTGTTATTACTGTCGTGCTTTCACTTCTCGCAGGAGTCCTCGTATGGTAAGCAACGCCTC +CGCATTAGGACGCAATGGCGTACATGATTTCATCCTCGTTCGCGCTACCGCTATCGTCCT +GACGCTCTACATCATTTATATGGTCGGTTTTTTCGCTACCAGTGGCGAGCTGACATATGA +AGTCTGGATCGGTTTCTTCGCCTCTGCGTTCACCAAAGTGTTCACCCTGCTGGCGCTGTT +TTCTATCTTGATCCATGCCTGGATCGGCATGTGGCAGGTGTTGACCGACTACGTTAAACC +GCTGGCTTTGCGCCTGATGCTGCAACTGGTGATTGTCGTTGCACTGGTGGTTTACGTGAT +TTATGGATTCGTTGTGGTGTGGGGTGTGTGATGAAATTGCCAGTCAGAGAATTTGATGCA +GTTGTGATTG