Mercurial > repos > althonos > gecco
changeset 14:56b924f62165 draft
"Update tests files for Galaxy tool wrapper"
author | althonos |
---|---|
date | Tue, 05 Apr 2022 23:18:49 +0000 |
parents | fde43648cba0 |
children | 64528877558f |
files | CHANGELOG.md gecco.xml test-data/BGC0001866.1_cluster_1.gbk test-data/clusters.tsv test-data/features.tsv test-data/genes.tsv test-data/sideload.json |
diffstat | 7 files changed, 147 insertions(+), 106 deletions(-) [+] |
line wrap: on
line diff
--- a/CHANGELOG.md Thu Mar 31 18:00:15 2022 +0000 +++ b/CHANGELOG.md Tue Apr 05 23:18:49 2022 +0000 @@ -5,7 +5,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [Unreleased] -[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1-alpha4...master +[Unreleased]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1...master + +## [v0.9.1] - 2022-04-05 +[v0.9.1]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1-alpha4...v0.9.1 + +### Changed +- Make the `genes.tsv` and `features.tsv` table contain all genes even when they come from a contig too short to be processed by the CRF sliding window. +- Replaced the `--force-clusters-tsv` flag with a `--force-tsv` flag to force writing TSV tables even when no genes or clusters were found in `gecco run` or `gecco annotate`. ## [v0.9.1-alpha4] - 2022-03-31 [v0.9.1-alpha4]: https://git.embl.de/grp-zeller/GECCO/compare/v0.9.1-alpha3...v0.9.1-alpha4 @@ -15,7 +22,7 @@ $ python -m gecco -vv train --c1 0.4 --c2 0 --select 0.25 --window-size 20 \ -f mibig-2.0.proG2.Pfam-v35.0.features.tsv \ -c mibig-2.0.proG2.clusters.tsv \ - -g GECCO-data/data/embeddings/mibig-2.0.proG2.genes.gff \ + -g GECCO-data/data/embeddings/mibig-2.0.proG2.genes.tsv \ -o models/v0.9.1-alpha4 ```
--- a/gecco.xml Thu Mar 31 18:00:15 2022 +0000 +++ b/gecco.xml Tue Apr 05 23:18:49 2022 +0000 @@ -1,8 +1,8 @@ <?xml version='1.0' encoding='utf-8'?> -<tool id="gecco" name="GECCO" version="0.8.10" python_template_version="3.5"> +<tool id="gecco" name="GECCO" version="0.9.1" python_template_version="3.5"> <description>is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description> <requirements> - <requirement type="package" version="0.8.10">gecco</requirement> + <requirement type="package" version="0.9.1">gecco</requirement> </requirements> <version_command>gecco --version</version_command> <command detect_errors="aggressive"><![CDATA[ @@ -18,8 +18,10 @@ --format $input.ext --genome input_tempfile.$file_extension --postproc $postproc - --edge-distance $edge_distance - --force-clusters-tsv + --force-tsv + #if $edge_distance + --edge-distance $edge_distance + #end if #if $mask --mask #end if @@ -33,6 +35,7 @@ --antismash-sideload #end if + && mv input_tempfile.genes.tsv '$genes' && mv input_tempfile.features.tsv '$features' && mv input_tempfile.clusters.tsv '$clusters' #if $antismash_sideload @@ -49,13 +52,14 @@ <option value="antismash">antiSMASH</option> <option value="gecco" selected="true">GECCO</option> </param> - <param argument="--edge-distance" type="integer" min="0" value="10" label="Number of genes from the contig edges to filter out"/> + <param argument="--edge-distance" type="integer" min="0" optional="true" value="" label="Number of genes from the contig edges to filter out"/> <param argument="--antismash-sideload" type="boolean" checked="false" label="Generate an antiSMASH v6 sideload JSON file"/> </inputs> <outputs> <collection name="records" type="list" label="${tool.name} detected Biosynthetic Gene Clusters on ${on_string} (GenBank)"> <discover_datasets pattern="(?P<designation>.*)\.gbk" ext="genbank" visible="false" /> </collection> + <data name="genes" format="tabular" label="${tool.name} summary of detected genes on ${on_string} (TSV)"/> <data name="features" format="tabular" label="${tool.name} summary of detected features on ${on_string} (TSV)"/> <data name="clusters" format="tabular" label="${tool.name} summary of detected BGCs on ${on_string} (TSV)"/> <data name="sideload" format="json" label="antiSMASH v6 sideload file with ${tool.name} detected BGCs on ${on_string} (JSON)"> @@ -66,12 +70,14 @@ <test> <param name="input" value="BGC0001866.fna"/> <output name="features" file="features.tsv"/> + <output name="genes" file="genes.tsv"/> <output name="clusters" file="clusters.tsv"/> </test> <test> <param name="input" value="BGC0001866.fna"/> <param name="edge_distance" value="0"/> <output name="features" file="features.tsv"/> + <output name="genes" file="genes.tsv"/> <output name="clusters" file="clusters.tsv"/> <output_collection name="records" type="list"> <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" compare="diff" lines_diff="4"/> @@ -82,6 +88,7 @@ <param name="antismash_sideload" value="True"/> <param name="edge_distance" value="0"/> <output name="features" file="features.tsv"/> + <output name="genes" file="genes.tsv"/> <output name="clusters" file="clusters.tsv"/> <output name="sideload" file="sideload.json"/> <output_collection name="records" type="list"> @@ -107,8 +114,9 @@ GECCO will create the following files once done (using the same prefix as the input file): -- ``features.tsv``: The features file, containing the identified proteins and domains in the input sequences. -- ``clusters.tsv``: If any were found, a clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type. +- ``features.tsv``: The genes file, containing the genes identified in the input sequences. +- ``features.tsv``: The features file, containing the protein domains identified in the input sequences. +- ``clusters.tsv``: A clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type. - ``{sequence}_cluster_{N}.gbk``: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains. Contact
--- a/test-data/BGC0001866.1_cluster_1.gbk Thu Mar 31 18:00:15 2022 +0000 +++ b/test-data/BGC0001866.1_cluster_1.gbk Tue Apr 05 23:18:49 2022 +0000 @@ -1,4 +1,4 @@ -LOCUS BGC0001866.1_cluster_1 32633 bp DNA linear UNK 21-NOV-2021 +LOCUS BGC0001866.1_cluster_1 32633 bp DNA linear UNK 06-APR-2022 DEFINITION BGC0001866.1 Byssochlamys spectabilis strain CBS 101075 chromosome Unknown C8Q69scaffold_14, whole genome shotgun sequence. ACCESSION BGC0001866.1_cluster_1 @@ -15,15 +15,15 @@ JOURNAL bioRxiv (2021.05.03.442509) REMARK doi:10.1101/2021.05.03.442509 COMMENT ##GECCO-Data-START## - version :: GECCO v0.8.10 - creation_date :: 2021-11-21T16:33:58.470847 + version :: GECCO v0.9.1 + creation_date :: 2022-04-06T01:08:36.965708 biosyn_class :: Polyketide - alkaloid_probability :: 0.0 - polyketide_probability :: 0.98 + alkaloid_probability :: 0.010000000000000009 + polyketide_probability :: 0.96 ripp_probability :: 0.0 saccharide_probability :: 0.0 - terpene_probability :: 0.0 - nrp_probability :: 0.09999999999999998 + terpene_probability :: 0.010000000000000009 + nrp_probability :: 0.14 ##GECCO-Data-END## FEATURES Location/Qualifiers CDS complement(1..1143) @@ -41,7 +41,7 @@ /inference="protein motif" /db_xref="PFAM:PF00394" /db_xref="InterPro:IPR001117" - /note="e-value: 2.1941888078432915e-08" + /note="e-value: 2.262067179461254e-08" /note="p-value: 8.178117062405111e-12" /function="Multicopper oxidase" /standard_name="PF00394" @@ -49,7 +49,7 @@ /inference="protein motif" /db_xref="PFAM:PF07731" /db_xref="InterPro:IPR011706" - /note="e-value: 3.9374169295176556e-23" + /note="e-value: 4.059222969454281e-23" /note="p-value: 1.467542649838858e-26" /function="Multicopper oxidase" /standard_name="PF07731" @@ -93,7 +93,7 @@ /inference="protein motif" /db_xref="PFAM:PF00891" /db_xref="InterPro:IPR001077" - /note="e-value: 4.743887678074703e-16" + /note="e-value: 4.890642309934635e-16" /note="p-value: 1.7681280946979883e-19" /function="O-methyltransferase domain" /standard_name="PF00891" @@ -108,7 +108,7 @@ /inference="protein motif" /db_xref="PFAM:PF00135" /db_xref="InterPro:IPR002018" - /note="e-value: 4.674605664377319e-21" + /note="e-value: 4.819217021121008e-21" /note="p-value: 1.7423055029360116e-24" /function="Carboxylesterase family" /standard_name="PF00135" @@ -123,7 +123,7 @@ /inference="protein motif" /db_xref="PFAM:PF00135" /db_xref="InterPro:IPR002018" - /note="e-value: 3.9706994470948554e-30" + /note="e-value: 4.0935350990176556e-30" /note="p-value: 1.4799476135277136e-33" /function="Carboxylesterase family" /standard_name="PF00135" @@ -140,7 +140,7 @@ /inference="protein motif" /db_xref="PFAM:PF00135" /db_xref="InterPro:IPR002018" - /note="e-value: 1.4185801852307574e-15" + /note="e-value: 1.4624647008379705e-15" /note="p-value: 5.287291037013632e-19" /function="Carboxylesterase family" /standard_name="PF00135" @@ -160,7 +160,7 @@ /inference="protein motif" /db_xref="PFAM:PF13434" /db_xref="InterPro:IPR025700" - /note="e-value: 5.777178703900199e-08" + /note="e-value: 5.955898730893757e-08" /note="p-value: 2.153253337271785e-11" /function="L-lysine 6-monooxygenase (NADPH-requiring)" /standard_name="PF13434" @@ -168,7 +168,7 @@ /inference="protein motif" /db_xref="PFAM:PF00743" /db_xref="InterPro:IPR020946" - /note="e-value: 5.089108077410868e-07" + /note="e-value: 5.246542281818287e-07" /note="p-value: 1.8967976434628658e-10" /function="Flavin-binding monooxygenase-like" /standard_name="PF00743" @@ -202,7 +202,7 @@ /inference="protein motif" /db_xref="PFAM:PF07690" /db_xref="InterPro:IPR011701" - /note="e-value: 5.839871260376694e-37" + /note="e-value: 6.020530714201243e-37" /note="p-value: 2.1766199255969786e-40" /function="Major Facilitator Superfamily" /standard_name="PF07690" @@ -210,7 +210,7 @@ /inference="protein motif" /db_xref="PFAM:PF06609" /db_xref="InterPro:IPR010573" - /note="e-value: 9.543170598318239e-09" + /note="e-value: 9.83839354265682e-09" /note="p-value: 3.55690294383833e-12" /function="Fungal trichothecene efflux pump (TRI12)" /standard_name="PF06609" @@ -235,8 +235,8 @@ /inference="protein motif" /db_xref="PFAM:PF08493" /db_xref="InterPro:IPR013700" - /note="e-value: 2.6165794251055913e-17" - /note="p-value: 9.752439154325723e-21" + /note="e-value: 2.686865976406516e-17" + /note="p-value: 9.713904470016327e-21" /function="Aflatoxin regulatory protein" /standard_name="PF08493" CDS 16827..18797 @@ -259,7 +259,7 @@ /inference="protein motif" /db_xref="PFAM:PF00109" /db_xref="InterPro:IPR014030" - /note="e-value: 9.025888536170949e-60" + /note="e-value: 9.30510909096118e-60" /note="p-value: 3.364103069761815e-63" /function="Beta-ketoacyl synthase, N-terminal domain" /standard_name="PF00109" @@ -267,23 +267,23 @@ /inference="protein motif" /db_xref="PFAM:PF02801" /db_xref="InterPro:IPR014031" - /note="e-value: 2.2171445990751238e-35" + /note="e-value: 2.2857331200304854e-35" /note="p-value: 8.263677223537547e-39" /function="Beta-ketoacyl synthase, C-terminal domain" /standard_name="PF02801" - misc_feature 17937..18287 + misc_feature 17937..18290 /inference="protein motif" /db_xref="PFAM:PF16197" /db_xref="InterPro:IPR032821" - /note="e-value: 3.8698172759236842e-25" - /note="p-value: 1.4423471024687604e-28" + /note="e-value: 4.800730099641783e-25" + /note="p-value: 1.7356218726109122e-28" /function="Ketoacyl-synthetase C-terminal extension" /standard_name="PF16197" misc_feature 18360..18770 /inference="protein motif" /db_xref="PFAM:PF00698" /db_xref="InterPro:IPR014043" - /note="e-value: 1.0799913424517567e-26" + /note="e-value: 1.113401436161595e-26" /note="p-value: 4.025312495161225e-30" /function="Acyl transferase domain" /standard_name="PF00698" @@ -314,7 +314,7 @@ /inference="protein motif" /db_xref="PFAM:PF00698" /db_xref="InterPro:IPR014043" - /note="e-value: 2.639223271303753e-16" + /note="e-value: 2.7208690154402465e-16" /note="p-value: 9.836836642950999e-20" /function="Acyl transferase domain" /standard_name="PF00698" @@ -322,14 +322,14 @@ /inference="protein motif" /db_xref="PFAM:PF14765" /db_xref="InterPro:IPR020807" - /note="e-value: 2.520598829779557e-60" + /note="e-value: 2.598574865139864e-60" /note="p-value: 9.394703055458656e-64" /function="Polyketide synthase dehydratase" /standard_name="PF14765" misc_feature 20786..21256 /inference="protein motif" /db_xref="PFAM:PF13489" - /note="e-value: 1.0131254482174088e-12" + /note="e-value: 1.04446701072283e-12" /note="p-value: 3.776091868123029e-16" /function="Methyltransferase domain" /standard_name="PF13489" @@ -337,23 +337,23 @@ /inference="protein motif" /db_xref="PFAM:PF13847" /db_xref="InterPro:IPR025714" - /note="e-value: 8.939870258494623e-11" - /note="p-value: 3.332042586095648e-14" + /note="e-value: 8.752004453621267e-11" + /note="p-value: 3.1641375465008194e-14" /function="Methyltransferase domain" /standard_name="PF13847" misc_feature 20804..21097 /inference="protein motif" /db_xref="PFAM:PF13649" /db_xref="InterPro:IPR041698" - /note="e-value: 2.319131521369124e-13" - /note="p-value: 8.643799930559537e-17" + /note="e-value: 2.4253465299984994e-13" + /note="p-value: 8.76842563267715e-17" /function="Methyltransferase domain" /standard_name="PF13649" misc_feature 20807..21103 /inference="protein motif" /db_xref="PFAM:PF08242" /db_xref="InterPro:IPR013217" - /note="e-value: 3.6288099491186147e-22" + /note="e-value: 3.7410690716593694e-22" /note="p-value: 1.3525195486837923e-25" /function="Methyltransferase domain" /standard_name="PF08242" @@ -361,7 +361,7 @@ /inference="protein motif" /db_xref="PFAM:PF08241" /db_xref="InterPro:IPR013216" - /note="e-value: 5.245291385894328e-12" + /note="e-value: 5.4075572021556884e-12" /note="p-value: 1.9550098344742185e-15" /function="Methyltransferase domain" /standard_name="PF08241" @@ -376,7 +376,7 @@ /inference="protein motif" /db_xref="PFAM:PF00107" /db_xref="InterPro:IPR013149" - /note="e-value: 1.0960342036668699e-15" + /note="e-value: 1.1299405916297285e-15" /note="p-value: 4.085106983476965e-19" /function="Zinc-binding dehydrogenase" /standard_name="PF00107" @@ -396,7 +396,7 @@ /inference="protein motif" /db_xref="PFAM:PF08659" /db_xref="InterPro:IPR013968" - /note="e-value: 1.5141662612831146e-61" + /note="e-value: 1.5610077818520667e-61" /note="p-value: 5.643556695054471e-65" /function="KR domain" /standard_name="PF08659" @@ -404,7 +404,7 @@ /inference="protein motif" /db_xref="PFAM:PF00106" /db_xref="InterPro:IPR002347" - /note="e-value: 1.1379002942545491e-07" + /note="e-value: 1.1731018314976082e-07" /note="p-value: 4.2411490654288077e-11" /function="short chain dehydrogenase" /standard_name="PF00106" @@ -412,7 +412,7 @@ /inference="protein motif" /db_xref="PFAM:PF00550" /db_xref="InterPro:IPR009081" - /note="e-value: 3.359618716013185e-10" + /note="e-value: 3.463550267794435e-10" /note="p-value: 1.2521873708584363e-13" /function="Phosphopantetheine attachment site" /standard_name="PF00550" @@ -426,8 +426,8 @@ /inference="protein motif" /db_xref="PFAM:PF16073" /db_xref="InterPro:IPR032088" - /note="e-value: 1.3071857188363548e-23" - /note="p-value: 4.872104803713585e-27" + /note="e-value: 9.422238725791962e-24" + /note="p-value: 3.406449286258844e-27" /function="Starter unit:ACP transacylase in aflatoxin biosynthesis" /standard_name="PF16073" @@ -459,8 +459,8 @@ /inference="protein motif" /db_xref="PFAM:PF16073" /db_xref="InterPro:IPR032088" - /note="e-value: 8.208876065249628e-11" - /note="p-value: 3.059588544632735e-14" + /note="e-value: 4.380197593141013e-11" + /note="p-value: 1.5835855362042708e-14" /function="Starter unit:ACP transacylase in aflatoxin biosynthesis" /standard_name="PF16073" @@ -468,7 +468,7 @@ /inference="protein motif" /db_xref="PFAM:PF00109" /db_xref="InterPro:IPR014030" - /note="e-value: 2.667462237983852e-82" + /note="e-value: 2.7499815692371726e-82" /note="p-value: 9.942088102809735e-86" /function="Beta-ketoacyl synthase, N-terminal domain" /standard_name="PF00109" @@ -476,7 +476,7 @@ /inference="protein motif" /db_xref="PFAM:PF02801" /db_xref="InterPro:IPR014031" - /note="e-value: 2.4031043351141288e-34" + /note="e-value: 2.4774456171918303e-34" /note="p-value: 8.956780973217029e-38" /function="Beta-ketoacyl synthase, C-terminal domain" /standard_name="PF02801" @@ -484,15 +484,15 @@ /inference="protein motif" /db_xref="PFAM:PF16197" /db_xref="InterPro:IPR032821" - /note="e-value: 2.535893425129411e-07" - /note="p-value: 9.451708628883381e-11" + /note="e-value: 8.475099126640419e-07" + /note="p-value: 3.0640271607521397e-10" /function="Ketoacyl-synthetase C-terminal extension" /standard_name="PF16197" misc_feature 28322..29233 /inference="protein motif" /db_xref="PFAM:PF00698" /db_xref="InterPro:IPR014043" - /note="e-value: 4.597134671955754e-38" + /note="e-value: 4.739349423268586e-38" /note="p-value: 1.7134307387088164e-41" /function="Acyl transferase domain" /standard_name="PF00698" @@ -509,7 +509,7 @@ /inference="protein motif" /db_xref="PFAM:PF14765" /db_xref="InterPro:IPR020807" - /note="e-value: 7.778696660229127e-11" + /note="e-value: 8.019334685871699e-11" /note="p-value: 2.8992533209948296e-14" /function="Polyketide synthase dehydratase" /standard_name="PF14765" @@ -533,7 +533,7 @@ /inference="protein motif" /db_xref="PFAM:PF00550" /db_xref="InterPro:IPR009081" - /note="e-value: 5.884377030377924e-14" + /note="e-value: 6.066413293337807e-14" /note="p-value: 2.193207987468477e-17" /function="Phosphopantetheine attachment site" /standard_name="PF00550" @@ -541,7 +541,7 @@ /inference="protein motif" /db_xref="PFAM:PF00550" /db_xref="InterPro:IPR009081" - /note="e-value: 3.9212317886052276e-10" + /note="e-value: 4.042537132792419e-10" /note="p-value: 1.461510170930014e-13" /function="Phosphopantetheine attachment site" /standard_name="PF00550" @@ -549,7 +549,7 @@ /inference="protein motif" /db_xref="PFAM:PF00550" /db_xref="InterPro:IPR009081" - /note="e-value: 1.367829688372301e-08" + /note="e-value: 1.4101442109719659e-08" /note="p-value: 5.098135252971677e-12" /function="Phosphopantetheine attachment site" /standard_name="PF00550" @@ -557,7 +557,7 @@ /inference="protein motif" /db_xref="PFAM:PF00975" /db_xref="InterPro:IPR001031" - /note="e-value: 6.711355516947163e-24" + /note="e-value: 6.91897478936856e-24" /note="p-value: 2.5014370171252933e-27" /function="Thioesterase domain" /standard_name="PF00975"
--- a/test-data/clusters.tsv Thu Mar 31 18:00:15 2022 +0000 +++ b/test-data/clusters.tsv Tue Apr 05 23:18:49 2022 +0000 @@ -1,2 +1,2 @@ sequence_id bgc_id start end average_p max_p type alkaloid_probability polyketide_probability ripp_probability saccharide_probability terpene_probability nrp_probability proteins domains -BGC0001866.1 BGC0001866.1_cluster_1 347 32979 0.9969495815733557 0.9999999447224028 Polyketide 0.0 0.98 0.0 0.0 0.0 0.09999999999999998 BGC0001866.1_1;BGC0001866.1_2;BGC0001866.1_3;BGC0001866.1_4;BGC0001866.1_5;BGC0001866.1_6;BGC0001866.1_7;BGC0001866.1_8;BGC0001866.1_9;BGC0001866.1_10;BGC0001866.1_11;BGC0001866.1_12;BGC0001866.1_13;BGC0001866.1_14;BGC0001866.1_15;BGC0001866.1_16;BGC0001866.1_17;BGC0001866.1_18;BGC0001866.1_19;BGC0001866.1_20;BGC0001866.1_21;BGC0001866.1_22;BGC0001866.1_23 PF00106;PF00107;PF00109;PF00135;PF00394;PF00550;PF00698;PF00743;PF00891;PF00975;PF02801;PF06609;PF07690;PF07731;PF08241;PF08242;PF08493;PF08659;PF13434;PF13489;PF13649;PF13847;PF14765;PF16073;PF16197 +BGC0001866.1 BGC0001866.1_cluster_1 347 32979 0.9958958770931704 0.9999999976946022 Polyketide 0.010000000000000009 0.96 0.0 0.0 0.010000000000000009 0.14 BGC0001866.1_1;BGC0001866.1_2;BGC0001866.1_3;BGC0001866.1_4;BGC0001866.1_5;BGC0001866.1_6;BGC0001866.1_7;BGC0001866.1_8;BGC0001866.1_9;BGC0001866.1_10;BGC0001866.1_11;BGC0001866.1_12;BGC0001866.1_13;BGC0001866.1_14;BGC0001866.1_15;BGC0001866.1_16;BGC0001866.1_17;BGC0001866.1_18;BGC0001866.1_19;BGC0001866.1_20;BGC0001866.1_21;BGC0001866.1_22;BGC0001866.1_23 PF00106;PF00107;PF00109;PF00135;PF00394;PF00550;PF00698;PF00743;PF00891;PF00975;PF02801;PF06609;PF07690;PF07731;PF08241;PF08242;PF08493;PF08659;PF13434;PF13489;PF13649;PF13847;PF14765;PF16073;PF16197
--- a/test-data/features.tsv Thu Mar 31 18:00:15 2022 +0000 +++ b/test-data/features.tsv Tue Apr 05 23:18:49 2022 +0000 @@ -1,38 +1,38 @@ sequence_id protein_id start end strand domain hmm i_evalue pvalue domain_start domain_end bgc_probability -BGC0001866.1 BGC0001866.1_1 347 1489 - PF00394 Pfam 2.1941888078432915e-08 8.178117062405111e-12 1 63 0.9852038761627908 -BGC0001866.1 BGC0001866.1_1 347 1489 - PF07731 Pfam 3.9374169295176556e-23 1.467542649838858e-26 150 281 0.9852038761627908 -BGC0001866.1 BGC0001866.1_6 3946 4389 + PF00891 Pfam 4.743887678074703e-16 1.7681280946979883e-19 17 121 0.9910535094227727 -BGC0001866.1 BGC0001866.1_7 4683 5138 + PF00135 Pfam 4.674605664377319e-21 1.7423055029360116e-24 48 140 0.9913598896683397 -BGC0001866.1 BGC0001866.1_8 5384 5812 + PF00135 Pfam 3.9706994470948554e-30 1.4799476135277136e-33 2 114 0.9925093258822111 -BGC0001866.1 BGC0001866.1_9 5823 6599 + PF00135 Pfam 1.4185801852307574e-15 5.287291037013632e-19 2 209 0.9946019708257335 -BGC0001866.1 BGC0001866.1_10 7758 9029 + PF13434 Pfam 5.777178703900199e-08 2.153253337271785e-11 13 124 0.9978201609931655 -BGC0001866.1 BGC0001866.1_10 7758 9029 + PF00743 Pfam 5.089108077410868e-07 1.8967976434628658e-10 36 102 0.9978201609931655 -BGC0001866.1 BGC0001866.1_13 11550 12662 + PF07690 Pfam 5.839871260376694e-37 2.1766199255969786e-40 1 362 0.9990971143689635 -BGC0001866.1 BGC0001866.1_13 11550 12662 + PF06609 Pfam 9.543170598318239e-09 3.55690294383833e-12 17 244 0.9990971143689635 -BGC0001866.1 BGC0001866.1_15 14920 15912 + PF08493 Pfam 2.6165794251055913e-17 9.752439154325723e-21 139 224 0.9999977987864139 -BGC0001866.1 BGC0001866.1_16 17173 19143 + PF00109 Pfam 9.025888536170949e-60 3.364103069761815e-63 2 248 0.9999994272691842 -BGC0001866.1 BGC0001866.1_16 17173 19143 + PF02801 Pfam 2.2171445990751238e-35 8.263677223537547e-39 257 368 0.9999994272691842 -BGC0001866.1 BGC0001866.1_16 17173 19143 + PF16197 Pfam 3.8698172759236842e-25 1.4423471024687604e-28 371 487 0.9999994272691842 -BGC0001866.1 BGC0001866.1_16 17173 19143 + PF00698 Pfam 1.0799913424517567e-26 4.025312495161225e-30 512 648 0.9999994272691842 -BGC0001866.1 BGC0001866.1_17 19152 22424 + PF00698 Pfam 2.639223271303753e-16 9.836836642950999e-20 2 151 0.9999940983719267 -BGC0001866.1 BGC0001866.1_17 19152 22424 + PF14765 Pfam 2.520598829779557e-60 9.394703055458656e-64 228 504 0.9999940983719267 -BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13489 Pfam 1.0131254482174088e-12 3.776091868123029e-16 661 817 0.9999940983719267 -BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13847 Pfam 8.939870258494623e-11 3.332042586095648e-14 666 776 0.9999940983719267 -BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13649 Pfam 2.319131521369124e-13 8.643799930559537e-17 667 764 0.9999940983719267 -BGC0001866.1 BGC0001866.1_17 19152 22424 + PF08242 Pfam 3.6288099491186147e-22 1.3525195486837923e-25 668 766 0.9999940983719267 -BGC0001866.1 BGC0001866.1_17 19152 22424 + PF08241 Pfam 5.245291385894328e-12 1.9550098344742185e-15 668 767 0.9999940983719267 -BGC0001866.1 BGC0001866.1_18 22762 23235 + PF00107 Pfam 1.0960342036668699e-15 4.085106983476965e-19 12 117 0.9999176675645223 -BGC0001866.1 BGC0001866.1_19 23268 24623 + PF08659 Pfam 1.5141662612831146e-61 5.643556695054471e-65 65 239 0.9999724741067139 -BGC0001866.1 BGC0001866.1_19 23268 24623 + PF00106 Pfam 1.1379002942545491e-07 4.2411490654288077e-11 68 221 0.9999724741067139 -BGC0001866.1 BGC0001866.1_19 23268 24623 + PF00550 Pfam 3.359618716013185e-10 1.2521873708584363e-13 384 437 0.9999724741067139 -BGC0001866.1 BGC0001866.1_20 25769 26056 + PF16073 Pfam 1.3071857188363548e-23 4.872104803713585e-27 8 94 0.999988513111687 -BGC0001866.1 BGC0001866.1_21 26544 29999 + PF16073 Pfam 8.208876065249628e-11 3.059588544632735e-14 2 47 0.9999999447224028 -BGC0001866.1 BGC0001866.1_21 26544 29999 + PF00109 Pfam 2.667462237983852e-82 9.942088102809735e-86 178 426 0.9999999447224028 -BGC0001866.1 BGC0001866.1_21 26544 29999 + PF02801 Pfam 2.4031043351141288e-34 8.956780973217029e-38 434 555 0.9999999447224028 -BGC0001866.1 BGC0001866.1_21 26544 29999 + PF16197 Pfam 2.535893425129411e-07 9.451708628883381e-11 567 673 0.9999999447224028 -BGC0001866.1 BGC0001866.1_21 26544 29999 + PF00698 Pfam 4.597134671955754e-38 1.7134307387088164e-41 709 1012 0.9999999447224028 -BGC0001866.1 BGC0001866.1_22 30150 30890 + PF14765 Pfam 7.778696660229127e-11 2.8992533209948296e-14 39 244 0.9999460955852995 -BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 5.884377030377924e-14 2.193207987468477e-17 67 128 0.9997314383315643 -BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 3.9212317886052276e-10 1.461510170930014e-13 174 238 0.9997314383315643 -BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 1.367829688372301e-08 5.098135252971677e-12 299 360 0.9997314383315643 -BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00975 Pfam 6.711355516947163e-24 2.5014370171252933e-27 443 550 0.9997314383315643 +BGC0001866.1 BGC0001866.1_1 347 1489 - PF00394 Pfam 2.262067179461254e-08 8.178117062405111e-12 1 63 0.9791890143072265 +BGC0001866.1 BGC0001866.1_1 347 1489 - PF07731 Pfam 4.059222969454281e-23 1.467542649838858e-26 150 281 0.9791890143072265 +BGC0001866.1 BGC0001866.1_6 3946 4389 + PF00891 Pfam 4.890642309934635e-16 1.7681280946979883e-19 17 121 0.9955095513800687 +BGC0001866.1 BGC0001866.1_7 4683 5138 + PF00135 Pfam 4.819217021121008e-21 1.7423055029360116e-24 48 140 0.995982045872177 +BGC0001866.1 BGC0001866.1_8 5384 5812 + PF00135 Pfam 4.0935350990176556e-30 1.4799476135277136e-33 2 114 0.9966491071789748 +BGC0001866.1 BGC0001866.1_9 5823 6599 + PF00135 Pfam 1.4624647008379705e-15 5.287291037013632e-19 2 209 0.9975265367646511 +BGC0001866.1 BGC0001866.1_10 7758 9029 + PF13434 Pfam 5.955898730893757e-08 2.153253337271785e-11 13 124 0.9986351193337516 +BGC0001866.1 BGC0001866.1_10 7758 9029 + PF00743 Pfam 5.246542281818287e-07 1.8967976434628658e-10 36 102 0.9986351193337516 +BGC0001866.1 BGC0001866.1_13 11550 12662 + PF07690 Pfam 6.020530714201243e-37 2.1766199255969786e-40 1 362 0.9994485509803548 +BGC0001866.1 BGC0001866.1_13 11550 12662 + PF06609 Pfam 9.83839354265682e-09 3.55690294383833e-12 17 244 0.9994485509803548 +BGC0001866.1 BGC0001866.1_15 14920 15912 + PF08493 Pfam 2.686865976406516e-17 9.713904470016327e-21 139 224 0.9999999296901834 +BGC0001866.1 BGC0001866.1_16 17173 19143 + PF00109 Pfam 9.30510909096118e-60 3.364103069761815e-63 2 248 0.9999998571963613 +BGC0001866.1 BGC0001866.1_16 17173 19143 + PF02801 Pfam 2.2857331200304854e-35 8.263677223537547e-39 257 368 0.9999998571963613 +BGC0001866.1 BGC0001866.1_16 17173 19143 + PF16197 Pfam 4.800730099641783e-25 1.7356218726109122e-28 371 488 0.9999998571963613 +BGC0001866.1 BGC0001866.1_16 17173 19143 + PF00698 Pfam 1.113401436161595e-26 4.025312495161225e-30 512 648 0.9999998571963613 +BGC0001866.1 BGC0001866.1_17 19152 22424 + PF00698 Pfam 2.7208690154402465e-16 9.836836642950999e-20 2 151 0.9999990994944158 +BGC0001866.1 BGC0001866.1_17 19152 22424 + PF14765 Pfam 2.598574865139864e-60 9.394703055458656e-64 228 504 0.9999990994944158 +BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13489 Pfam 1.04446701072283e-12 3.776091868123029e-16 661 817 0.9999990994944158 +BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13847 Pfam 8.752004453621267e-11 3.1641375465008194e-14 666 776 0.9999990994944158 +BGC0001866.1 BGC0001866.1_17 19152 22424 + PF13649 Pfam 2.4253465299984994e-13 8.76842563267715e-17 667 764 0.9999990994944158 +BGC0001866.1 BGC0001866.1_17 19152 22424 + PF08242 Pfam 3.7410690716593694e-22 1.3525195486837923e-25 668 766 0.9999990994944158 +BGC0001866.1 BGC0001866.1_17 19152 22424 + PF08241 Pfam 5.4075572021556884e-12 1.9550098344742185e-15 668 767 0.9999990994944158 +BGC0001866.1 BGC0001866.1_18 22762 23235 + PF00107 Pfam 1.1299405916297285e-15 4.085106983476965e-19 12 117 0.9999802025553775 +BGC0001866.1 BGC0001866.1_19 23268 24623 + PF08659 Pfam 1.5610077818520667e-61 5.643556695054471e-65 65 239 0.9999913868972266 +BGC0001866.1 BGC0001866.1_19 23268 24623 + PF00106 Pfam 1.1731018314976082e-07 4.2411490654288077e-11 68 221 0.9999913868972266 +BGC0001866.1 BGC0001866.1_19 23268 24623 + PF00550 Pfam 3.463550267794435e-10 1.2521873708584363e-13 384 437 0.9999913868972266 +BGC0001866.1 BGC0001866.1_20 25769 26056 + PF16073 Pfam 9.422238725791962e-24 3.406449286258844e-27 8 94 0.9999994733759681 +BGC0001866.1 BGC0001866.1_21 26544 29999 + PF16073 Pfam 4.380197593141013e-11 1.5835855362042708e-14 2 47 0.9999999976946022 +BGC0001866.1 BGC0001866.1_21 26544 29999 + PF00109 Pfam 2.7499815692371726e-82 9.942088102809735e-86 178 426 0.9999999976946022 +BGC0001866.1 BGC0001866.1_21 26544 29999 + PF02801 Pfam 2.4774456171918303e-34 8.956780973217029e-38 434 555 0.9999999976946022 +BGC0001866.1 BGC0001866.1_21 26544 29999 + PF16197 Pfam 8.475099126640419e-07 3.0640271607521397e-10 567 673 0.9999999976946022 +BGC0001866.1 BGC0001866.1_21 26544 29999 + PF00698 Pfam 4.739349423268586e-38 1.7134307387088164e-41 709 1012 0.9999999976946022 +BGC0001866.1 BGC0001866.1_22 30150 30890 + PF14765 Pfam 8.019334685871699e-11 2.8992533209948296e-14 39 244 0.9999912059124727 +BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 6.066413293337807e-14 2.193207987468477e-17 67 128 0.9998703656415205 +BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 4.042537132792419e-10 1.461510170930014e-13 174 238 0.9998703656415205 +BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00550 Pfam 1.4101442109719659e-08 5.098135252971677e-12 299 360 0.9998703656415205 +BGC0001866.1 BGC0001866.1_23 30937 32979 + PF00975 Pfam 6.91897478936856e-24 2.5014370171252933e-27 443 550 0.9998703656415205
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/genes.tsv Tue Apr 05 23:18:49 2022 +0000 @@ -0,0 +1,24 @@ +sequence_id protein_id start end strand average_p max_p +BGC0001866.1 BGC0001866.1_1 347 1489 - 0.9791890143072265 0.9791890143072265 +BGC0001866.1 BGC0001866.1_2 1525 2016 + 0.9816626269970528 0.9816626269970528 +BGC0001866.1 BGC0001866.1_3 2513 2722 - 0.9844997726878899 0.9844997726878899 +BGC0001866.1 BGC0001866.1_4 2905 3378 + 0.9877300777686966 0.9877300777686966 +BGC0001866.1 BGC0001866.1_5 3353 3922 + 0.9913872741253911 0.9913872741253911 +BGC0001866.1 BGC0001866.1_6 3946 4389 + 0.9955095513800687 0.9955095513800687 +BGC0001866.1 BGC0001866.1_7 4683 5138 + 0.995982045872177 0.995982045872177 +BGC0001866.1 BGC0001866.1_8 5384 5812 + 0.9966491071789748 0.9966491071789748 +BGC0001866.1 BGC0001866.1_9 5823 6599 + 0.9975265367646511 0.9975265367646511 +BGC0001866.1 BGC0001866.1_10 7758 9029 + 0.9986351193337516 0.9986351193337516 +BGC0001866.1 BGC0001866.1_11 9800 10384 + 0.9988029392597757 0.9988029392597757 +BGC0001866.1 BGC0001866.1_12 11109 11537 + 0.999073142625125 0.999073142625125 +BGC0001866.1 BGC0001866.1_13 11550 12662 + 0.9994485509803548 0.9994485509803548 +BGC0001866.1 BGC0001866.1_14 12681 13127 + 0.9996778954036583 0.9996778954036583 +BGC0001866.1 BGC0001866.1_15 14920 15912 + 0.9999999296901834 0.9999999296901834 +BGC0001866.1 BGC0001866.1_16 17173 19143 + 0.9999998571963613 0.9999998571963613 +BGC0001866.1 BGC0001866.1_17 19152 22424 + 0.9999990994944158 0.9999990994944158 +BGC0001866.1 BGC0001866.1_18 22762 23235 + 0.9999802025553775 0.9999802025553775 +BGC0001866.1 BGC0001866.1_19 23268 24623 + 0.9999913868972266 0.9999913868972266 +BGC0001866.1 BGC0001866.1_20 25769 26056 + 0.9999994733759681 0.9999994733759681 +BGC0001866.1 BGC0001866.1_21 26544 29999 + 0.9999999976946022 0.9999999976946022 +BGC0001866.1 BGC0001866.1_22 30150 30890 + 0.9999912059124727 0.9999912059124727 +BGC0001866.1 BGC0001866.1_23 30937 32979 + 0.9998703656415205 0.9998703656415205
--- a/test-data/sideload.json Thu Mar 31 18:00:15 2022 +0000 +++ b/test-data/sideload.json Tue Apr 05 23:18:49 2022 +0000 @@ -5,14 +5,14 @@ "subregions": [ { "details": { - "alkaloid_probability": "0.000", - "average_p": "0.997", + "alkaloid_probability": "0.010", + "average_p": "0.996", "max_p": "1.000", - "nrp_probability": "0.100", - "polyketide_probability": "0.980", + "nrp_probability": "0.140", + "polyketide_probability": "0.960", "ripp_probability": "0.000", "saccharide_probability": "0.000", - "terpene_probability": "0.000" + "terpene_probability": "0.010" }, "end": 32979, "label": "Polyketide", @@ -25,11 +25,13 @@ "configuration": { "cds": "3", "e-filter": "None", + "edge-distance": "0", + "mask": "False", "postproc": "'gecco'", - "threshold": "0.3" + "threshold": "0.8" }, "description": "Biosynthetic Gene Cluster prediction with Conditional Random Fields.", "name": "GECCO", - "version": "0.8.10" + "version": "0.9.1" } } \ No newline at end of file