Mercurial > repos > onnodg > blast_annotations_processor

diff tests/test_blast_annotations_processor.py @ 3:ca2f07b71581 draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit 600e5a50a13a3a16a1970d6d4d31cb4f7bd549bf-dirty
author: onnodg
date: Thu, 12 Feb 2026 13:52:07 +0000
parents: 9ca209477dfd
--- a/tests/test_blast_annotations_processor.py	Mon Dec 15 16:43:36 2025 +0000
+++ b/tests/test_blast_annotations_processor.py	Thu Feb 12 13:52:07 2026 +0000
@@ -50,13 +50,13 @@
 read4(25)	subject4	id4.2	subject4   	90.0	87	1e-50	160	database1	Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii
 """
 
-        fasta_content = """>read1(100) count=100;
+        fasta_content = """>read1(100) count=100; obiclean_count={'XXX': 100};
 ATCGATCGATCGATCG
->read2(50) count=50;
+>read2(50) count=50; obiclean_count={'XXX': 50}
 GCTAGCTAGCTAGCTA
->read3(25) count=25;
+>read3(25) count=25; obiclean_count={'XXX': 25}
 TGACTGACTGACTGAC
->read4(25) count=25;
+>read4(25) count=25; obiclean_count={'XXX': 25}
 TGAAAAAAACACCAC
 """
 
@@ -259,11 +259,11 @@
 read3(25)	source=NCBI   sequenceID=KR737595   superkingdom=Eukaryota   kingdom=Viridiplantae   phylum=Streptophyta   subphylum=Streptophytina   class=Magnoliopsida   subclass=NA   infraclass=NA   order=Malvales   suborder=NA   infraorder=NA   superfamily=NA   family=Malvaceae   genus=Hibiscus   species=Hibiscus trionum   markercode=trnL   lat=0.304   lon=36.87	source=NCBI	N/A	97.561	87	1.68e-14	71.3	Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum
 """
 
-        fasta_content = """>read1(100) count=100;
+        fasta_content = """>read1(100) count=100; obiclean_count={'XXX': 100};
 ATCG
->read2(50) merged_sample={}; count=1011; direction=right; seq_b_insertion=0; sminR=40.0; ali_length=53; seq_b_deletion=248; seq_a_deletion=248; seq_a_insertion=0; mode=alignment; sminL=40.0; seq_a_single=0; seq_b_single=0; 
+>read2(50) merged_sample={}; count=1011; obiclean_count={'XXX': 1011}; direction=right; seq_b_insertion=0; sminR=40.0; ali_length=53; seq_b_deletion=248; seq_a_deletion=248; seq_a_insertion=0; mode=alignment; sminL=40.0; seq_a_single=0; seq_b_single=0; 
 gggcaatcctgagccaagtgactggagttcagataggtgcagagactcaatgg
->read3(25) merged_sample={}; count=179; direction=right; sminR=40.0; ali_length=49; seq_b_deletion=252; seq_a_deletion=252; seq_b_insertion=0; seq_a_insertion=0; mode=alignment; sminL=40.0; seq_a_single=0; seq_b_single=0; 
+>read3(25) merged_sample={}; count=179; obiclean_count={'XXX': 179}; direction=right; sminR=40.0; ali_length=49; seq_b_deletion=252; seq_a_deletion=252; seq_b_insertion=0; seq_a_insertion=0; mode=alignment; sminL=40.0; seq_a_single=0; seq_b_single=0; 
 gggcaatcctgagccaactggagttcagataggtgcagagactcaatgg
 """
 
@@ -411,18 +411,18 @@
         blast = input_dir / "combined_filters.tabular"
 
         fasta.write_text(
-            ">lowSupport(1) count=1;\nACGT\n"
-            ">obicleanFail(10) count=10; obiclean_status={'XXX': 's'};\nACGT\n"
-            ">pairendFail_PairEnd(10) count=10;\nACGT\n"
-            ">identityFail(10) count=10;\nACGT\n"
-            ">coverageFail(10) count=10;\nACGT\n"
-            ">bitscoreFail(10) count=10;\nACGT\n"
-            ">bscutoffHigh(10) count=10;\nACGT\n"
-            ">envTaxFail(10) count=10;\nACGT\n"
-            ">rankFail(10) count=10;\nACGT\n"
-            ">seqidFail(10) count=10;\nACGT\n"
-            ">readOK(10) count=10;\nACGT\n"
-            ">readOK_multiple_id(10) count=10;\nACGT\n"
+            ">lowSupport(1) obiclean_count={'XXX': 1}; count=1;\nACGT\n"
+            ">obicleanFail(10) count=10; obiclean_count={'XXX': 10}; obiclean_status={'XXX': 's'};\nACGT\n"
+            ">pairendFail_PairEnd(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n"
+            ">identityFail(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n"
+            ">coverageFail(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n"
+            ">bitscoreFail(10) count=10; obiclean_count={'XXX': 10}; ACGT\n"
+            ">bscutoffHigh(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n"
+            ">envTaxFail(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n"
+            ">rankFail(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n"
+            ">seqidFail(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n"
+            ">readOK(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n"
+            ">readOK_multiple_id(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n"
         )
 
         blast.write_text(
@@ -495,8 +495,8 @@
 
         with open(args.filtered_fasta) as f:
             headers = [l.strip() for l in f if l.startswith(">")]
-        assert '>obicleanFail(10) count=10;' not in headers
-        assert '>pairendFail_PairEnd(10) count=10;' not in headers
+        assert ">obicleanFail(10) count=10; obiclean_count={'XXX': 10};" not in headers
+        assert ">pairendFail_PairEnd(10) count=10; obiclean_count={'XXX': 10};" not in headers
         assert len(headers) == 9, "FASTA filtering only applies to header-based rules"
 
         df = pd.read_excel(args.header_anno, sheet_name="Individual_Reads")
author	onnodg
date	Thu, 12 Feb 2026 13:52:07 +0000
parents	9ca209477dfd
children