Mercurial > repos > onnodg > blast_annotations_processor
comparison tests/test_blast_annotations_processor.py @ 3:ca2f07b71581 draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit 600e5a50a13a3a16a1970d6d4d31cb4f7bd549bf-dirty
| author | onnodg |
|---|---|
| date | Thu, 12 Feb 2026 13:52:07 +0000 |
| parents | 9ca209477dfd |
| children |
comparison
equal
deleted
inserted
replaced
| 2:9ca209477dfd | 3:ca2f07b71581 |
|---|---|
| 48 read4(25) subject4 id4.1 subject4 80.0 85 1e-40 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_eclhi | 48 read4(25) subject4 id4.1 subject4 80.0 85 1e-40 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_eclhi |
| 49 read4(25) subject4 id4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_elchi | 49 read4(25) subject4 id4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_elchi |
| 50 read4(25) subject4 id4.2 subject4 90.0 87 1e-50 160 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii | 50 read4(25) subject4 id4.2 subject4 90.0 87 1e-50 160 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii |
| 51 """ | 51 """ |
| 52 | 52 |
| 53 fasta_content = """>read1(100) count=100; | 53 fasta_content = """>read1(100) count=100; obiclean_count={'XXX': 100}; |
| 54 ATCGATCGATCGATCG | 54 ATCGATCGATCGATCG |
| 55 >read2(50) count=50; | 55 >read2(50) count=50; obiclean_count={'XXX': 50} |
| 56 GCTAGCTAGCTAGCTA | 56 GCTAGCTAGCTAGCTA |
| 57 >read3(25) count=25; | 57 >read3(25) count=25; obiclean_count={'XXX': 25} |
| 58 TGACTGACTGACTGAC | 58 TGACTGACTGACTGAC |
| 59 >read4(25) count=25; | 59 >read4(25) count=25; obiclean_count={'XXX': 25} |
| 60 TGAAAAAAACACCAC | 60 TGAAAAAAACACCAC |
| 61 """ | 61 """ |
| 62 | 62 |
| 63 blast_file = input_dir / "test_blast.tabular" | 63 blast_file = input_dir / "test_blast.tabular" |
| 64 fasta_file = input_dir / "test_sequences.fasta" | 64 fasta_file = input_dir / "test_sequences.fasta" |
| 257 read1(100) source=NCBI sequenceID=KR738670 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 100.000 100 7.35e-14 54.7 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum | 257 read1(100) source=NCBI sequenceID=KR738670 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 100.000 100 7.35e-14 54.7 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum |
| 258 read2.1(50) 1 2 3 4 5 6 7 8 9 | 258 read2.1(50) 1 2 3 4 5 6 7 8 9 |
| 259 read3(25) source=NCBI sequenceID=KR737595 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 97.561 87 1.68e-14 71.3 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum | 259 read3(25) source=NCBI sequenceID=KR737595 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 97.561 87 1.68e-14 71.3 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum |
| 260 """ | 260 """ |
| 261 | 261 |
| 262 fasta_content = """>read1(100) count=100; | 262 fasta_content = """>read1(100) count=100; obiclean_count={'XXX': 100}; |
| 263 ATCG | 263 ATCG |
| 264 >read2(50) merged_sample={}; count=1011; direction=right; seq_b_insertion=0; sminR=40.0; ali_length=53; seq_b_deletion=248; seq_a_deletion=248; seq_a_insertion=0; mode=alignment; sminL=40.0; seq_a_single=0; seq_b_single=0; | 264 >read2(50) merged_sample={}; count=1011; obiclean_count={'XXX': 1011}; direction=right; seq_b_insertion=0; sminR=40.0; ali_length=53; seq_b_deletion=248; seq_a_deletion=248; seq_a_insertion=0; mode=alignment; sminL=40.0; seq_a_single=0; seq_b_single=0; |
| 265 gggcaatcctgagccaagtgactggagttcagataggtgcagagactcaatgg | 265 gggcaatcctgagccaagtgactggagttcagataggtgcagagactcaatgg |
| 266 >read3(25) merged_sample={}; count=179; direction=right; sminR=40.0; ali_length=49; seq_b_deletion=252; seq_a_deletion=252; seq_b_insertion=0; seq_a_insertion=0; mode=alignment; sminL=40.0; seq_a_single=0; seq_b_single=0; | 266 >read3(25) merged_sample={}; count=179; obiclean_count={'XXX': 179}; direction=right; sminR=40.0; ali_length=49; seq_b_deletion=252; seq_a_deletion=252; seq_b_insertion=0; seq_a_insertion=0; mode=alignment; sminL=40.0; seq_a_single=0; seq_b_single=0; |
| 267 gggcaatcctgagccaactggagttcagataggtgcagagactcaatgg | 267 gggcaatcctgagccaactggagttcagataggtgcagagactcaatgg |
| 268 """ | 268 """ |
| 269 | 269 |
| 270 blast_file = input_dir / "test_sync.tabular" | 270 blast_file = input_dir / "test_sync.tabular" |
| 271 fasta_file = input_dir / "test_sync.fasta" | 271 fasta_file = input_dir / "test_sync.fasta" |
| 409 | 409 |
| 410 fasta = input_dir / "combined_filters.fasta" | 410 fasta = input_dir / "combined_filters.fasta" |
| 411 blast = input_dir / "combined_filters.tabular" | 411 blast = input_dir / "combined_filters.tabular" |
| 412 | 412 |
| 413 fasta.write_text( | 413 fasta.write_text( |
| 414 ">lowSupport(1) count=1;\nACGT\n" | 414 ">lowSupport(1) obiclean_count={'XXX': 1}; count=1;\nACGT\n" |
| 415 ">obicleanFail(10) count=10; obiclean_status={'XXX': 's'};\nACGT\n" | 415 ">obicleanFail(10) count=10; obiclean_count={'XXX': 10}; obiclean_status={'XXX': 's'};\nACGT\n" |
| 416 ">pairendFail_PairEnd(10) count=10;\nACGT\n" | 416 ">pairendFail_PairEnd(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n" |
| 417 ">identityFail(10) count=10;\nACGT\n" | 417 ">identityFail(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n" |
| 418 ">coverageFail(10) count=10;\nACGT\n" | 418 ">coverageFail(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n" |
| 419 ">bitscoreFail(10) count=10;\nACGT\n" | 419 ">bitscoreFail(10) count=10; obiclean_count={'XXX': 10}; ACGT\n" |
| 420 ">bscutoffHigh(10) count=10;\nACGT\n" | 420 ">bscutoffHigh(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n" |
| 421 ">envTaxFail(10) count=10;\nACGT\n" | 421 ">envTaxFail(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n" |
| 422 ">rankFail(10) count=10;\nACGT\n" | 422 ">rankFail(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n" |
| 423 ">seqidFail(10) count=10;\nACGT\n" | 423 ">seqidFail(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n" |
| 424 ">readOK(10) count=10;\nACGT\n" | 424 ">readOK(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n" |
| 425 ">readOK_multiple_id(10) count=10;\nACGT\n" | 425 ">readOK_multiple_id(10) count=10; obiclean_count={'XXX': 10}; \nACGT\n" |
| 426 ) | 426 ) |
| 427 | 427 |
| 428 blast.write_text( | 428 blast.write_text( |
| 429 # min_support (count=1 < 5) | 429 # min_support (count=1 < 5) |
| 430 "lowSupport(1)\ts\tid1\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" | 430 "lowSupport(1)\ts\tid1\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" |
| 493 args = Args() | 493 args = Args() |
| 494 process_single_file(args.input_anno, args.input_unanno, args, log_messages=[]) | 494 process_single_file(args.input_anno, args.input_unanno, args, log_messages=[]) |
| 495 | 495 |
| 496 with open(args.filtered_fasta) as f: | 496 with open(args.filtered_fasta) as f: |
| 497 headers = [l.strip() for l in f if l.startswith(">")] | 497 headers = [l.strip() for l in f if l.startswith(">")] |
| 498 assert '>obicleanFail(10) count=10;' not in headers | 498 assert ">obicleanFail(10) count=10; obiclean_count={'XXX': 10};" not in headers |
| 499 assert '>pairendFail_PairEnd(10) count=10;' not in headers | 499 assert ">pairendFail_PairEnd(10) count=10; obiclean_count={'XXX': 10};" not in headers |
| 500 assert len(headers) == 9, "FASTA filtering only applies to header-based rules" | 500 assert len(headers) == 9, "FASTA filtering only applies to header-based rules" |
| 501 | 501 |
| 502 df = pd.read_excel(args.header_anno, sheet_name="Individual_Reads") | 502 df = pd.read_excel(args.header_anno, sheet_name="Individual_Reads") |
| 503 seq_ids = { | 503 seq_ids = { |
| 504 sid | 504 sid |
