Mercurial > repos > public-health-bioinformatics > adjust_bracken_for_unclassified_reads
changeset 3:899a650587ed draft default tip
planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/adjust_bracken_for_unclassified_reads commit 24535690aedb81353cf5e036dc4577022d9604ad
author | public-health-bioinformatics |
---|---|
date | Thu, 27 Oct 2022 19:13:25 +0000 |
parents | 87459bd1615a |
children | |
files | adjust_bracken_kreport_for_unclassified_reads.py adjust_bracken_kreport_for_unclassified_reads.xml test-data/input/SRR17907745_kraken_style_bracken_report.txt |
diffstat | 3 files changed, 245 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/adjust_bracken_kreport_for_unclassified_reads.py Thu Oct 27 19:13:25 2022 +0000 @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +import argparse +import csv +import json +import sys + + +def parse_kraken_report(kraken_report_path): + kraken_report = [] + with open(kraken_report_path, 'r') as f: + for line in f: + kraken_line = {} + [percentage, seqs_total, seqs_this_level, taxonomic_level, ncbi_taxid, taxon_name] = line.strip().split('\t') + kraken_line['percentage'] = float(percentage) + kraken_line['seqs_total'] = int(seqs_total) + kraken_line['seqs_this_level'] = int(seqs_this_level) + kraken_line['taxonomic_level'] = taxonomic_level + kraken_line['ncbi_taxid'] = ncbi_taxid + kraken_line['taxon_name'] = taxon_name + kraken_report.append(kraken_line) + + return kraken_report + + +def main(args): + kraken_report = parse_kraken_report(args.kraken_report) + kraken_style_bracken_report = parse_kraken_report(args.kraken_style_bracken_report) + + try: + kraken_report_unclassified_seqs = list(filter(lambda x: x['taxon_name'] == 'unclassified', kraken_report))[0]['seqs_this_level'] + except IndexError as e: + kraken_report_unclassified_seqs = 0 + kraken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_report))[0]['seqs_total'] + kraken_style_bracken_report_classified_seqs = list(filter(lambda x: x['taxon_name'] == 'root', kraken_style_bracken_report))[0]['seqs_total'] + + total_seqs = kraken_style_bracken_report_classified_seqs + kraken_report_unclassified_seqs + fraction_unclassified = float(kraken_report_unclassified_seqs) / float(total_seqs) + + output_fieldnames = [ + 'percentage', + 'seqs_total', + 'seqs_this_level', + 'taxonomic_level', + 'ncbi_taxid', + 'taxon_name', + ] + + writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='excel-tab') + + bracken_unclassified_entry = { + 'percentage': '{:.2f}'.format(kraken_report_unclassified_seqs / total_seqs * 100), + 'seqs_total': kraken_report_unclassified_seqs, + 'seqs_this_level': kraken_report_unclassified_seqs, + 'taxonomic_level': 'U', + 'ncbi_taxid': 0, + 'taxon_name': 'unclassified', + } + + for row in kraken_style_bracken_report: + row['percentage'] = '{:.2f}'.format(row['seqs_total'] / total_seqs * 100) + + kraken_style_bracken_report_with_unclassified = [bracken_unclassified_entry] + kraken_style_bracken_report + for row in kraken_style_bracken_report_with_unclassified: + writer.writerow(row) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-k', '--kraken-report') + parser.add_argument('-b', '--kraken-style-bracken-report') + args = parser.parse_args() + main(args)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/adjust_bracken_kreport_for_unclassified_reads.xml Thu Oct 27 19:13:25 2022 +0000 @@ -0,0 +1,29 @@ +<tool id="adjust_bracken_kreport_for_unclassified_reads" name="Adjust Kraken-Style Bracken Report for Unclassified Reads" version="0.1.0+galaxy0"> + <description>Adjust kraken-style bracken report to account for unclassified reads</description> + <requirements> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + '$__tool_directory__/adjust_bracken_kreport_for_unclassified_reads.py' + --kraken-report '${kraken_report}' + --kraken-style-bracken-report '${kraken_style_bracken_report}' + > ${adjusted_report} + ]]></command> + <inputs> + <param name="kraken_report" type="data" format="txt" /> + <param name="kraken_style_bracken_report" type="data" format="txt" /> + </inputs> + <outputs> + <data name="adjusted_report" label="Adjusted Report" format="txt"/> + </outputs> + <tests> + <test> + <param name="kraken_report" value="input/SRR17619849_kraken2.txt"/> + <param name="kraken_style_bracken_report" value=""/> + <output name="adjusted_report" file="" ftype="tabular"/> + </test> + </tests> + <help><![CDATA[ + ]]></help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input/SRR17907745_kraken_style_bracken_report.txt Thu Oct 27 19:13:25 2022 +0000 @@ -0,0 +1,143 @@ +100.00 2371444 0 R 1 root +100.00 2371444 0 R1 131567 cellular organisms +99.99 2371306 0 D 2 Bacteria +99.99 2371287 0 P 1224 Proteobacteria +99.99 2371241 0 C 1236 Gammaproteobacteria +99.99 2371241 0 O 91347 Enterobacterales +99.25 2353760 0 F 543 Enterobacteriaceae +53.04 1257733 0 G 570 Klebsiella +44.15 1046984 1046984 S 1463165 Klebsiella quasipneumoniae +7.73 183381 183381 S 573 Klebsiella pneumoniae +0.26 6092 0 G1 2608929 unclassified Klebsiella +0.19 4605 4605 S 2015795 Klebsiella sp. LY +0.03 757 757 S 2267618 Klebsiella sp. P1CD1 +0.01 318 318 S 2787706 Klebsiella sp. BDA134-6 +0.02 410 410 S 2488567 Klebsiella sp. FDAARGOS_511 +0.32 7585 7585 S 244366 Klebsiella variicola +0.08 1853 1853 S 571 Klebsiella oxytoca +0.05 1294 1294 S 548 Klebsiella aerogenes +0.05 1261 1261 S 2026240 Klebsiella quasivariicola +0.19 4517 4517 S 1134687 Klebsiella michiganensis +0.17 3937 3937 S 2058152 Klebsiella grimontii +0.03 659 659 S 2489010 Klebsiella africana +0.01 164 164 S 2153354 Klebsiella huaxiensis +33.69 798958 0 G 561 Escherichia +33.68 798809 798809 S 562 Escherichia coli +0.01 149 149 S 564 Escherichia fergusonii +6.10 144567 0 G 547 Enterobacter +6.09 144531 0 G1 354276 Enterobacter cloacae complex +3.68 87241 87241 S 158836 Enterobacter hormaechei +2.40 56903 56903 S 550 Enterobacter cloacae +0.01 282 282 S 1812935 Enterobacter roggenkampii +0.00 91 91 S 208224 Enterobacter kobei +0.00 13 13 S 61645 Enterobacter asburiae +0.00 35 0 G1 2608935 unclassified Enterobacter +0.00 18 18 S 2596949 Enterobacter sp. E76 +0.00 17 17 S 2500132 Enterobacter sp. N18-03635 +5.77 136773 0 G 544 Citrobacter +5.65 134026 0 G1 1344959 Citrobacter freundii complex +5.65 133878 133878 S 546 Citrobacter freundii +0.01 148 148 S 1639133 Citrobacter portucalensis +0.12 2728 0 G1 2644389 unclassified Citrobacter +0.10 2352 2352 S 2742632 Citrobacter sp. RHBSTW-00053 +0.02 375 375 S 2742638 Citrobacter sp. RHBSTW-00137 +0.00 19 19 S 67825 Citrobacter rodentium +0.58 13709 0 G 590 Salmonella +0.21 5059 0 G1 2614656 unclassified Salmonella +0.09 2074 2074 S 2500542 Salmonella sp. SSDFZ54 +0.13 2985 2985 S 599 Salmonella sp. +0.36 8649 8649 S 28901 Salmonella enterica +0.03 746 0 G 620 Shigella +0.03 746 746 S 621 Shigella boydii +0.02 554 0 G 160674 Raoultella +0.02 431 431 S 54291 Raoultella ornithinolytica +0.00 89 89 S 577 Raoultella terrigena +0.00 33 33 S 575 Raoultella planticola +0.02 409 0 G 1330547 Kosakonia +0.02 387 387 S 283686 Kosakonia radicincitans +0.00 22 22 S 497725 Kosakonia oryzae +0.01 182 0 G 83654 Leclercia +0.01 182 0 G1 2627398 unclassified Leclercia +0.01 182 182 S 2815358 Leclercia sp. 4-9-1-25 +0.00 46 0 G 1330546 Pluralibacter +0.00 35 35 S 1334193 [Enterobacter] lignolyticus +0.00 11 11 S 61647 Pluralibacter gergoviae +0.00 21 0 G 2815296 Jejubacter +0.00 21 21 S 2579935 Jejubacter calystegiae +0.00 13 0 G 158483 Cedecea +0.00 13 13 S 158822 Cedecea neteri +0.00 11 0 G 1330545 Lelliottia +0.00 11 11 S 61646 Lelliottia amnigena +0.00 11 0 G 2726810 Scandinavium +0.00 11 11 S 1851514 Scandinavium goeteborgense +0.00 11 0 G 1335483 Shimwellia +0.00 11 11 S 563 Shimwellia blattae +0.00 10 0 G 2055880 Pseudescherichia +0.00 10 10 S 566 Pseudescherichia vulneris +0.71 16876 0 F 1903409 Erwiniaceae +0.71 16788 0 G 82986 Tatumella +0.71 16788 0 G1 2649542 unclassified Tatumella +0.71 16788 16788 S 2487345 Tatumella sp. TA1 +0.00 53 0 G 53335 Pantoea +0.00 33 33 S 553 Pantoea ananatis +0.00 19 19 S 66269 Pantoea stewartii +0.00 34 0 G 551 Erwinia +0.00 34 34 S 79967 Erwinia pyrifoliae +0.02 525 0 F 1903411 Yersiniaceae +0.01 341 0 G 629 Yersinia +0.01 190 0 G1 1649845 Yersinia pseudotuberculosis complex +0.01 190 190 S 633 Yersinia pseudotuberculosis +0.01 151 151 S 28152 Yersinia kristensenii +0.01 151 0 G 34037 Rahnella +0.00 79 79 S 34038 Rahnella aquatilis +0.00 71 0 G1 2635087 unclassified Rahnella +0.00 71 71 S 657334 Rahnella sp. WMR42 +0.00 31 0 G 613 Serratia +0.00 31 31 S 615 Serratia marcescens +0.00 79 0 F 1903410 Pectobacteriaceae +0.00 61 0 G 122277 Pectobacterium +0.00 61 61 S 2488639 Pectobacterium versatile +0.00 18 0 G 71655 Brenneria +0.00 18 18 S 1109412 Brenneria goodwinii +0.00 46 0 C 28211 Alphaproteobacteria +0.00 46 0 O 356 Hyphomicrobiales +0.00 46 0 F 41294 Bradyrhizobiaceae +0.00 46 0 G 1073 Rhodopseudomonas +0.00 46 46 S 1076 Rhodopseudomonas palustris +0.00 19 0 D1 1783272 Terrabacteria group +0.00 19 0 P 1239 Firmicutes +0.00 19 0 C 91061 Bacilli +0.00 19 0 O 1385 Bacillales +0.00 19 0 F 90964 Staphylococcaceae +0.00 19 0 G 1279 Staphylococcus +0.00 19 19 S 1280 Staphylococcus aureus +0.01 138 0 D 2759 Eukaryota +0.01 138 0 D1 33154 Opisthokonta +0.01 138 0 K 33208 Metazoa +0.01 138 0 K1 6072 Eumetazoa +0.01 138 0 K2 33213 Bilateria +0.01 138 0 K3 33511 Deuterostomia +0.01 138 0 P 7711 Chordata +0.01 138 0 P1 89593 Craniata +0.01 138 0 P2 7742 Vertebrata +0.01 138 0 P3 7776 Gnathostomata +0.01 138 0 P4 117570 Teleostomi +0.01 138 0 P5 117571 Euteleostomi +0.01 138 0 P6 8287 Sarcopterygii +0.01 138 0 C 1338369 Dipnotetrapodomorpha +0.01 138 0 C1 32523 Tetrapoda +0.01 138 0 C2 32524 Amniota +0.01 138 0 C 40674 Mammalia +0.01 138 0 C1 32525 Theria +0.01 138 0 C2 9347 Eutheria +0.01 138 0 C3 1437010 Boreoeutheria +0.01 138 0 C4 314146 Euarchontoglires +0.01 138 0 O 9443 Primates +0.01 138 0 O1 376913 Haplorrhini +0.01 138 0 O2 314293 Simiiformes +0.01 138 0 O3 9526 Catarrhini +0.01 138 0 O4 314295 Hominoidea +0.01 138 0 F 9604 Hominidae +0.01 138 0 F1 207598 Homininae +0.01 138 0 G 9605 Homo +0.01 138 138 S 9606 Homo sapiens