# HG changeset patch
# User matnguyen
# Date 1536043789 14400
# Node ID 45533fb9d2f4d45ed8ad15ce672e6307a8511472
Primary version
diff -r 000000000000 -r 45533fb9d2f4 test-data/refseq_parser/output.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/refseq_parser/output.tsv Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,2 @@
+SRR6152717
+SRR6153036
diff -r 000000000000 -r 45533fb9d2f4 test-data/refseq_parser/reference.fna.gz
Binary file test-data/refseq_parser/reference.fna.gz has changed
diff -r 000000000000 -r 45533fb9d2f4 test-data/refseq_parser/test.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/refseq_parser/test.tsv Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,31 @@
+sample top_taxonomy_name distance pvalue matching full_taxonomy taxonomic_species taxonomic_genus taxonomic_family taxonomic_order taxonomic_class taxonomic_phylum taxonomic_superkingdom subspecies serovar plasmid bioproject biosample taxid assembly_accession match_id taxonomic_species group
+SRR6152717 Mycobacterium tuberculosis TKK_05MA_0040 0.07493939999999999 5.471479999999998e-187 71/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_05MA_0040 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02586004 1448509 GCF_000653175.1 ./rcn/refseq-NZ-1448509-PRJNA224116-SAMN02586004-GCF_000653175.1-.-Mycobacterium_tuberculosis_TKK_05MA_0040.fna Mycobacterium tuberculosis complex
+SRR6152717 Mycobacterium tuberculosis MD17647 0.07493939999999999 5.152709999999997e-187 71/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17647 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02584623 1447472 GCF_000650435.1 ./rcn/refseq-NZ-1447472-PRJNA224116-SAMN02584623-GCF_000650435.1-.-Mycobacterium_tuberculosis_MD17647.fna Mycobacterium tuberculosis complex
+SRR6152717 Mycobacterium tuberculosis MD17646 0.07493939999999999 5.719649999999997e-187 71/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17646 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02584624 1447473 GCF_000706405.1 ./rcn/refseq-NZ-1447473-PRJNA224116-SAMN02584624-GCF_000706405.1-.-Mycobacterium_tuberculosis_MD17646.fna Mycobacterium tuberculosis complex
+SRR6152717 Mycobacterium tuberculosis MD17240 0.07493939999999999 5.256599999999999e-187 71/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17240 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02584627 1447476 GCF_000650495.1 ./rcn/refseq-NZ-1447476-PRJNA224116-SAMN02584627-GCF_000650495.1-.-Mycobacterium_tuberculosis_MD17240.fna Mycobacterium tuberculosis complex
+SRR6152717 Mycobacterium tuberculosis MD17749 0.07493939999999999 5.147799999999998e-187 71/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17749 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02584628 1447477 NZ_JLFC ./rcn/refseq-NZ-1447477-PRJNA224116-SAMN02584628-NZ_JLFC-.-Mycobacterium_tuberculosis_MD17749.fna Mycobacterium tuberculosis complex
+SRR6152731 Mycobacterium tuberculosis UG-D 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; UG-D Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02360630 1402483 GCF_000674655.1 ./rcn/refseq-NZ-1402483-PRJNA224116-SAMN02360630-GCF_000674655.1-.-Mycobacterium_tuberculosis_UG_D.fna Mycobacterium tuberculosis complex
+SRR6152731 Mycobacterium tuberculosis TKK_04_0148 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_04_0148 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02586095 1448600 GCF_000656935.1 ./rcn/refseq-NZ-1448600-PRJNA224116-SAMN02586095-GCF_000656935.1-.-Mycobacterium_tuberculosis_TKK_04_0148.fna Mycobacterium tuberculosis complex
+SRR6152731 Mycobacterium tuberculosis UT0058 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; UT0058 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02381013 1408936 GCF_000668595.1 ./rcn/refseq-NZ-1408936-PRJNA224116-SAMN02381013-GCF_000668595.1-.-Mycobacterium_tuberculosis_UT0058.fna Mycobacterium tuberculosis complex
+SRR6152731 Mycobacterium tuberculosis BTB10-001 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB10-001 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02414929 1423513 GCF_000678695.1 ./rcn/refseq-NZ-1423513-PRJNA224116-SAMN02414929-GCF_000678695.1-.-Mycobacterium_tuberculosis_BTB10_001.fna Mycobacterium tuberculosis complex
+SRR6152731 Mycobacterium tuberculosis BTB10-142 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB10-142 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02414934 1423518 GCF_000678775.1 ./rcn/refseq-NZ-1423518-PRJNA224116-SAMN02414934-GCF_000678775.1-.-Mycobacterium_tuberculosis_BTB10_142.fna Mycobacterium tuberculosis complex
+SRR6152844 Mycobacterium tuberculosis XTB13-092 7.82718e-05 0.0 399/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; XTB13-092 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02419542 1427186 GCF_000680035.1 ./rcn/refseq-NZ-1427186-PRJNA224116-SAMN02419542-GCF_000680035.1-.-Mycobacterium_tuberculosis_XTB13_092.fna Mycobacterium tuberculosis complex
+SRR6152844 Mycobacterium tuberculosis TKK_03_0065 7.82718e-05 0.0 399/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_03_0065 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02585907 1448412 GCF_000651735.1 ./rcn/refseq-NZ-1448412-PRJNA224116-SAMN02585907-GCF_000651735.1-.-Mycobacterium_tuberculosis_TKK_03_0065.fna Mycobacterium tuberculosis complex
+SRR6152844 Mycobacterium tuberculosis TKK_03_0072 7.82718e-05 0.0 399/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_03_0072 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02585909 1448414 GCF_000651755.1 ./rcn/refseq-NZ-1448414-PRJNA224116-SAMN02585909-GCF_000651755.1-.-Mycobacterium_tuberculosis_TKK_03_0072.fna Mycobacterium tuberculosis complex
+SRR6152844 Mycobacterium tuberculosis 7.82718e-05 0.0 399/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN03338743 1773 GCF_000972425.1 ./rcn/refseq-NZ-1773-PRJNA224116-SAMN03338743-GCF_000972425.1-.-Mycobacterium_tuberculosis.fna Mycobacterium tuberculosis complex
+SRR6152844 Mycobacterium tuberculosis X122 7.82718e-05 0.0 399/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; X122 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02471089 747368 GCF_000184025.1 ./rcn/refseq-NZ-747368-PRJNA224116-SAMN02471089-GCF_000184025.1-.-Mycobacterium_tuberculosis_X122.fna Mycobacterium tuberculosis complex
+SRR6152991 Mycobacterium tuberculosis 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN03177496 1773 NZ_JUFU ./rcn/refseq-NZ-1773-PRJNA224116-SAMN03177496-NZ_JUFU-.-Mycobacterium_tuberculosis.fna Mycobacterium tuberculosis complex
+SRR6152991 Mycobacterium tuberculosis UT0110 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; UT0110 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02381044 1408967 GCF_000669155.1 ./rcn/refseq-NZ-1408967-PRJNA224116-SAMN02381044-GCF_000669155.1-.-Mycobacterium_tuberculosis_UT0110.fna Mycobacterium tuberculosis complex
+SRR6152991 Mycobacterium tuberculosis KT-0022 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; KT-0022 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02360562 1400884 GCF_000673755.1 ./rcn/refseq-NZ-1400884-PRJNA224116-SAMN02360562-GCF_000673755.1-.-Mycobacterium_tuberculosis_KT_0022.fna Mycobacterium tuberculosis complex
+SRR6152991 Mycobacterium tuberculosis BTB07-275 0.000235701 0.0 397/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB07-275 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02414885 1423469 GCF_000677935.1 ./rcn/refseq-NZ-1423469-PRJNA224116-SAMN02414885-GCF_000677935.1-.-Mycobacterium_tuberculosis_BTB07_275.fna Mycobacterium tuberculosis complex
+SRR6152991 Mycobacterium tuberculosis TKK_05SA_0021 0.000235701 0.0 397/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_05SA_0021 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02586030 1448535 GCF_000653515.1 ./rcn/refseq-NZ-1448535-PRJNA224116-SAMN02586030-GCF_000653515.1-.-Mycobacterium_tuberculosis_TKK_05SA_0021.fna Mycobacterium tuberculosis complex
+SRR6153036 Staphylococcus epidermidis AU12-03 0.0227453 0.0 213/400 Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; AU12-03 Staphylococcus epidermidis Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria PRJNA180900 SAMN01103171 1220510 NZ_AMCS ./rcn/refseq-NZ-1220510-PRJNA180900-SAMN01103171-NZ_AMCS-.-Staphylococcus_epidermidis_AU12_03.fna
+SRR6153036 Staphylococcus epidermidis SK135 0.022937400000000004 0.0 212/400 Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; SK135 Staphylococcus epidermidis Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria PRJNA42967 SAMN00008358 596317 NZ_ADEY ./rcn/refseq-NZ-596317-PRJNA42967-SAMN00008358-NZ_ADEY-.-Staphylococcus_epidermidis_SK135.fna
+SRR6153036 Staphylococcus epidermidis VCU109 0.022937400000000004 0.0 212/400 Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; VCU109 Staphylococcus epidermidis Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria PRJNA179852 SAMN00116832 904330 NZ_AFUA ./rcn/refseq-NZ-904330-PRJNA179852-SAMN00116832-NZ_AFUA-.-Staphylococcus_epidermidis_VCU109.fna
+SRR6153036 Staphylococcus epidermidis 0.0233253 0.0 210/400 Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis Staphylococcus epidermidis Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria PRJNA224116 SAMN02640611 1282 NZ_JMIF ./rcn/refseq-NZ-1282-PRJNA224116-SAMN02640611-NZ_JMIF-.-Staphylococcus_epidermidis.fna
+SRR6153036 Staphylococcus epidermidis VCU125 0.024116 0.0 206/400 Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; VCU125 Staphylococcus epidermidis Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria PRJNA180069 SAMN00116836 904341 NZ_AHLF ./rcn/refseq-NZ-904341-PRJNA180069-SAMN00116836-NZ_AHLF-.-Staphylococcus_epidermidis_VCU125.fna
+SRR6153231 Mycobacterium tuberculosis MD19051 0.0317811 0.0 172/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD19051 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02584606 1447455 GCF_000650115.1 ./rcn/refseq-NZ-1447455-PRJNA224116-SAMN02584606-GCF_000650115.1-.-Mycobacterium_tuberculosis_MD19051.fna Mycobacterium tuberculosis complex
+SRR6153231 Mycobacterium tuberculosis TKK-01-0080 0.0317811 0.0 172/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK-01-0080 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN01828251 1267364 GCF_000659185.1 ./rcn/refseq-NZ-1267364-PRJNA224116-SAMN01828251-GCF_000659185.1-.-Mycobacterium_tuberculosis_TKK_01_0080.fna Mycobacterium tuberculosis complex
+SRR6153231 Mycobacterium tuberculosis BTB12-315 0.0317811 0.0 172/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB12-315 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02414981 1423565 GCF_000679635.1 ./rcn/refseq-NZ-1423565-PRJNA224116-SAMN02414981-GCF_000679635.1-.-Mycobacterium_tuberculosis_BTB12_315.fna Mycobacterium tuberculosis complex
+SRR6153231 Mycobacterium tuberculosis BTB05-552 0.0317811 0.0 172/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB05-552 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02472041 882099 GCF_000220435.1 ./rcn/refseq-NZ-882099-PRJNA224116-SAMN02472041-GCF_000220435.1-.-Mycobacterium_tuberculosis_BTB05_552.fna Mycobacterium tuberculosis complex
+SRR6153231 Mycobacterium tuberculosis BTB05-559 0.0317811 0.0 172/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB05-559 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02472033 882100 GCF_000220455.1 ./rcn/refseq-NZ-882100-PRJNA224116-SAMN02472033-GCF_000220455.1-.-Mycobacterium_tuberculosis_BTB05_559.fna Mycobacterium tuberculosis complex
diff -r 000000000000 -r 45533fb9d2f4 tools/kraken_filter/kraken_filter
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/kraken_filter/kraken_filter Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import sys
+import argparse as ap
+from ete3 import NCBITaxa
+
+def parse_kraken_results(db, report, taxid):
+ kraken = {} # Store classification for each read
+ ncbi = NCBITaxa(db)
+ descendants = set(ncbi.get_descendant_taxa(taxid))
+
+ with open(report, 'r') as classification:
+ for line in classification:
+ classified, read_id, tax_id, length, details = line.strip().split("\t")
+ kraken[read_id] = tax_id
+
+ # Classify each read
+ kraken_class = {}
+
+ for read_id, tax_id in kraken.items():
+ if tax_id == 0:
+ kraken_class[read_id] = "unclassified"
+ elif int(tax_id) in descendants or int(tax_id) == int(taxid):
+ kraken_class[read_id] = "target"
+ else:
+ kraken_class[read_id] = "other"
+
+ return kraken_class
+
+
+def kraken_trim(db, report, taxid, paired, fastq, fastq2):
+ kraken = parse_kraken_results(db, report, taxid)
+
+ # Write new fastq file
+ if paired:
+ files = [fastq, fastq2]
+ else:
+ files = [fastq]
+ for index,fastq_in in enumerate(files):
+ with open(fastq_in, 'r') as f_in:
+ with open('input_%d.fastq' % (index+1), 'w') as f_out:
+ for line in f_in:
+ # Split ID with space, then remove "/1" or "/2" if it exists and ignore initial @
+ read_id = line.split(" ")[0].split("/")[0][1:]
+ if read_id in kraken and kraken[read_id] != "other":
+ f_out.write(line)
+ for i in range(3):
+ f_out.write(f_in.readline())
+ else:
+ for i in range(3):
+ f_in.readline()
+
+parser = ap.ArgumentParser(prog='kraken_trim', conflict_handler='resolve',
+ description="Trims contaminated reads using Kraken reports")
+
+input = parser.add_argument_group('Input', '')
+input.add_argument('db', help="sqlite formatted ETE3 taxa database")
+input.add_argument('report', help="Kraken report")
+input.add_argument('taxid', type=int, help="Target taxonomic ID")
+input.add_argument('fastq', help="FASTQ file")
+input.add_argument('fastq2', nargs='?', help="Reverse FASTQ mate ")
+input.add_argument('--p', '--paired', action='store_true', help="Paired FASTQ files")
+
+if len(sys.argv) == 1:
+ parser.print_usage()
+ sys.exit(1)
+
+args = parser.parse_args()
+
+kraken_trim(args.db, args.report, args.taxid, args.p, args.fastq, args.fastq2)
\ No newline at end of file
diff -r 000000000000 -r 45533fb9d2f4 tools/kraken_filter/kraken_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/kraken_filter/kraken_filter.xml Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,90 @@
+
+ by parsing Kraken reports
+
+ ete3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fastq_input['input_type'] == 'single'
+
+
+
+
+
+ fastq_input['input_type'] == 'paired_collection'
+
+
+
+ fastq_input['input_type'] == 'paired'
+
+
+ fastq_input['input_type'] == 'paired'
+
+
+
+
+
+
+
+
+
+ [fastq_reverse]
+
+ ]]>
+
+
+ Manuscript in preparation
+
+
diff -r 000000000000 -r 45533fb9d2f4 tools/qualimap_parser/qualimap_parser
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/qualimap_parser/qualimap_parser Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,38 @@
+#!/usr/bin/python
+
+# Usage: qualimap_parser -i
+
+import sys
+import argparse as ap
+
+
+### Define global variables
+mapped_percentage = ""
+mean_mapping_quality = ""
+
+parser = ap.ArgumentParser(prog='outlier-parser', conflict_handler='resolve',
+ description="Parses Qualimap output file to eliminate outliers")
+
+input = parser.add_argument_group('Input', '')
+input.add_argument('-n', '--name', nargs='+', required=True, help="Sample name")
+input.add_argument('-i', '--input', nargs='+', required=True, help="Qualimap Genome Report File")
+
+if len(sys.argv) == 1:
+ parser.print_usage()
+ sys.exit(1)
+
+args = parser.parse_args()
+output = open("outlier_list.txt", "a")
+
+"""For parsing report file"""
+for index,report in enumerate(args.input):
+ with open(report) as file:
+ for line in file:
+ if "number of mapped reads" in line:
+ mapped_percentage = line.split()[-1].strip('()%')
+ if "mean mapping quality" in line:
+ mean_mapping_quality = line.split()[-1]
+
+ if float(mapped_percentage) < 90 or float(mean_mapping_quality) < 10:
+ output.write("%s\n" % args.name[index])
+
diff -r 000000000000 -r 45533fb9d2f4 tools/qualimap_parser/qualimap_parser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/qualimap_parser/qualimap_parser.xml Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,31 @@
+
+ and output a list of outliers
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -i
+
+ ]]>
+
+
+ Manuscript in preparation
+
+
diff -r 000000000000 -r 45533fb9d2f4 tools/refseq_parser/refseq_parser
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/refseq_parser/refseq_parser Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+import argparse as ap
+from ete3 import NCBITaxa
+
+parser = ap.ArgumentParser(prog='outlier-parser', conflict_handler='resolve',
+ description="Parses output file to eliminate outliers")
+
+
+input = parser.add_argument_group('Input', '')
+input.add_argument('db', help="sqlite formatted ETE3 taxa database")
+input.add_argument('taxid', metavar="INT", help='Target taxonomic ID')
+input.add_argument('input', nargs='+', help="Tab-delimited RefSeq Masher reports")
+
+if len(sys.argv) == 1:
+ parser.print_usage()
+ sys.exit(1)
+
+args = parser.parse_args()
+
+output = open("outlier_list.txt", "w")
+outlier_flag = False
+accession = ""
+
+ncbi = NCBITaxa(args.db)
+descendants = ncbi.get_descendant_taxa(args.taxid, intermediate_nodes=True)
+
+for report in args.input:
+ with open(report) as csvfile:
+ reader = csv.DictReader(csvfile, delimiter='\t')
+ next(reader, None)
+
+ for row in reader:
+ if row['sample'] == accession and outlier_flag:
+ continue
+ accession = row['sample']
+
+ if (int(row['taxid']) != int(args.taxid) and int(row['taxid']) not in descendants) or float(row['distance']) > 0.05:
+ output.write("%s\n" % accession)
+ outlier_flag = True
diff -r 000000000000 -r 45533fb9d2f4 tools/refseq_parser/refseq_parser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/refseq_parser/refseq_parser.xml Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,38 @@
+
+ and output a list of outliers
+
+ ete3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.05
+
+ Command line:
+ refseq_parser [input refseq_masher reports ...]
+ ]]>
+
+
+ Manuscript in preparation
+
+