changeset 0:45533fb9d2f4 draft default tip

Primary version
author matnguyen
date Tue, 04 Sep 2018 02:49:49 -0400
parents
children
files test-data/refseq_parser/output.tsv test-data/refseq_parser/reference.fna.gz test-data/refseq_parser/test.tsv tools/kraken_filter/kraken_filter tools/kraken_filter/kraken_filter.xml tools/qualimap_parser/qualimap_parser tools/qualimap_parser/qualimap_parser.xml tools/refseq_parser/refseq_parser tools/refseq_parser/refseq_parser.xml
diffstat 9 files changed, 342 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/refseq_parser/output.tsv	Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,2 @@
+SRR6152717
+SRR6153036
Binary file test-data/refseq_parser/reference.fna.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/refseq_parser/test.tsv	Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,31 @@
+sample	top_taxonomy_name	distance	pvalue	matching	full_taxonomy	taxonomic_species	taxonomic_genus	taxonomic_family	taxonomic_order	taxonomic_class	taxonomic_phylum	taxonomic_superkingdom	subspecies	serovar	plasmid	bioproject	biosample	taxid	assembly_accession	match_id	taxonomic_species group
+SRR6152717	Mycobacterium tuberculosis TKK_05MA_0040	0.07493939999999999	5.471479999999998e-187	71/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_05MA_0040	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02586004	1448509	GCF_000653175.1	./rcn/refseq-NZ-1448509-PRJNA224116-SAMN02586004-GCF_000653175.1-.-Mycobacterium_tuberculosis_TKK_05MA_0040.fna	Mycobacterium tuberculosis complex
+SRR6152717	Mycobacterium tuberculosis MD17647	0.07493939999999999	5.152709999999997e-187	71/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17647	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02584623	1447472	GCF_000650435.1	./rcn/refseq-NZ-1447472-PRJNA224116-SAMN02584623-GCF_000650435.1-.-Mycobacterium_tuberculosis_MD17647.fna	Mycobacterium tuberculosis complex
+SRR6152717	Mycobacterium tuberculosis MD17646	0.07493939999999999	5.719649999999997e-187	71/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17646	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02584624	1447473	GCF_000706405.1	./rcn/refseq-NZ-1447473-PRJNA224116-SAMN02584624-GCF_000706405.1-.-Mycobacterium_tuberculosis_MD17646.fna	Mycobacterium tuberculosis complex
+SRR6152717	Mycobacterium tuberculosis MD17240	0.07493939999999999	5.256599999999999e-187	71/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17240	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02584627	1447476	GCF_000650495.1	./rcn/refseq-NZ-1447476-PRJNA224116-SAMN02584627-GCF_000650495.1-.-Mycobacterium_tuberculosis_MD17240.fna	Mycobacterium tuberculosis complex
+SRR6152717	Mycobacterium tuberculosis MD17749	0.07493939999999999	5.147799999999998e-187	71/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17749	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02584628	1447477	NZ_JLFC	./rcn/refseq-NZ-1447477-PRJNA224116-SAMN02584628-NZ_JLFC-.-Mycobacterium_tuberculosis_MD17749.fna	Mycobacterium tuberculosis complex
+SRR6152731	Mycobacterium tuberculosis UG-D	0.000156838	0.0	398/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; UG-D	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02360630	1402483	GCF_000674655.1	./rcn/refseq-NZ-1402483-PRJNA224116-SAMN02360630-GCF_000674655.1-.-Mycobacterium_tuberculosis_UG_D.fna	Mycobacterium tuberculosis complex
+SRR6152731	Mycobacterium tuberculosis TKK_04_0148	0.000156838	0.0	398/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_04_0148	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02586095	1448600	GCF_000656935.1	./rcn/refseq-NZ-1448600-PRJNA224116-SAMN02586095-GCF_000656935.1-.-Mycobacterium_tuberculosis_TKK_04_0148.fna	Mycobacterium tuberculosis complex
+SRR6152731	Mycobacterium tuberculosis UT0058	0.000156838	0.0	398/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; UT0058	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02381013	1408936	GCF_000668595.1	./rcn/refseq-NZ-1408936-PRJNA224116-SAMN02381013-GCF_000668595.1-.-Mycobacterium_tuberculosis_UT0058.fna	Mycobacterium tuberculosis complex
+SRR6152731	Mycobacterium tuberculosis BTB10-001	0.000156838	0.0	398/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB10-001	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02414929	1423513	GCF_000678695.1	./rcn/refseq-NZ-1423513-PRJNA224116-SAMN02414929-GCF_000678695.1-.-Mycobacterium_tuberculosis_BTB10_001.fna	Mycobacterium tuberculosis complex
+SRR6152731	Mycobacterium tuberculosis BTB10-142	0.000156838	0.0	398/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB10-142	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02414934	1423518	GCF_000678775.1	./rcn/refseq-NZ-1423518-PRJNA224116-SAMN02414934-GCF_000678775.1-.-Mycobacterium_tuberculosis_BTB10_142.fna	Mycobacterium tuberculosis complex
+SRR6152844	Mycobacterium tuberculosis XTB13-092	7.82718e-05	0.0	399/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; XTB13-092	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02419542	1427186	GCF_000680035.1	./rcn/refseq-NZ-1427186-PRJNA224116-SAMN02419542-GCF_000680035.1-.-Mycobacterium_tuberculosis_XTB13_092.fna	Mycobacterium tuberculosis complex
+SRR6152844	Mycobacterium tuberculosis TKK_03_0065	7.82718e-05	0.0	399/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_03_0065	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02585907	1448412	GCF_000651735.1	./rcn/refseq-NZ-1448412-PRJNA224116-SAMN02585907-GCF_000651735.1-.-Mycobacterium_tuberculosis_TKK_03_0065.fna	Mycobacterium tuberculosis complex
+SRR6152844	Mycobacterium tuberculosis TKK_03_0072	7.82718e-05	0.0	399/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_03_0072	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02585909	1448414	GCF_000651755.1	./rcn/refseq-NZ-1448414-PRJNA224116-SAMN02585909-GCF_000651755.1-.-Mycobacterium_tuberculosis_TKK_03_0072.fna	Mycobacterium tuberculosis complex
+SRR6152844	Mycobacterium tuberculosis	7.82718e-05	0.0	399/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN03338743	1773	GCF_000972425.1	./rcn/refseq-NZ-1773-PRJNA224116-SAMN03338743-GCF_000972425.1-.-Mycobacterium_tuberculosis.fna	Mycobacterium tuberculosis complex
+SRR6152844	Mycobacterium tuberculosis X122	7.82718e-05	0.0	399/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; X122	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02471089	747368	GCF_000184025.1	./rcn/refseq-NZ-747368-PRJNA224116-SAMN02471089-GCF_000184025.1-.-Mycobacterium_tuberculosis_X122.fna	Mycobacterium tuberculosis complex
+SRR6152991	Mycobacterium tuberculosis	0.000156838	0.0	398/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN03177496	1773	NZ_JUFU	./rcn/refseq-NZ-1773-PRJNA224116-SAMN03177496-NZ_JUFU-.-Mycobacterium_tuberculosis.fna	Mycobacterium tuberculosis complex
+SRR6152991	Mycobacterium tuberculosis UT0110	0.000156838	0.0	398/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; UT0110	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02381044	1408967	GCF_000669155.1	./rcn/refseq-NZ-1408967-PRJNA224116-SAMN02381044-GCF_000669155.1-.-Mycobacterium_tuberculosis_UT0110.fna	Mycobacterium tuberculosis complex
+SRR6152991	Mycobacterium tuberculosis KT-0022	0.000156838	0.0	398/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; KT-0022	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02360562	1400884	GCF_000673755.1	./rcn/refseq-NZ-1400884-PRJNA224116-SAMN02360562-GCF_000673755.1-.-Mycobacterium_tuberculosis_KT_0022.fna	Mycobacterium tuberculosis complex
+SRR6152991	Mycobacterium tuberculosis BTB07-275	0.000235701	0.0	397/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB07-275	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02414885	1423469	GCF_000677935.1	./rcn/refseq-NZ-1423469-PRJNA224116-SAMN02414885-GCF_000677935.1-.-Mycobacterium_tuberculosis_BTB07_275.fna	Mycobacterium tuberculosis complex
+SRR6152991	Mycobacterium tuberculosis TKK_05SA_0021	0.000235701	0.0	397/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_05SA_0021	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02586030	1448535	GCF_000653515.1	./rcn/refseq-NZ-1448535-PRJNA224116-SAMN02586030-GCF_000653515.1-.-Mycobacterium_tuberculosis_TKK_05SA_0021.fna	Mycobacterium tuberculosis complex
+SRR6153036	Staphylococcus epidermidis AU12-03	0.0227453	0.0	213/400	Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; AU12-03	Staphylococcus epidermidis	Staphylococcus	Staphylococcaceae	Bacillales	Bacilli	Firmicutes	Bacteria				PRJNA180900	SAMN01103171	1220510	NZ_AMCS	./rcn/refseq-NZ-1220510-PRJNA180900-SAMN01103171-NZ_AMCS-.-Staphylococcus_epidermidis_AU12_03.fna	
+SRR6153036	Staphylococcus epidermidis SK135	0.022937400000000004	0.0	212/400	Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; SK135	Staphylococcus epidermidis	Staphylococcus	Staphylococcaceae	Bacillales	Bacilli	Firmicutes	Bacteria				PRJNA42967	SAMN00008358	596317	NZ_ADEY	./rcn/refseq-NZ-596317-PRJNA42967-SAMN00008358-NZ_ADEY-.-Staphylococcus_epidermidis_SK135.fna	
+SRR6153036	Staphylococcus epidermidis VCU109	0.022937400000000004	0.0	212/400	Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; VCU109	Staphylococcus epidermidis	Staphylococcus	Staphylococcaceae	Bacillales	Bacilli	Firmicutes	Bacteria				PRJNA179852	SAMN00116832	904330	NZ_AFUA	./rcn/refseq-NZ-904330-PRJNA179852-SAMN00116832-NZ_AFUA-.-Staphylococcus_epidermidis_VCU109.fna	
+SRR6153036	Staphylococcus epidermidis	0.0233253	0.0	210/400	Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis	Staphylococcus epidermidis	Staphylococcus	Staphylococcaceae	Bacillales	Bacilli	Firmicutes	Bacteria				PRJNA224116	SAMN02640611	1282	NZ_JMIF	./rcn/refseq-NZ-1282-PRJNA224116-SAMN02640611-NZ_JMIF-.-Staphylococcus_epidermidis.fna	
+SRR6153036	Staphylococcus epidermidis VCU125	0.024116	0.0	206/400	Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; VCU125	Staphylococcus epidermidis	Staphylococcus	Staphylococcaceae	Bacillales	Bacilli	Firmicutes	Bacteria				PRJNA180069	SAMN00116836	904341	NZ_AHLF	./rcn/refseq-NZ-904341-PRJNA180069-SAMN00116836-NZ_AHLF-.-Staphylococcus_epidermidis_VCU125.fna	
+SRR6153231	Mycobacterium tuberculosis MD19051	0.0317811	0.0	172/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD19051	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02584606	1447455	GCF_000650115.1	./rcn/refseq-NZ-1447455-PRJNA224116-SAMN02584606-GCF_000650115.1-.-Mycobacterium_tuberculosis_MD19051.fna	Mycobacterium tuberculosis complex
+SRR6153231	Mycobacterium tuberculosis TKK-01-0080	0.0317811	0.0	172/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK-01-0080	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN01828251	1267364	GCF_000659185.1	./rcn/refseq-NZ-1267364-PRJNA224116-SAMN01828251-GCF_000659185.1-.-Mycobacterium_tuberculosis_TKK_01_0080.fna	Mycobacterium tuberculosis complex
+SRR6153231	Mycobacterium tuberculosis BTB12-315	0.0317811	0.0	172/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB12-315	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02414981	1423565	GCF_000679635.1	./rcn/refseq-NZ-1423565-PRJNA224116-SAMN02414981-GCF_000679635.1-.-Mycobacterium_tuberculosis_BTB12_315.fna	Mycobacterium tuberculosis complex
+SRR6153231	Mycobacterium tuberculosis BTB05-552	0.0317811	0.0	172/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB05-552	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02472041	882099	GCF_000220435.1	./rcn/refseq-NZ-882099-PRJNA224116-SAMN02472041-GCF_000220435.1-.-Mycobacterium_tuberculosis_BTB05_552.fna	Mycobacterium tuberculosis complex
+SRR6153231	Mycobacterium tuberculosis BTB05-559	0.0317811	0.0	172/400	Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB05-559	Mycobacterium tuberculosis	Mycobacterium	Mycobacteriaceae	Corynebacteriales	Actinobacteria	Actinobacteria	Bacteria				PRJNA224116	SAMN02472033	882100	GCF_000220455.1	./rcn/refseq-NZ-882100-PRJNA224116-SAMN02472033-GCF_000220455.1-.-Mycobacterium_tuberculosis_BTB05_559.fna	Mycobacterium tuberculosis complex
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/kraken_filter/kraken_filter	Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import sys
+import argparse as ap
+from ete3 import NCBITaxa
+
+def parse_kraken_results(db, report, taxid):
+    kraken = {}  # Store classification for each read
+    ncbi = NCBITaxa(db)
+    descendants = set(ncbi.get_descendant_taxa(taxid))
+
+    with open(report, 'r') as classification:
+        for line in classification:
+            classified, read_id, tax_id, length, details = line.strip().split("\t")
+            kraken[read_id] = tax_id
+
+    # Classify each read
+    kraken_class = {}
+
+    for read_id, tax_id in kraken.items():
+        if tax_id == 0:
+            kraken_class[read_id] = "unclassified"
+        elif int(tax_id) in descendants or int(tax_id) == int(taxid):
+            kraken_class[read_id] = "target"
+        else:
+            kraken_class[read_id] = "other"
+
+    return kraken_class
+
+
+def kraken_trim(db, report, taxid, paired, fastq, fastq2):
+    kraken = parse_kraken_results(db, report, taxid)
+
+    # Write new fastq file
+    if paired:
+        files = [fastq, fastq2]
+    else:
+        files = [fastq]
+    for index,fastq_in in enumerate(files):
+        with open(fastq_in, 'r') as f_in:
+            with open('input_%d.fastq' % (index+1), 'w') as f_out:
+                 for line in f_in:
+                    # Split ID with space, then remove "/1" or "/2" if it exists and ignore initial @
+                    read_id = line.split(" ")[0].split("/")[0][1:]
+                    if read_id in kraken and kraken[read_id] != "other":
+                        f_out.write(line)
+                        for i in range(3):
+                            f_out.write(f_in.readline())
+                    else:
+                        for i in range(3):
+                            f_in.readline()
+
+parser = ap.ArgumentParser(prog='kraken_trim', conflict_handler='resolve',
+                           description="Trims contaminated reads using Kraken reports")
+
+input = parser.add_argument_group('Input', '')
+input.add_argument('db', help="sqlite formatted ETE3 taxa database")
+input.add_argument('report', help="Kraken report")
+input.add_argument('taxid', type=int, help="Target taxonomic ID")
+input.add_argument('fastq', help="FASTQ file")
+input.add_argument('fastq2', nargs='?', help="Reverse FASTQ mate ")
+input.add_argument('--p', '--paired', action='store_true', help="Paired FASTQ files")
+
+if len(sys.argv) == 1:
+    parser.print_usage()
+    sys.exit(1)
+
+args = parser.parse_args()
+
+kraken_trim(args.db, args.report, args.taxid, args.p, args.fastq, args.fastq2)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/kraken_filter/kraken_filter.xml	Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,90 @@
+<tool id="kraken_trim" name="Trim contaminated reads" version="0.1.0">
+    <description>by parsing Kraken reports</description>
+    <requirements>
+        <requirement type="package" version="3.1.1">ete3</requirement>
+    </requirements>
+    <command><![CDATA[
+        $__tool_directory__/kraken_filter
+
+        #if $fastq_input.input_type == "paired" or $fastq_input.input_type == "paired_collection"
+            "--p"
+        #end if
+
+        "${db}"
+        "${report}"
+        "${taxid}"
+
+        #if $fastq_input.input_type == "single"
+            "${fastq_input.fastq1}"
+        #elif $fastq_input.input_type == "paired"
+            "${fastq_input.fastq1}" "${fastq_input.fastq2}"
+        #elif $fastq_input.input_type == "paired_collection"
+            "${fastq_input.fastq1.forward}" "${fastq_input.fastq1.reverse}"
+        #end if
+    ]]></command>
+    <inputs>
+        <conditional name="fastq_input">
+            <param name="input_type" type="select" label="Single or Paired-end reads">
+                <option value="paired">Paired</option>
+                <option value="single">Single</option>
+                <option value="paired_collection">Paired Collection</option>
+            </param>
+            <when value="paired">
+                <param name="fastq1" type="data" format="fastqsanger,fastq"
+                       label="Select fastq dataset with forward reads"/>
+                <param name="fastq2" type="data" format="fastqsanger,fastq"
+                       label="Select fastq dataset with reverse reads"/>
+            </when>
+            <when value="single">
+                <param name="fastq1" type="data" format="fastqsanger,fastq"
+                       label="Select fastq dataset"/>
+            </when>
+            <when value="paired_collection">
+                <param name="fastq1" type="data_collection" collection_type="paired"
+                       format="fastqsanger,fastq" label="Select paired collection"/>
+            </when>
+        </conditional>
+        <param name="db" type="data" format="sqlite" label="(ETE3) Taxonomy Database"/>
+        <param name="report" type="data" format="tabular" label="Kraken report" help="" optional="false"/>
+        <param name="taxid" type="integer" value="0" label="Taxonomic ID of target taxonomic rank" optional="false"/>
+    </inputs>
+    <outputs>
+        <data name="fastq_trim" label="${tool.name} on ${on_string}" format_source="fastq1"
+              from_work_dir="input_1.fastq">
+            <filter>fastq_input['input_type'] == 'single'</filter>
+        </data>
+
+        <collection name="trimmed_paired_collection" type="paired" label="${tool.name} on ${on_string}">
+            <data name="forward" format_source="fastq1['forward']" from_work_dir="input_1.fastq"/>
+            <data name="reverse" format_source="fastq1['forward']" from_work_dir="input_2.fastq"/>
+            <filter>fastq_input['input_type'] == 'paired_collection'</filter>
+        </collection>
+
+        <data name="fastq1_trim" label="${tool.name} on ${on_string}" format_source="fastq1"
+              from_work_dir="input_1.fastq">
+            <filter>fastq_input['input_type'] == 'paired'</filter>
+        </data>
+        <data name="fastq2_trim" label="${tool.name} on ${on_string}: reverse mate" format_source="fastq2"
+              from_work_dir="input_2.fastq">
+            <filter>fastq_input['input_type'] == 'paired'</filter>
+        </data>
+    </outputs>
+
+    <tests>
+        <test>
+
+        </test>
+    </tests>
+
+    <help><![CDATA[
+    Trims contaminated reads from Next-Generation Sequencing data using Kraken outputs.
+
+    Command line:
+    kraken_filter [--p] <ETE3 taxonomy database> <kraken report> <taxid> <fastq> [fastq_reverse]
+
+    ]]></help>
+
+    <citations>
+        Manuscript in preparation
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/qualimap_parser/qualimap_parser	Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,38 @@
+#!/usr/bin/python
+
+# Usage: qualimap_parser -i <reports>
+
+import sys
+import argparse as ap
+
+
+### Define global variables
+mapped_percentage = ""
+mean_mapping_quality = ""
+
+parser = ap.ArgumentParser(prog='outlier-parser', conflict_handler='resolve',
+                           description="Parses Qualimap output file to eliminate outliers")
+
+input = parser.add_argument_group('Input', '')
+input.add_argument('-n', '--name', nargs='+', required=True, help="Sample name")
+input.add_argument('-i', '--input', nargs='+', required=True, help="Qualimap Genome Report File")
+
+if len(sys.argv) == 1:
+    parser.print_usage()
+    sys.exit(1)
+
+args = parser.parse_args()
+output = open("outlier_list.txt", "a")
+
+"""For parsing report file"""
+for index,report in enumerate(args.input):
+    with open(report) as file:
+        for line in file:
+            if "number of mapped reads" in line:
+                mapped_percentage = line.split()[-1].strip('()%')
+            if "mean mapping quality" in line:
+                mean_mapping_quality = line.split()[-1]
+
+        if float(mapped_percentage) < 90 or float(mean_mapping_quality) < 10:
+            output.write("%s\n" % args.name[index])
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/qualimap_parser/qualimap_parser.xml	Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,31 @@
+<tool id="parse_qualimap" name="Parse Qualimap reports" version="0.1.0">
+    <description>and output a list of outliers</description>
+    <command><![CDATA[
+        $__tool_directory__/qualimap_parser -n #for $report in $reports# ${report.element_identifier} #end for#
+        -i #for $report in $reports# $report #end for#
+    ]]></command>
+    <inputs>
+        <param name="reports" type="data" format="txt" label="Qualimap Genome Report" help="" optional="false" multiple="True" />
+    </inputs>
+    <outputs>
+        <data name="output_file" label="Outliers from dataset after mapping" format="txt" from_work_dir="outlier_list.txt"/>
+    </outputs>
+
+    <tests>
+        <test>
+
+        </test>
+    </tests>
+
+    <help><![CDATA[
+    Parses Qualimap reports and outputs a list of outliers based on percentage of the reference genome covered and mean mapping quality.
+
+    Command line:
+    qualimap_parser -n <accessions> -i <reports>
+
+    ]]></help>
+
+    <citations>
+        Manuscript in preparation
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/refseq_parser/refseq_parser	Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+import argparse as ap
+from ete3 import NCBITaxa
+
+parser = ap.ArgumentParser(prog='outlier-parser', conflict_handler='resolve',
+                           description="Parses output file to eliminate outliers")
+
+
+input = parser.add_argument_group('Input', '')
+input.add_argument('db', help="sqlite formatted ETE3 taxa database")
+input.add_argument('taxid', metavar="INT", help='Target taxonomic ID')
+input.add_argument('input', nargs='+', help="Tab-delimited RefSeq Masher reports")
+
+if len(sys.argv) == 1:
+    parser.print_usage()
+    sys.exit(1)
+
+args = parser.parse_args()
+
+output = open("outlier_list.txt", "w")
+outlier_flag = False
+accession = ""
+
+ncbi = NCBITaxa(args.db)
+descendants = ncbi.get_descendant_taxa(args.taxid, intermediate_nodes=True)
+
+for report in args.input: 
+    with open(report) as csvfile:
+        reader = csv.DictReader(csvfile, delimiter='\t')
+        next(reader, None)
+
+        for row in reader:
+            if row['sample'] == accession and outlier_flag:
+                continue
+            accession = row['sample']
+
+            if (int(row['taxid']) != int(args.taxid) and int(row['taxid']) not in descendants) or float(row['distance']) > 0.05:
+                output.write("%s\n" % accession)
+                outlier_flag = True
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/refseq_parser/refseq_parser.xml	Tue Sep 04 02:49:49 2018 -0400
@@ -0,0 +1,38 @@
+<tool id="parse_refseq_masher" name="Parse refseq_masher matches collection" version="0.1.0">
+    <description>and output a list of outliers</description>
+    <requirements>
+        <requirement type="package" version="3.1.1">ete3</requirement>
+    </requirements>
+    <command><![CDATA[
+        $__tool_directory__/refseq_parser $db $taxid #for $report in $reports# $report #end for#
+    ]]></command>
+    <inputs>
+        <param name="db" type="data" format="sqlite" label="(ETE3) Taxonomy Database"/>
+        <param name="reports" type="data" format="tabular" label="Refseq_masher report" help="" optional="false" multiple="True" />
+        <param name="taxid" type="integer" value="0" label="Taxonomic ID of target taxonomic rank" optional="false"/>
+    </inputs>
+    <outputs>
+        <data name="output_file" label="Outliers from dataset" format="txt" from_work_dir="outlier_list.txt"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="report" value="test.tsv" ftype="tabular" />
+            <output name="output" value="output.tsv" ftype="tabular" />
+        </test>
+    </tests>
+
+    <help><![CDATA[
+        Parses refseq_masher output and returns a list of outliers based on two criteria:
+
+        1. If the mash match is not the same or a descendant of the target taxonomic ID
+        2. If the distance between the match and the sample is >0.05
+
+        Command line:
+        refseq_parser <ETE3 taxonomy database> <taxid> [input refseq_masher reports ...]
+    ]]></help>
+
+    <citations>
+        Manuscript in preparation
+    </citations>
+</tool>