Mercurial > repos > matnguyen > ngsweep
changeset 0:45533fb9d2f4 draft default tip
Primary version
author | matnguyen |
---|---|
date | Tue, 04 Sep 2018 02:49:49 -0400 |
parents | |
children | |
files | test-data/refseq_parser/output.tsv test-data/refseq_parser/reference.fna.gz test-data/refseq_parser/test.tsv tools/kraken_filter/kraken_filter tools/kraken_filter/kraken_filter.xml tools/qualimap_parser/qualimap_parser tools/qualimap_parser/qualimap_parser.xml tools/refseq_parser/refseq_parser tools/refseq_parser/refseq_parser.xml |
diffstat | 9 files changed, 342 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/refseq_parser/output.tsv Tue Sep 04 02:49:49 2018 -0400 @@ -0,0 +1,2 @@ +SRR6152717 +SRR6153036
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/refseq_parser/test.tsv Tue Sep 04 02:49:49 2018 -0400 @@ -0,0 +1,31 @@ +sample top_taxonomy_name distance pvalue matching full_taxonomy taxonomic_species taxonomic_genus taxonomic_family taxonomic_order taxonomic_class taxonomic_phylum taxonomic_superkingdom subspecies serovar plasmid bioproject biosample taxid assembly_accession match_id taxonomic_species group +SRR6152717 Mycobacterium tuberculosis TKK_05MA_0040 0.07493939999999999 5.471479999999998e-187 71/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_05MA_0040 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02586004 1448509 GCF_000653175.1 ./rcn/refseq-NZ-1448509-PRJNA224116-SAMN02586004-GCF_000653175.1-.-Mycobacterium_tuberculosis_TKK_05MA_0040.fna Mycobacterium tuberculosis complex +SRR6152717 Mycobacterium tuberculosis MD17647 0.07493939999999999 5.152709999999997e-187 71/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17647 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02584623 1447472 GCF_000650435.1 ./rcn/refseq-NZ-1447472-PRJNA224116-SAMN02584623-GCF_000650435.1-.-Mycobacterium_tuberculosis_MD17647.fna Mycobacterium tuberculosis complex +SRR6152717 Mycobacterium tuberculosis MD17646 0.07493939999999999 5.719649999999997e-187 71/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17646 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02584624 1447473 GCF_000706405.1 ./rcn/refseq-NZ-1447473-PRJNA224116-SAMN02584624-GCF_000706405.1-.-Mycobacterium_tuberculosis_MD17646.fna Mycobacterium tuberculosis complex +SRR6152717 Mycobacterium tuberculosis MD17240 0.07493939999999999 5.256599999999999e-187 71/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17240 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02584627 1447476 GCF_000650495.1 ./rcn/refseq-NZ-1447476-PRJNA224116-SAMN02584627-GCF_000650495.1-.-Mycobacterium_tuberculosis_MD17240.fna Mycobacterium tuberculosis complex +SRR6152717 Mycobacterium tuberculosis MD17749 0.07493939999999999 5.147799999999998e-187 71/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD17749 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02584628 1447477 NZ_JLFC ./rcn/refseq-NZ-1447477-PRJNA224116-SAMN02584628-NZ_JLFC-.-Mycobacterium_tuberculosis_MD17749.fna Mycobacterium tuberculosis complex +SRR6152731 Mycobacterium tuberculosis UG-D 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; UG-D Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02360630 1402483 GCF_000674655.1 ./rcn/refseq-NZ-1402483-PRJNA224116-SAMN02360630-GCF_000674655.1-.-Mycobacterium_tuberculosis_UG_D.fna Mycobacterium tuberculosis complex +SRR6152731 Mycobacterium tuberculosis TKK_04_0148 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_04_0148 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02586095 1448600 GCF_000656935.1 ./rcn/refseq-NZ-1448600-PRJNA224116-SAMN02586095-GCF_000656935.1-.-Mycobacterium_tuberculosis_TKK_04_0148.fna Mycobacterium tuberculosis complex +SRR6152731 Mycobacterium tuberculosis UT0058 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; UT0058 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02381013 1408936 GCF_000668595.1 ./rcn/refseq-NZ-1408936-PRJNA224116-SAMN02381013-GCF_000668595.1-.-Mycobacterium_tuberculosis_UT0058.fna Mycobacterium tuberculosis complex +SRR6152731 Mycobacterium tuberculosis BTB10-001 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB10-001 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02414929 1423513 GCF_000678695.1 ./rcn/refseq-NZ-1423513-PRJNA224116-SAMN02414929-GCF_000678695.1-.-Mycobacterium_tuberculosis_BTB10_001.fna Mycobacterium tuberculosis complex +SRR6152731 Mycobacterium tuberculosis BTB10-142 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB10-142 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02414934 1423518 GCF_000678775.1 ./rcn/refseq-NZ-1423518-PRJNA224116-SAMN02414934-GCF_000678775.1-.-Mycobacterium_tuberculosis_BTB10_142.fna Mycobacterium tuberculosis complex +SRR6152844 Mycobacterium tuberculosis XTB13-092 7.82718e-05 0.0 399/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; XTB13-092 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02419542 1427186 GCF_000680035.1 ./rcn/refseq-NZ-1427186-PRJNA224116-SAMN02419542-GCF_000680035.1-.-Mycobacterium_tuberculosis_XTB13_092.fna Mycobacterium tuberculosis complex +SRR6152844 Mycobacterium tuberculosis TKK_03_0065 7.82718e-05 0.0 399/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_03_0065 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02585907 1448412 GCF_000651735.1 ./rcn/refseq-NZ-1448412-PRJNA224116-SAMN02585907-GCF_000651735.1-.-Mycobacterium_tuberculosis_TKK_03_0065.fna Mycobacterium tuberculosis complex +SRR6152844 Mycobacterium tuberculosis TKK_03_0072 7.82718e-05 0.0 399/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_03_0072 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02585909 1448414 GCF_000651755.1 ./rcn/refseq-NZ-1448414-PRJNA224116-SAMN02585909-GCF_000651755.1-.-Mycobacterium_tuberculosis_TKK_03_0072.fna Mycobacterium tuberculosis complex +SRR6152844 Mycobacterium tuberculosis 7.82718e-05 0.0 399/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN03338743 1773 GCF_000972425.1 ./rcn/refseq-NZ-1773-PRJNA224116-SAMN03338743-GCF_000972425.1-.-Mycobacterium_tuberculosis.fna Mycobacterium tuberculosis complex +SRR6152844 Mycobacterium tuberculosis X122 7.82718e-05 0.0 399/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; X122 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02471089 747368 GCF_000184025.1 ./rcn/refseq-NZ-747368-PRJNA224116-SAMN02471089-GCF_000184025.1-.-Mycobacterium_tuberculosis_X122.fna Mycobacterium tuberculosis complex +SRR6152991 Mycobacterium tuberculosis 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN03177496 1773 NZ_JUFU ./rcn/refseq-NZ-1773-PRJNA224116-SAMN03177496-NZ_JUFU-.-Mycobacterium_tuberculosis.fna Mycobacterium tuberculosis complex +SRR6152991 Mycobacterium tuberculosis UT0110 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; UT0110 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02381044 1408967 GCF_000669155.1 ./rcn/refseq-NZ-1408967-PRJNA224116-SAMN02381044-GCF_000669155.1-.-Mycobacterium_tuberculosis_UT0110.fna Mycobacterium tuberculosis complex +SRR6152991 Mycobacterium tuberculosis KT-0022 0.000156838 0.0 398/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; KT-0022 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02360562 1400884 GCF_000673755.1 ./rcn/refseq-NZ-1400884-PRJNA224116-SAMN02360562-GCF_000673755.1-.-Mycobacterium_tuberculosis_KT_0022.fna Mycobacterium tuberculosis complex +SRR6152991 Mycobacterium tuberculosis BTB07-275 0.000235701 0.0 397/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB07-275 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02414885 1423469 GCF_000677935.1 ./rcn/refseq-NZ-1423469-PRJNA224116-SAMN02414885-GCF_000677935.1-.-Mycobacterium_tuberculosis_BTB07_275.fna Mycobacterium tuberculosis complex +SRR6152991 Mycobacterium tuberculosis TKK_05SA_0021 0.000235701 0.0 397/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK_05SA_0021 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02586030 1448535 GCF_000653515.1 ./rcn/refseq-NZ-1448535-PRJNA224116-SAMN02586030-GCF_000653515.1-.-Mycobacterium_tuberculosis_TKK_05SA_0021.fna Mycobacterium tuberculosis complex +SRR6153036 Staphylococcus epidermidis AU12-03 0.0227453 0.0 213/400 Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; AU12-03 Staphylococcus epidermidis Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria PRJNA180900 SAMN01103171 1220510 NZ_AMCS ./rcn/refseq-NZ-1220510-PRJNA180900-SAMN01103171-NZ_AMCS-.-Staphylococcus_epidermidis_AU12_03.fna +SRR6153036 Staphylococcus epidermidis SK135 0.022937400000000004 0.0 212/400 Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; SK135 Staphylococcus epidermidis Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria PRJNA42967 SAMN00008358 596317 NZ_ADEY ./rcn/refseq-NZ-596317-PRJNA42967-SAMN00008358-NZ_ADEY-.-Staphylococcus_epidermidis_SK135.fna +SRR6153036 Staphylococcus epidermidis VCU109 0.022937400000000004 0.0 212/400 Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; VCU109 Staphylococcus epidermidis Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria PRJNA179852 SAMN00116832 904330 NZ_AFUA ./rcn/refseq-NZ-904330-PRJNA179852-SAMN00116832-NZ_AFUA-.-Staphylococcus_epidermidis_VCU109.fna +SRR6153036 Staphylococcus epidermidis 0.0233253 0.0 210/400 Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis Staphylococcus epidermidis Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria PRJNA224116 SAMN02640611 1282 NZ_JMIF ./rcn/refseq-NZ-1282-PRJNA224116-SAMN02640611-NZ_JMIF-.-Staphylococcus_epidermidis.fna +SRR6153036 Staphylococcus epidermidis VCU125 0.024116 0.0 206/400 Bacteria; Terrabacteria group; Firmicutes; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus; epidermidis; VCU125 Staphylococcus epidermidis Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria PRJNA180069 SAMN00116836 904341 NZ_AHLF ./rcn/refseq-NZ-904341-PRJNA180069-SAMN00116836-NZ_AHLF-.-Staphylococcus_epidermidis_VCU125.fna +SRR6153231 Mycobacterium tuberculosis MD19051 0.0317811 0.0 172/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; MD19051 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02584606 1447455 GCF_000650115.1 ./rcn/refseq-NZ-1447455-PRJNA224116-SAMN02584606-GCF_000650115.1-.-Mycobacterium_tuberculosis_MD19051.fna Mycobacterium tuberculosis complex +SRR6153231 Mycobacterium tuberculosis TKK-01-0080 0.0317811 0.0 172/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; TKK-01-0080 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN01828251 1267364 GCF_000659185.1 ./rcn/refseq-NZ-1267364-PRJNA224116-SAMN01828251-GCF_000659185.1-.-Mycobacterium_tuberculosis_TKK_01_0080.fna Mycobacterium tuberculosis complex +SRR6153231 Mycobacterium tuberculosis BTB12-315 0.0317811 0.0 172/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB12-315 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02414981 1423565 GCF_000679635.1 ./rcn/refseq-NZ-1423565-PRJNA224116-SAMN02414981-GCF_000679635.1-.-Mycobacterium_tuberculosis_BTB12_315.fna Mycobacterium tuberculosis complex +SRR6153231 Mycobacterium tuberculosis BTB05-552 0.0317811 0.0 172/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB05-552 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02472041 882099 GCF_000220435.1 ./rcn/refseq-NZ-882099-PRJNA224116-SAMN02472041-GCF_000220435.1-.-Mycobacterium_tuberculosis_BTB05_552.fna Mycobacterium tuberculosis complex +SRR6153231 Mycobacterium tuberculosis BTB05-559 0.0317811 0.0 172/400 Bacteria; Terrabacteria group; Actinobacteria; ; Corynebacteriales; Mycobacteriaceae; Mycobacterium; tuberculosis complex; Mycobacterium tuberculosis; BTB05-559 Mycobacterium tuberculosis Mycobacterium Mycobacteriaceae Corynebacteriales Actinobacteria Actinobacteria Bacteria PRJNA224116 SAMN02472033 882100 GCF_000220455.1 ./rcn/refseq-NZ-882100-PRJNA224116-SAMN02472033-GCF_000220455.1-.-Mycobacterium_tuberculosis_BTB05_559.fna Mycobacterium tuberculosis complex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/kraken_filter/kraken_filter Tue Sep 04 02:49:49 2018 -0400 @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +import sys +import argparse as ap +from ete3 import NCBITaxa + +def parse_kraken_results(db, report, taxid): + kraken = {} # Store classification for each read + ncbi = NCBITaxa(db) + descendants = set(ncbi.get_descendant_taxa(taxid)) + + with open(report, 'r') as classification: + for line in classification: + classified, read_id, tax_id, length, details = line.strip().split("\t") + kraken[read_id] = tax_id + + # Classify each read + kraken_class = {} + + for read_id, tax_id in kraken.items(): + if tax_id == 0: + kraken_class[read_id] = "unclassified" + elif int(tax_id) in descendants or int(tax_id) == int(taxid): + kraken_class[read_id] = "target" + else: + kraken_class[read_id] = "other" + + return kraken_class + + +def kraken_trim(db, report, taxid, paired, fastq, fastq2): + kraken = parse_kraken_results(db, report, taxid) + + # Write new fastq file + if paired: + files = [fastq, fastq2] + else: + files = [fastq] + for index,fastq_in in enumerate(files): + with open(fastq_in, 'r') as f_in: + with open('input_%d.fastq' % (index+1), 'w') as f_out: + for line in f_in: + # Split ID with space, then remove "/1" or "/2" if it exists and ignore initial @ + read_id = line.split(" ")[0].split("/")[0][1:] + if read_id in kraken and kraken[read_id] != "other": + f_out.write(line) + for i in range(3): + f_out.write(f_in.readline()) + else: + for i in range(3): + f_in.readline() + +parser = ap.ArgumentParser(prog='kraken_trim', conflict_handler='resolve', + description="Trims contaminated reads using Kraken reports") + +input = parser.add_argument_group('Input', '') +input.add_argument('db', help="sqlite formatted ETE3 taxa database") +input.add_argument('report', help="Kraken report") +input.add_argument('taxid', type=int, help="Target taxonomic ID") +input.add_argument('fastq', help="FASTQ file") +input.add_argument('fastq2', nargs='?', help="Reverse FASTQ mate ") +input.add_argument('--p', '--paired', action='store_true', help="Paired FASTQ files") + +if len(sys.argv) == 1: + parser.print_usage() + sys.exit(1) + +args = parser.parse_args() + +kraken_trim(args.db, args.report, args.taxid, args.p, args.fastq, args.fastq2) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/kraken_filter/kraken_filter.xml Tue Sep 04 02:49:49 2018 -0400 @@ -0,0 +1,90 @@ +<tool id="kraken_trim" name="Trim contaminated reads" version="0.1.0"> + <description>by parsing Kraken reports</description> + <requirements> + <requirement type="package" version="3.1.1">ete3</requirement> + </requirements> + <command><![CDATA[ + $__tool_directory__/kraken_filter + + #if $fastq_input.input_type == "paired" or $fastq_input.input_type == "paired_collection" + "--p" + #end if + + "${db}" + "${report}" + "${taxid}" + + #if $fastq_input.input_type == "single" + "${fastq_input.fastq1}" + #elif $fastq_input.input_type == "paired" + "${fastq_input.fastq1}" "${fastq_input.fastq2}" + #elif $fastq_input.input_type == "paired_collection" + "${fastq_input.fastq1.forward}" "${fastq_input.fastq1.reverse}" + #end if + ]]></command> + <inputs> + <conditional name="fastq_input"> + <param name="input_type" type="select" label="Single or Paired-end reads"> + <option value="paired">Paired</option> + <option value="single">Single</option> + <option value="paired_collection">Paired Collection</option> + </param> + <when value="paired"> + <param name="fastq1" type="data" format="fastqsanger,fastq" + label="Select fastq dataset with forward reads"/> + <param name="fastq2" type="data" format="fastqsanger,fastq" + label="Select fastq dataset with reverse reads"/> + </when> + <when value="single"> + <param name="fastq1" type="data" format="fastqsanger,fastq" + label="Select fastq dataset"/> + </when> + <when value="paired_collection"> + <param name="fastq1" type="data_collection" collection_type="paired" + format="fastqsanger,fastq" label="Select paired collection"/> + </when> + </conditional> + <param name="db" type="data" format="sqlite" label="(ETE3) Taxonomy Database"/> + <param name="report" type="data" format="tabular" label="Kraken report" help="" optional="false"/> + <param name="taxid" type="integer" value="0" label="Taxonomic ID of target taxonomic rank" optional="false"/> + </inputs> + <outputs> + <data name="fastq_trim" label="${tool.name} on ${on_string}" format_source="fastq1" + from_work_dir="input_1.fastq"> + <filter>fastq_input['input_type'] == 'single'</filter> + </data> + + <collection name="trimmed_paired_collection" type="paired" label="${tool.name} on ${on_string}"> + <data name="forward" format_source="fastq1['forward']" from_work_dir="input_1.fastq"/> + <data name="reverse" format_source="fastq1['forward']" from_work_dir="input_2.fastq"/> + <filter>fastq_input['input_type'] == 'paired_collection'</filter> + </collection> + + <data name="fastq1_trim" label="${tool.name} on ${on_string}" format_source="fastq1" + from_work_dir="input_1.fastq"> + <filter>fastq_input['input_type'] == 'paired'</filter> + </data> + <data name="fastq2_trim" label="${tool.name} on ${on_string}: reverse mate" format_source="fastq2" + from_work_dir="input_2.fastq"> + <filter>fastq_input['input_type'] == 'paired'</filter> + </data> + </outputs> + + <tests> + <test> + + </test> + </tests> + + <help><![CDATA[ + Trims contaminated reads from Next-Generation Sequencing data using Kraken outputs. + + Command line: + kraken_filter [--p] <ETE3 taxonomy database> <kraken report> <taxid> <fastq> [fastq_reverse] + + ]]></help> + + <citations> + Manuscript in preparation + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/qualimap_parser/qualimap_parser Tue Sep 04 02:49:49 2018 -0400 @@ -0,0 +1,38 @@ +#!/usr/bin/python + +# Usage: qualimap_parser -i <reports> + +import sys +import argparse as ap + + +### Define global variables +mapped_percentage = "" +mean_mapping_quality = "" + +parser = ap.ArgumentParser(prog='outlier-parser', conflict_handler='resolve', + description="Parses Qualimap output file to eliminate outliers") + +input = parser.add_argument_group('Input', '') +input.add_argument('-n', '--name', nargs='+', required=True, help="Sample name") +input.add_argument('-i', '--input', nargs='+', required=True, help="Qualimap Genome Report File") + +if len(sys.argv) == 1: + parser.print_usage() + sys.exit(1) + +args = parser.parse_args() +output = open("outlier_list.txt", "a") + +"""For parsing report file""" +for index,report in enumerate(args.input): + with open(report) as file: + for line in file: + if "number of mapped reads" in line: + mapped_percentage = line.split()[-1].strip('()%') + if "mean mapping quality" in line: + mean_mapping_quality = line.split()[-1] + + if float(mapped_percentage) < 90 or float(mean_mapping_quality) < 10: + output.write("%s\n" % args.name[index]) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/qualimap_parser/qualimap_parser.xml Tue Sep 04 02:49:49 2018 -0400 @@ -0,0 +1,31 @@ +<tool id="parse_qualimap" name="Parse Qualimap reports" version="0.1.0"> + <description>and output a list of outliers</description> + <command><![CDATA[ + $__tool_directory__/qualimap_parser -n #for $report in $reports# ${report.element_identifier} #end for# + -i #for $report in $reports# $report #end for# + ]]></command> + <inputs> + <param name="reports" type="data" format="txt" label="Qualimap Genome Report" help="" optional="false" multiple="True" /> + </inputs> + <outputs> + <data name="output_file" label="Outliers from dataset after mapping" format="txt" from_work_dir="outlier_list.txt"/> + </outputs> + + <tests> + <test> + + </test> + </tests> + + <help><![CDATA[ + Parses Qualimap reports and outputs a list of outliers based on percentage of the reference genome covered and mean mapping quality. + + Command line: + qualimap_parser -n <accessions> -i <reports> + + ]]></help> + + <citations> + Manuscript in preparation + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/refseq_parser/refseq_parser Tue Sep 04 02:49:49 2018 -0400 @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +import sys +import csv +import argparse as ap +from ete3 import NCBITaxa + +parser = ap.ArgumentParser(prog='outlier-parser', conflict_handler='resolve', + description="Parses output file to eliminate outliers") + + +input = parser.add_argument_group('Input', '') +input.add_argument('db', help="sqlite formatted ETE3 taxa database") +input.add_argument('taxid', metavar="INT", help='Target taxonomic ID') +input.add_argument('input', nargs='+', help="Tab-delimited RefSeq Masher reports") + +if len(sys.argv) == 1: + parser.print_usage() + sys.exit(1) + +args = parser.parse_args() + +output = open("outlier_list.txt", "w") +outlier_flag = False +accession = "" + +ncbi = NCBITaxa(args.db) +descendants = ncbi.get_descendant_taxa(args.taxid, intermediate_nodes=True) + +for report in args.input: + with open(report) as csvfile: + reader = csv.DictReader(csvfile, delimiter='\t') + next(reader, None) + + for row in reader: + if row['sample'] == accession and outlier_flag: + continue + accession = row['sample'] + + if (int(row['taxid']) != int(args.taxid) and int(row['taxid']) not in descendants) or float(row['distance']) > 0.05: + output.write("%s\n" % accession) + outlier_flag = True
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/refseq_parser/refseq_parser.xml Tue Sep 04 02:49:49 2018 -0400 @@ -0,0 +1,38 @@ +<tool id="parse_refseq_masher" name="Parse refseq_masher matches collection" version="0.1.0"> + <description>and output a list of outliers</description> + <requirements> + <requirement type="package" version="3.1.1">ete3</requirement> + </requirements> + <command><![CDATA[ + $__tool_directory__/refseq_parser $db $taxid #for $report in $reports# $report #end for# + ]]></command> + <inputs> + <param name="db" type="data" format="sqlite" label="(ETE3) Taxonomy Database"/> + <param name="reports" type="data" format="tabular" label="Refseq_masher report" help="" optional="false" multiple="True" /> + <param name="taxid" type="integer" value="0" label="Taxonomic ID of target taxonomic rank" optional="false"/> + </inputs> + <outputs> + <data name="output_file" label="Outliers from dataset" format="txt" from_work_dir="outlier_list.txt"/> + </outputs> + + <tests> + <test> + <param name="report" value="test.tsv" ftype="tabular" /> + <output name="output" value="output.tsv" ftype="tabular" /> + </test> + </tests> + + <help><![CDATA[ + Parses refseq_masher output and returns a list of outliers based on two criteria: + + 1. If the mash match is not the same or a descendant of the target taxonomic ID + 2. If the distance between the match and the sample is >0.05 + + Command line: + refseq_parser <ETE3 taxonomy database> <taxid> [input refseq_masher reports ...] + ]]></help> + + <citations> + Manuscript in preparation + </citations> +</tool>