Mercurial > repos > greg > vsnp_determine_ref_from_data
changeset 0:ebc08e5ce646 draft
Uploaded
author | greg |
---|---|
date | Tue, 21 Apr 2020 10:08:28 -0400 |
parents | |
children | bca267738b33 |
files | .shed.yml vsnp_determine_ref_from_data.py vsnp_determine_ref_from_data.xml |
diffstat | 3 files changed, 544 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Tue Apr 21 10:08:28 2020 -0400 @@ -0,0 +1,11 @@ +name: vsnp_determine_ref_from_data +owner: greg +description: | + Contains a tool that sniffs bacterial genome samples to discover the primary species within the sample. +homepage_url: https://github.com/USDA-VS/vSNP +long_description: | + Contains a tool that sniffs bacterial genome samples to discover the primary species within the sample. +remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_determine_ref_from_data +type: unrestricted +categories: + - Sequence Analysis
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vsnp_determine_ref_from_data.py Tue Apr 21 10:08:28 2020 -0400 @@ -0,0 +1,367 @@ +#!/usr/bin/env python + +import argparse +import gzip +import multiprocessing +import os +import queue +import yaml +from Bio.SeqIO.QualityIO import FastqGeneralIterator +from collections import OrderedDict + +INPUT_READS_DIR = 'input_reads' +OUTPUT_DBKEY_DIR = 'output_dbkey' +OUTPUT_METRICS_DIR = 'output_metrics' + + +def get_base_file_name(file_path): + base_file_name = os.path.basename(file_path) + if base_file_name.find(".") > 0: + # Eliminate the extension. + return os.path.splitext(base_file_name)[0] + elif base_file_name.find("_") > 0: + # The dot extension was likely changed to + # the " character. + items = base_file_name.split("_") + no_ext = "_".join(items[0:-2]) + if len(no_ext) > 0: + return no_ext + return base_file_name + + +def get_dbkey(dnaprints_dict, key, s): + # dnaprints_dict looks something like this: + # {'brucella': {'NC_002945v4': ['11001110', '11011110', '11001100']} + # {'bovis': {'NC_006895': ['11111110', '00010010', '01111011']}} + d = dnaprints_dict.get(key, {}) + for data_table_value, v_list in d.items(): + if s in v_list: + return data_table_value + return "" + + +def get_dnaprints_dict(dnaprint_fields): + # A dndprint_fields entry looks something liek this. + # [['AF2122', '/galaxy/tool-data/vsnp/AF2122/dnaprints/NC_002945v4.yml']] + dnaprints_dict = {} + for item in dnaprint_fields: + # Here item is a 2-element list of data + # table components, # value and path. + value = item[0] + path = item[1].strip() + with open(path, "rt") as fh: + # The format of all dnaprints yaml + # files is something like this: + # brucella: + # - 0111111111111111 + print_dict = yaml.load(fh, Loader=yaml.Loader) + for print_dict_k, print_dict_v in print_dict.items(): + dnaprints_v_dict = dnaprints_dict.get(print_dict_k, {}) + if len(dnaprints_v_dict) > 0: + # dnaprints_dict already contains k (e.g., 'brucella', + # and dnaprints_v_dict will be a dictionary # that + # looks something like this: + # {'NC_002945v4': ['11001110', '11011110', '11001100']} + value_list = dnaprints_v_dict.get(value, []) + value_list = value_list + print_dict_v + dnaprints_v_dict[value] = value_list + else: + # dnaprints_v_dict is an empty dictionary. + dnaprints_v_dict[value] = print_dict_v + dnaprints_dict[print_dict_k] = dnaprints_v_dict + # dnaprints_dict looks something like this: + # {'brucella': {'NC_002945v4': ['11001110', '11011110', '11001100']} + # {'bovis': {'NC_006895': ['11111110', '00010010', '01111011']}} + return dnaprints_dict + + +def get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum): + if brucella_sum > 3: + group = "Brucella" + dbkey = get_dbkey(dnaprints_dict, "brucella", brucella_string) + elif bovis_sum > 3: + group = "TB" + dbkey = get_dbkey(dnaprints_dict, "bovis", bovis_string) + elif para_sum >= 1: + group = "paraTB" + dbkey = get_dbkey(dnaprints_dict, "para", para_string) + else: + group = "" + dbkey = "" + return group, dbkey + + +def get_group_and_dbkey_for_collection(task_queue, finished_queue, dnaprints_dict, timeout): + while True: + try: + tup = task_queue.get(block=True, timeout=timeout) + except queue.Empty: + break + fastq_file, count_list, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum = tup + group, dbkey = get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum) + finished_queue.put((fastq_file, count_list, group, dbkey)) + task_queue.task_done() + + +def get_oligo_dict(): + oligo_dict = {} + oligo_dict["01_ab1"] = "AATTGTCGGATAGCCTGGCGATAACGACGC" + oligo_dict["02_ab3"] = "CACACGCGGGCCGGAACTGCCGCAAATGAC" + oligo_dict["03_ab5"] = "GCTGAAGCGGCAGACCGGCAGAACGAATAT" + oligo_dict["04_mel"] = "TGTCGCGCGTCAAGCGGCGTGAAATCTCTG" + oligo_dict["05_suis1"] = "TGCGTTGCCGTGAAGCTTAATTCGGCTGAT" + oligo_dict["06_suis2"] = "GGCAATCATGCGCAGGGCTTTGCATTCGTC" + oligo_dict["07_suis3"] = "CAAGGCAGATGCACATAATCCGGCGACCCG" + oligo_dict["08_ceti1"] = "GTGAATATAGGGTGAATTGATCTTCAGCCG" + oligo_dict["09_ceti2"] = "TTACAAGCAGGCCTATGAGCGCGGCGTGAA" + oligo_dict["10_canis4"] = "CTGCTACATAAAGCACCCGGCGACCGAGTT" + oligo_dict["11_canis"] = "ATCGTTTTGCGGCATATCGCTGACCACAGC" + oligo_dict["12_ovis"] = "CACTCAATCTTCTCTACGGGCGTGGTATCC" + oligo_dict["13_ether2"] = "CGAAATCGTGGTGAAGGACGGGACCGAACC" + oligo_dict["14_63B1"] = "CCTGTTTAAAAGAATCGTCGGAACCGCTCT" + oligo_dict["15_16M0"] = "TCCCGCCGCCATGCCGCCGAAAGTCGCCGT" + oligo_dict["16_mel1b"] = "TCTGTCCAAACCCCGTGACCGAACAATAGA" + oligo_dict["17_tb157"] = "CTCTTCGTATACCGTTCCGTCGTCACCATGGTCCT" + oligo_dict["18_tb7"] = "TCACGCAGCCAACGATATTCGTGTACCGCGACGGT" + oligo_dict["19_tbbov"] = "CTGGGCGACCCGGCCGACCTGCACACCGCGCATCA" + oligo_dict["20_tb5"] = "CCGTGGTGGCGTATCGGGCCCCTGGATCGCGCCCT" + oligo_dict["21_tb2"] = "ATGTCTGCGTAAAGAAGTTCCATGTCCGGGAAGTA" + oligo_dict["22_tb3"] = "GAAGACCTTGATGCCGATCTGGGTGTCGATCTTGA" + oligo_dict["23_tb4"] = "CGGTGTTGAAGGGTCCCCCGTTCCAGAAGCCGGTG" + oligo_dict["24_tb6"] = "ACGGTGATTCGGGTGGTCGACACCGATGGTTCAGA" + oligo_dict["25_para"] = "CCTTTCTTGAAGGGTGTTCG" + oligo_dict["26_para_sheep"] = "CGTGGTGGCGACGGCGGCGGGCCTGTCTAT" + oligo_dict["27_para_cattle"] = "TCTCCTCGGTCGGTGATTCGGGGGCGCGGT" + return oligo_dict + + +def get_seq_counts(value, fastq_list, gzipped): + count = 0 + for fastq_file in fastq_list: + if gzipped == "true": + with gzip.open(fastq_file, 'rt') as fh: + for title, seq, qual in FastqGeneralIterator(fh): + count += seq.count(value) + else: + with open(fastq_file, 'r') as fh: + for title, seq, qual in FastqGeneralIterator(fh): + count += seq.count(value) + return(value, count) + + +def get_species_counts(fastq_list, gzipped): + count_summary = {} + oligo_dict = get_oligo_dict() + for v1 in oligo_dict.values(): + returned_value, count = get_seq_counts(v1, fastq_list, gzipped) + for key, v2 in oligo_dict.items(): + if returned_value == v2: + count_summary.update({key: count}) + count_list = [] + for v in count_summary.values(): + count_list.append(v) + brucella_sum = sum(count_list[:16]) + bovis_sum = sum(count_list[16:24]) + para_sum = sum(count_list[24:]) + return count_summary, count_list, brucella_sum, bovis_sum, para_sum + + +def get_species_counts_for_collection(task_queue, finished_queue, gzipped, timeout): + while True: + try: + fastq_file = task_queue.get(block=True, timeout=timeout) + except queue.Empty: + break + count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts([fastq_file], gzipped) + finished_queue.put((fastq_file, count_summary, count_list, brucella_sum, bovis_sum, para_sum)) + task_queue.task_done() + + +def get_species_strings(count_summary): + binary_dictionary = {} + for k, v in count_summary.items(): + if v > 1: + binary_dictionary.update({k: 1}) + else: + binary_dictionary.update({k: 0}) + binary_dictionary = OrderedDict(sorted(binary_dictionary.items())) + binary_list = [] + for v in binary_dictionary.values(): + binary_list.append(v) + brucella_binary = binary_list[:16] + brucella_string = ''.join(str(e) for e in brucella_binary) + bovis_binary = binary_list[16:24] + bovis_string = ''.join(str(e) for e in bovis_binary) + para_binary = binary_list[24:] + para_string = ''.join(str(e) for e in para_binary) + return brucella_string, bovis_string, para_string + + +def get_species_strings_for_collection(task_queue, finished_queue, timeout): + while True: + try: + tup = task_queue.get(block=True, timeout=timeout) + except queue.Empty: + break + fastq_file, count_summary, count_list, brucella_sum, bovis_sum, para_sum = tup + brucella_string, bovis_string, para_string = get_species_strings(count_summary) + finished_queue.put((fastq_file, count_list, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum)) + task_queue.task_done() + + +def output_dbkey(file_name, dbkey, output_file=None): + # Output the dbkey. + if output_file is None: + # We're producing a dataset collection. + output_file = os.path.join(OUTPUT_DBKEY_DIR, "%s.txt" % file_name) + with open(output_file, "w") as fh: + fh.write("%s" % dbkey) + + +def output_files(fastq_file, count_list, group, dbkey, dbkey_file=None, metrics_file=None): + base_file_name = get_base_file_name(fastq_file) + if dbkey_file is not None: + # We're dealing with a single read or + # a set of paired reads. If the latter, + # the following will hopefully produce a + # good sample string. + if base_file_name.find("_") > 0: + base_file_name = base_file_name.split("_")[0] + output_dbkey(base_file_name, dbkey, dbkey_file) + output_metrics(base_file_name, count_list, group, dbkey, metrics_file) + + +def output_files_for_collection(task_queue, timeout): + while True: + try: + tup = task_queue.get(block=True, timeout=timeout) + except queue.Empty: + break + fastq_file, count_list, group, dbkey = tup + output_files(fastq_file, count_list, group, dbkey) + task_queue.task_done() + + +def output_metrics(file_name, count_list, group, dbkey, output_file=None): + # Output the metrics. + if output_file is None: + # We're producing a dataset collection. + output_file = os.path.join(OUTPUT_METRICS_DIR, "%s.txt" % file_name) + with open(output_file, "w") as fh: + fh.write("Sample: %s\n" % file_name) + fh.write("Brucella counts: ") + for i in count_list[:16]: + fh.write("%d," % i) + fh.write("\nTB counts: ") + for i in count_list[16:24]: + fh.write("%d," % i) + fh.write("\nPara counts: ") + for i in count_list[24:]: + fh.write("%d," % i) + fh.write("\nGroup: %s" % group) + fh.write("\ndbkey: %s\n" % dbkey) + + +def set_num_cpus(num_files, processes): + num_cpus = int(multiprocessing.cpu_count()) + if num_files < num_cpus and num_files < processes: + return num_files + if num_cpus < processes: + half_cpus = int(num_cpus / 2) + if num_files < half_cpus: + return num_files + return half_cpus + return processes + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('--dnaprint_fields', action='append', dest='dnaprint_fields', nargs=2, help="List of dnaprints data table value, name and path fields") + parser.add_argument('--read1', action='store', dest='read1', required=False, default=None, help='Required: single read') + parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') + parser.add_argument('--gzipped', action='store', dest='gzipped', help='Input files are gzipped') + parser.add_argument('--output_dbkey', action='store', dest='output_dbkey', required=False, default=None, help='Output reference file') + parser.add_argument('--output_metrics', action='store', dest='output_metrics', required=False, default=None, help='Output metrics file') + parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting') + + args = parser.parse_args() + + collection = False + fastq_list = [] + if args.read1 is not None: + fastq_list.append(args.read1) + if args.read2 is not None: + fastq_list.append(args.read2) + else: + collection = True + for file_name in sorted(os.listdir(INPUT_READS_DIR)): + file_path = os.path.abspath(os.path.join(INPUT_READS_DIR, file_name)) + fastq_list.append(file_path) + + # The value of dnaprint_fields is a list of lists, where each list is + # the [value, name, path] components of the vsnp_dnaprints data table. + # The data_manager_vsnp_dnaprints tool assigns the dbkey column from the + # all_fasta data table to the value column in the vsnp_dnaprints data + # table to ensure a proper mapping for discovering the dbkey. + dnaprints_dict = get_dnaprints_dict(args.dnaprint_fields) + + if collection: + # Here fastq_list consists of any number of + # reads, so each file will be processed and + # dataset collections will be produced as outputs. + multiprocessing.set_start_method('spawn') + queue1 = multiprocessing.JoinableQueue() + queue2 = multiprocessing.JoinableQueue() + num_files = len(fastq_list) + cpus = set_num_cpus(num_files, args.processes) + # Set a timeout for get()s in the queue. + timeout = 0.05 + + for fastq_file in fastq_list: + queue1.put(fastq_file) + + # Complete the get_species_counts task. + processes = [multiprocessing.Process(target=get_species_counts_for_collection, args=(queue1, queue2, args.gzipped, timeout, )) for _ in range(cpus)] + for p in processes: + p.start() + for p in processes: + p.join() + queue1.join() + + # Complete the get_species_strings task. + processes = [multiprocessing.Process(target=get_species_strings_for_collection, args=(queue2, queue1, timeout, )) for _ in range(cpus)] + for p in processes: + p.start() + for p in processes: + p.join() + queue2.join() + + # Complete the get_group_and_dbkey task. + processes = [multiprocessing.Process(target=get_group_and_dbkey_for_collection, args=(queue1, queue2, dnaprints_dict, timeout, )) for _ in range(cpus)] + for p in processes: + p.start() + for p in processes: + p.join() + queue1.join() + + # Complete the output_files task. + processes = [multiprocessing.Process(target=output_files_for_collection, args=(queue2, timeout, )) for _ in range(cpus)] + for p in processes: + p.start() + for p in processes: + p.join() + queue2.join() + + if queue1.empty() and queue2.empty(): + queue1.close() + queue1.join_thread() + queue2.close() + queue2.join_thread() + else: + # Here fastq_list consists of either a single read + # or a set of paired reads, producing single outputs. + count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts(fastq_list, args.gzipped) + brucella_string, bovis_string, para_string = get_species_strings(count_summary) + group, dbkey = get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum) + output_files(args.read1, count_list, group, dbkey, dbkey_file=args.output_dbkey, metrics_file=args.output_metrics)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vsnp_determine_ref_from_data.xml Tue Apr 21 10:08:28 2020 -0400 @@ -0,0 +1,166 @@ +<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0"> + <description>from input data</description> + <requirements> + <requirement type="package" version="1.76">biopython</requirement> + <requirement type="package" version="5.3">pyyaml</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ +#import os +#import re +#set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields() +#set gzipped = 'false' +#set input_type = $input_type_cond.input_type +#set input_reads_dir = 'input_reads' +#set output_dbkey_dir = 'output_dbkey' +#set output_metrics_dir = 'output_metrics' +mkdir -p $input_reads_dir && +mkdir -p $output_dbkey_dir && +mkdir -p $output_metrics_dir && +#if str($input_type) == "single": + #set read_type_cond = $input_type_cond.read_type_cond + #set read1 = $read_type_cond.read1 + #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier)) + #if str($read_type_cond.read_type) == "single": + ln -s '${read1}' '${read1_identifier}' && + #if $read1.is_of_type('fastqsanger.gz'): + #set gzipped = 'true' + #end if + #else: + #set read2 = $read_type_cond.read2 + #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier)) + ln -s '${read1}' '${read1_identifier}' && + ln -s '${read2}' '${read2_identifier}' && + #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'): + #set gzipped = 'true' + #end if + #end if +#else: + #for $i in $input_type_cond.reads_collection: + #if $i.is_of_type('fastqsanger.gz'): + #set gzipped = 'true' + #end if + #set filename = $i.file_name + #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier)) + ln -s $filename $input_reads_dir/$identifier && + #end for +#end if +python '$__tool_directory__/vsnp_determine_ref_from_data.py' +#if str($input_type) == "single": + #if str($read_type_cond.read_type) == "single": + --read1 '${read1_identifier}' + #else: + --read1 '${read1_identifier}' + --read2 '${read2_identifier}' + #end if + --output_dbkey '$output_dbkey' + --output_metrics '$output_metrics' +#end if +--gzipped $gzipped +--processes $processes +#for $i in $dnaprint_fields: + --dnaprint_fields '${i[0]}' '${i[2]}' +#end for +]]></command> + <inputs> + <conditional name="input_type_cond"> + <param name="input_type" type="select" label="Choose the category of the files to be analyzed"> + <option value="single" selected="true">Single files</option> + <option value="collection">Collections of files</option> + </param> + <when value="single"> + <conditional name="read_type_cond"> + <param name="read_type" type="select" label="Choose the read type"> + <option value="paired" selected="true">Paired</option> + <option value="single">Single</option> + </param> + <when value="paired"> + <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/> + <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/> + </when> + <when value="single"> + <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/> + </when> + </conditional> + </when> + <when value="collection"> + <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/> + </when> + </conditional> + <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/> + </inputs> + <outputs> + <data name="output_dbkey" format="txt" label="${tool.name} (dbkey) on ${on_string}"> + <filter>input_type_cond['input_type'] == 'single'</filter> + </data> + <data name="output_metrics" format="txt" label="${tool.name} (metrics) on ${on_string}"> + <filter>input_type_cond['input_type'] == 'single'</filter> + </data> + <collection name="output_dbkey_collection" type="list"> + <discover_datasets pattern="__name__" directory="output_dbkey" format="txt" /> + <filter>input_type_cond['input_type'] == 'collection'</filter> + </collection> + <collection name="output_metrics_collection" type="list"> + <discover_datasets pattern="__name__" directory="output_metrics" format="txt" /> + <filter>input_type_cond['input_type'] == 'collection'</filter> + </collection> + </outputs> + <tests> + <test> + <!-- Need to figure out how to test installed data tables --> + <param name="read1" value="reads.fastqsanger" ftype="fastqsanger" dbkey="89"/> + <param name="read2" value="read2.fastqsanger" ftype="fastqsanger" dbkey="89"/> + <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/> + <output name="output_metrics" file="output_metrics.txt" ftype="txt"/> + </test> + </tests> + <help> +**What it does** + +Accepts a single fastqsanger read, a set of paired reads, or a collections of reads and inspects the data to discover the +best reference genome for aligning the reads. This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to +perform this task. While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call +the complete string a "DNA print". All of the "DNA prints" files installed by the complementary **vSNP DNAprints data +manager** tool are then inspected to find a match for the compiled "DNA print" string. These files are each associated +with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g., +**Map with BWA-MEM**) to align the reads to the associated reference. + +The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information +used to compile the "DNA print" string. + +This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species, +and discovering the primary species is critical. DNA print matchig is currently supported for the following genomes. + + * Mycobacterium bovis AF2122/97 + * Brucella abortus bv. 1 str. 9-941 + * Brucella abortus strain BER + * Brucella canis ATCC 23365 + * Brucella ceti TE10759-12 + * Brucella melitensis bv. 1 str. 16M + * Brucella melitensis bv. 3 str. Ether + * Brucella melitensis BwIM_SOM_36b + * Brucella melitensis ATCC 23457 + * Brucella ovis ATCC 25840 + * Brucella suis 1330 + * Mycobacterium tuberculosis H37Rv + * Mycobacterium avium subsp. paratuberculosis strain Telford + * Mycobacterium avium subsp. paratuberculosis K-10 + * Brucella suis ATCC 23445 + * Brucella suis bv. 3 str. 686 + +**Required Options** + + * **Choose the category of the files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single or paired fastqsanger reads or collections of fastqsanger reads) based on the selected option. + * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time. + </help> + <citations> + <citation type="bibtex"> + @misc{None, + journal = {None}, + author = {1. Stuber T}, + title = {Manuscript in preparation}, + year = {None}, + url = {https://github.com/USDA-VS/vSNP},} + </citation> + </citations> +</tool> +