comparison vsnp_determine_ref_from_data.py @ 3:85587c8eb25f draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 524a39e08f2bea8b8754284df606ff8dd27ed24b"
author iuc
date Wed, 02 Dec 2020 09:10:07 +0000
parents
children efb86aade548
comparison
equal deleted inserted replaced
2:a52b819aa990 3:85587c8eb25f
1 #!/usr/bin/env python
2
3 import argparse
4 import gzip
5 import os
6 from collections import OrderedDict
7
8 import yaml
9 from Bio.SeqIO.QualityIO import FastqGeneralIterator
10
11 OUTPUT_DBKEY_DIR = 'output_dbkey'
12 OUTPUT_METRICS_DIR = 'output_metrics'
13
14
15 def get_base_file_name(file_path):
16 base_file_name = os.path.basename(file_path)
17 if base_file_name.find(".") > 0:
18 # Eliminate the extension.
19 return os.path.splitext(base_file_name)[0]
20 elif base_file_name.find("_fq") > 0:
21 # The "." character has likely
22 # changed to an "_" character.
23 return base_file_name.split("_fq")[0]
24 elif base_file_name.find("_fastq") > 0:
25 return base_file_name.split("_fastq")[0]
26 return base_file_name
27
28
29 def get_dbkey(dnaprints_dict, key, s):
30 # dnaprints_dict looks something like this:
31 # {'brucella': {'NC_002945v4': ['11001110', '11011110', '11001100']}
32 # {'bovis': {'NC_006895': ['11111110', '00010010', '01111011']}}
33 d = dnaprints_dict.get(key, {})
34 for data_table_value, v_list in d.items():
35 if s in v_list:
36 return data_table_value
37 return ""
38
39
40 def get_dnaprints_dict(dnaprint_fields):
41 # A dndprint_fields entry looks something liek this.
42 # [['AF2122', '/galaxy/tool-data/vsnp/AF2122/dnaprints/NC_002945v4.yml']]
43 dnaprints_dict = {}
44 for item in dnaprint_fields:
45 # Here item is a 2-element list of data
46 # table components, # value and path.
47 value = item[0]
48 path = item[1].strip()
49 with open(path, "rt") as fh:
50 # The format of all dnaprints yaml
51 # files is something like this:
52 # brucella:
53 # - 0111111111111111
54 print_dict = yaml.load(fh, Loader=yaml.Loader)
55 for print_dict_k, print_dict_v in print_dict.items():
56 dnaprints_v_dict = dnaprints_dict.get(print_dict_k, {})
57 if len(dnaprints_v_dict) > 0:
58 # dnaprints_dict already contains k (e.g., 'brucella',
59 # and dnaprints_v_dict will be a dictionary # that
60 # looks something like this:
61 # {'NC_002945v4': ['11001110', '11011110', '11001100']}
62 value_list = dnaprints_v_dict.get(value, [])
63 value_list = value_list + print_dict_v
64 dnaprints_v_dict[value] = value_list
65 else:
66 # dnaprints_v_dict is an empty dictionary.
67 dnaprints_v_dict[value] = print_dict_v
68 dnaprints_dict[print_dict_k] = dnaprints_v_dict
69 # dnaprints_dict looks something like this:
70 # {'brucella': {'NC_002945v4': ['11001110', '11011110', '11001100']}
71 # {'bovis': {'NC_006895': ['11111110', '00010010', '01111011']}}
72 return dnaprints_dict
73
74
75 def get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum):
76 if brucella_sum > 3:
77 group = "Brucella"
78 dbkey = get_dbkey(dnaprints_dict, "brucella", brucella_string)
79 elif bovis_sum > 3:
80 group = "TB"
81 dbkey = get_dbkey(dnaprints_dict, "bovis", bovis_string)
82 elif para_sum >= 1:
83 group = "paraTB"
84 dbkey = get_dbkey(dnaprints_dict, "para", para_string)
85 else:
86 group = ""
87 dbkey = ""
88 return group, dbkey
89
90
91 def get_oligo_dict():
92 oligo_dict = {}
93 oligo_dict["01_ab1"] = "AATTGTCGGATAGCCTGGCGATAACGACGC"
94 oligo_dict["02_ab3"] = "CACACGCGGGCCGGAACTGCCGCAAATGAC"
95 oligo_dict["03_ab5"] = "GCTGAAGCGGCAGACCGGCAGAACGAATAT"
96 oligo_dict["04_mel"] = "TGTCGCGCGTCAAGCGGCGTGAAATCTCTG"
97 oligo_dict["05_suis1"] = "TGCGTTGCCGTGAAGCTTAATTCGGCTGAT"
98 oligo_dict["06_suis2"] = "GGCAATCATGCGCAGGGCTTTGCATTCGTC"
99 oligo_dict["07_suis3"] = "CAAGGCAGATGCACATAATCCGGCGACCCG"
100 oligo_dict["08_ceti1"] = "GTGAATATAGGGTGAATTGATCTTCAGCCG"
101 oligo_dict["09_ceti2"] = "TTACAAGCAGGCCTATGAGCGCGGCGTGAA"
102 oligo_dict["10_canis4"] = "CTGCTACATAAAGCACCCGGCGACCGAGTT"
103 oligo_dict["11_canis"] = "ATCGTTTTGCGGCATATCGCTGACCACAGC"
104 oligo_dict["12_ovis"] = "CACTCAATCTTCTCTACGGGCGTGGTATCC"
105 oligo_dict["13_ether2"] = "CGAAATCGTGGTGAAGGACGGGACCGAACC"
106 oligo_dict["14_63B1"] = "CCTGTTTAAAAGAATCGTCGGAACCGCTCT"
107 oligo_dict["15_16M0"] = "TCCCGCCGCCATGCCGCCGAAAGTCGCCGT"
108 oligo_dict["16_mel1b"] = "TCTGTCCAAACCCCGTGACCGAACAATAGA"
109 oligo_dict["17_tb157"] = "CTCTTCGTATACCGTTCCGTCGTCACCATGGTCCT"
110 oligo_dict["18_tb7"] = "TCACGCAGCCAACGATATTCGTGTACCGCGACGGT"
111 oligo_dict["19_tbbov"] = "CTGGGCGACCCGGCCGACCTGCACACCGCGCATCA"
112 oligo_dict["20_tb5"] = "CCGTGGTGGCGTATCGGGCCCCTGGATCGCGCCCT"
113 oligo_dict["21_tb2"] = "ATGTCTGCGTAAAGAAGTTCCATGTCCGGGAAGTA"
114 oligo_dict["22_tb3"] = "GAAGACCTTGATGCCGATCTGGGTGTCGATCTTGA"
115 oligo_dict["23_tb4"] = "CGGTGTTGAAGGGTCCCCCGTTCCAGAAGCCGGTG"
116 oligo_dict["24_tb6"] = "ACGGTGATTCGGGTGGTCGACACCGATGGTTCAGA"
117 oligo_dict["25_para"] = "CCTTTCTTGAAGGGTGTTCG"
118 oligo_dict["26_para_sheep"] = "CGTGGTGGCGACGGCGGCGGGCCTGTCTAT"
119 oligo_dict["27_para_cattle"] = "TCTCCTCGGTCGGTGATTCGGGGGCGCGGT"
120 return oligo_dict
121
122
123 def get_seq_counts(value, fastq_list, gzipped):
124 count = 0
125 for fastq_file in fastq_list:
126 if gzipped:
127 with gzip.open(fastq_file, 'rt') as fh:
128 for title, seq, qual in FastqGeneralIterator(fh):
129 count += seq.count(value)
130 else:
131 with open(fastq_file, 'r') as fh:
132 for title, seq, qual in FastqGeneralIterator(fh):
133 count += seq.count(value)
134 return(value, count)
135
136
137 def get_species_counts(fastq_list, gzipped):
138 count_summary = {}
139 oligo_dict = get_oligo_dict()
140 for v1 in oligo_dict.values():
141 returned_value, count = get_seq_counts(v1, fastq_list, gzipped)
142 for key, v2 in oligo_dict.items():
143 if returned_value == v2:
144 count_summary.update({key: count})
145 count_list = []
146 for v in count_summary.values():
147 count_list.append(v)
148 brucella_sum = sum(count_list[:16])
149 bovis_sum = sum(count_list[16:24])
150 para_sum = sum(count_list[24:])
151 return count_summary, count_list, brucella_sum, bovis_sum, para_sum
152
153
154 def get_species_strings(count_summary):
155 binary_dictionary = {}
156 for k, v in count_summary.items():
157 if v > 1:
158 binary_dictionary.update({k: 1})
159 else:
160 binary_dictionary.update({k: 0})
161 binary_dictionary = OrderedDict(sorted(binary_dictionary.items()))
162 binary_list = []
163 for v in binary_dictionary.values():
164 binary_list.append(v)
165 brucella_binary = binary_list[:16]
166 brucella_string = ''.join(str(e) for e in brucella_binary)
167 bovis_binary = binary_list[16:24]
168 bovis_string = ''.join(str(e) for e in bovis_binary)
169 para_binary = binary_list[24:]
170 para_string = ''.join(str(e) for e in para_binary)
171 return brucella_string, bovis_string, para_string
172
173
174 def output_dbkey(file_name, dbkey, output_file):
175 # Output the dbkey.
176 with open(output_file, "w") as fh:
177 fh.write("%s" % dbkey)
178
179
180 def output_files(fastq_file, count_list, group, dbkey, dbkey_file, metrics_file):
181 base_file_name = get_base_file_name(fastq_file)
182 output_dbkey(base_file_name, dbkey, dbkey_file)
183 output_metrics(base_file_name, count_list, group, dbkey, metrics_file)
184
185
186 def output_metrics(file_name, count_list, group, dbkey, output_file):
187 # Output the metrics.
188 with open(output_file, "w") as fh:
189 fh.write("Sample: %s\n" % file_name)
190 fh.write("Brucella counts: ")
191 for i in count_list[:16]:
192 fh.write("%d," % i)
193 fh.write("\nTB counts: ")
194 for i in count_list[16:24]:
195 fh.write("%d," % i)
196 fh.write("\nPara counts: ")
197 for i in count_list[24:]:
198 fh.write("%d," % i)
199 fh.write("\nGroup: %s" % group)
200 fh.write("\ndbkey: %s\n" % dbkey)
201
202
203 if __name__ == '__main__':
204 parser = argparse.ArgumentParser()
205
206 parser.add_argument('--dnaprint_fields', action='append', dest='dnaprint_fields', nargs=2, required=False, default=None, help="List of dnaprints data table value, name and path fields")
207 parser.add_argument('--read1', action='store', dest='read1', required=True, default=None, help='Required: single read')
208 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
209 parser.add_argument('--gzipped', action='store_true', dest='gzipped', default=False, help='Input files are gzipped')
210 parser.add_argument('--output_dbkey', action='store', dest='output_dbkey', required=True, default=None, help='Output reference file')
211 parser.add_argument('--output_metrics', action='store', dest='output_metrics', required=True, default=None, help='Output metrics file')
212
213 args = parser.parse_args()
214
215 fastq_list = [args.read1]
216 if args.read2 is not None:
217 fastq_list.append(args.read2)
218
219 # The value of dnaprint_fields is a list of lists, where each list is
220 # the [value, name, path] components of the vsnp_dnaprints data table.
221 # The data_manager_vsnp_dnaprints tool assigns the dbkey column from the
222 # all_fasta data table to the value column in the vsnp_dnaprints data
223 # table to ensure a proper mapping for discovering the dbkey.
224 dnaprints_dict = get_dnaprints_dict(args.dnaprint_fields)
225
226 # Here fastq_list consists of either a single read
227 # or a set of paired reads, producing single outputs.
228 count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts(fastq_list, args.gzipped)
229 brucella_string, bovis_string, para_string = get_species_strings(count_summary)
230 group, dbkey = get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum)
231 output_files(args.read1, count_list, group, dbkey, dbkey_file=args.output_dbkey, metrics_file=args.output_metrics)