annotate vsnp_get_snps.py @ 0:ee4ef1fc23c6 draft

Uploaded
author greg
date Tue, 21 Apr 2020 10:14:11 -0400
parents
children 14285a94fb13
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env python
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
2
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
3 # Collect quality parsimonious SNPs from vcf files and output alignment files in fasta format.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
4
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
5 import argparse
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
6 import multiprocessing
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
7 import os
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
8 import pandas
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
9 import queue
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
10 import shutil
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
11 import sys
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
12 import time
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
13 import vcf
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
14 from collections import OrderedDict
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
15 from datetime import datetime
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
16
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
17 ALL_VCFS_DIR = 'all_vcf'
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
18 INPUT_VCF_DIR = 'input_vcf_dir'
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
19 OUTPUT_JSON_AVG_MQ_DIR = 'output_json_avg_mq_dir'
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
20 OUTPUT_JSON_SNPS_DIR = 'output_json_snps_dir'
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
21 OUTPUT_SNPS_DIR = 'output_snps_dir'
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
22
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
23
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
24 def get_time_stamp():
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
25 return datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
26
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
27
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
28 def set_num_cpus(num_files, processes):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
29 num_cpus = int(multiprocessing.cpu_count())
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
30 if num_files < num_cpus and num_files < processes:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
31 return num_files
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
32 if num_cpus < processes:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
33 half_cpus = int(num_cpus / 2)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
34 if num_files < half_cpus:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
35 return num_files
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
36 return half_cpus
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
37 return processes
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
38
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
39
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
40 def setup_all_vcfs(vcf_files, vcf_dirs):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
41 # Create the all_vcfs directory and link
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
42 # all input vcf files into it for processing.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
43 os.makedirs(ALL_VCFS_DIR)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
44 vcf_dirs.append(ALL_VCFS_DIR)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
45 for vcf_file in vcf_files:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
46 file_name_base = os.path.basename(vcf_file)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
47 dst_file = os.path.join(ALL_VCFS_DIR, file_name_base)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
48 os.symlink(vcf_file, dst_file)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
49 return vcf_dirs
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
50
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
51
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
52 class SnpFinder:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
53
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
54 def __init__(self, num_files, reference, excel_grouper_file,
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
55 all_isolates, ac, mq_val, n_threshold, qual_threshold, output_summary):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
56 self.ac = ac
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
57 self.all_isolates = all_isolates
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
58 self.all_positions = None
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
59 # Filter based on the contents of an Excel file.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
60 self.excel_grouper_file = excel_grouper_file
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
61 # Use Genbank file
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
62 self.groups = []
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
63 # This will be populated from the columns
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
64 # in the Excel filter file if it is used.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
65 self.mq_val = mq_val
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
66 self.n_threshold = n_threshold
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
67 self.qual_threshold = qual_threshold
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
68 self.reference = reference
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
69 self.start_time = get_time_stamp()
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
70 self.summary_str = ""
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
71 self.timer_start = datetime.now()
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
72 self.num_files = num_files
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
73 self.initiate_summary(output_summary)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
74
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
75 def append_to_summary(self, html_str):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
76 self.summary_str = "%s%s" % (self.summary_str, html_str)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
77
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
78 def bin_input_files(self, filename, samples_groups_dict, defining_snps, inverted_defining_snps, found_positions, found_positions_mix):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
79 sample_groups_list = []
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
80 table_name = self.get_base_file_name(filename)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
81 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
82 defining_snp = False
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
83 # Absolute positions in set union of two lists.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
84 for abs_position in list(defining_snps.keys() & (found_positions.keys() | found_positions_mix.keys())):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
85 group = defining_snps[abs_position]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
86 sample_groups_list.append(group)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
87 self.check_add_group(group)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
88 if len(list(defining_snps.keys() & found_positions_mix.keys())) > 0:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
89 table_name = self.get_base_file_name(filename)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
90 table_name = '%s<font color="red">[[MIXED]]</font>' % table_name
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
91 self.copy_file(filename, group)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
92 defining_snp = True
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
93 if not set(inverted_defining_snps.keys()).intersection(found_positions.keys() | found_positions_mix.keys()):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
94 for abs_position in list(inverted_defining_snps.keys()):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
95 group = inverted_defining_snps[abs_position]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
96 sample_groups_list.append(group)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
97 self.check_add_group(group)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
98 self.copy_file(filename, group)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
99 defining_snp = True
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
100 if defining_snp:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
101 samples_groups_dict[table_name] = sorted(sample_groups_list)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
102 else:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
103 samples_groups_dict[table_name] = ['<font color="red">No defining SNP</font>']
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
104 except TypeError as e:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
105 msg = "<br/>Error processing file %s to generate samples_groups_dict: %s<br/>" % (filename, str(e))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
106 self.append_to_summary(msg)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
107 samples_groups_dict[table_name] = [msg]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
108 return samples_groups_dict
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
109
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
110 def check_add_group(self, group):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
111 if group not in self.groups:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
112 self.groups.append(group)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
113
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
114 def copy_file(self, filename, dir):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
115 if not os.path.exists(dir):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
116 os.makedirs(dir)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
117 shutil.copy(filename, dir)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
118
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
119 def decide_snps(self, filename):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
120 positions_dict = self.all_positions
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
121 # Find the SNPs in a vcf file to produce a pandas data
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
122 # frame and a dictionary containing sample map qualities.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
123 sample_map_qualities = {}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
124 # Eliminate the path.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
125 file_name_base = self.get_base_file_name(filename)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
126 vcf_reader = vcf.Reader(open(filename, 'r'))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
127 sample_dict = {}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
128 for record in vcf_reader:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
129 alt = str(record.ALT[0])
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
130 record_position = "%s:%s" % (str(record.CHROM), str(record.POS))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
131 if record_position in positions_dict:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
132 if alt == "None":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
133 sample_dict.update({record_position: "-"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
134 else:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
135 # Not sure this is the best place to capture MQM average
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
136 # may be faster after parsimony SNPs are decided, but
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
137 # then it will require opening the files again.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
138 # On rare occassions MQM gets called "NaN", thus passing
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
139 # a string when a number is expected when calculating average.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
140 mq_val = self.get_mq_val(record.INFO, filename)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
141 if str(mq_val).lower() not in ["nan"]:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
142 sample_map_qualities.update({record_position: mq_val})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
143 # Add parameters here to change what each vcf represents.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
144 # SNP is represented in table, now how will the vcf represent
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
145 # the called position alt != "None", which means a deletion
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
146 # as alt is not record.FILTER, or rather passed.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
147 len_alt = len(alt)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
148 if len_alt == 1:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
149 qual_val = self.val_as_int(record.QUAL)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
150 ac = record.INFO['AC'][0]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
151 ref = str(record.REF[0])
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
152 if ac == 2 and qual_val > self.n_threshold:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
153 sample_dict.update({record_position: alt})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
154 elif ac == 1 and qual_val > self.n_threshold:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
155 alt_ref = "%s%s" % (alt, ref)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
156 if alt_ref == "AG":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
157 sample_dict.update({record_position: "R"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
158 elif alt_ref == "CT":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
159 sample_dict.update({record_position: "Y"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
160 elif alt_ref == "GC":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
161 sample_dict.update({record_position: "S"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
162 elif alt_ref == "AT":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
163 sample_dict.update({record_position: "W"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
164 elif alt_ref == "GT":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
165 sample_dict.update({record_position: "K"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
166 elif alt_ref == "AC":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
167 sample_dict.update({record_position: "M"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
168 elif alt_ref == "GA":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
169 sample_dict.update({record_position: "R"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
170 elif alt_ref == "TC":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
171 sample_dict.update({record_position: "Y"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
172 elif alt_ref == "CG":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
173 sample_dict.update({record_position: "S"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
174 elif alt_ref == "TA":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
175 sample_dict.update({record_position: "W"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
176 elif alt_ref == "TG":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
177 sample_dict.update({record_position: "K"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
178 elif alt_ref == "CA":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
179 sample_dict.update({record_position: "M"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
180 else:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
181 sample_dict.update({record_position: "N"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
182 # Poor calls
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
183 elif qual_val <= 50:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
184 # Do not coerce record.REF[0] to a string!
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
185 sample_dict.update({record_position: record.REF[0]})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
186 elif qual_val <= self.n_threshold:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
187 sample_dict.update({record_position: "N"})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
188 else:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
189 # Insurance -- Will still report on a possible
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
190 # SNP even if missed with above statement
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
191 # Do not coerce record.REF[0] to a string!
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
192 sample_dict.update({record_position: record.REF[0]})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
193 # Merge dictionaries and order
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
194 merge_dict = {}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
195 # abs_pos:REF
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
196 merge_dict.update(positions_dict)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
197 # abs_pos:ALT replacing all_positions, because keys must be unique
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
198 merge_dict.update(sample_dict)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
199 sample_df = pandas.DataFrame(merge_dict, index=[file_name_base])
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
200 return sample_df, file_name_base, sample_map_qualities
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
201
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
202 def df_to_fasta(self, parsimonious_df, group):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
203 # Generate SNP alignment file from the parsimonious_df
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
204 # data frame.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
205 snps_file = os.path.join(OUTPUT_SNPS_DIR, "%s.fasta" % group)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
206 test_duplicates = []
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
207 has_sequence_data = False
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
208 for index, row in parsimonious_df.iterrows():
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
209 for pos in row:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
210 if len(pos) > 0:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
211 has_sequence_data = True
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
212 break
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
213 if has_sequence_data:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
214 with open(snps_file, 'w') as fh:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
215 for index, row in parsimonious_df.iterrows():
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
216 test_duplicates.append(row.name)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
217 if test_duplicates.count(row.name) < 2:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
218 print(f'>{row.name}', file=fh)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
219 for pos in row:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
220 print(pos, end='', file=fh)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
221 print("", file=fh)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
222 return has_sequence_data
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
223
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
224 def find_initial_positions(self, filename):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
225 # Find SNP positions in a vcf file.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
226 found_positions = {}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
227 found_positions_mix = {}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
228 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
229 vcf_reader = vcf.Reader(open(filename, 'r'))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
230 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
231 for record in vcf_reader:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
232 qual_val = self.val_as_int(record.QUAL)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
233 chrom = record.CHROM
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
234 position = record.POS
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
235 absolute_position = "%s:%s" % (str(chrom), str(position))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
236 alt = str(record.ALT[0])
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
237 if alt != "None":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
238 mq_val = self.get_mq_val(record.INFO, filename)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
239 ac = record.INFO['AC'][0]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
240 len_ref = len(record.REF)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
241 if ac == self.ac and len_ref == 1 and qual_val > self.qual_threshold and mq_val > self.mq_val:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
242 found_positions.update({absolute_position: record.REF})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
243 if ac == 1 and len_ref == 1 and qual_val > self.qual_threshold and mq_val > self.mq_val:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
244 found_positions_mix.update({absolute_position: record.REF})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
245 return found_positions, found_positions_mix
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
246 except (ZeroDivisionError, ValueError, UnboundLocalError, TypeError) as e:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
247 self.append_to_summar("<br/>Error parsing record in file %s: %s<br/>" % (filename, str(e)))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
248 return {'': ''}, {'': ''}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
249 except (SyntaxError, AttributeError) as e:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
250 self.append_to_summary("<br/>Error attempting to read file %s: %s<br/>" % (filename, str(e)))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
251 return {'': ''}, {'': ''}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
252
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
253 def gather_and_filter(self, prefilter_df, mq_averages, group_dir):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
254 # Group a data frame of SNPs.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
255 if self.excel_grouper_file is None:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
256 filtered_all_df = prefilter_df
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
257 sheet_names = None
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
258 else:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
259 # Filter positions to be removed from all.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
260 xl = pandas.ExcelFile(self.excel_grouper_file)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
261 sheet_names = xl.sheet_names
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
262 # Use the first column to filter "all" postions.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
263 exclusion_list_all = self.get_position_list(sheet_names, 0)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
264 exclusion_list_group = self.get_position_list(sheet_names, group_dir)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
265 exclusion_list = exclusion_list_all + exclusion_list_group
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
266 # Filters for all applied.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
267 filtered_all_df = prefilter_df.drop(columns=exclusion_list, errors='ignore')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
268 json_snps_file = os.path.join(OUTPUT_JSON_SNPS_DIR, "%s.json" % group_dir)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
269 parsimonious_df = self.get_parsimonious_df(filtered_all_df)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
270 samples_number, columns = parsimonious_df.shape
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
271 if samples_number >= 4:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
272 has_sequence_data = self.df_to_fasta(parsimonious_df, group_dir)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
273 if has_sequence_data:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
274 json_avg_mq_file = os.path.join(OUTPUT_JSON_AVG_MQ_DIR, "%s.json" % group_dir)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
275 mq_averages.to_json(json_avg_mq_file, orient='split')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
276 parsimonious_df.to_json(json_snps_file, orient='split')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
277 else:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
278 msg = "<br/>No sequence data"
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
279 if group_dir is not None:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
280 msg = "%s for group: %s" % (msg, group_dir)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
281 self.append_to_summary("%s<br/>\n" % msg)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
282 else:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
283 msg = "<br/>Too few samples to build tree"
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
284 if group_dir is not None:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
285 msg = "%s for group: %s" % (msg, group_dir)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
286 self.append_to_summary("%s<br/>\n" % msg)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
287
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
288 def get_base_file_name(self, file_path):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
289 base_file_name = os.path.basename(file_path)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
290 if base_file_name.find(".") > 0:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
291 # Eliminate the extension.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
292 return os.path.splitext(base_file_name)[0]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
293 elif base_file_name.find("_") > 0:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
294 # The dot extension was likely changed to
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
295 # the " character.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
296 items = base_file_name.split("_")
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
297 return "_".join(items[0:-1])
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
298 else:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
299 return base_file_name
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
300
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
301 def get_mq_val(self, record_info, filename):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
302 # Get the MQ (gatk) or MQM (freebayes) value
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
303 # from the record.INFO component of the vcf file.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
304 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
305 mq_val = record_info['MQM']
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
306 return self.return_val(mq_val)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
307 except Exception:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
308 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
309 mq_val = record_info['MQ']
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
310 return self.return_val(mq_val)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
311 except Exception:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
312 msg = "Invalid or unsupported vcf header %s in file: %s\n" % (str(record_info), filename)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
313 sys.exit(msg)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
314
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
315 def get_parsimonious_df(self, filtered_all_df):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
316 # Get the parsimonious SNPs data frame
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
317 # from a data frame of filtered SNPs.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
318 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
319 ref_series = filtered_all_df.loc['root']
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
320 # In all_vcf root needs to be removed.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
321 filtered_all_df = filtered_all_df.drop(['root'])
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
322 except KeyError:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
323 pass
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
324 parsimony = filtered_all_df.loc[:, (filtered_all_df != filtered_all_df.iloc[0]).any()]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
325 parsimony_positions = list(parsimony)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
326 parse_df = filtered_all_df[parsimony_positions]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
327 ref_df = ref_series.to_frame()
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
328 ref_df = ref_df.T
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
329 parsimonious_df = pandas.concat([parse_df, ref_df], join='inner')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
330 return parsimonious_df
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
331
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
332 def get_position_list(self, sheet_names, group):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
333 # Get a list of positions defined by an excel file.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
334 exclusion_list = []
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
335 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
336 filter_to_all = pandas.read_excel(self.excel_grouper_file, header=1, usecols=[group])
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
337 for value in filter_to_all.values:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
338 value = str(value[0])
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
339 if "-" not in value.split(":")[-1]:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
340 exclusion_list.append(value)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
341 elif "-" in value:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
342 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
343 chrom, sequence_range = value.split(":")
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
344 except Exception as e:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
345 sys.exit(str(e))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
346 value = sequence_range.split("-")
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
347 for position in range(int(value[0].replace(',', '')), int(value[1].replace(',', '')) + 1):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
348 exclusion_list.append(chrom + ":" + str(position))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
349 return exclusion_list
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
350 except ValueError:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
351 exclusion_list = []
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
352 return exclusion_list
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
353
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
354 def get_snps(self, task_queue, timeout):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
355 while True:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
356 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
357 group_dir = task_queue.get(block=True, timeout=timeout)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
358 except queue.Empty:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
359 break
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
360 # Parse all vcf files to accumulate SNPs into a
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
361 # data frame.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
362 positions_dict = {}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
363 group_files = []
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
364 for file_name in os.listdir(os.path.abspath(group_dir)):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
365 file_path = os.path.abspath(os.path.join(group_dir, file_name))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
366 group_files.append(file_path)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
367 for file_name in group_files:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
368 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
369 found_positions, found_positions_mix = self.find_initial_positions(file_name)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
370 positions_dict.update(found_positions)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
371 except Exception as e:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
372 self.append_to_summary("Error updating the positions_dict dictionary when processing file %s:\n%s\n" % (file_name, str(e)))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
373 # Order before adding to file to match
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
374 # with ordering of individual samples.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
375 # all_positions is abs_pos:REF
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
376 self.all_positions = OrderedDict(sorted(positions_dict.items()))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
377 ref_positions_df = pandas.DataFrame(self.all_positions, index=['root'])
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
378 all_map_qualities = {}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
379 df_list = []
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
380 for file_name in group_files:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
381 sample_df, file_name_base, sample_map_qualities = self.decide_snps(file_name)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
382 df_list.append(sample_df)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
383 all_map_qualities.update({file_name_base: sample_map_qualities})
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
384 all_sample_df = pandas.concat(df_list)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
385 # All positions have now been selected for each sample,
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
386 # so select parisomony informative SNPs. This removes
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
387 # columns where all fields are the same.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
388 # Add reference to top row.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
389 prefilter_df = pandas.concat([ref_positions_df, all_sample_df], join='inner')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
390 all_mq_df = pandas.DataFrame.from_dict(all_map_qualities)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
391 mq_averages = all_mq_df.mean(axis=1).astype(int)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
392 self.gather_and_filter(prefilter_df, mq_averages, group_dir)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
393 task_queue.task_done()
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
394
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
395 def group_vcfs(self, vcf_files):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
396 # Parse an excel file to produce a
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
397 # grouping dictionary for filtering SNPs.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
398 xl = pandas.ExcelFile(self.excel_grouper_file)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
399 sheet_names = xl.sheet_names
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
400 ws = pandas.read_excel(self.excel_grouper_file, sheet_name=sheet_names[0])
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
401 defining_snps = ws.iloc[0]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
402 defsnp_iterator = iter(defining_snps.iteritems())
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
403 next(defsnp_iterator)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
404 defining_snps = {}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
405 inverted_defining_snps = {}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
406 for abs_pos, group in defsnp_iterator:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
407 if '!' in abs_pos:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
408 inverted_defining_snps[abs_pos.replace('!', '')] = group
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
409 else:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
410 defining_snps[abs_pos] = group
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
411 samples_groups_dict = {}
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
412 for vcf_file in vcf_files:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
413 found_positions, found_positions_mix = self.find_initial_positions(vcf_file)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
414 samples_groups_dict = self.bin_input_files(vcf_file, samples_groups_dict, defining_snps, inverted_defining_snps, found_positions, found_positions_mix)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
415 # Output summary grouping table.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
416 self.append_to_summary('<br/>')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
417 self.append_to_summary('<b>Groupings with %d listed:</b><br/>\n' % len(samples_groups_dict))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
418 self.append_to_summary('<table cellpadding="5" cellspaging="5" border="1">\n')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
419 for key, value in samples_groups_dict.items():
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
420 self.append_to_summary('<tr align="left"><th>Sample Name</th>\n')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
421 self.append_to_summary('<td>%s</td>' % key)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
422 for group in value:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
423 self.append_to_summary('<td>%s</td>\n' % group)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
424 self.append_to_summary('</tr>\n')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
425 self.append_to_summary('</table><br/>\n')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
426
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
427 def initiate_summary(self, output_summary):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
428 # Output summary file handle.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
429 self.append_to_summary('<html>\n')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
430 self.append_to_summary('<head></head>\n')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
431 self.append_to_summary('<body style=\"font-size:12px;">')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
432 self.append_to_summary("<b>Time started:</b> %s<br/>" % str(get_time_stamp()))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
433 self.append_to_summary("<b>Number of VCF inputs:</b> %d<br/>" % self.num_files)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
434 self.append_to_summary("<b>Reference:</b> %s<br/>" % str(self.reference))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
435 self.append_to_summary("<b>All isolates:</b> %s<br/>" % str(self.all_isolates))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
436
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
437 def return_val(self, val, index=0):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
438 # Handle element and single-element list values.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
439 if isinstance(val, list):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
440 return val[index]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
441 return val
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
442
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
443 def val_as_int(self, val):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
444 # Handle integer value conversion.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
445 try:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
446 return int(val)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
447 except TypeError:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
448 # val is likely None here.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
449 return 0
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
450
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
451
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
452 if __name__ == '__main__':
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
453 parser = argparse.ArgumentParser()
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
454
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
455 parser.add_argument('--all_isolates', action='store', dest='all_isolates', required=False, default="No", help='Create table with all isolates'),
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
456 parser.add_argument('--excel_grouper_file', action='store', dest='excel_grouper_file', required=False, default=None, help='Optional Excel filter file'),
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
457 parser.add_argument('--output_summary', action='store', dest='output_summary', help='Output summary html file'),
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
458 parser.add_argument('--reference', action='store', dest='reference', help='Reference file'),
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
459 parser.add_argument('--processes', action='store', dest='processes', type=int, help='User-selected number of processes to use for job splitting')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
460
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
461 args = parser.parse_args()
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
462
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
463 # Initializations - TODO: should these be passed in as command line args?
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
464 ac = 2
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
465 mq_val = 56
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
466 n_threshold = 50
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
467 qual_threshold = 150
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
468
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
469 # Build the list of sample vcf files for the current run.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
470 vcf_files = []
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
471 for file_name in os.listdir(INPUT_VCF_DIR):
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
472 file_path = os.path.abspath(os.path.join(INPUT_VCF_DIR, file_name))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
473 vcf_files.append(file_path)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
474
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
475 multiprocessing.set_start_method('spawn')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
476 queue1 = multiprocessing.JoinableQueue()
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
477 num_files = len(vcf_files)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
478 cpus = set_num_cpus(num_files, args.processes)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
479 # Set a timeout for get()s in the queue.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
480 timeout = 0.05
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
481
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
482 # Initialize the snp_finder object.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
483 snp_finder = SnpFinder(num_files, args.reference, args.excel_grouper_file, args.all_isolates,
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
484 ac, mq_val, n_threshold, qual_threshold, args.output_summary)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
485
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
486 # Initialize the set of directories containiing vcf files for analysis.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
487 vcf_dirs = []
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
488 if args.excel_grouper_file is None:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
489 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
490 else:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
491 if args.all_isolates.lower() == "yes":
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
492 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
493 # Parse the Excel file to detemine groups for filtering.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
494 snp_finder.group_vcfs(vcf_files)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
495 # Append the list of group directories created by
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
496 # the above call to the set of directories containing
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
497 # vcf files for analysis
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
498 group_dirs = [d for d in os.listdir(os.getcwd()) if os.path.isdir(d) and d in snp_finder.groups]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
499 vcf_dirs.extend(group_dirs)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
500
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
501 # Populate the queue for job splitting.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
502 for vcf_dir in vcf_dirs:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
503 queue1.put(vcf_dir)
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
504
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
505 # Complete the get_snps task.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
506 processes = [multiprocessing.Process(target=snp_finder.get_snps, args=(queue1, timeout, )) for _ in range(cpus)]
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
507 for p in processes:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
508 p.start()
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
509 for p in processes:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
510 p.join()
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
511 queue1.join()
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
512
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
513 # Finish summary log.
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
514 snp_finder.append_to_summary("<br/><b>Time finished:</b> %s<br/>\n" % get_time_stamp())
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
515 total_run_time = datetime.now() - snp_finder.timer_start
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
516 snp_finder.append_to_summary("<br/><b>Total run time:</b> %s<br/>\n" % str(total_run_time))
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
517 snp_finder.append_to_summary('</body>\n</html>\n')
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
518 with open(args.output_summary, "w") as fh:
ee4ef1fc23c6 Uploaded
greg
parents:
diff changeset
519 fh.write("%s" % snp_finder.summary_str)