comparison build/scripts-3.6/SeqSero2_package.py @ 10:e6437d423693 draft

planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
author cstrittmatter
date Fri, 01 May 2020 13:30:43 -0400
parents
children
comparison
equal deleted inserted replaced
9:43f6b7f6ebb3 10:e6437d423693
1 #!/Users/charles.strittmatter/miniconda3/bin/python
2
3 import sys
4 import time
5 import random
6 import os
7 import subprocess
8 import gzip
9 import io
10 import pickle
11 import argparse
12 import itertools
13 from distutils.version import LooseVersion
14 from distutils.spawn import find_executable
15 sys.path.insert(1,sys.path[0]+'/..')
16
17 try:
18 from .version import SeqSero2_version
19 except Exception: #ImportError
20 from version import SeqSero2_version
21
22 ### SeqSero Kmer
23 def parse_args():
24 "Parse the input arguments, use '-h' for help."
25 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\nVersion: v1.1.1')#add "-m <data_type>" in future
26 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### add 'type=os.path.abspath' to generate absolute path of input data.
27 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore fasta, '6' for nanopore fastq")
28 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode")
29 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1")
30 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a")
31 parser.add_argument("-n",help="<string>: optional, to specify a sample name in the report output")
32 parser.add_argument("-d",help="<string>: optional, to specify an output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
33 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will only output serotype prediction without the directory containing log files")
34 parser.add_argument("-s",action="store_true",help="<flag>: if '-s' was flagged, SeqSero2 will not output header in SeqSero_result.tsv")
35 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies")
36 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SeqSero2_version)
37 return parser.parse_args()
38
39 ### check paths of dependencies
40 check_dependencies = parse_args().check
41 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py']
42 if check_dependencies:
43 for item in dependencies:
44 ext_path = find_executable(item)
45 if ext_path is not None:
46 print ("Using "+item+" - "+ext_path)
47 else:
48 print ("ERROR: can not find "+item+" in PATH")
49 sys.exit()
50 ### end of --check
51
52 def reverse_complement(sequence):
53 complement = {
54 'A': 'T',
55 'C': 'G',
56 'G': 'C',
57 'T': 'A',
58 'N': 'N',
59 'M': 'K',
60 'R': 'Y',
61 'W': 'W',
62 'S': 'S',
63 'Y': 'R',
64 'K': 'M',
65 'V': 'B',
66 'H': 'D',
67 'D': 'H',
68 'B': 'V'
69 }
70 return "".join(complement[base] for base in reversed(sequence))
71
72
73 def createKmerDict_reads(list_of_strings, kmer):
74 kmer_table = {}
75 for string in list_of_strings:
76 sequence = string.strip('\n')
77 for i in range(len(sequence) - kmer + 1):
78 new_mer = sequence[i:i + kmer].upper()
79 new_mer_rc = reverse_complement(new_mer)
80 if new_mer in kmer_table:
81 kmer_table[new_mer.upper()] += 1
82 else:
83 kmer_table[new_mer.upper()] = 1
84 if new_mer_rc in kmer_table:
85 kmer_table[new_mer_rc.upper()] += 1
86 else:
87 kmer_table[new_mer_rc.upper()] = 1
88 return kmer_table
89
90
91 def multifasta_dict(multifasta):
92 multifasta_list = [
93 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
94 ]
95 headers = [i for i in multifasta_list if i[0] == '>']
96 multifasta_dict = {}
97 for h in headers:
98 start = multifasta_list.index(h)
99 for element in multifasta_list[start + 1:]:
100 if element[0] == '>':
101 break
102 else:
103 if h[1:] in multifasta_dict:
104 multifasta_dict[h[1:]] += element
105 else:
106 multifasta_dict[h[1:]] = element
107 return multifasta_dict
108
109
110 def multifasta_single_string(multifasta):
111 multifasta_list = [
112 line.strip() for line in open(multifasta, 'r')
113 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
114 ]
115 return ''.join(multifasta_list)
116
117
118 def chunk_a_long_sequence(long_sequence, chunk_size=60):
119 chunk_list = []
120 steps = len(long_sequence) // 60 #how many chunks
121 for i in range(steps):
122 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
123 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
124 return chunk_list
125
126
127 def target_multifasta_kmerizer(multifasta, k, kmerDict):
128 forward_length = 300 #if find the target, put forward 300 bases
129 reverse_length = 2200 #if find the target, put backward 2200 bases
130 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
131 target_mers = []
132 long_single_string = multifasta_single_string(multifasta)
133 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
134 unit_length = len(multifasta_list[0])
135 forward_lines = int(forward_length / unit_length) + 1
136 reverse_lines = int(forward_length / unit_length) + 1
137 start_num = 0
138 end_num = 0
139 for i in range(len(multifasta_list)):
140 if i not in range(start_num, end_num): #avoid computational repetition
141 line = multifasta_list[i]
142 start = int((len(line) - k) // 2)
143 s1 = line[start:k + start]
144 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
145 if i - forward_lines >= 0:
146 start_num = i - forward_lines
147 else:
148 start_num = 0
149 if i + reverse_lines <= len(multifasta_list) - 1:
150 end_num = i + reverse_lines
151 else:
152 end_num = len(multifasta_list) - 1
153 target_list = [
154 x.strip() for x in multifasta_list[start_num:end_num]
155 ]
156 target_line = "".join(target_list)
157 target_mers += [
158 k1 for k1 in createKmerDict_reads([str(target_line)], k)
159 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
160 else:
161 pass
162 return set(target_mers)
163
164
165 def target_read_kmerizer(file, k, kmerDict):
166 i = 1
167 n_reads = 0
168 total_coverage = 0
169 target_mers = []
170 if file.endswith(".gz"):
171 file_content = io.BufferedReader(gzip.open(file))
172 else:
173 file_content = open(file, "r").readlines()
174 for line in file_content:
175 start = int((len(line) - k) // 2)
176 if i % 4 == 2:
177 if file.endswith(".gz"):
178 s1 = line[start:k + start].decode()
179 line = line.decode()
180 else:
181 s1 = line[start:k + start]
182 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
183 n_reads += 1
184 total_coverage += len(line)
185 target_mers += [
186 k1 for k1 in createKmerDict_reads([str(line)], k)
187 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
188 i += 1
189 if total_coverage >= 4000000:
190 break
191 return set(target_mers)
192
193
194 def minion_fasta_kmerizer(file, k, kmerDict):
195 i = 1
196 n_reads = 0
197 total_coverage = 0
198 target_mers = {}
199 for line in open(file):
200 if i % 2 == 0:
201 for kmer, rc_kmer in kmers(line.strip().upper(), k):
202 if (kmer in kmerDict) or (rc_kmer in kmerDict):
203 if kmer in target_mers:
204 target_mers[kmer] += 1
205 else:
206 target_mers[kmer] = 1
207 if rc_kmer in target_mers:
208 target_mers[rc_kmer] += 1
209 else:
210 target_mers[rc_kmer] = 1
211 i += 1
212 return set([h for h in target_mers])
213
214
215 def minion_fastq_kmerizer(file, k, kmerDict):
216 i = 1
217 n_reads = 0
218 total_coverage = 0
219 target_mers = {}
220 for line in open(file):
221 if i % 4 == 2:
222 for kmer, rc_kmer in kmers(line.strip().upper(), k):
223 if (kmer in kmerDict) or (rc_kmer in kmerDict):
224 if kmer in target_mers:
225 target_mers[kmer] += 1
226 else:
227 target_mers[kmer] = 1
228 if rc_kmer in target_mers:
229 target_mers[rc_kmer] += 1
230 else:
231 target_mers[rc_kmer] = 1
232 i += 1
233 return set([h for h in target_mers])
234
235
236 def multifasta_single_string2(multifasta):
237 single_string = ''
238 with open(multifasta, 'r') as f:
239 for line in f:
240 if line.strip()[0] == '>':
241 pass
242 else:
243 single_string += line.strip()
244 return single_string
245
246
247 def kmers(seq, k):
248 rev_comp = reverse_complement(seq)
249 for start in range(1, len(seq) - k + 1):
250 yield seq[start:start + k], rev_comp[-(start + k):-start]
251
252
253 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
254 multi_seq_dict = multifasta_dict(multifasta)
255 lib_dict = {}
256 for h in multi_seq_dict:
257 lib_dict[h] = set(
258 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
259 return lib_dict
260
261
262 def Combine(b, c):
263 fliC_combinations = []
264 fliC_combinations.append(",".join(c))
265 temp_combinations = []
266 for i in range(len(b)):
267 for x in itertools.combinations(b, i + 1):
268 temp_combinations.append(",".join(x))
269 for x in temp_combinations:
270 temp = []
271 for y in c:
272 temp.append(y)
273 temp.append(x)
274 temp = ",".join(temp)
275 temp = temp.split(",")
276 temp.sort()
277 temp = ",".join(temp)
278 fliC_combinations.append(temp)
279 return fliC_combinations
280
281
282 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
283 #like test_output_06012017.txt
284 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
285 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
286 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
287 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
288 seronames = []
289 seronames_none_subspecies=[]
290 for i in range(len(phase1)):
291 fliC_combine = []
292 fljB_combine = []
293 if phaseO[i] == Otype: # no VII in KW, but it's there
294 ### for fliC, detect every possible combinations to avoid the effect of "["
295 if phase1[i].count("[") == 0:
296 fliC_combine.append(phase1[i])
297 elif phase1[i].count("[") >= 1:
298 c = []
299 b = []
300 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
301 "[") == 1:
302 content = phase1[i].replace("[", "").replace("]", "")
303 fliC_combine.append(content)
304 fliC_combine.append("-")
305 else:
306 for x in phase1[i].split(","):
307 if "[" in x:
308 b.append(x.replace("[", "").replace("]", ""))
309 else:
310 c.append(x)
311 fliC_combine = Combine(
312 b, c
313 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
314 ### end of fliC "[" detect
315 ### for fljB, detect every possible combinations to avoid the effect of "["
316 if phase2[i].count("[") == 0:
317 fljB_combine.append(phase2[i])
318 elif phase2[i].count("[") >= 1:
319 d = []
320 e = []
321 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
322 "[") == 1:
323 content = phase2[i].replace("[", "").replace("]", "")
324 fljB_combine.append(content)
325 fljB_combine.append("-")
326 else:
327 for x in phase2[i].split(","):
328 if "[" in x:
329 d.append(x.replace("[", "").replace("]", ""))
330 else:
331 e.append(x)
332 fljB_combine = Combine(d, e)
333 ### end of fljB "[" detect
334 new_fliC = fliC.split(
335 ","
336 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
337 new_fliC.sort()
338 new_fliC = ",".join(new_fliC)
339 new_fljB = fljB.split(",")
340 new_fljB.sort()
341 new_fljB = ",".join(new_fljB)
342 if (new_fliC in fliC_combine
343 or fliC in fliC_combine) and (new_fljB in fljB_combine
344 or fljB in fljB_combine):
345 ######start, remove_list,rename_dict, added on 11/11/2018
346 if sero[i] not in remove_list:
347 temp_sero=sero[i]
348 if temp_sero in rename_dict:
349 temp_sero=rename_dict[temp_sero] #rename if in the rename list
350 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
351 if subs[i] == subspecies:
352 seronames.append(temp_sero)
353 seronames_none_subspecies.append(temp_sero)
354 else:
355 pass
356 else:
357 pass
358 ######end, added on 11/11/2018
359 #analyze seronames
360 subspecies_pointer=""
361 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
362 # ed_SL_12182019: modified to fix the subspecies output problem
363 #seronames=seronames_none_subspecies
364 seronames=["N/A"]
365 #subspecies_pointer="1"
366 subspecies_pointer="0"
367 if len(seronames) == 0:
368 seronames = [
369 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
370 ]
371 star = ""
372 star_line = ""
373 if len(seronames) > 1: #there are two possible predictions for serotypes
374 star = "*"
375 #changed 04072019
376 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
377 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
378 star="*"
379 star_line="The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme. " + star_line
380 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
381 if Otype=="":
382 Otype="-"
383 predict_form = Otype + ":" + fliC + ":" + fljB
384 predict_sero = (" or ").join(seronames)
385 ###special test for Enteritidis
386 if predict_form == "9:g,m:-":
387 sdf = "-"
388 for x in special_gene_list:
389 if x.startswith("sdf"):
390 sdf = "+"
391 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
392 star_line="sdf gene detected. "
393 #predict_form = predict_form + " Sdf prediction:" + sdf
394 predict_form = predict_form #changed 04072019
395 if sdf == "-":
396 star = "*"
397 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
398 star_line="sdf gene not detected. "
399 #changed in 04072019, for new output
400 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
401 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
402 predict_sero = "Gallinarum or Enteritidis"
403 ###end of special test for Enteritidis
404 elif predict_form == "4:i:-":
405 predict_sero = "I 4,[5],12:i:-" # change serotype name
406 elif predict_form == "4:r:-":
407 predict_sero = "N/A (4:r:-)"
408 elif predict_form == "4:b:-":
409 predict_sero = "N/A (4:b:-)"
410 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
411 #predict_sero = "Newport"
412 #star = "*"
413 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
414 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
415 if "N/A" in predict_sero:
416 claim = ""
417 #special test for Typhimurium
418 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
419 normal = 0
420 mutation = 0
421 for x in special_gene_list:
422 if "oafA-O-4_full" in x:
423 normal = float(special_gene_list[x])
424 elif "oafA-O-4_5-" in x:
425 mutation = float(special_gene_list[x])
426 if normal > mutation:
427 pass
428 elif normal < mutation:
429 #predict_sero = predict_sero.strip() + "(O5-)"
430 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
431 star = "*"
432 #star_line = "Detected the deletion of O5-."
433 star_line = "Detected a deletion that causes O5- variant of Typhimurium. "
434 else:
435 pass
436 #special test for Paratyphi B
437 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
438 normal = 0
439 mutation = 0
440 for x in special_gene_list:
441 if "gntR-family-regulatory-protein_dt-positive" in x:
442 normal = float(special_gene_list[x])
443 elif "gntR-family-regulatory-protein_dt-negative" in x:
444 mutation = float(special_gene_list[x])
445 #print(normal,mutation)
446 if normal > mutation:
447 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
448 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip()
449 star = "*"
450 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
451 star_line = "The SNP that causes d-Tartrate nonfermentating phenotype of Paratyphi B was not detected. "
452 elif normal < mutation:
453 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
454 predict_sero = predict_sero.strip()
455 star = "*"
456 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
457 star_line = "Detected the SNP for d-Tartrate nonfermenting phenotype of Paratyphi B. "
458 else:
459 star = "*"
460 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
461 star_line = " " ## ed_SL_05152019: do not report this situation.
462 #special test for O13,22 and O13,23
463 if Otype=="13":
464 #ex_dir = os.path.dirname(os.path.realpath(__file__))
465 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019
466 f = open(ex_dir + '/special.pickle', 'rb')
467 special = pickle.load(f)
468 O22_O23=special['O22_O23']
469 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
470 O22_score=0
471 O23_score=0
472 for x in special_gene_list:
473 if "O:22" in x:
474 O22_score = O22_score+float(special_gene_list[x])
475 elif "O:23" in x:
476 O23_score = O23_score+float(special_gene_list[x])
477 #print(O22_score,O23_score)
478 for z in O22_O23[0]:
479 if predict_sero.split(" or ")[0] in z:
480 if O22_score > O23_score:
481 star = "*"
482 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
483 predict_sero = z[0]
484 elif O22_score < O23_score:
485 star = "*"
486 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
487 predict_sero = z[1]
488 else:
489 star = "*"
490 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
491 if " or " in predict_sero:
492 star_line = star_line + "The predicted serotypes share the same general formula: " + Otype + ":" + fliC + ":" + fljB + "."
493 #special test for O6,8
494 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
495 #for x in merge_O68_list:
496 # if x in predict_sero:
497 # predict_sero=x
498 # star=""
499 # star_line=""
500 #special test for Montevideo; most of them are monophasic
501 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
502 #star="*"
503 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
504 return predict_form, predict_sero, star, star_line, claim
505 ### End of SeqSero Kmer part
506
507 ### Begin of SeqSero2 allele prediction and output
508 def xml_parse_score_comparision_seqsero(xmlfile):
509 #used to do seqsero xml analysis
510 from Bio.Blast import NCBIXML
511 handle=open(xmlfile)
512 handle=NCBIXML.parse(handle)
513 handle=list(handle)
514 List=[]
515 List_score=[]
516 List_ids=[]
517 List_query_region=[]
518 for i in range(len(handle)):
519 if len(handle[i].alignments)>0:
520 for j in range(len(handle[i].alignments)):
521 score=0
522 ids=0
523 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
524 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
525 for z in range(len(handle[i].alignments[j].hsps)):
526 hsp=handle[i].alignments[j].hsps[z]
527 temp=set(range(hsp.query_start,hsp.query_end))
528 if len(cover_region)==0:
529 cover_region=cover_region|temp
530 fraction=1
531 else:
532 fraction=1-len(cover_region&temp)/float(len(temp))
533 cover_region=cover_region|temp
534 if "last" in handle[i].query or "first" in handle[i].query:
535 score+=hsp.bits*fraction
536 ids+=float(hsp.identities)/handle[i].query_length*fraction
537 else:
538 score+=hsp.bits*fraction
539 ids+=float(hsp.identities)/handle[i].query_length*fraction
540 List_score.append(score)
541 List_ids.append(ids)
542 List_query_region.append(cover_region)
543 temp=zip(List,List_score,List_ids,List_query_region)
544 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
545 return Final_list
546
547
548 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
549 Old=L
550 L.sort()
551 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
552 count=[]
553 for j in range(len(L)):
554 y=0
555 for x in Old:
556 if L[j]==x:
557 y+=1
558 count.append(y)
559 if sort_on_fre!="none":
560 d=zip(*sorted(zip(count, L)))
561 L=d[1]
562 count=d[0]
563 return (L,count)
564
565 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
566 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
567 #this is mainly used for
568 a=nodes_vs_score_list
569 fliC_score=0
570 fljB_score=0
571 for z in a:
572 if "fliC" in z[0]:
573 fliC_score+=z[1]
574 elif "fljB" in z[0]:
575 fljB_score+=z[1]
576 if fliC_score>=fljB_score:
577 role="fliC"
578 else:
579 role="fljB"
580 return (role,abs(fliC_score-fljB_score))
581
582 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
583 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
584 #also used when no head or tail got blasted score for the contig
585 role=""
586 for z in Final_list_passed:
587 if node_name in z[0]:
588 role=z[0].split("_")[0]
589 break
590 return role
591
592 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
593 #nodes_list is the c created by c,d=Uniq(nodes) in below function
594 first_target=""
595 role_list=[]
596 for x in nodes_list:
597 a=[]
598 role=""
599 for y in tail_head_list:
600 if x in y[0]:
601 a.append(y)
602 if len(a)==4:
603 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
604 if diff<20:
605 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
606 elif len(a)==3:
607 ###however, if the one with highest score is the fewer one, compare their accumulation score
608 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
609 if diff<20:
610 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
611 ###end of above score comparison
612 elif len(a)==2:
613 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
614 temp=[]
615 for z in a:
616 temp.append(z[0].split("_")[0])
617 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
618 if len(m)==1:
619 pass
620 else:
621 pass
622 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
623 if diff<20:
624 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
625 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
626 elif len(a)==1:
627 #that one
628 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
629 if diff<20:
630 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
631 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
632 else:#a==0
633 #use Final_list_passed best match
634 for z in Final_list_passed:
635 if x in z[0]:
636 role=z[0].split("_")[0]
637 break
638 #print x,role,len(a)
639 role_list.append((role,x))
640 if len(role_list)==2:
641 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
642 #just use score to do a final test
643 role_list=[]
644 for x in nodes_list:
645 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
646 role_list.append((role,x))
647 return role_list
648
649 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
650 #used to decide which contig is FliC and which one is fljB
651 contigs=[]
652 nodes=[]
653 for x in Final_list_passed:
654 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
655 nodes.append(x[0].split("___")[1].strip())
656 c,d=Uniq(nodes)#c is node_list
657 #print c
658 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
659 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
660 return roles
661
662 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
663 #decide O based on Final_list
664 O_choice="?"
665 O_list=[]
666 special_genes={}
667 nodes=[]
668 for x in Final_list_passed:
669 if x[0].startswith("O-"):
670 nodes.append(x[0].split("___")[1].strip())
671 elif not x[0].startswith("fl"):
672 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
673 #print "special_genes:",special_genes
674 c,d=Uniq(nodes)
675 #print "potential O antigen contig",c
676 final_O=[]
677 O_nodes_list=[]
678 for x in c:#c is the list for contigs
679 temp=0
680 for y in Final_list_passed:
681 if x in y[0] and y[0].startswith("O-"):
682 final_O.append(y)
683 break
684 ### O contig has the problem of two genes on same contig, so do additional test
685 potenial_new_gene=""
686 for x in final_O:
687 pointer=0 #for genes merged or not
688 #not consider O-1,3,19_not_in_3,10, too short compared with others
689 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region
690 pointer=x[0].split("___")[1].strip()#store the contig name
691 print(pointer)
692 if pointer!=0:#it has potential merge event
693 for y in Final_list:
694 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
695 potenial_new_gene=y
696 #print(potenial_new_gene)
697 break
698 if potenial_new_gene!="":
699 print("two differnt genes in same contig, fix it for O antigen")
700 print(potenial_new_gene[:3])
701 pointer=0
702 for y in final_O:
703 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
704 pointer=1
705 if pointer!=0: #changed to consider two genes in same contig
706 final_O.append(potenial_new_gene)
707 ### end of the two genes on same contig test
708 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
709 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
710 #print "$$$No Otype, due to no hit"#may need to be changed
711 O_choice="-"
712 else:
713 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
714 O_list=[]
715 O_list_less_contamination=[]
716 for x in final_O:
717 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
718 O_list.append(x[0].split("__")[0])
719 O_nodes_list.append(x[0].split("___")[1])
720 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
721 O_list_less_contamination.append(x[0].split("__")[0])
722 ### special test for O9,46 and O3,10 family
723 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
724 if "O-9,46_wzy" in O_list or "O-9,46_wzy_partial" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
725 O_choice="O-9,46"
726 #print "$$$Most possilble Otype: O-9,46"
727 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
728 O_choice="O-9,46,27"
729 #print "$$$Most possilble Otype: O-9,46,27"
730 else:
731 O_choice="O-9"#next, detect O9 vs O2?
732 O2=0
733 O9=0
734 for z in special_genes:
735 if "tyr-O-9" in z:
736 O9=special_genes[z]
737 elif "tyr-O-2" in z:
738 O2=special_genes[z]
739 if O2>O9:
740 O_choice="O-2"
741 elif O2<O9:
742 pass
743 else:
744 pass
745 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
746 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
747 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
748 O_choice="O-3,10"
749 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
750 else:
751 O_choice="O-1,3,19"
752 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
753 ### end of special test for O9,46 and O3,10 family
754 else:
755 try:
756 max_score=0
757 for x in final_O:
758 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
759 max_score=x[2]#change from x[-1] to x[2],08172018
760 O_choice=x[0].split("_")[0]
761 if O_choice=="O-1,3,19":
762 O_choice=final_O[1][0].split("_")[0]
763 #print "$$$Most possilble Otype: ",O_choice
764 except:
765 pass
766 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
767 if O_choice=="O-9,46,27" and len(O_list)==2 and "O-4_wzx" in O_list: #special for very low chance sitatuion between O4 and O9,27,46, this is for serotypes like Bredeney and Schwarzengrund (normallly O-4 will have higher score, but sometimes sequencing quality may affect the prediction)
768 O_choice="O-4"
769 #print "O:",O_choice,O_nodes_list
770 Otypes=[]
771 for x in O_list:
772 if x!="O-1,3,19_not_in_3,10":
773 if "O-9,46_" not in x:
774 Otypes.append(x.split("_")[0])
775 else:
776 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
777 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
778 Otypes_uniq,Otypes_fre=Uniq(Otypes)
779 contamination_O=""
780 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
781 if len(Otypes_uniq)>2:
782 contamination_O="potential contamination from O antigen signals"
783 else:
784 if len(Otypes_uniq)>1:
785 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
786 contamination_O=""
787 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
788 contamination_O=""
789 else:
790 contamination_O="potential contamination from O antigen signals"
791 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
792 ### End of SeqSero2 allele prediction and output
793
794 def get_input_files(make_dir,input_file,data_type,dirpath):
795 #tell input files from datatype
796 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
797 for_fq=""
798 rev_fq=""
799 os.chdir(make_dir)
800 if data_type=="1":
801 input_file=input_file[0].split("/")[-1]
802 if input_file.endswith(".sra"):
803 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
804 for_fq=input_file.replace(".sra","_1.fastq")
805 rev_fq=input_file.replace(".sra","_2.fastq")
806 else:
807 core_id=input_file.split(".fastq")[0].split(".fq")[0]
808 for_fq=core_id+"_1.fastq"
809 rev_fq=core_id+"_2.fastq"
810 if input_file.endswith(".gz"):
811 subprocess.check_call("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
812 else:
813 subprocess.check_call("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
814 elif data_type=="2":
815 for_fq=input_file[0].split("/")[-1]
816 rev_fq=input_file[1].split("/")[-1]
817 elif data_type=="3":
818 input_file=input_file[0].split("/")[-1]
819 if input_file.endswith(".sra"):
820 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
821 for_fq=input_file.replace(".sra","_1.fastq")
822 else:
823 for_fq=input_file
824 elif data_type in ["4","5","6"]:
825 for_fq=input_file[0].split("/")[-1]
826 os.chdir("..")
827 return for_fq,rev_fq
828
829 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
830 #get O and H types from Final_list from blast parsing; allele mode
831 from Bio import SeqIO
832 fliC_choice="-"
833 fljB_choice="-"
834 fliC_contig="NA"
835 fljB_contig="NA"
836 fliC_region=set([0])
837 fljB_region=set([0,])
838 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
839 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
840 O_choice="-"#no need to decide O contig for now, should be only one
841 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
842 O_choice=O_choice.split("-")[-1].strip()
843 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
844 O_choice="-"
845 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
846 #add alignment locations, used for further selection, 03312019
847 for i in range(len(H_contig_roles)):
848 x=H_contig_roles[i]
849 for y in Final_list_passed:
850 if x[1] in y[0] and y[0].startswith(x[0]):
851 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
852 break
853 log_file=open("SeqSero_log.txt","a")
854 extract_file=open("Extracted_antigen_alleles.fasta","a")
855 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
856
857 #print("O_contigs:")
858 log_file.write("O_contigs:\n")
859 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
860 extract_file.write("#O_contigs:\n")
861 for x in O_nodes_roles:
862 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
863 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
864 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
865 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
866 seqs=""
867 for z in handle_fasta:
868 if x[0].split("___")[-1]==z.description:
869 seqs=str(z.seq)
870 extract_file.write(title+seqs+"\n")
871 if len(H_contig_roles)!=0:
872 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
873 else:
874 highest_H_coverage=0
875 for x in H_contig_roles:
876 #if multiple choices, temporately select the one with longest length for now, will revise in further change
877 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
878 fliC_contig=x[1]
879 fliC_length=len(x[-1])
880 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
881 fljB_contig=x[1]
882 fljB_length=len(x[-1])
883 for x in Final_list_passed:
884 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
885 fliC_choice=x[0].split("_")[1]
886 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
887 fljB_choice=x[0].split("_")[1]
888 elif fliC_choice!="-" and fljB_choice!="-":
889 break
890 #now remove contigs not in middle core part
891 first_allele="NA"
892 first_allele_percentage=0
893 for x in Final_list:
894 if x[0].startswith("fliC") or x[0].startswith("fljB"):
895 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
896 first_allele_percentage=x[2]
897 break
898 additional_contigs=[]
899 for x in Final_list:
900 if first_allele in x[0]:
901 if (fliC_contig == x[0].split("___")[-1]):
902 fliC_region=x[3]
903 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
904 fljB_region=x[3]
905 else:
906 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
907 additional_contigs.append(x)
908 #else:
909 #print x[:3]
910 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
911 if first_allele_percentage>0.9:
912 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
913 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
914 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
915 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
916 else:
917 target_region=set()#doesn't do anything
918 else:
919 target_region=set()#doesn't do anything
920 #print(target_region)
921 #print(additional_contigs)
922 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
923 target_region=target_region2|target_region
924 for x in additional_contigs:
925 removal=0
926 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
927 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
928 removal=1
929 else:
930 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
931 removal=1
932 else:
933 pass
934 if removal==1:
935 for y in H_contig_roles:
936 if y[1] in x[0]:
937 H_contig_roles.remove(y)
938 else:
939 pass
940 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
941 #end of removing none-middle contigs
942 #print("H_contigs:")
943 log_file.write("H_contigs:\n")
944 extract_file.write("#H_contigs:\n")
945 H_contig_stat=[]
946 H1_cont_stat={}
947 H2_cont_stat={}
948 for i in range(len(H_contig_roles)):
949 x=H_contig_roles[i]
950 a=0
951 for y in Final_list_passed:
952 if x[1] in y[0] and y[0].startswith(x[0]):
953 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
954 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
955 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
956 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
957 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
958 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
959 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
960 seqs=""
961 for z in handle_fasta:
962 if x[1]==z.description:
963 seqs=str(z.seq)
964 extract_file.write(title+seqs+"\n")
965 break
966 else:
967 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
968 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
969 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
970 seqs=""
971 for z in handle_fasta:
972 if x[1]==z.description:
973 seqs=str(z.seq)
974 extract_file.write(title+seqs+"\n")
975 if x[0]=="fliC":
976 if y[0].split("_")[1] not in H1_cont_stat:
977 H1_cont_stat[y[0].split("_")[1]]=y[2]
978 else:
979 H1_cont_stat[y[0].split("_")[1]]+=y[2]
980 if x[0]=="fljB":
981 if y[0].split("_")[1] not in H2_cont_stat:
982 H2_cont_stat[y[0].split("_")[1]]=y[2]
983 else:
984 H2_cont_stat[y[0].split("_")[1]]+=y[2]
985 break
986 #detect contaminations
987 #print(H1_cont_stat)
988 #print(H2_cont_stat)
989 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
990 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
991 contamination_H=""
992 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
993 contamination_H="potential contamination from H antigen signals"
994 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
995 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
996 #get additional antigens
997 """
998 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
999 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
1000 O_choice="O-9,46"
1001 #print "$$$Most possilble Otype: O-9,46"
1002 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
1003 O_choice="O-9,46,27"
1004 #print "$$$Most possilble Otype: O-9,46,27"
1005 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
1006 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
1007 O_choice="O-3,10"
1008 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
1009 else:
1010 O_choice="O-1,3,19"
1011 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
1012 ### end of special test for O9,46 and O3,10 family
1013
1014 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
1015 if len(Otypes_uniq)>2:
1016 contamination_O="potential contamination from O antigen signals"
1017 else:
1018 if len(Otypes_uniq)>1:
1019 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
1020 contamination_O=""
1021 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
1022 contamination_O=""
1023 """
1024 additonal_antigents=[]
1025 #print(contamination_O)
1026 #print(contamination_H)
1027 log_file.write(contamination_O+"\n")
1028 log_file.write(contamination_H+"\n")
1029 log_file.close()
1030 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
1031
1032 def get_input_K(input_file,lib_dict,data_type,k_size):
1033 #kmer mode; get input_Ks from dict and data_type
1034 kmers = []
1035 for h in lib_dict:
1036 kmers += lib_dict[h]
1037 if data_type == '4':
1038 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
1039 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
1040 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
1041 elif data_type == '5':#minion_2d_fasta
1042 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
1043 if data_type == '6':#minion_2d_fastq
1044 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
1045 return input_Ks
1046
1047 def get_kmer_dict(lib_dict,input_Ks):
1048 #kmer mode; get predicted types
1049 O_dict = {}
1050 H_dict = {}
1051 Special_dict = {}
1052 for h in lib_dict:
1053 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
1054 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
1055 if h.startswith('O-') and score > 25:
1056 O_dict[h] = score
1057 if h.startswith('fl') and score > 40:
1058 H_dict[h] = score
1059 if (h[:2] != 'fl') and (h[:2] != 'O-'):
1060 Special_dict[h] = score
1061 return O_dict,H_dict,Special_dict
1062
1063 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
1064 log_file=open("SeqSero_log.txt","a")
1065 log_file.write("O_scores:\n")
1066 #call O:
1067 highest_O = '-'
1068 if len(O_dict) == 0:
1069 pass
1070 else:
1071 for x in O_dict:
1072 log_file.write(x+"\t"+str(O_dict[x])+"\n")
1073 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
1074 #if 'O-9,46_wzy__1191' in O_dict or "O-9,46_wzy_partial__216" in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
1075 #modified to fix miscall of O-9,46
1076 if ('O-9,46_wzy__1191' in O_dict and O_dict['O-9,46_wzy__1191']>40) or ("O-9,46_wzy_partial__216" in O_dict and O_dict["O-9,46_wzy_partial__216"]>40): # and float(O946_wzy)/float(num_1) > 0.1
1077 highest_O = "O-9,46"
1078 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
1079 highest_O = "O-9,46,27"
1080 else:
1081 highest_O = "O-9" # next, detect O9 vs O2?
1082 O2 = 0
1083 O9 = 0
1084 for z in Special_dict:
1085 if "tyr-O-9" in z:
1086 O9 = float(Special_dict[z])
1087 if "tyr-O-2" in z:
1088 O2 = float(Special_dict[z])
1089 if O2 > O9:
1090 highest_O = "O-2"
1091 elif ("O-3,10_wzx__1539" in O_dict) and (
1092 "O-9,46_wzy__1191" in O_dict
1093 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
1094 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
1095 highest_O = "O-3,10"
1096 else:
1097 highest_O = "O-1,3,19"
1098 ### end of special test for O9,46 and O3,10 family
1099 else:
1100 try:
1101 max_score = 0
1102 for x in O_dict:
1103 if float(O_dict[x]) >= max_score:
1104 max_score = float(O_dict[x])
1105 #highest_O = x.split("_")[0]
1106 # ed_SL_12182019: modified to fix the O-9,46 error example1
1107 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict):
1108 highest_O = "O-9"
1109 else:
1110 highest_O = x.split("_")[0]
1111 if highest_O == "O-1,3,19":
1112 highest_O = '-'
1113 max_score = 0
1114 for x in O_dict:
1115 if x == 'O-1,3,19_not_in_3,10__130':
1116 pass
1117 else:
1118 if float(O_dict[x]) >= max_score:
1119 max_score = float(O_dict[x])
1120 #highest_O = x.split("_")[0]
1121 # ed_SL_12182019: modified to fix the O-9,46 error example1
1122 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict):
1123 highest_O = "O-9"
1124 else:
1125 highest_O = x.split("_")[0]
1126 except:
1127 pass
1128 #call_fliC:
1129 if len(H_dict)!=0:
1130 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
1131 else:
1132 highest_H_score_both_BC=0
1133 highest_fliC = '-'
1134 highest_fliC_raw = '-'
1135 highest_Score = 0
1136 log_file.write("\nH_scores:\n")
1137 for s in H_dict:
1138 log_file.write(s+"\t"+str(H_dict[s])+"\n")
1139 if s.startswith('fliC'):
1140 if float(H_dict[s]) > highest_Score:
1141 highest_fliC = s.split('_')[1]
1142 highest_fliC_raw = s
1143 highest_Score = float(H_dict[s])
1144 #call_fljB
1145 highest_fljB = '-'
1146 highest_fljB_raw = '-'
1147 highest_Score = 0
1148 for s in H_dict:
1149 if s.startswith('fljB'):
1150 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
1151 #highest_fljB = s.split('_')[1]
1152 #highest_fljB_raw = s
1153 #highest_Score = float(H_dict[s])
1154 if s.split('_')[1]!=highest_fliC:
1155 highest_fljB = s.split('_')[1]
1156 highest_fljB_raw = s
1157 highest_Score = float(H_dict[s])
1158 log_file.write("\nSpecial_scores:\n")
1159 for s in Special_dict:
1160 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
1161 log_file.close()
1162 return highest_O,highest_fliC,highest_fljB
1163
1164 def get_temp_file_names(for_fq,rev_fq):
1165 #seqsero2 -a; get temp file names
1166 sam=for_fq+".sam"
1167 bam=for_fq+".bam"
1168 sorted_bam=for_fq+"_sorted.bam"
1169 mapped_fq1=for_fq+"_mapped.fq"
1170 mapped_fq2=rev_fq+"_mapped.fq"
1171 combined_fq=for_fq+"_combined.fq"
1172 for_sai=for_fq+".sai"
1173 rev_sai=rev_fq+".sai"
1174 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
1175
1176 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
1177 #seqsero2 -a; do mapping and sort
1178 print("building database...")
1179 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True)
1180 print("mapping...")
1181 if mapping_mode=="mem":
1182 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
1183 elif mapping_mode=="sam":
1184 if fnameB!="":
1185 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
1186 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True)
1187 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
1188 else:
1189 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
1190 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam)
1191 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True)
1192 ### check the version of samtools then use differnt commands
1193 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
1194 out, err = samtools_version.communicate()
1195 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
1196 print("check samtools version:",version)
1197 ### end of samtools version check and its analysis
1198 if LooseVersion(version)<=LooseVersion("1.2"):
1199 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True)
1200 else:
1201 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True)
1202
1203 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
1204 #seqsero2 -a; extract, assembly and blast
1205 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True)
1206 #print("fnameA:",fnameA)
1207 #print("fnameB:",fnameB)
1208 if fnameB!="":
1209 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output
1210 else:
1211 pass
1212 outdir=current_time+"_temp"
1213 print("assembling...")
1214 if int(threads)>4:
1215 t="4"
1216 else:
1217 t=threads
1218 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
1219 if fnameB!="":
1220 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
1221 else:
1222 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
1223 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
1224 #new_fasta=fnameA+"_"+database.split('/')[-1]+"_"+mapping_mode+".fasta" # change path to databse for packaging
1225 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True)
1226 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
1227 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True)
1228 print("blasting...","\n")
1229 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
1230 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
1231 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10"
1232 else:
1233 xmlfile="NA"
1234 return xmlfile,new_fasta
1235
1236 def judge_subspecies(fnameA):
1237 #seqsero2 -a; judge subspecies on just forward raw reads fastq
1238 salmID_output=subprocess.Popen("SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
1239 out, err = salmID_output.communicate()
1240 out=out.decode("utf-8")
1241 file=open("data_log.txt","a")
1242 file.write(out)
1243 file.close()
1244 salm_species_scores=out.split("\n")[1].split("\t")[6:]
1245 salm_species_results=out.split("\n")[0].split("\t")[6:]
1246 max_score=0
1247 max_score_index=1 #default is 1, means "I"
1248 for i in range(len(salm_species_scores)):
1249 if max_score<float(salm_species_scores[i]):
1250 max_score=float(salm_species_scores[i])
1251 max_score_index=i
1252 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
1253 #if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
1254 if float(out.split("\n")[1].split("\t")[4]) > 10 and float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): ## ed_SL_0318: change SalmID_ssp_threshold
1255 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
1256 #if max_score<10: ## ed_SL_0318: change SalmID_ssp_threshold
1257 if max_score<60:
1258 prediction="-"
1259 return prediction
1260
1261 def judge_subspecies_Kmer(Special_dict):
1262 #seqsero2 -k;
1263 max_score=0
1264 prediction="-" #default should be I
1265 for x in Special_dict:
1266 #if "mer" in x: ## ed_SL_0318: change ssp_threshold
1267 if "mer" in x and float(Special_dict[x]) > 60:
1268 if max_score<float(Special_dict[x]):
1269 max_score=float(Special_dict[x])
1270 prediction=x.split("_")[-1].strip()
1271 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
1272 prediction="bongori"
1273 break
1274 return prediction
1275
1276 ## ed_SL_11232019: add notes for missing antigen
1277 def check_antigens(ssp,O_antigen,H1_antigen,H2_antigen,NA_note):
1278 antigen_note = ''
1279 if ssp != '-':
1280 if O_antigen != '-' and H1_antigen == '-' and H2_antigen == '-': # O:-:-
1281 antigen_note = 'H antigens were not detected. This is an atypical result that should be further investigated. Most Salmonella strains have at least fliC, encoding the Phase 1 H antigen, even if it is not expressed. '
1282 NA_note = ''
1283 elif O_antigen != '-' and H1_antigen == '-' and H2_antigen != '-': # O:-:H2
1284 antigen_note = 'fliC was not detected. This is an atypical result that should be further investigated. Most Salmonella strains have fliC, encoding the Phase 1 H antigen, even if it is not expressed. '
1285 NA_note = ''
1286 elif O_antigen == '-' and H1_antigen != '-': # -:H1:X
1287 antigen_note = 'O antigen was not detected. This result may be due to a rough strain that has deleted the rfb region. For raw reads input, the k-mer workflow is sometimes more sensitive than the microassembly workflow in detecting O antigen. Caution should be used with this approach because the k-mer result may be due to low levels of contamination. '
1288 NA_note = ''
1289 elif O_antigen == '-' and H1_antigen == '-' and H2_antigen == '-': # -:-:-
1290 antigen_note = 'No serotype antigens were detected. This is an atypical result that should be further investigated. '
1291 NA_note = ''
1292 else:
1293 antigen_note = 'The input genome cannot be identified as Salmonella. Check the input for taxonomic ID, contamination, or sequencing quality. '
1294 NA_note = ''
1295 # if [O_antigen, H1_antigen, H2_antigen].count('-') >= 2:
1296 # antigen_note = 'No subspecies marker was detected and less than 2 serotype antigens were detected; further, this genome was not identified as Salmonella. This is an atypical result that should be further investigated. '
1297 # else:
1298 # antigen_note = 'No subspecies marker was detected. This genome may not be Salmonella. This is an atypical result that should be further investigated. '
1299 return (antigen_note,NA_note)
1300
1301 def main():
1302 #combine SeqSeroK and SeqSero2, also with SalmID
1303 args = parse_args()
1304 input_file = args.i
1305 data_type = args.t
1306 analysis_mode = args.m
1307 mapping_mode=args.b
1308 threads=args.p
1309 make_dir=args.d
1310 clean_mode=args.c
1311 sample_name=args.n
1312 ingore_header=args.s
1313 k_size=27 #will change for bug fixing
1314 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
1315 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: add ex_dir for packaging
1316 seqsero2_db=ex_dir+"/H_and_O_and_specific_genes.fasta" # ed_SL_11092019: change path to database for packaging
1317 database="H_and_O_and_specific_genes.fasta"
1318 note="Note: "
1319 NA_note="This predicted serotype is not in the Kauffman-White scheme. " # ed_SL_09272019: add for new output format
1320 if len(sys.argv)==1:
1321 subprocess.check_call(dirpath+"/SeqSero2_package.py -h",shell=True)#change name of python file
1322 else:
1323 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
1324 request_id += str(random.randint(1, 10000000))
1325 if make_dir is None:
1326 make_dir="SeqSero_result_"+request_id
1327 make_dir=os.path.abspath(make_dir)
1328 if os.path.isdir(make_dir):
1329 pass
1330 else:
1331 subprocess.check_call("mkdir -p "+make_dir,shell=True)
1332 #subprocess.check_call("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
1333 #subprocess.check_call("ln -sr "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
1334 subprocess.check_call("ln -f -s "+seqsero2_db+" "+" ".join(input_file)+" "+make_dir,shell=True) # ed_SL_11092019: change path to database for packaging
1335 #subprocess.check_call("ln -f -s "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument).
1336 ############################begin the real analysis
1337 if analysis_mode=="a":
1338 if data_type in ["1","2","3"]:#use allele mode
1339 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
1340 os.chdir(make_dir)
1341 ###add a function to tell input files
1342 fnameA=for_fq.split("/")[-1]
1343 fnameB=rev_fq.split("/")[-1]
1344 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
1345 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
1346 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
1347 ### avoid error out when micro assembly fails. ed_SL_03172020
1348 try:
1349 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
1350 except (UnboundLocalError, subprocess.CalledProcessError):
1351 xmlfile="NA"
1352 H1_cont_stat_list=[]
1353 H2_cont_stat_list=[]
1354 ###
1355 if xmlfile=="NA":
1356 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
1357 else:
1358 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
1359 file=open("data_log.txt","a")
1360 for x in Final_list:
1361 file.write("\t".join(str(y) for y in x)+"\n")
1362 file.close()
1363 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
1364 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
1365 subspecies=judge_subspecies(fnameA) #predict subspecies
1366 ###output
1367 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
1368 claim="" #04132019, disable claim for new report requirement
1369 contamination_report=""
1370 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
1371 if contamination_O!="" and contamination_H=="":
1372 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
1373 elif contamination_O=="" and contamination_H!="":
1374 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
1375 elif contamination_O!="" and contamination_H!="":
1376 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
1377 if contamination_report!="":
1378 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
1379 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles. "
1380 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
1381 #if contamination_report+star_line+claim=="": #0413, new output style
1382 # note=""
1383 #else:
1384 # note="Note:"
1385
1386 ### ed_SL_11232019: add notes for missing antigen
1387 if O_choice=="":
1388 O_choice="-"
1389 antigen_note,NA_note=check_antigens(subspecies,O_choice,fliC_choice,fljB_choice,NA_note)
1390 if sample_name:
1391 print ("Sample name:\t"+sample_name)
1392 ###
1393
1394 if clean_mode:
1395 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
1396 make_dir="none-output-directory due to '-c' flag"
1397 else:
1398 new_file=open("SeqSero_result.txt","w")
1399 ### ed_SL_01152020: add new output
1400 conta_note="yes" if "inter-serotype contamination" in contamination_report else "no"
1401 tsv_file=open("SeqSero_result.tsv","w")
1402 if ingore_header:
1403 pass
1404 else:
1405 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted subspecies\tPredicted antigenic profile\tPredicted serotype\tPotential inter-serotype contamination\tNote\n")
1406 if sample_name:
1407 new_file.write("Sample name:\t"+sample_name+"\n")
1408 tsv_file.write(sample_name+'\t')
1409 else:
1410 tsv_file.write(input_file[0].split('/')[-1]+'\t')
1411 ###
1412 if "N/A" not in predict_sero:
1413 new_file.write("Output directory:\t"+make_dir+"\n"+
1414 "Input files:\t"+"\t".join(input_file)+"\n"+
1415 "O antigen prediction:\t"+O_choice+"\n"+
1416 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1417 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1418 "Predicted subspecies:\t"+subspecies+"\n"+
1419 "Predicted antigenic profile:\t"+predict_form+"\n"+
1420 "Predicted serotype:\t"+predict_sero+"\n"+
1421 note+contamination_report+star_line+claim+antigen_note+"\n")#+##
1422 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies+"\t"+predict_form+"\t"+predict_sero+"\t"+conta_note+"\t"+contamination_report+star_line+claim+antigen_note+"\n")
1423 else:
1424 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
1425 star_line="" #04132019, for new output requirement, diable star_line if "NA" in output
1426 new_file.write("Output directory:\t"+make_dir+"\n"+
1427 "Input files:\t"+"\t".join(input_file)+"\n"+
1428 "O antigen prediction:\t"+O_choice+"\n"+
1429 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1430 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1431 "Predicted subspecies:\t"+subspecies+"\n"+
1432 "Predicted antigenic profile:\t"+predict_form+"\n"+
1433 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, add subspecies
1434 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")#+##
1435 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+conta_note+"\t"+NA_note+contamination_report+star_line+claim+antigen_note+"\n")
1436 new_file.close()
1437 tsv_file.close()
1438 #subprocess.check_call("cat Seqsero_result.txt",shell=True)
1439 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True)
1440 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True)
1441 if "N/A" not in predict_sero:
1442 #print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\nNote:"+contamination_report+star+star_line+claim+"\n")#+##
1443 print("Output directory:\t"+make_dir+"\n"+
1444 "Input files:\t"+"\t".join(input_file)+"\n"+
1445 "O antigen prediction:\t"+O_choice+"\n"+
1446 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1447 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1448 "Predicted subspecies:\t"+subspecies+"\n"+
1449 "Predicted antigenic profile:\t"+predict_form+"\n"+
1450 "Predicted serotype:\t"+predict_sero+"\n"+
1451 note+contamination_report+star_line+claim+antigen_note+"\n")#+##
1452 else:
1453 print("Output directory:\t"+make_dir+"\n"+
1454 "Input files:\t"+"\t".join(input_file)+"\n"+
1455 "O antigen prediction:\t"+O_choice+"\n"+
1456 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1457 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1458 "Predicted subspecies:\t"+subspecies+"\n"+
1459 "Predicted antigenic profile:\t"+predict_form+"\n"+
1460 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
1461 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")
1462 else:
1463 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
1464 elif analysis_mode=="k":
1465 #ex_dir = os.path.dirname(os.path.realpath(__file__))
1466 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: change ex_dir for packaging
1467 #output_mode = args.mode
1468 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
1469 input_file = for_fq #-k will just use forward because not all reads were used
1470 os.chdir(make_dir)
1471 f = open(ex_dir + '/antigens.pickle', 'rb')
1472 lib_dict = pickle.load(f)
1473 f.close
1474 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size)
1475 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks)
1476 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
1477 subspecies=judge_subspecies_Kmer(Special_dict)
1478 if subspecies=="IIb" or subspecies=="IIa":
1479 subspecies="II"
1480 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
1481 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
1482 claim="" #no claim any more based on new output requirement
1483 #if star_line+claim=="": #0413, new output style
1484 # note=""
1485 #else:
1486 # note="Note:"
1487
1488 ### ed_SL_11232019: add notes for missing antigen
1489 if highest_O.split('-')[-1]=="":
1490 O_choice="-"
1491 else:
1492 O_choice=highest_O.split('-')[-1]
1493 antigen_note,NA_note=check_antigens(subspecies,O_choice,highest_fliC,highest_fljB,NA_note)
1494 if sample_name:
1495 print ("Sample name:\t"+sample_name)
1496 ###
1497
1498 if clean_mode:
1499 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
1500 make_dir="none-output-directory due to '-c' flag"
1501 # ### ed_SL_05282019, fix the assignment issue of variable 'O_choice' using "-m k -c"
1502 # if highest_O.split('-')[-1]=="":
1503 # O_choice="-"
1504 # else:
1505 # O_choice=highest_O.split('-')[-1]
1506 # ###
1507 else:
1508 # if highest_O.split('-')[-1]=="":
1509 # O_choice="-"
1510 # else:
1511 # O_choice=highest_O.split('-')[-1]
1512 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
1513 new_file=open("SeqSero_result.txt","w")
1514 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
1515 ### ed_SL_01152020: add new output
1516 tsv_file=open("SeqSero_result.tsv","w")
1517 if ingore_header:
1518 pass
1519 else:
1520 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted subspecies\tPredicted antigenic profile\tPredicted serotype\tNote\n")
1521 if sample_name:
1522 new_file.write("Sample name:\t"+sample_name+"\n")
1523 tsv_file.write(sample_name+'\t')
1524 else:
1525 tsv_file.write(input_file.split('/')[-1]+'\t')
1526 ###
1527 if "N/A" not in predict_sero:
1528 new_file.write("Output directory:\t"+make_dir+"\n"+
1529 "Input files:\t"+input_file+"\n"+
1530 "O antigen prediction:\t"+O_choice+"\n"+
1531 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1532 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1533 "Predicted subspecies:\t"+subspecies+"\n"+
1534 "Predicted antigenic profile:\t"+predict_form+"\n"+
1535 "Predicted serotype:\t"+predict_sero+"\n"+
1536 note+star_line+claim+antigen_note+"\n")#+##
1537 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies+"\t"+predict_form+"\t"+predict_sero+"\t"+star_line+claim+antigen_note+"\n")
1538 else:
1539 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
1540 star_line = "" #changed for new output requirement, 04132019
1541 new_file.write("Output directory:\t"+make_dir+"\n"+
1542 "Input files:\t"+input_file+"\n"+
1543 "O antigen prediction:\t"+O_choice+"\n"+
1544 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1545 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1546 "Predicted subspecies:\t"+subspecies+"\n"+
1547 "Predicted antigenic profile:\t"+predict_form+"\n"+
1548 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
1549 note+NA_note+star_line+claim+antigen_note+"\n")#+##
1550 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+NA_note+star_line+claim+antigen_note+"\n")
1551 new_file.close()
1552 tsv_file.close()
1553 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True)
1554 if "N/A" not in predict_sero:
1555 print("Output directory:\t"+make_dir+"\n"+
1556 "Input files:\t"+input_file+"\n"+
1557 "O antigen prediction:\t"+O_choice+"\n"+
1558 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1559 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1560 "Predicted subspecies:\t"+subspecies+"\n"+
1561 "Predicted antigenic profile:\t"+predict_form+"\n"+
1562 "Predicted serotype:\t"+predict_sero+"\n"+
1563 note+star_line+claim+antigen_note+"\n")#+##
1564 else:
1565 print("Output directory:\t"+make_dir+"\n"+
1566 "Input files:\t"+input_file+"\n"+
1567 "O antigen prediction:\t"+O_choice+"\n"+
1568 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1569 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1570 "Predicted subspecies:\t"+subspecies+"\n"+
1571 "Predicted antigenic profile:\t"+predict_form+"\n"+
1572 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
1573 note+NA_note+star_line+claim+antigen_note+"\n")#+##
1574
1575 if __name__ == '__main__':
1576 main()