annotate bin/SeqSero2_package.py @ 13:e3b74e412f40 draft

planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
author cstrittmatter
date Fri, 15 May 2020 10:37:52 -0400
parents 08832c0d3cbd
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1 #!/usr/bin/env python3
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
2
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
3 import sys
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
4 import time
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
5 import random
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
6 import os
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
7 import subprocess
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
8 import gzip
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
9 import io
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
10 import pickle
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
11 import argparse
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
12 import itertools
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
13 from distutils.version import LooseVersion
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
14 from distutils.spawn import find_executable
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
15 sys.path.insert(1,sys.path[0]+'/..')
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
16
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
17 try:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
18 from .version import SeqSero2_version
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
19 except Exception: #ImportError
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
20 from version import SeqSero2_version
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
21
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
22 ### SeqSero Kmer
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
23 def parse_args():
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
24 "Parse the input arguments, use '-h' for help."
8
357e38526e2a planemo upload commit c50df40caef2fb97c178d6890961e0e527992324-dirty
cstrittmatter
parents: 7
diff changeset
25 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\nVersion: v1.1.1')#add "-m <data_type>" in future
0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
26 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### add 'type=os.path.abspath' to generate absolute path of input data.
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
27 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore fasta, '6' for nanopore fastq")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
28 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
29 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
30 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
31 parser.add_argument("-n",help="<string>: optional, to specify a sample name in the report output")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
32 parser.add_argument("-d",help="<string>: optional, to specify an output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
33 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will only output serotype prediction without the directory containing log files")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
34 parser.add_argument("-s",action="store_true",help="<flag>: if '-s' was flagged, SeqSero2 will not output header in SeqSero_result.tsv")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
35 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
36 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SeqSero2_version)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
37 return parser.parse_args()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
38
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
39 ### check paths of dependencies
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
40 check_dependencies = parse_args().check
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
41 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py']
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
42 if check_dependencies:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
43 for item in dependencies:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
44 ext_path = find_executable(item)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
45 if ext_path is not None:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
46 print ("Using "+item+" - "+ext_path)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
47 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
48 print ("ERROR: can not find "+item+" in PATH")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
49 sys.exit()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
50 ### end of --check
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
51
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
52 def reverse_complement(sequence):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
53 complement = {
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
54 'A': 'T',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
55 'C': 'G',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
56 'G': 'C',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
57 'T': 'A',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
58 'N': 'N',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
59 'M': 'K',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
60 'R': 'Y',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
61 'W': 'W',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
62 'S': 'S',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
63 'Y': 'R',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
64 'K': 'M',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
65 'V': 'B',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
66 'H': 'D',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
67 'D': 'H',
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
68 'B': 'V'
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
69 }
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
70 return "".join(complement[base] for base in reversed(sequence))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
71
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
72
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
73 def createKmerDict_reads(list_of_strings, kmer):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
74 kmer_table = {}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
75 for string in list_of_strings:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
76 sequence = string.strip('\n')
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
77 for i in range(len(sequence) - kmer + 1):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
78 new_mer = sequence[i:i + kmer].upper()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
79 new_mer_rc = reverse_complement(new_mer)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
80 if new_mer in kmer_table:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
81 kmer_table[new_mer.upper()] += 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
82 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
83 kmer_table[new_mer.upper()] = 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
84 if new_mer_rc in kmer_table:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
85 kmer_table[new_mer_rc.upper()] += 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
86 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
87 kmer_table[new_mer_rc.upper()] = 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
88 return kmer_table
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
89
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
90
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
91 def multifasta_dict(multifasta):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
92 multifasta_list = [
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
93 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
94 ]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
95 headers = [i for i in multifasta_list if i[0] == '>']
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
96 multifasta_dict = {}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
97 for h in headers:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
98 start = multifasta_list.index(h)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
99 for element in multifasta_list[start + 1:]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
100 if element[0] == '>':
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
101 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
102 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
103 if h[1:] in multifasta_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
104 multifasta_dict[h[1:]] += element
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
105 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
106 multifasta_dict[h[1:]] = element
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
107 return multifasta_dict
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
108
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
109
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
110 def multifasta_single_string(multifasta):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
111 multifasta_list = [
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
112 line.strip() for line in open(multifasta, 'r')
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
113 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
114 ]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
115 return ''.join(multifasta_list)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
116
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
117
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
118 def chunk_a_long_sequence(long_sequence, chunk_size=60):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
119 chunk_list = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
120 steps = len(long_sequence) // 60 #how many chunks
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
121 for i in range(steps):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
122 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
123 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
124 return chunk_list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
125
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
126
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
127 def target_multifasta_kmerizer(multifasta, k, kmerDict):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
128 forward_length = 300 #if find the target, put forward 300 bases
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
129 reverse_length = 2200 #if find the target, put backward 2200 bases
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
130 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
131 target_mers = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
132 long_single_string = multifasta_single_string(multifasta)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
133 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
134 unit_length = len(multifasta_list[0])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
135 forward_lines = int(forward_length / unit_length) + 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
136 reverse_lines = int(forward_length / unit_length) + 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
137 start_num = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
138 end_num = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
139 for i in range(len(multifasta_list)):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
140 if i not in range(start_num, end_num): #avoid computational repetition
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
141 line = multifasta_list[i]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
142 start = int((len(line) - k) // 2)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
143 s1 = line[start:k + start]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
144 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
145 if i - forward_lines >= 0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
146 start_num = i - forward_lines
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
147 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
148 start_num = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
149 if i + reverse_lines <= len(multifasta_list) - 1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
150 end_num = i + reverse_lines
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
151 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
152 end_num = len(multifasta_list) - 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
153 target_list = [
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
154 x.strip() for x in multifasta_list[start_num:end_num]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
155 ]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
156 target_line = "".join(target_list)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
157 target_mers += [
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
158 k1 for k1 in createKmerDict_reads([str(target_line)], k)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
159 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
160 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
161 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
162 return set(target_mers)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
163
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
164
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
165 def target_read_kmerizer(file, k, kmerDict):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
166 i = 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
167 n_reads = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
168 total_coverage = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
169 target_mers = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
170 if file.endswith(".gz"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
171 file_content = io.BufferedReader(gzip.open(file))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
172 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
173 file_content = open(file, "r").readlines()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
174 for line in file_content:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
175 start = int((len(line) - k) // 2)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
176 if i % 4 == 2:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
177 if file.endswith(".gz"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
178 s1 = line[start:k + start].decode()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
179 line = line.decode()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
180 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
181 s1 = line[start:k + start]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
182 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
183 n_reads += 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
184 total_coverage += len(line)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
185 target_mers += [
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
186 k1 for k1 in createKmerDict_reads([str(line)], k)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
187 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
188 i += 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
189 if total_coverage >= 4000000:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
190 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
191 return set(target_mers)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
192
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
193
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
194 def minion_fasta_kmerizer(file, k, kmerDict):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
195 i = 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
196 n_reads = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
197 total_coverage = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
198 target_mers = {}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
199 for line in open(file):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
200 if i % 2 == 0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
201 for kmer, rc_kmer in kmers(line.strip().upper(), k):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
202 if (kmer in kmerDict) or (rc_kmer in kmerDict):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
203 if kmer in target_mers:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
204 target_mers[kmer] += 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
205 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
206 target_mers[kmer] = 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
207 if rc_kmer in target_mers:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
208 target_mers[rc_kmer] += 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
209 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
210 target_mers[rc_kmer] = 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
211 i += 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
212 return set([h for h in target_mers])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
213
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
214
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
215 def minion_fastq_kmerizer(file, k, kmerDict):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
216 i = 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
217 n_reads = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
218 total_coverage = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
219 target_mers = {}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
220 for line in open(file):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
221 if i % 4 == 2:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
222 for kmer, rc_kmer in kmers(line.strip().upper(), k):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
223 if (kmer in kmerDict) or (rc_kmer in kmerDict):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
224 if kmer in target_mers:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
225 target_mers[kmer] += 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
226 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
227 target_mers[kmer] = 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
228 if rc_kmer in target_mers:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
229 target_mers[rc_kmer] += 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
230 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
231 target_mers[rc_kmer] = 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
232 i += 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
233 return set([h for h in target_mers])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
234
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
235
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
236 def multifasta_single_string2(multifasta):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
237 single_string = ''
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
238 with open(multifasta, 'r') as f:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
239 for line in f:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
240 if line.strip()[0] == '>':
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
241 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
242 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
243 single_string += line.strip()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
244 return single_string
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
245
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
246
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
247 def kmers(seq, k):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
248 rev_comp = reverse_complement(seq)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
249 for start in range(1, len(seq) - k + 1):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
250 yield seq[start:start + k], rev_comp[-(start + k):-start]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
251
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
252
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
253 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
254 multi_seq_dict = multifasta_dict(multifasta)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
255 lib_dict = {}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
256 for h in multi_seq_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
257 lib_dict[h] = set(
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
258 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
259 return lib_dict
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
260
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
261
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
262 def Combine(b, c):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
263 fliC_combinations = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
264 fliC_combinations.append(",".join(c))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
265 temp_combinations = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
266 for i in range(len(b)):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
267 for x in itertools.combinations(b, i + 1):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
268 temp_combinations.append(",".join(x))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
269 for x in temp_combinations:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
270 temp = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
271 for y in c:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
272 temp.append(y)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
273 temp.append(x)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
274 temp = ",".join(temp)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
275 temp = temp.split(",")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
276 temp.sort()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
277 temp = ",".join(temp)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
278 fliC_combinations.append(temp)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
279 return fliC_combinations
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
280
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
281
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
282 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
283 #like test_output_06012017.txt
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
284 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
285 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
286 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
287 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
288 seronames = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
289 seronames_none_subspecies=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
290 for i in range(len(phase1)):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
291 fliC_combine = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
292 fljB_combine = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
293 if phaseO[i] == Otype: # no VII in KW, but it's there
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
294 ### for fliC, detect every possible combinations to avoid the effect of "["
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
295 if phase1[i].count("[") == 0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
296 fliC_combine.append(phase1[i])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
297 elif phase1[i].count("[") >= 1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
298 c = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
299 b = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
300 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
301 "[") == 1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
302 content = phase1[i].replace("[", "").replace("]", "")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
303 fliC_combine.append(content)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
304 fliC_combine.append("-")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
305 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
306 for x in phase1[i].split(","):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
307 if "[" in x:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
308 b.append(x.replace("[", "").replace("]", ""))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
309 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
310 c.append(x)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
311 fliC_combine = Combine(
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
312 b, c
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
313 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
314 ### end of fliC "[" detect
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
315 ### for fljB, detect every possible combinations to avoid the effect of "["
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
316 if phase2[i].count("[") == 0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
317 fljB_combine.append(phase2[i])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
318 elif phase2[i].count("[") >= 1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
319 d = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
320 e = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
321 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
322 "[") == 1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
323 content = phase2[i].replace("[", "").replace("]", "")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
324 fljB_combine.append(content)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
325 fljB_combine.append("-")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
326 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
327 for x in phase2[i].split(","):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
328 if "[" in x:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
329 d.append(x.replace("[", "").replace("]", ""))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
330 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
331 e.append(x)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
332 fljB_combine = Combine(d, e)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
333 ### end of fljB "[" detect
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
334 new_fliC = fliC.split(
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
335 ","
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
336 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
337 new_fliC.sort()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
338 new_fliC = ",".join(new_fliC)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
339 new_fljB = fljB.split(",")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
340 new_fljB.sort()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
341 new_fljB = ",".join(new_fljB)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
342 if (new_fliC in fliC_combine
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
343 or fliC in fliC_combine) and (new_fljB in fljB_combine
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
344 or fljB in fljB_combine):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
345 ######start, remove_list,rename_dict, added on 11/11/2018
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
346 if sero[i] not in remove_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
347 temp_sero=sero[i]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
348 if temp_sero in rename_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
349 temp_sero=rename_dict[temp_sero] #rename if in the rename list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
350 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
351 if subs[i] == subspecies:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
352 seronames.append(temp_sero)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
353 seronames_none_subspecies.append(temp_sero)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
354 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
355 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
356 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
357 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
358 ######end, added on 11/11/2018
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
359 #analyze seronames
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
360 subspecies_pointer=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
361 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
362 # ed_SL_12182019: modified to fix the subspecies output problem
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
363 #seronames=seronames_none_subspecies
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
364 seronames=["N/A"]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
365 #subspecies_pointer="1"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
366 subspecies_pointer="0"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
367 if len(seronames) == 0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
368 seronames = [
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
369 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
370 ]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
371 star = ""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
372 star_line = ""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
373 if len(seronames) > 1: #there are two possible predictions for serotypes
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
374 star = "*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
375 #changed 04072019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
376 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
377 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
378 star="*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
379 star_line="The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme. " + star_line
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
380 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
381 if Otype=="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
382 Otype="-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
383 predict_form = Otype + ":" + fliC + ":" + fljB
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
384 predict_sero = (" or ").join(seronames)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
385 ###special test for Enteritidis
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
386 if predict_form == "9:g,m:-":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
387 sdf = "-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
388 for x in special_gene_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
389 if x.startswith("sdf"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
390 sdf = "+"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
391 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
392 star_line="sdf gene detected. "
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
393 #predict_form = predict_form + " Sdf prediction:" + sdf
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
394 predict_form = predict_form #changed 04072019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
395 if sdf == "-":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
396 star = "*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
397 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
398 star_line="sdf gene not detected. "
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
399 #changed in 04072019, for new output
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
400 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
401 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
402 predict_sero = "Gallinarum or Enteritidis"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
403 ###end of special test for Enteritidis
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
404 elif predict_form == "4:i:-":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
405 predict_sero = "I 4,[5],12:i:-" # change serotype name
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
406 elif predict_form == "4:r:-":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
407 predict_sero = "N/A (4:r:-)"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
408 elif predict_form == "4:b:-":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
409 predict_sero = "N/A (4:b:-)"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
410 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
411 #predict_sero = "Newport"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
412 #star = "*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
413 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
414 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
415 if "N/A" in predict_sero:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
416 claim = ""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
417 #special test for Typhimurium
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
418 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
419 normal = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
420 mutation = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
421 for x in special_gene_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
422 if "oafA-O-4_full" in x:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
423 normal = float(special_gene_list[x])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
424 elif "oafA-O-4_5-" in x:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
425 mutation = float(special_gene_list[x])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
426 if normal > mutation:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
427 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
428 elif normal < mutation:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
429 #predict_sero = predict_sero.strip() + "(O5-)"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
430 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
431 star = "*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
432 #star_line = "Detected the deletion of O5-."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
433 star_line = "Detected a deletion that causes O5- variant of Typhimurium. "
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
434 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
435 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
436 #special test for Paratyphi B
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
437 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
438 normal = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
439 mutation = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
440 for x in special_gene_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
441 if "gntR-family-regulatory-protein_dt-positive" in x:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
442 normal = float(special_gene_list[x])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
443 elif "gntR-family-regulatory-protein_dt-negative" in x:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
444 mutation = float(special_gene_list[x])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
445 #print(normal,mutation)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
446 if normal > mutation:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
447 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
448 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
449 star = "*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
450 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
451 star_line = "The SNP that causes d-Tartrate nonfermentating phenotype of Paratyphi B was not detected. "
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
452 elif normal < mutation:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
453 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
454 predict_sero = predict_sero.strip()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
455 star = "*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
456 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
457 star_line = "Detected the SNP for d-Tartrate nonfermenting phenotype of Paratyphi B. "
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
458 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
459 star = "*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
460 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
461 star_line = " " ## ed_SL_05152019: do not report this situation.
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
462 #special test for O13,22 and O13,23
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
463 if Otype=="13":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
464 #ex_dir = os.path.dirname(os.path.realpath(__file__))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
465 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
466 f = open(ex_dir + '/special.pickle', 'rb')
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
467 special = pickle.load(f)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
468 O22_O23=special['O22_O23']
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
469 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
470 O22_score=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
471 O23_score=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
472 for x in special_gene_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
473 if "O:22" in x:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
474 O22_score = O22_score+float(special_gene_list[x])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
475 elif "O:23" in x:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
476 O23_score = O23_score+float(special_gene_list[x])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
477 #print(O22_score,O23_score)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
478 for z in O22_O23[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
479 if predict_sero.split(" or ")[0] in z:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
480 if O22_score > O23_score:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
481 star = "*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
482 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
483 predict_sero = z[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
484 elif O22_score < O23_score:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
485 star = "*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
486 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
487 predict_sero = z[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
488 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
489 star = "*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
490 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
491 if " or " in predict_sero:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
492 star_line = star_line + "The predicted serotypes share the same general formula: " + Otype + ":" + fliC + ":" + fljB + "."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
493 #special test for O6,8
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
494 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
495 #for x in merge_O68_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
496 # if x in predict_sero:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
497 # predict_sero=x
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
498 # star=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
499 # star_line=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
500 #special test for Montevideo; most of them are monophasic
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
501 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
502 #star="*"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
503 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
504 return predict_form, predict_sero, star, star_line, claim
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
505 ### End of SeqSero Kmer part
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
506
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
507 ### Begin of SeqSero2 allele prediction and output
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
508 def xml_parse_score_comparision_seqsero(xmlfile):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
509 #used to do seqsero xml analysis
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
510 from Bio.Blast import NCBIXML
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
511 handle=open(xmlfile)
12
08832c0d3cbd planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents: 10
diff changeset
512 print(handle)
0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
513 handle=NCBIXML.parse(handle)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
514 handle=list(handle)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
515 List=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
516 List_score=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
517 List_ids=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
518 List_query_region=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
519 for i in range(len(handle)):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
520 if len(handle[i].alignments)>0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
521 for j in range(len(handle[i].alignments)):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
522 score=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
523 ids=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
524 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
525 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
526 for z in range(len(handle[i].alignments[j].hsps)):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
527 hsp=handle[i].alignments[j].hsps[z]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
528 temp=set(range(hsp.query_start,hsp.query_end))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
529 if len(cover_region)==0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
530 cover_region=cover_region|temp
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
531 fraction=1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
532 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
533 fraction=1-len(cover_region&temp)/float(len(temp))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
534 cover_region=cover_region|temp
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
535 if "last" in handle[i].query or "first" in handle[i].query:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
536 score+=hsp.bits*fraction
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
537 ids+=float(hsp.identities)/handle[i].query_length*fraction
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
538 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
539 score+=hsp.bits*fraction
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
540 ids+=float(hsp.identities)/handle[i].query_length*fraction
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
541 List_score.append(score)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
542 List_ids.append(ids)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
543 List_query_region.append(cover_region)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
544 temp=zip(List,List_score,List_ids,List_query_region)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
545 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
546 return Final_list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
547
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
548
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
549 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
550 Old=L
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
551 L.sort()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
552 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
553 count=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
554 for j in range(len(L)):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
555 y=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
556 for x in Old:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
557 if L[j]==x:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
558 y+=1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
559 count.append(y)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
560 if sort_on_fre!="none":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
561 d=zip(*sorted(zip(count, L)))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
562 L=d[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
563 count=d[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
564 return (L,count)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
565
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
566 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
567 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
568 #this is mainly used for
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
569 a=nodes_vs_score_list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
570 fliC_score=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
571 fljB_score=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
572 for z in a:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
573 if "fliC" in z[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
574 fliC_score+=z[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
575 elif "fljB" in z[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
576 fljB_score+=z[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
577 if fliC_score>=fljB_score:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
578 role="fliC"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
579 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
580 role="fljB"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
581 return (role,abs(fliC_score-fljB_score))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
582
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
583 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
584 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
585 #also used when no head or tail got blasted score for the contig
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
586 role=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
587 for z in Final_list_passed:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
588 if node_name in z[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
589 role=z[0].split("_")[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
590 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
591 return role
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
592
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
593 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
594 #nodes_list is the c created by c,d=Uniq(nodes) in below function
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
595 first_target=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
596 role_list=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
597 for x in nodes_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
598 a=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
599 role=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
600 for y in tail_head_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
601 if x in y[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
602 a.append(y)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
603 if len(a)==4:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
604 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
605 if diff<20:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
606 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
607 elif len(a)==3:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
608 ###however, if the one with highest score is the fewer one, compare their accumulation score
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
609 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
610 if diff<20:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
611 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
612 ###end of above score comparison
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
613 elif len(a)==2:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
614 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
615 temp=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
616 for z in a:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
617 temp.append(z[0].split("_")[0])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
618 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
619 if len(m)==1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
620 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
621 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
622 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
623 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
624 if diff<20:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
625 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
626 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
627 elif len(a)==1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
628 #that one
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
629 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
630 if diff<20:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
631 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
632 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
633 else:#a==0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
634 #use Final_list_passed best match
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
635 for z in Final_list_passed:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
636 if x in z[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
637 role=z[0].split("_")[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
638 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
639 #print x,role,len(a)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
640 role_list.append((role,x))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
641 if len(role_list)==2:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
642 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
643 #just use score to do a final test
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
644 role_list=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
645 for x in nodes_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
646 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
647 role_list.append((role,x))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
648 return role_list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
649
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
650 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
651 #used to decide which contig is FliC and which one is fljB
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
652 contigs=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
653 nodes=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
654 for x in Final_list_passed:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
655 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
656 nodes.append(x[0].split("___")[1].strip())
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
657 c,d=Uniq(nodes)#c is node_list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
658 #print c
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
659 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
660 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
661 return roles
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
662
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
663 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
664 #decide O based on Final_list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
665 O_choice="?"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
666 O_list=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
667 special_genes={}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
668 nodes=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
669 for x in Final_list_passed:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
670 if x[0].startswith("O-"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
671 nodes.append(x[0].split("___")[1].strip())
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
672 elif not x[0].startswith("fl"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
673 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
674 #print "special_genes:",special_genes
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
675 c,d=Uniq(nodes)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
676 #print "potential O antigen contig",c
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
677 final_O=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
678 O_nodes_list=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
679 for x in c:#c is the list for contigs
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
680 temp=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
681 for y in Final_list_passed:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
682 if x in y[0] and y[0].startswith("O-"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
683 final_O.append(y)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
684 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
685 ### O contig has the problem of two genes on same contig, so do additional test
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
686 potenial_new_gene=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
687 for x in final_O:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
688 pointer=0 #for genes merged or not
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
689 #not consider O-1,3,19_not_in_3,10, too short compared with others
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
690 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
691 pointer=x[0].split("___")[1].strip()#store the contig name
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
692 print(pointer)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
693 if pointer!=0:#it has potential merge event
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
694 for y in Final_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
695 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
696 potenial_new_gene=y
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
697 #print(potenial_new_gene)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
698 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
699 if potenial_new_gene!="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
700 print("two differnt genes in same contig, fix it for O antigen")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
701 print(potenial_new_gene[:3])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
702 pointer=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
703 for y in final_O:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
704 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
705 pointer=1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
706 if pointer!=0: #changed to consider two genes in same contig
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
707 final_O.append(potenial_new_gene)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
708 ### end of the two genes on same contig test
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
709 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
710 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
711 #print "$$$No Otype, due to no hit"#may need to be changed
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
712 O_choice="-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
713 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
714 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
715 O_list=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
716 O_list_less_contamination=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
717 for x in final_O:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
718 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
719 O_list.append(x[0].split("__")[0])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
720 O_nodes_list.append(x[0].split("___")[1])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
721 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
722 O_list_less_contamination.append(x[0].split("__")[0])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
723 ### special test for O9,46 and O3,10 family
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
724 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
725 if "O-9,46_wzy" in O_list or "O-9,46_wzy_partial" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
726 O_choice="O-9,46"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
727 #print "$$$Most possilble Otype: O-9,46"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
728 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
729 O_choice="O-9,46,27"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
730 #print "$$$Most possilble Otype: O-9,46,27"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
731 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
732 O_choice="O-9"#next, detect O9 vs O2?
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
733 O2=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
734 O9=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
735 for z in special_genes:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
736 if "tyr-O-9" in z:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
737 O9=special_genes[z]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
738 elif "tyr-O-2" in z:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
739 O2=special_genes[z]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
740 if O2>O9:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
741 O_choice="O-2"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
742 elif O2<O9:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
743 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
744 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
745 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
746 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
747 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
748 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
749 O_choice="O-3,10"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
750 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
751 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
752 O_choice="O-1,3,19"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
753 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
754 ### end of special test for O9,46 and O3,10 family
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
755 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
756 try:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
757 max_score=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
758 for x in final_O:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
759 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
760 max_score=x[2]#change from x[-1] to x[2],08172018
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
761 O_choice=x[0].split("_")[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
762 if O_choice=="O-1,3,19":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
763 O_choice=final_O[1][0].split("_")[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
764 #print "$$$Most possilble Otype: ",O_choice
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
765 except:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
766 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
767 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
768 if O_choice=="O-9,46,27" and len(O_list)==2 and "O-4_wzx" in O_list: #special for very low chance sitatuion between O4 and O9,27,46, this is for serotypes like Bredeney and Schwarzengrund (normallly O-4 will have higher score, but sometimes sequencing quality may affect the prediction)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
769 O_choice="O-4"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
770 #print "O:",O_choice,O_nodes_list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
771 Otypes=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
772 for x in O_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
773 if x!="O-1,3,19_not_in_3,10":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
774 if "O-9,46_" not in x:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
775 Otypes.append(x.split("_")[0])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
776 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
777 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
778 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
779 Otypes_uniq,Otypes_fre=Uniq(Otypes)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
780 contamination_O=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
781 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
782 if len(Otypes_uniq)>2:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
783 contamination_O="potential contamination from O antigen signals"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
784 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
785 if len(Otypes_uniq)>1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
786 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
787 contamination_O=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
788 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
789 contamination_O=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
790 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
791 contamination_O="potential contamination from O antigen signals"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
792 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
793 ### End of SeqSero2 allele prediction and output
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
794
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
795 def get_input_files(make_dir,input_file,data_type,dirpath):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
796 #tell input files from datatype
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
797 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
798 for_fq=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
799 rev_fq=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
800 os.chdir(make_dir)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
801 if data_type=="1":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
802 input_file=input_file[0].split("/")[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
803 if input_file.endswith(".sra"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
804 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
805 for_fq=input_file.replace(".sra","_1.fastq")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
806 rev_fq=input_file.replace(".sra","_2.fastq")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
807 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
808 core_id=input_file.split(".fastq")[0].split(".fq")[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
809 for_fq=core_id+"_1.fastq"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
810 rev_fq=core_id+"_2.fastq"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
811 if input_file.endswith(".gz"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
812 subprocess.check_call("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
813 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
814 subprocess.check_call("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
815 elif data_type=="2":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
816 for_fq=input_file[0].split("/")[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
817 rev_fq=input_file[1].split("/")[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
818 elif data_type=="3":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
819 input_file=input_file[0].split("/")[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
820 if input_file.endswith(".sra"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
821 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
822 for_fq=input_file.replace(".sra","_1.fastq")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
823 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
824 for_fq=input_file
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
825 elif data_type in ["4","5","6"]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
826 for_fq=input_file[0].split("/")[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
827 os.chdir("..")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
828 return for_fq,rev_fq
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
829
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
830 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
831 #get O and H types from Final_list from blast parsing; allele mode
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
832 from Bio import SeqIO
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
833 fliC_choice="-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
834 fljB_choice="-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
835 fliC_contig="NA"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
836 fljB_contig="NA"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
837 fliC_region=set([0])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
838 fljB_region=set([0,])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
839 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
840 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
841 O_choice="-"#no need to decide O contig for now, should be only one
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
842 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
843 O_choice=O_choice.split("-")[-1].strip()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
844 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
845 O_choice="-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
846 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
847 #add alignment locations, used for further selection, 03312019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
848 for i in range(len(H_contig_roles)):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
849 x=H_contig_roles[i]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
850 for y in Final_list_passed:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
851 if x[1] in y[0] and y[0].startswith(x[0]):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
852 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
853 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
854 log_file=open("SeqSero_log.txt","a")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
855 extract_file=open("Extracted_antigen_alleles.fasta","a")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
856 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
857
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
858 #print("O_contigs:")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
859 log_file.write("O_contigs:\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
860 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
861 extract_file.write("#O_contigs:\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
862 for x in O_nodes_roles:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
863 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
864 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
865 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
866 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
867 seqs=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
868 for z in handle_fasta:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
869 if x[0].split("___")[-1]==z.description:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
870 seqs=str(z.seq)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
871 extract_file.write(title+seqs+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
872 if len(H_contig_roles)!=0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
873 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
874 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
875 highest_H_coverage=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
876 for x in H_contig_roles:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
877 #if multiple choices, temporately select the one with longest length for now, will revise in further change
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
878 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
879 fliC_contig=x[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
880 fliC_length=len(x[-1])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
881 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
882 fljB_contig=x[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
883 fljB_length=len(x[-1])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
884 for x in Final_list_passed:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
885 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
886 fliC_choice=x[0].split("_")[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
887 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
888 fljB_choice=x[0].split("_")[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
889 elif fliC_choice!="-" and fljB_choice!="-":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
890 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
891 #now remove contigs not in middle core part
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
892 first_allele="NA"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
893 first_allele_percentage=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
894 for x in Final_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
895 if x[0].startswith("fliC") or x[0].startswith("fljB"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
896 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
897 first_allele_percentage=x[2]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
898 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
899 additional_contigs=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
900 for x in Final_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
901 if first_allele in x[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
902 if (fliC_contig == x[0].split("___")[-1]):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
903 fliC_region=x[3]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
904 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
905 fljB_region=x[3]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
906 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
907 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
908 additional_contigs.append(x)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
909 #else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
910 #print x[:3]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
911 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
912 if first_allele_percentage>0.9:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
913 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
914 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
915 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
916 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
917 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
918 target_region=set()#doesn't do anything
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
919 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
920 target_region=set()#doesn't do anything
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
921 #print(target_region)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
922 #print(additional_contigs)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
923 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
924 target_region=target_region2|target_region
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
925 for x in additional_contigs:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
926 removal=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
927 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
928 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
929 removal=1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
930 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
931 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
932 removal=1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
933 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
934 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
935 if removal==1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
936 for y in H_contig_roles:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
937 if y[1] in x[0]:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
938 H_contig_roles.remove(y)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
939 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
940 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
941 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
942 #end of removing none-middle contigs
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
943 #print("H_contigs:")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
944 log_file.write("H_contigs:\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
945 extract_file.write("#H_contigs:\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
946 H_contig_stat=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
947 H1_cont_stat={}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
948 H2_cont_stat={}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
949 for i in range(len(H_contig_roles)):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
950 x=H_contig_roles[i]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
951 a=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
952 for y in Final_list_passed:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
953 if x[1] in y[0] and y[0].startswith(x[0]):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
954 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
955 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
956 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
957 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
958 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
959 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
960 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
961 seqs=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
962 for z in handle_fasta:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
963 if x[1]==z.description:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
964 seqs=str(z.seq)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
965 extract_file.write(title+seqs+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
966 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
967 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
968 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
969 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
970 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
971 seqs=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
972 for z in handle_fasta:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
973 if x[1]==z.description:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
974 seqs=str(z.seq)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
975 extract_file.write(title+seqs+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
976 if x[0]=="fliC":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
977 if y[0].split("_")[1] not in H1_cont_stat:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
978 H1_cont_stat[y[0].split("_")[1]]=y[2]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
979 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
980 H1_cont_stat[y[0].split("_")[1]]+=y[2]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
981 if x[0]=="fljB":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
982 if y[0].split("_")[1] not in H2_cont_stat:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
983 H2_cont_stat[y[0].split("_")[1]]=y[2]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
984 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
985 H2_cont_stat[y[0].split("_")[1]]+=y[2]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
986 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
987 #detect contaminations
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
988 #print(H1_cont_stat)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
989 #print(H2_cont_stat)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
990 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
991 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
992 contamination_H=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
993 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
994 contamination_H="potential contamination from H antigen signals"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
995 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
996 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
997 #get additional antigens
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
998 """
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
999 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1000 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1001 O_choice="O-9,46"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1002 #print "$$$Most possilble Otype: O-9,46"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1003 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1004 O_choice="O-9,46,27"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1005 #print "$$$Most possilble Otype: O-9,46,27"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1006 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1007 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1008 O_choice="O-3,10"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1009 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1010 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1011 O_choice="O-1,3,19"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1012 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1013 ### end of special test for O9,46 and O3,10 family
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1014
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1015 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1016 if len(Otypes_uniq)>2:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1017 contamination_O="potential contamination from O antigen signals"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1018 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1019 if len(Otypes_uniq)>1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1020 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1021 contamination_O=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1022 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1023 contamination_O=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1024 """
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1025 additonal_antigents=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1026 #print(contamination_O)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1027 #print(contamination_H)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1028 log_file.write(contamination_O+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1029 log_file.write(contamination_H+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1030 log_file.close()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1031 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1032
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1033 def get_input_K(input_file,lib_dict,data_type,k_size):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1034 #kmer mode; get input_Ks from dict and data_type
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1035 kmers = []
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1036 for h in lib_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1037 kmers += lib_dict[h]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1038 if data_type == '4':
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1039 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1040 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1041 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1042 elif data_type == '5':#minion_2d_fasta
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1043 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1044 if data_type == '6':#minion_2d_fastq
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1045 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1046 return input_Ks
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1047
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1048 def get_kmer_dict(lib_dict,input_Ks):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1049 #kmer mode; get predicted types
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1050 O_dict = {}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1051 H_dict = {}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1052 Special_dict = {}
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1053 for h in lib_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1054 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1055 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1056 if h.startswith('O-') and score > 25:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1057 O_dict[h] = score
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1058 if h.startswith('fl') and score > 40:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1059 H_dict[h] = score
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1060 if (h[:2] != 'fl') and (h[:2] != 'O-'):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1061 Special_dict[h] = score
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1062 return O_dict,H_dict,Special_dict
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1063
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1064 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1065 log_file=open("SeqSero_log.txt","a")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1066 log_file.write("O_scores:\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1067 #call O:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1068 highest_O = '-'
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1069 if len(O_dict) == 0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1070 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1071 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1072 for x in O_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1073 log_file.write(x+"\t"+str(O_dict[x])+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1074 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1075 #if 'O-9,46_wzy__1191' in O_dict or "O-9,46_wzy_partial__216" in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1076 #modified to fix miscall of O-9,46
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1077 if ('O-9,46_wzy__1191' in O_dict and O_dict['O-9,46_wzy__1191']>40) or ("O-9,46_wzy_partial__216" in O_dict and O_dict["O-9,46_wzy_partial__216"]>40): # and float(O946_wzy)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1078 highest_O = "O-9,46"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1079 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1080 highest_O = "O-9,46,27"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1081 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1082 highest_O = "O-9" # next, detect O9 vs O2?
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1083 O2 = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1084 O9 = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1085 for z in Special_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1086 if "tyr-O-9" in z:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1087 O9 = float(Special_dict[z])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1088 if "tyr-O-2" in z:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1089 O2 = float(Special_dict[z])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1090 if O2 > O9:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1091 highest_O = "O-2"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1092 elif ("O-3,10_wzx__1539" in O_dict) and (
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1093 "O-9,46_wzy__1191" in O_dict
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1094 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1095 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1096 highest_O = "O-3,10"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1097 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1098 highest_O = "O-1,3,19"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1099 ### end of special test for O9,46 and O3,10 family
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1100 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1101 try:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1102 max_score = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1103 for x in O_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1104 if float(O_dict[x]) >= max_score:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1105 max_score = float(O_dict[x])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1106 #highest_O = x.split("_")[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1107 # ed_SL_12182019: modified to fix the O-9,46 error example1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1108 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1109 highest_O = "O-9"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1110 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1111 highest_O = x.split("_")[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1112 if highest_O == "O-1,3,19":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1113 highest_O = '-'
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1114 max_score = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1115 for x in O_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1116 if x == 'O-1,3,19_not_in_3,10__130':
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1117 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1118 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1119 if float(O_dict[x]) >= max_score:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1120 max_score = float(O_dict[x])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1121 #highest_O = x.split("_")[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1122 # ed_SL_12182019: modified to fix the O-9,46 error example1
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1123 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1124 highest_O = "O-9"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1125 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1126 highest_O = x.split("_")[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1127 except:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1128 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1129 #call_fliC:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1130 if len(H_dict)!=0:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1131 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1132 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1133 highest_H_score_both_BC=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1134 highest_fliC = '-'
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1135 highest_fliC_raw = '-'
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1136 highest_Score = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1137 log_file.write("\nH_scores:\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1138 for s in H_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1139 log_file.write(s+"\t"+str(H_dict[s])+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1140 if s.startswith('fliC'):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1141 if float(H_dict[s]) > highest_Score:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1142 highest_fliC = s.split('_')[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1143 highest_fliC_raw = s
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1144 highest_Score = float(H_dict[s])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1145 #call_fljB
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1146 highest_fljB = '-'
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1147 highest_fljB_raw = '-'
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1148 highest_Score = 0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1149 for s in H_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1150 if s.startswith('fljB'):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1151 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1152 #highest_fljB = s.split('_')[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1153 #highest_fljB_raw = s
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1154 #highest_Score = float(H_dict[s])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1155 if s.split('_')[1]!=highest_fliC:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1156 highest_fljB = s.split('_')[1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1157 highest_fljB_raw = s
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1158 highest_Score = float(H_dict[s])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1159 log_file.write("\nSpecial_scores:\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1160 for s in Special_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1161 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1162 log_file.close()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1163 return highest_O,highest_fliC,highest_fljB
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1164
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1165 def get_temp_file_names(for_fq,rev_fq):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1166 #seqsero2 -a; get temp file names
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1167 sam=for_fq+".sam"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1168 bam=for_fq+".bam"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1169 sorted_bam=for_fq+"_sorted.bam"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1170 mapped_fq1=for_fq+"_mapped.fq"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1171 mapped_fq2=rev_fq+"_mapped.fq"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1172 combined_fq=for_fq+"_combined.fq"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1173 for_sai=for_fq+".sai"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1174 rev_sai=rev_fq+".sai"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1175 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1176
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1177 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1178 #seqsero2 -a; do mapping and sort
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1179 print("building database...")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1180 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1181 print("mapping...")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1182 if mapping_mode=="mem":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1183 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1184 elif mapping_mode=="sam":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1185 if fnameB!="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1186 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1187 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1188 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1189 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1190 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1191 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1192 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1193 ### check the version of samtools then use differnt commands
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1194 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1195 out, err = samtools_version.communicate()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1196 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1197 print("check samtools version:",version)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1198 ### end of samtools version check and its analysis
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1199 if LooseVersion(version)<=LooseVersion("1.2"):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1200 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1201 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1202 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1203
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1204 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1205 #seqsero2 -a; extract, assembly and blast
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1206 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1207 #print("fnameA:",fnameA)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1208 #print("fnameB:",fnameB)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1209 if fnameB!="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1210 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1211 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1212 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1213 outdir=current_time+"_temp"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1214 print("assembling...")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1215 if int(threads)>4:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1216 t="4"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1217 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1218 t=threads
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1219 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1220 if fnameB!="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1221 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1222 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1223 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1224 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1225 #new_fasta=fnameA+"_"+database.split('/')[-1]+"_"+mapping_mode+".fasta" # change path to databse for packaging
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1226 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1227 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1228 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1229 print("blasting...","\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1230 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1231 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1232 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1233 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1234 xmlfile="NA"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1235 return xmlfile,new_fasta
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1236
7
aa54a94b9aeb planemo upload commit c50df40caef2fb97c178d6890961e0e527992324-dirty
cstrittmatter
parents: 0
diff changeset
1237 def judge_subspecies(fnameA,dirpath):
0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1238 #seqsero2 -a; judge subspecies on just forward raw reads fastq
7
aa54a94b9aeb planemo upload commit c50df40caef2fb97c178d6890961e0e527992324-dirty
cstrittmatter
parents: 0
diff changeset
1239 samid_strcmd = "python " + dirpath + "/../SalmID.py -i "+fnameA
aa54a94b9aeb planemo upload commit c50df40caef2fb97c178d6890961e0e527992324-dirty
cstrittmatter
parents: 0
diff changeset
1240 print(samid_strcmd)
10
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents: 8
diff changeset
1241 #seqsero2 -a; judge subspecies on just forward raw reads fastq
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents: 8
diff changeset
1242 #salmID_output=subprocess.Popen("SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
7
aa54a94b9aeb planemo upload commit c50df40caef2fb97c178d6890961e0e527992324-dirty
cstrittmatter
parents: 0
diff changeset
1243 salmID_output=subprocess.Popen(samid_strcmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1244 out, err = salmID_output.communicate()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1245 out=out.decode("utf-8")
10
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents: 8
diff changeset
1246 print(out)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents: 8
diff changeset
1247 print(err)
0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1248 file=open("data_log.txt","a")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1249 file.write(out)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1250 file.close()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1251 salm_species_scores=out.split("\n")[1].split("\t")[6:]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1252 salm_species_results=out.split("\n")[0].split("\t")[6:]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1253 max_score=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1254 max_score_index=1 #default is 1, means "I"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1255 for i in range(len(salm_species_scores)):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1256 if max_score<float(salm_species_scores[i]):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1257 max_score=float(salm_species_scores[i])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1258 max_score_index=i
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1259 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1260 #if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1261 if float(out.split("\n")[1].split("\t")[4]) > 10 and float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): ## ed_SL_0318: change SalmID_ssp_threshold
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1262 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1263 #if max_score<10: ## ed_SL_0318: change SalmID_ssp_threshold
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1264 if max_score<60:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1265 prediction="-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1266 return prediction
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1267
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1268 def judge_subspecies_Kmer(Special_dict):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1269 #seqsero2 -k;
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1270 max_score=0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1271 prediction="-" #default should be I
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1272 for x in Special_dict:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1273 #if "mer" in x: ## ed_SL_0318: change ssp_threshold
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1274 if "mer" in x and float(Special_dict[x]) > 60:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1275 if max_score<float(Special_dict[x]):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1276 max_score=float(Special_dict[x])
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1277 prediction=x.split("_")[-1].strip()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1278 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1279 prediction="bongori"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1280 break
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1281 return prediction
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1282
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1283 ## ed_SL_11232019: add notes for missing antigen
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1284 def check_antigens(ssp,O_antigen,H1_antigen,H2_antigen,NA_note):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1285 antigen_note = ''
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1286 if ssp != '-':
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1287 if O_antigen != '-' and H1_antigen == '-' and H2_antigen == '-': # O:-:-
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1288 antigen_note = 'H antigens were not detected. This is an atypical result that should be further investigated. Most Salmonella strains have at least fliC, encoding the Phase 1 H antigen, even if it is not expressed. '
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1289 NA_note = ''
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1290 elif O_antigen != '-' and H1_antigen == '-' and H2_antigen != '-': # O:-:H2
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1291 antigen_note = 'fliC was not detected. This is an atypical result that should be further investigated. Most Salmonella strains have fliC, encoding the Phase 1 H antigen, even if it is not expressed. '
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1292 NA_note = ''
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1293 elif O_antigen == '-' and H1_antigen != '-': # -:H1:X
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1294 antigen_note = 'O antigen was not detected. This result may be due to a rough strain that has deleted the rfb region. For raw reads input, the k-mer workflow is sometimes more sensitive than the microassembly workflow in detecting O antigen. Caution should be used with this approach because the k-mer result may be due to low levels of contamination. '
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1295 NA_note = ''
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1296 elif O_antigen == '-' and H1_antigen == '-' and H2_antigen == '-': # -:-:-
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1297 antigen_note = 'No serotype antigens were detected. This is an atypical result that should be further investigated. '
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1298 NA_note = ''
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1299 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1300 antigen_note = 'The input genome cannot be identified as Salmonella. Check the input for taxonomic ID, contamination, or sequencing quality. '
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1301 NA_note = ''
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1302 # if [O_antigen, H1_antigen, H2_antigen].count('-') >= 2:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1303 # antigen_note = 'No subspecies marker was detected and less than 2 serotype antigens were detected; further, this genome was not identified as Salmonella. This is an atypical result that should be further investigated. '
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1304 # else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1305 # antigen_note = 'No subspecies marker was detected. This genome may not be Salmonella. This is an atypical result that should be further investigated. '
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1306 return (antigen_note,NA_note)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1307
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1308 def main():
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1309 #combine SeqSeroK and SeqSero2, also with SalmID
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1310 args = parse_args()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1311 input_file = args.i
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1312 data_type = args.t
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1313 analysis_mode = args.m
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1314 mapping_mode=args.b
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1315 threads=args.p
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1316 make_dir=args.d
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1317 clean_mode=args.c
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1318 sample_name=args.n
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1319 ingore_header=args.s
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1320 k_size=27 #will change for bug fixing
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1321 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1322 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: add ex_dir for packaging
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1323 seqsero2_db=ex_dir+"/H_and_O_and_specific_genes.fasta" # ed_SL_11092019: change path to database for packaging
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1324 database="H_and_O_and_specific_genes.fasta"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1325 note="Note: "
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1326 NA_note="This predicted serotype is not in the Kauffman-White scheme. " # ed_SL_09272019: add for new output format
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1327 if len(sys.argv)==1:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1328 subprocess.check_call(dirpath+"/SeqSero2_package.py -h",shell=True)#change name of python file
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1329 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1330 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1331 request_id += str(random.randint(1, 10000000))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1332 if make_dir is None:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1333 make_dir="SeqSero_result_"+request_id
8
357e38526e2a planemo upload commit c50df40caef2fb97c178d6890961e0e527992324-dirty
cstrittmatter
parents: 7
diff changeset
1334 make_dir=os.path.abspath(make_dir)
0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1335 if os.path.isdir(make_dir):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1336 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1337 else:
8
357e38526e2a planemo upload commit c50df40caef2fb97c178d6890961e0e527992324-dirty
cstrittmatter
parents: 7
diff changeset
1338 subprocess.check_call("mkdir -p "+make_dir,shell=True)
0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1339 #subprocess.check_call("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1340 #subprocess.check_call("ln -sr "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1341 subprocess.check_call("ln -f -s "+seqsero2_db+" "+" ".join(input_file)+" "+make_dir,shell=True) # ed_SL_11092019: change path to database for packaging
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1342 #subprocess.check_call("ln -f -s "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument).
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1343 ############################begin the real analysis
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1344 if analysis_mode=="a":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1345 if data_type in ["1","2","3"]:#use allele mode
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1346 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1347 os.chdir(make_dir)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1348 ###add a function to tell input files
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1349 fnameA=for_fq.split("/")[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1350 fnameB=rev_fq.split("/")[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1351 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1352 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1353 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1354 ### avoid error out when micro assembly fails. ed_SL_03172020
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1355 try:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1356 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1357 except (UnboundLocalError, subprocess.CalledProcessError):
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1358 xmlfile="NA"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1359 H1_cont_stat_list=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1360 H2_cont_stat_list=[]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1361 ###
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1362 if xmlfile=="NA":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1363 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1364 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1365 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1366 file=open("data_log.txt","a")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1367 for x in Final_list:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1368 file.write("\t".join(str(y) for y in x)+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1369 file.close()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1370 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1371 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
7
aa54a94b9aeb planemo upload commit c50df40caef2fb97c178d6890961e0e527992324-dirty
cstrittmatter
parents: 0
diff changeset
1372 subspecies=judge_subspecies(fnameA,dirpath) #predict subspecies
0
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1373 ###output
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1374 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1375 claim="" #04132019, disable claim for new report requirement
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1376 contamination_report=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1377 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1378 if contamination_O!="" and contamination_H=="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1379 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1380 elif contamination_O=="" and contamination_H!="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1381 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1382 elif contamination_O!="" and contamination_H!="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1383 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1384 if contamination_report!="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1385 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1386 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles. "
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1387 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1388 #if contamination_report+star_line+claim=="": #0413, new output style
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1389 # note=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1390 #else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1391 # note="Note:"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1392
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1393 ### ed_SL_11232019: add notes for missing antigen
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1394 if O_choice=="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1395 O_choice="-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1396 antigen_note,NA_note=check_antigens(subspecies,O_choice,fliC_choice,fljB_choice,NA_note)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1397 if sample_name:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1398 print ("Sample name:\t"+sample_name)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1399 ###
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1400
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1401 if clean_mode:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1402 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1403 make_dir="none-output-directory due to '-c' flag"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1404 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1405 new_file=open("SeqSero_result.txt","w")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1406 ### ed_SL_01152020: add new output
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1407 conta_note="yes" if "inter-serotype contamination" in contamination_report else "no"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1408 tsv_file=open("SeqSero_result.tsv","w")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1409 if ingore_header:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1410 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1411 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1412 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted subspecies\tPredicted antigenic profile\tPredicted serotype\tPotential inter-serotype contamination\tNote\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1413 if sample_name:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1414 new_file.write("Sample name:\t"+sample_name+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1415 tsv_file.write(sample_name+'\t')
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1416 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1417 tsv_file.write(input_file[0].split('/')[-1]+'\t')
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1418 ###
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1419 if "N/A" not in predict_sero:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1420 new_file.write("Output directory:\t"+make_dir+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1421 "Input files:\t"+"\t".join(input_file)+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1422 "O antigen prediction:\t"+O_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1423 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1424 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1425 "Predicted subspecies:\t"+subspecies+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1426 "Predicted antigenic profile:\t"+predict_form+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1427 "Predicted serotype:\t"+predict_sero+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1428 note+contamination_report+star_line+claim+antigen_note+"\n")#+##
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1429 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies+"\t"+predict_form+"\t"+predict_sero+"\t"+conta_note+"\t"+contamination_report+star_line+claim+antigen_note+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1430 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1431 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1432 star_line="" #04132019, for new output requirement, diable star_line if "NA" in output
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1433 new_file.write("Output directory:\t"+make_dir+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1434 "Input files:\t"+"\t".join(input_file)+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1435 "O antigen prediction:\t"+O_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1436 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1437 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1438 "Predicted subspecies:\t"+subspecies+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1439 "Predicted antigenic profile:\t"+predict_form+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1440 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, add subspecies
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1441 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")#+##
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1442 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+conta_note+"\t"+NA_note+contamination_report+star_line+claim+antigen_note+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1443 new_file.close()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1444 tsv_file.close()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1445 #subprocess.check_call("cat Seqsero_result.txt",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1446 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1447 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1448 if "N/A" not in predict_sero:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1449 #print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\nNote:"+contamination_report+star+star_line+claim+"\n")#+##
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1450 print("Output directory:\t"+make_dir+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1451 "Input files:\t"+"\t".join(input_file)+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1452 "O antigen prediction:\t"+O_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1453 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1454 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1455 "Predicted subspecies:\t"+subspecies+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1456 "Predicted antigenic profile:\t"+predict_form+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1457 "Predicted serotype:\t"+predict_sero+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1458 note+contamination_report+star_line+claim+antigen_note+"\n")#+##
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1459 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1460 print("Output directory:\t"+make_dir+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1461 "Input files:\t"+"\t".join(input_file)+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1462 "O antigen prediction:\t"+O_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1463 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1464 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1465 "Predicted subspecies:\t"+subspecies+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1466 "Predicted antigenic profile:\t"+predict_form+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1467 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1468 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1469 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1470 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1471 elif analysis_mode=="k":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1472 #ex_dir = os.path.dirname(os.path.realpath(__file__))
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1473 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: change ex_dir for packaging
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1474 #output_mode = args.mode
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1475 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1476 input_file = for_fq #-k will just use forward because not all reads were used
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1477 os.chdir(make_dir)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1478 f = open(ex_dir + '/antigens.pickle', 'rb')
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1479 lib_dict = pickle.load(f)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1480 f.close
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1481 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1482 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1483 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1484 subspecies=judge_subspecies_Kmer(Special_dict)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1485 if subspecies=="IIb" or subspecies=="IIa":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1486 subspecies="II"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1487 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1488 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1489 claim="" #no claim any more based on new output requirement
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1490 #if star_line+claim=="": #0413, new output style
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1491 # note=""
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1492 #else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1493 # note="Note:"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1494
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1495 ### ed_SL_11232019: add notes for missing antigen
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1496 if highest_O.split('-')[-1]=="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1497 O_choice="-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1498 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1499 O_choice=highest_O.split('-')[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1500 antigen_note,NA_note=check_antigens(subspecies,O_choice,highest_fliC,highest_fljB,NA_note)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1501 if sample_name:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1502 print ("Sample name:\t"+sample_name)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1503 ###
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1504
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1505 if clean_mode:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1506 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1507 make_dir="none-output-directory due to '-c' flag"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1508 # ### ed_SL_05282019, fix the assignment issue of variable 'O_choice' using "-m k -c"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1509 # if highest_O.split('-')[-1]=="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1510 # O_choice="-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1511 # else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1512 # O_choice=highest_O.split('-')[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1513 # ###
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1514 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1515 # if highest_O.split('-')[-1]=="":
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1516 # O_choice="-"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1517 # else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1518 # O_choice=highest_O.split('-')[-1]
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1519 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1520 new_file=open("SeqSero_result.txt","w")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1521 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1522 ### ed_SL_01152020: add new output
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1523 tsv_file=open("SeqSero_result.tsv","w")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1524 if ingore_header:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1525 pass
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1526 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1527 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted subspecies\tPredicted antigenic profile\tPredicted serotype\tNote\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1528 if sample_name:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1529 new_file.write("Sample name:\t"+sample_name+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1530 tsv_file.write(sample_name+'\t')
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1531 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1532 tsv_file.write(input_file.split('/')[-1]+'\t')
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1533 ###
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1534 if "N/A" not in predict_sero:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1535 new_file.write("Output directory:\t"+make_dir+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1536 "Input files:\t"+input_file+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1537 "O antigen prediction:\t"+O_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1538 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1539 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1540 "Predicted subspecies:\t"+subspecies+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1541 "Predicted antigenic profile:\t"+predict_form+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1542 "Predicted serotype:\t"+predict_sero+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1543 note+star_line+claim+antigen_note+"\n")#+##
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1544 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies+"\t"+predict_form+"\t"+predict_sero+"\t"+star_line+claim+antigen_note+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1545 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1546 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1547 star_line = "" #changed for new output requirement, 04132019
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1548 new_file.write("Output directory:\t"+make_dir+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1549 "Input files:\t"+input_file+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1550 "O antigen prediction:\t"+O_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1551 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1552 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1553 "Predicted subspecies:\t"+subspecies+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1554 "Predicted antigenic profile:\t"+predict_form+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1555 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1556 note+NA_note+star_line+claim+antigen_note+"\n")#+##
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1557 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+NA_note+star_line+claim+antigen_note+"\n")
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1558 new_file.close()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1559 tsv_file.close()
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1560 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True)
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1561 if "N/A" not in predict_sero:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1562 print("Output directory:\t"+make_dir+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1563 "Input files:\t"+input_file+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1564 "O antigen prediction:\t"+O_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1565 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1566 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1567 "Predicted subspecies:\t"+subspecies+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1568 "Predicted antigenic profile:\t"+predict_form+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1569 "Predicted serotype:\t"+predict_sero+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1570 note+star_line+claim+antigen_note+"\n")#+##
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1571 else:
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1572 print("Output directory:\t"+make_dir+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1573 "Input files:\t"+input_file+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1574 "O antigen prediction:\t"+O_choice+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1575 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1576 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1577 "Predicted subspecies:\t"+subspecies+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1578 "Predicted antigenic profile:\t"+predict_form+"\n"+
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1579 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1580 note+NA_note+star_line+claim+antigen_note+"\n")#+##
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1581
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1582 if __name__ == '__main__':
fc22ec8e924e planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
cstrittmatter
parents:
diff changeset
1583 main()