annotate build/lib/bin/SeqSero2_package.py @ 10:e6437d423693 draft

planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
author cstrittmatter
date Fri, 01 May 2020 13:30:43 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1 #!/usr/bin/env python3
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
2
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
3 import sys
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
4 import time
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
5 import random
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
6 import os
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
7 import subprocess
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
8 import gzip
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
9 import io
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
10 import pickle
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
11 import argparse
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
12 import itertools
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
13 from distutils.version import LooseVersion
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
14 from distutils.spawn import find_executable
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
15 sys.path.insert(1,sys.path[0]+'/..')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
16
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
17 try:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
18 from .version import SeqSero2_version
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
19 except Exception: #ImportError
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
20 from version import SeqSero2_version
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
21
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
22 ### SeqSero Kmer
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
23 def parse_args():
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
24 "Parse the input arguments, use '-h' for help."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
25 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\nVersion: v1.1.1')#add "-m <data_type>" in future
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
26 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### add 'type=os.path.abspath' to generate absolute path of input data.
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
27 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore fasta, '6' for nanopore fastq")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
28 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
29 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
30 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
31 parser.add_argument("-n",help="<string>: optional, to specify a sample name in the report output")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
32 parser.add_argument("-d",help="<string>: optional, to specify an output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
33 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will only output serotype prediction without the directory containing log files")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
34 parser.add_argument("-s",action="store_true",help="<flag>: if '-s' was flagged, SeqSero2 will not output header in SeqSero_result.tsv")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
35 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
36 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SeqSero2_version)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
37 return parser.parse_args()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
38
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
39 ### check paths of dependencies
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
40 check_dependencies = parse_args().check
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
41 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py']
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
42 if check_dependencies:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
43 for item in dependencies:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
44 ext_path = find_executable(item)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
45 if ext_path is not None:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
46 print ("Using "+item+" - "+ext_path)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
47 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
48 print ("ERROR: can not find "+item+" in PATH")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
49 sys.exit()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
50 ### end of --check
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
51
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
52 def reverse_complement(sequence):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
53 complement = {
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
54 'A': 'T',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
55 'C': 'G',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
56 'G': 'C',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
57 'T': 'A',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
58 'N': 'N',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
59 'M': 'K',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
60 'R': 'Y',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
61 'W': 'W',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
62 'S': 'S',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
63 'Y': 'R',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
64 'K': 'M',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
65 'V': 'B',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
66 'H': 'D',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
67 'D': 'H',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
68 'B': 'V'
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
69 }
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
70 return "".join(complement[base] for base in reversed(sequence))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
71
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
72
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
73 def createKmerDict_reads(list_of_strings, kmer):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
74 kmer_table = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
75 for string in list_of_strings:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
76 sequence = string.strip('\n')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
77 for i in range(len(sequence) - kmer + 1):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
78 new_mer = sequence[i:i + kmer].upper()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
79 new_mer_rc = reverse_complement(new_mer)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
80 if new_mer in kmer_table:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
81 kmer_table[new_mer.upper()] += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
82 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
83 kmer_table[new_mer.upper()] = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
84 if new_mer_rc in kmer_table:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
85 kmer_table[new_mer_rc.upper()] += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
86 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
87 kmer_table[new_mer_rc.upper()] = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
88 return kmer_table
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
89
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
90
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
91 def multifasta_dict(multifasta):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
92 multifasta_list = [
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
93 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
94 ]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
95 headers = [i for i in multifasta_list if i[0] == '>']
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
96 multifasta_dict = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
97 for h in headers:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
98 start = multifasta_list.index(h)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
99 for element in multifasta_list[start + 1:]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
100 if element[0] == '>':
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
101 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
102 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
103 if h[1:] in multifasta_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
104 multifasta_dict[h[1:]] += element
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
105 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
106 multifasta_dict[h[1:]] = element
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
107 return multifasta_dict
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
108
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
109
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
110 def multifasta_single_string(multifasta):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
111 multifasta_list = [
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
112 line.strip() for line in open(multifasta, 'r')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
113 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
114 ]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
115 return ''.join(multifasta_list)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
116
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
117
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
118 def chunk_a_long_sequence(long_sequence, chunk_size=60):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
119 chunk_list = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
120 steps = len(long_sequence) // 60 #how many chunks
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
121 for i in range(steps):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
122 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
123 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
124 return chunk_list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
125
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
126
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
127 def target_multifasta_kmerizer(multifasta, k, kmerDict):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
128 forward_length = 300 #if find the target, put forward 300 bases
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
129 reverse_length = 2200 #if find the target, put backward 2200 bases
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
130 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
131 target_mers = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
132 long_single_string = multifasta_single_string(multifasta)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
133 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
134 unit_length = len(multifasta_list[0])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
135 forward_lines = int(forward_length / unit_length) + 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
136 reverse_lines = int(forward_length / unit_length) + 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
137 start_num = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
138 end_num = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
139 for i in range(len(multifasta_list)):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
140 if i not in range(start_num, end_num): #avoid computational repetition
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
141 line = multifasta_list[i]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
142 start = int((len(line) - k) // 2)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
143 s1 = line[start:k + start]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
144 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
145 if i - forward_lines >= 0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
146 start_num = i - forward_lines
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
147 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
148 start_num = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
149 if i + reverse_lines <= len(multifasta_list) - 1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
150 end_num = i + reverse_lines
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
151 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
152 end_num = len(multifasta_list) - 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
153 target_list = [
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
154 x.strip() for x in multifasta_list[start_num:end_num]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
155 ]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
156 target_line = "".join(target_list)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
157 target_mers += [
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
158 k1 for k1 in createKmerDict_reads([str(target_line)], k)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
159 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
160 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
161 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
162 return set(target_mers)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
163
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
164
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
165 def target_read_kmerizer(file, k, kmerDict):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
166 i = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
167 n_reads = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
168 total_coverage = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
169 target_mers = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
170 if file.endswith(".gz"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
171 file_content = io.BufferedReader(gzip.open(file))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
172 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
173 file_content = open(file, "r").readlines()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
174 for line in file_content:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
175 start = int((len(line) - k) // 2)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
176 if i % 4 == 2:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
177 if file.endswith(".gz"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
178 s1 = line[start:k + start].decode()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
179 line = line.decode()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
180 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
181 s1 = line[start:k + start]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
182 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
183 n_reads += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
184 total_coverage += len(line)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
185 target_mers += [
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
186 k1 for k1 in createKmerDict_reads([str(line)], k)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
187 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
188 i += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
189 if total_coverage >= 4000000:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
190 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
191 return set(target_mers)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
192
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
193
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
194 def minion_fasta_kmerizer(file, k, kmerDict):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
195 i = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
196 n_reads = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
197 total_coverage = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
198 target_mers = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
199 for line in open(file):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
200 if i % 2 == 0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
201 for kmer, rc_kmer in kmers(line.strip().upper(), k):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
202 if (kmer in kmerDict) or (rc_kmer in kmerDict):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
203 if kmer in target_mers:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
204 target_mers[kmer] += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
205 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
206 target_mers[kmer] = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
207 if rc_kmer in target_mers:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
208 target_mers[rc_kmer] += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
209 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
210 target_mers[rc_kmer] = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
211 i += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
212 return set([h for h in target_mers])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
213
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
214
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
215 def minion_fastq_kmerizer(file, k, kmerDict):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
216 i = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
217 n_reads = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
218 total_coverage = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
219 target_mers = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
220 for line in open(file):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
221 if i % 4 == 2:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
222 for kmer, rc_kmer in kmers(line.strip().upper(), k):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
223 if (kmer in kmerDict) or (rc_kmer in kmerDict):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
224 if kmer in target_mers:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
225 target_mers[kmer] += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
226 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
227 target_mers[kmer] = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
228 if rc_kmer in target_mers:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
229 target_mers[rc_kmer] += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
230 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
231 target_mers[rc_kmer] = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
232 i += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
233 return set([h for h in target_mers])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
234
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
235
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
236 def multifasta_single_string2(multifasta):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
237 single_string = ''
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
238 with open(multifasta, 'r') as f:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
239 for line in f:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
240 if line.strip()[0] == '>':
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
241 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
242 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
243 single_string += line.strip()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
244 return single_string
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
245
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
246
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
247 def kmers(seq, k):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
248 rev_comp = reverse_complement(seq)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
249 for start in range(1, len(seq) - k + 1):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
250 yield seq[start:start + k], rev_comp[-(start + k):-start]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
251
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
252
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
253 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
254 multi_seq_dict = multifasta_dict(multifasta)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
255 lib_dict = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
256 for h in multi_seq_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
257 lib_dict[h] = set(
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
258 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
259 return lib_dict
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
260
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
261
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
262 def Combine(b, c):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
263 fliC_combinations = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
264 fliC_combinations.append(",".join(c))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
265 temp_combinations = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
266 for i in range(len(b)):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
267 for x in itertools.combinations(b, i + 1):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
268 temp_combinations.append(",".join(x))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
269 for x in temp_combinations:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
270 temp = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
271 for y in c:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
272 temp.append(y)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
273 temp.append(x)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
274 temp = ",".join(temp)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
275 temp = temp.split(",")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
276 temp.sort()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
277 temp = ",".join(temp)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
278 fliC_combinations.append(temp)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
279 return fliC_combinations
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
280
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
281
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
282 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
283 #like test_output_06012017.txt
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
284 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
285 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
286 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
287 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
288 seronames = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
289 seronames_none_subspecies=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
290 for i in range(len(phase1)):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
291 fliC_combine = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
292 fljB_combine = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
293 if phaseO[i] == Otype: # no VII in KW, but it's there
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
294 ### for fliC, detect every possible combinations to avoid the effect of "["
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
295 if phase1[i].count("[") == 0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
296 fliC_combine.append(phase1[i])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
297 elif phase1[i].count("[") >= 1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
298 c = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
299 b = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
300 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
301 "[") == 1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
302 content = phase1[i].replace("[", "").replace("]", "")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
303 fliC_combine.append(content)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
304 fliC_combine.append("-")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
305 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
306 for x in phase1[i].split(","):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
307 if "[" in x:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
308 b.append(x.replace("[", "").replace("]", ""))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
309 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
310 c.append(x)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
311 fliC_combine = Combine(
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
312 b, c
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
313 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
314 ### end of fliC "[" detect
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
315 ### for fljB, detect every possible combinations to avoid the effect of "["
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
316 if phase2[i].count("[") == 0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
317 fljB_combine.append(phase2[i])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
318 elif phase2[i].count("[") >= 1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
319 d = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
320 e = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
321 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
322 "[") == 1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
323 content = phase2[i].replace("[", "").replace("]", "")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
324 fljB_combine.append(content)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
325 fljB_combine.append("-")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
326 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
327 for x in phase2[i].split(","):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
328 if "[" in x:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
329 d.append(x.replace("[", "").replace("]", ""))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
330 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
331 e.append(x)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
332 fljB_combine = Combine(d, e)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
333 ### end of fljB "[" detect
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
334 new_fliC = fliC.split(
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
335 ","
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
336 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
337 new_fliC.sort()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
338 new_fliC = ",".join(new_fliC)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
339 new_fljB = fljB.split(",")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
340 new_fljB.sort()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
341 new_fljB = ",".join(new_fljB)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
342 if (new_fliC in fliC_combine
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
343 or fliC in fliC_combine) and (new_fljB in fljB_combine
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
344 or fljB in fljB_combine):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
345 ######start, remove_list,rename_dict, added on 11/11/2018
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
346 if sero[i] not in remove_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
347 temp_sero=sero[i]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
348 if temp_sero in rename_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
349 temp_sero=rename_dict[temp_sero] #rename if in the rename list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
350 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
351 if subs[i] == subspecies:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
352 seronames.append(temp_sero)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
353 seronames_none_subspecies.append(temp_sero)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
354 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
355 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
356 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
357 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
358 ######end, added on 11/11/2018
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
359 #analyze seronames
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
360 subspecies_pointer=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
361 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
362 # ed_SL_12182019: modified to fix the subspecies output problem
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
363 #seronames=seronames_none_subspecies
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
364 seronames=["N/A"]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
365 #subspecies_pointer="1"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
366 subspecies_pointer="0"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
367 if len(seronames) == 0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
368 seronames = [
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
369 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
370 ]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
371 star = ""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
372 star_line = ""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
373 if len(seronames) > 1: #there are two possible predictions for serotypes
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
374 star = "*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
375 #changed 04072019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
376 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
377 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
378 star="*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
379 star_line="The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme. " + star_line
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
380 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
381 if Otype=="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
382 Otype="-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
383 predict_form = Otype + ":" + fliC + ":" + fljB
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
384 predict_sero = (" or ").join(seronames)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
385 ###special test for Enteritidis
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
386 if predict_form == "9:g,m:-":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
387 sdf = "-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
388 for x in special_gene_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
389 if x.startswith("sdf"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
390 sdf = "+"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
391 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
392 star_line="sdf gene detected. "
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
393 #predict_form = predict_form + " Sdf prediction:" + sdf
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
394 predict_form = predict_form #changed 04072019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
395 if sdf == "-":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
396 star = "*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
397 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
398 star_line="sdf gene not detected. "
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
399 #changed in 04072019, for new output
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
400 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
401 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
402 predict_sero = "Gallinarum or Enteritidis"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
403 ###end of special test for Enteritidis
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
404 elif predict_form == "4:i:-":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
405 predict_sero = "I 4,[5],12:i:-" # change serotype name
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
406 elif predict_form == "4:r:-":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
407 predict_sero = "N/A (4:r:-)"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
408 elif predict_form == "4:b:-":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
409 predict_sero = "N/A (4:b:-)"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
410 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
411 #predict_sero = "Newport"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
412 #star = "*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
413 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
414 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
415 if "N/A" in predict_sero:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
416 claim = ""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
417 #special test for Typhimurium
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
418 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
419 normal = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
420 mutation = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
421 for x in special_gene_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
422 if "oafA-O-4_full" in x:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
423 normal = float(special_gene_list[x])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
424 elif "oafA-O-4_5-" in x:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
425 mutation = float(special_gene_list[x])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
426 if normal > mutation:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
427 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
428 elif normal < mutation:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
429 #predict_sero = predict_sero.strip() + "(O5-)"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
430 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
431 star = "*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
432 #star_line = "Detected the deletion of O5-."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
433 star_line = "Detected a deletion that causes O5- variant of Typhimurium. "
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
434 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
435 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
436 #special test for Paratyphi B
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
437 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
438 normal = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
439 mutation = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
440 for x in special_gene_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
441 if "gntR-family-regulatory-protein_dt-positive" in x:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
442 normal = float(special_gene_list[x])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
443 elif "gntR-family-regulatory-protein_dt-negative" in x:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
444 mutation = float(special_gene_list[x])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
445 #print(normal,mutation)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
446 if normal > mutation:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
447 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
448 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
449 star = "*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
450 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
451 star_line = "The SNP that causes d-Tartrate nonfermentating phenotype of Paratyphi B was not detected. "
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
452 elif normal < mutation:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
453 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
454 predict_sero = predict_sero.strip()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
455 star = "*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
456 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
457 star_line = "Detected the SNP for d-Tartrate nonfermenting phenotype of Paratyphi B. "
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
458 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
459 star = "*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
460 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
461 star_line = " " ## ed_SL_05152019: do not report this situation.
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
462 #special test for O13,22 and O13,23
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
463 if Otype=="13":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
464 #ex_dir = os.path.dirname(os.path.realpath(__file__))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
465 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
466 f = open(ex_dir + '/special.pickle', 'rb')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
467 special = pickle.load(f)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
468 O22_O23=special['O22_O23']
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
469 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
470 O22_score=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
471 O23_score=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
472 for x in special_gene_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
473 if "O:22" in x:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
474 O22_score = O22_score+float(special_gene_list[x])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
475 elif "O:23" in x:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
476 O23_score = O23_score+float(special_gene_list[x])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
477 #print(O22_score,O23_score)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
478 for z in O22_O23[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
479 if predict_sero.split(" or ")[0] in z:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
480 if O22_score > O23_score:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
481 star = "*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
482 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
483 predict_sero = z[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
484 elif O22_score < O23_score:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
485 star = "*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
486 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
487 predict_sero = z[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
488 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
489 star = "*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
490 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
491 if " or " in predict_sero:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
492 star_line = star_line + "The predicted serotypes share the same general formula: " + Otype + ":" + fliC + ":" + fljB + "."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
493 #special test for O6,8
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
494 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
495 #for x in merge_O68_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
496 # if x in predict_sero:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
497 # predict_sero=x
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
498 # star=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
499 # star_line=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
500 #special test for Montevideo; most of them are monophasic
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
501 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
502 #star="*"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
503 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
504 return predict_form, predict_sero, star, star_line, claim
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
505 ### End of SeqSero Kmer part
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
506
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
507 ### Begin of SeqSero2 allele prediction and output
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
508 def xml_parse_score_comparision_seqsero(xmlfile):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
509 #used to do seqsero xml analysis
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
510 from Bio.Blast import NCBIXML
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
511 handle=open(xmlfile)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
512 handle=NCBIXML.parse(handle)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
513 handle=list(handle)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
514 List=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
515 List_score=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
516 List_ids=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
517 List_query_region=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
518 for i in range(len(handle)):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
519 if len(handle[i].alignments)>0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
520 for j in range(len(handle[i].alignments)):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
521 score=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
522 ids=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
523 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
524 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
525 for z in range(len(handle[i].alignments[j].hsps)):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
526 hsp=handle[i].alignments[j].hsps[z]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
527 temp=set(range(hsp.query_start,hsp.query_end))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
528 if len(cover_region)==0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
529 cover_region=cover_region|temp
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
530 fraction=1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
531 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
532 fraction=1-len(cover_region&temp)/float(len(temp))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
533 cover_region=cover_region|temp
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
534 if "last" in handle[i].query or "first" in handle[i].query:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
535 score+=hsp.bits*fraction
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
536 ids+=float(hsp.identities)/handle[i].query_length*fraction
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
537 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
538 score+=hsp.bits*fraction
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
539 ids+=float(hsp.identities)/handle[i].query_length*fraction
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
540 List_score.append(score)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
541 List_ids.append(ids)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
542 List_query_region.append(cover_region)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
543 temp=zip(List,List_score,List_ids,List_query_region)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
544 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
545 return Final_list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
546
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
547
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
548 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
549 Old=L
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
550 L.sort()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
551 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
552 count=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
553 for j in range(len(L)):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
554 y=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
555 for x in Old:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
556 if L[j]==x:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
557 y+=1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
558 count.append(y)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
559 if sort_on_fre!="none":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
560 d=zip(*sorted(zip(count, L)))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
561 L=d[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
562 count=d[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
563 return (L,count)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
564
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
565 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
566 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
567 #this is mainly used for
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
568 a=nodes_vs_score_list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
569 fliC_score=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
570 fljB_score=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
571 for z in a:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
572 if "fliC" in z[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
573 fliC_score+=z[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
574 elif "fljB" in z[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
575 fljB_score+=z[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
576 if fliC_score>=fljB_score:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
577 role="fliC"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
578 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
579 role="fljB"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
580 return (role,abs(fliC_score-fljB_score))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
581
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
582 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
583 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
584 #also used when no head or tail got blasted score for the contig
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
585 role=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
586 for z in Final_list_passed:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
587 if node_name in z[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
588 role=z[0].split("_")[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
589 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
590 return role
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
591
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
592 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
593 #nodes_list is the c created by c,d=Uniq(nodes) in below function
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
594 first_target=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
595 role_list=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
596 for x in nodes_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
597 a=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
598 role=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
599 for y in tail_head_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
600 if x in y[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
601 a.append(y)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
602 if len(a)==4:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
603 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
604 if diff<20:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
605 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
606 elif len(a)==3:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
607 ###however, if the one with highest score is the fewer one, compare their accumulation score
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
608 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
609 if diff<20:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
610 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
611 ###end of above score comparison
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
612 elif len(a)==2:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
613 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
614 temp=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
615 for z in a:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
616 temp.append(z[0].split("_")[0])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
617 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
618 if len(m)==1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
619 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
620 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
621 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
622 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
623 if diff<20:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
624 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
625 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
626 elif len(a)==1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
627 #that one
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
628 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
629 if diff<20:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
630 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
631 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
632 else:#a==0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
633 #use Final_list_passed best match
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
634 for z in Final_list_passed:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
635 if x in z[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
636 role=z[0].split("_")[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
637 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
638 #print x,role,len(a)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
639 role_list.append((role,x))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
640 if len(role_list)==2:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
641 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
642 #just use score to do a final test
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
643 role_list=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
644 for x in nodes_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
645 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
646 role_list.append((role,x))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
647 return role_list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
648
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
649 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
650 #used to decide which contig is FliC and which one is fljB
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
651 contigs=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
652 nodes=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
653 for x in Final_list_passed:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
654 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
655 nodes.append(x[0].split("___")[1].strip())
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
656 c,d=Uniq(nodes)#c is node_list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
657 #print c
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
658 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
659 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
660 return roles
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
661
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
662 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
663 #decide O based on Final_list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
664 O_choice="?"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
665 O_list=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
666 special_genes={}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
667 nodes=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
668 for x in Final_list_passed:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
669 if x[0].startswith("O-"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
670 nodes.append(x[0].split("___")[1].strip())
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
671 elif not x[0].startswith("fl"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
672 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
673 #print "special_genes:",special_genes
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
674 c,d=Uniq(nodes)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
675 #print "potential O antigen contig",c
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
676 final_O=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
677 O_nodes_list=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
678 for x in c:#c is the list for contigs
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
679 temp=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
680 for y in Final_list_passed:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
681 if x in y[0] and y[0].startswith("O-"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
682 final_O.append(y)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
683 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
684 ### O contig has the problem of two genes on same contig, so do additional test
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
685 potenial_new_gene=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
686 for x in final_O:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
687 pointer=0 #for genes merged or not
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
688 #not consider O-1,3,19_not_in_3,10, too short compared with others
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
689 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
690 pointer=x[0].split("___")[1].strip()#store the contig name
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
691 print(pointer)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
692 if pointer!=0:#it has potential merge event
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
693 for y in Final_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
694 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
695 potenial_new_gene=y
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
696 #print(potenial_new_gene)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
697 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
698 if potenial_new_gene!="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
699 print("two differnt genes in same contig, fix it for O antigen")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
700 print(potenial_new_gene[:3])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
701 pointer=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
702 for y in final_O:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
703 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
704 pointer=1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
705 if pointer!=0: #changed to consider two genes in same contig
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
706 final_O.append(potenial_new_gene)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
707 ### end of the two genes on same contig test
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
708 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
709 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
710 #print "$$$No Otype, due to no hit"#may need to be changed
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
711 O_choice="-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
712 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
713 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
714 O_list=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
715 O_list_less_contamination=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
716 for x in final_O:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
717 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
718 O_list.append(x[0].split("__")[0])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
719 O_nodes_list.append(x[0].split("___")[1])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
720 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
721 O_list_less_contamination.append(x[0].split("__")[0])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
722 ### special test for O9,46 and O3,10 family
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
723 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
724 if "O-9,46_wzy" in O_list or "O-9,46_wzy_partial" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
725 O_choice="O-9,46"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
726 #print "$$$Most possilble Otype: O-9,46"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
727 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
728 O_choice="O-9,46,27"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
729 #print "$$$Most possilble Otype: O-9,46,27"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
730 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
731 O_choice="O-9"#next, detect O9 vs O2?
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
732 O2=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
733 O9=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
734 for z in special_genes:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
735 if "tyr-O-9" in z:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
736 O9=special_genes[z]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
737 elif "tyr-O-2" in z:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
738 O2=special_genes[z]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
739 if O2>O9:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
740 O_choice="O-2"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
741 elif O2<O9:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
742 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
743 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
744 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
745 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
746 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
747 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
748 O_choice="O-3,10"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
749 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
750 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
751 O_choice="O-1,3,19"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
752 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
753 ### end of special test for O9,46 and O3,10 family
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
754 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
755 try:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
756 max_score=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
757 for x in final_O:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
758 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
759 max_score=x[2]#change from x[-1] to x[2],08172018
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
760 O_choice=x[0].split("_")[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
761 if O_choice=="O-1,3,19":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
762 O_choice=final_O[1][0].split("_")[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
763 #print "$$$Most possilble Otype: ",O_choice
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
764 except:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
765 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
766 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
767 if O_choice=="O-9,46,27" and len(O_list)==2 and "O-4_wzx" in O_list: #special for very low chance sitatuion between O4 and O9,27,46, this is for serotypes like Bredeney and Schwarzengrund (normallly O-4 will have higher score, but sometimes sequencing quality may affect the prediction)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
768 O_choice="O-4"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
769 #print "O:",O_choice,O_nodes_list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
770 Otypes=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
771 for x in O_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
772 if x!="O-1,3,19_not_in_3,10":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
773 if "O-9,46_" not in x:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
774 Otypes.append(x.split("_")[0])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
775 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
776 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
777 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
778 Otypes_uniq,Otypes_fre=Uniq(Otypes)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
779 contamination_O=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
780 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
781 if len(Otypes_uniq)>2:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
782 contamination_O="potential contamination from O antigen signals"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
783 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
784 if len(Otypes_uniq)>1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
785 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
786 contamination_O=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
787 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
788 contamination_O=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
789 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
790 contamination_O="potential contamination from O antigen signals"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
791 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
792 ### End of SeqSero2 allele prediction and output
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
793
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
794 def get_input_files(make_dir,input_file,data_type,dirpath):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
795 #tell input files from datatype
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
796 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
797 for_fq=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
798 rev_fq=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
799 os.chdir(make_dir)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
800 if data_type=="1":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
801 input_file=input_file[0].split("/")[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
802 if input_file.endswith(".sra"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
803 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
804 for_fq=input_file.replace(".sra","_1.fastq")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
805 rev_fq=input_file.replace(".sra","_2.fastq")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
806 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
807 core_id=input_file.split(".fastq")[0].split(".fq")[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
808 for_fq=core_id+"_1.fastq"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
809 rev_fq=core_id+"_2.fastq"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
810 if input_file.endswith(".gz"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
811 subprocess.check_call("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
812 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
813 subprocess.check_call("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
814 elif data_type=="2":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
815 for_fq=input_file[0].split("/")[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
816 rev_fq=input_file[1].split("/")[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
817 elif data_type=="3":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
818 input_file=input_file[0].split("/")[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
819 if input_file.endswith(".sra"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
820 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
821 for_fq=input_file.replace(".sra","_1.fastq")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
822 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
823 for_fq=input_file
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
824 elif data_type in ["4","5","6"]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
825 for_fq=input_file[0].split("/")[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
826 os.chdir("..")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
827 return for_fq,rev_fq
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
828
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
829 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
830 #get O and H types from Final_list from blast parsing; allele mode
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
831 from Bio import SeqIO
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
832 fliC_choice="-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
833 fljB_choice="-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
834 fliC_contig="NA"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
835 fljB_contig="NA"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
836 fliC_region=set([0])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
837 fljB_region=set([0,])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
838 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
839 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
840 O_choice="-"#no need to decide O contig for now, should be only one
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
841 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
842 O_choice=O_choice.split("-")[-1].strip()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
843 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
844 O_choice="-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
845 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
846 #add alignment locations, used for further selection, 03312019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
847 for i in range(len(H_contig_roles)):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
848 x=H_contig_roles[i]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
849 for y in Final_list_passed:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
850 if x[1] in y[0] and y[0].startswith(x[0]):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
851 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
852 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
853 log_file=open("SeqSero_log.txt","a")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
854 extract_file=open("Extracted_antigen_alleles.fasta","a")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
855 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
856
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
857 #print("O_contigs:")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
858 log_file.write("O_contigs:\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
859 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
860 extract_file.write("#O_contigs:\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
861 for x in O_nodes_roles:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
862 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
863 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
864 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
865 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
866 seqs=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
867 for z in handle_fasta:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
868 if x[0].split("___")[-1]==z.description:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
869 seqs=str(z.seq)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
870 extract_file.write(title+seqs+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
871 if len(H_contig_roles)!=0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
872 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
873 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
874 highest_H_coverage=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
875 for x in H_contig_roles:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
876 #if multiple choices, temporately select the one with longest length for now, will revise in further change
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
877 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
878 fliC_contig=x[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
879 fliC_length=len(x[-1])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
880 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
881 fljB_contig=x[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
882 fljB_length=len(x[-1])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
883 for x in Final_list_passed:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
884 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
885 fliC_choice=x[0].split("_")[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
886 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
887 fljB_choice=x[0].split("_")[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
888 elif fliC_choice!="-" and fljB_choice!="-":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
889 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
890 #now remove contigs not in middle core part
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
891 first_allele="NA"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
892 first_allele_percentage=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
893 for x in Final_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
894 if x[0].startswith("fliC") or x[0].startswith("fljB"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
895 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
896 first_allele_percentage=x[2]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
897 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
898 additional_contigs=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
899 for x in Final_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
900 if first_allele in x[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
901 if (fliC_contig == x[0].split("___")[-1]):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
902 fliC_region=x[3]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
903 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
904 fljB_region=x[3]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
905 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
906 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
907 additional_contigs.append(x)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
908 #else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
909 #print x[:3]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
910 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
911 if first_allele_percentage>0.9:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
912 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
913 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
914 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
915 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
916 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
917 target_region=set()#doesn't do anything
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
918 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
919 target_region=set()#doesn't do anything
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
920 #print(target_region)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
921 #print(additional_contigs)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
922 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
923 target_region=target_region2|target_region
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
924 for x in additional_contigs:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
925 removal=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
926 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
927 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
928 removal=1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
929 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
930 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
931 removal=1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
932 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
933 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
934 if removal==1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
935 for y in H_contig_roles:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
936 if y[1] in x[0]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
937 H_contig_roles.remove(y)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
938 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
939 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
940 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
941 #end of removing none-middle contigs
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
942 #print("H_contigs:")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
943 log_file.write("H_contigs:\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
944 extract_file.write("#H_contigs:\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
945 H_contig_stat=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
946 H1_cont_stat={}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
947 H2_cont_stat={}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
948 for i in range(len(H_contig_roles)):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
949 x=H_contig_roles[i]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
950 a=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
951 for y in Final_list_passed:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
952 if x[1] in y[0] and y[0].startswith(x[0]):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
953 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
954 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
955 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
956 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
957 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
958 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
959 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
960 seqs=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
961 for z in handle_fasta:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
962 if x[1]==z.description:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
963 seqs=str(z.seq)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
964 extract_file.write(title+seqs+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
965 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
966 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
967 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
968 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
969 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
970 seqs=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
971 for z in handle_fasta:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
972 if x[1]==z.description:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
973 seqs=str(z.seq)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
974 extract_file.write(title+seqs+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
975 if x[0]=="fliC":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
976 if y[0].split("_")[1] not in H1_cont_stat:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
977 H1_cont_stat[y[0].split("_")[1]]=y[2]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
978 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
979 H1_cont_stat[y[0].split("_")[1]]+=y[2]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
980 if x[0]=="fljB":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
981 if y[0].split("_")[1] not in H2_cont_stat:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
982 H2_cont_stat[y[0].split("_")[1]]=y[2]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
983 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
984 H2_cont_stat[y[0].split("_")[1]]+=y[2]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
985 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
986 #detect contaminations
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
987 #print(H1_cont_stat)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
988 #print(H2_cont_stat)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
989 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
990 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
991 contamination_H=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
992 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
993 contamination_H="potential contamination from H antigen signals"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
994 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
995 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
996 #get additional antigens
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
997 """
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
998 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
999 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1000 O_choice="O-9,46"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1001 #print "$$$Most possilble Otype: O-9,46"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1002 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1003 O_choice="O-9,46,27"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1004 #print "$$$Most possilble Otype: O-9,46,27"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1005 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1006 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1007 O_choice="O-3,10"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1008 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1009 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1010 O_choice="O-1,3,19"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1011 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1012 ### end of special test for O9,46 and O3,10 family
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1013
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1014 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1015 if len(Otypes_uniq)>2:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1016 contamination_O="potential contamination from O antigen signals"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1017 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1018 if len(Otypes_uniq)>1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1019 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1020 contamination_O=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1021 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1022 contamination_O=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1023 """
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1024 additonal_antigents=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1025 #print(contamination_O)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1026 #print(contamination_H)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1027 log_file.write(contamination_O+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1028 log_file.write(contamination_H+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1029 log_file.close()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1030 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1031
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1032 def get_input_K(input_file,lib_dict,data_type,k_size):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1033 #kmer mode; get input_Ks from dict and data_type
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1034 kmers = []
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1035 for h in lib_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1036 kmers += lib_dict[h]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1037 if data_type == '4':
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1038 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1039 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1040 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1041 elif data_type == '5':#minion_2d_fasta
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1042 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1043 if data_type == '6':#minion_2d_fastq
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1044 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1045 return input_Ks
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1046
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1047 def get_kmer_dict(lib_dict,input_Ks):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1048 #kmer mode; get predicted types
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1049 O_dict = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1050 H_dict = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1051 Special_dict = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1052 for h in lib_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1053 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1054 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1055 if h.startswith('O-') and score > 25:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1056 O_dict[h] = score
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1057 if h.startswith('fl') and score > 40:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1058 H_dict[h] = score
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1059 if (h[:2] != 'fl') and (h[:2] != 'O-'):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1060 Special_dict[h] = score
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1061 return O_dict,H_dict,Special_dict
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1062
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1063 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1064 log_file=open("SeqSero_log.txt","a")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1065 log_file.write("O_scores:\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1066 #call O:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1067 highest_O = '-'
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1068 if len(O_dict) == 0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1069 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1070 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1071 for x in O_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1072 log_file.write(x+"\t"+str(O_dict[x])+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1073 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1074 #if 'O-9,46_wzy__1191' in O_dict or "O-9,46_wzy_partial__216" in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1075 #modified to fix miscall of O-9,46
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1076 if ('O-9,46_wzy__1191' in O_dict and O_dict['O-9,46_wzy__1191']>40) or ("O-9,46_wzy_partial__216" in O_dict and O_dict["O-9,46_wzy_partial__216"]>40): # and float(O946_wzy)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1077 highest_O = "O-9,46"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1078 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1079 highest_O = "O-9,46,27"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1080 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1081 highest_O = "O-9" # next, detect O9 vs O2?
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1082 O2 = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1083 O9 = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1084 for z in Special_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1085 if "tyr-O-9" in z:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1086 O9 = float(Special_dict[z])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1087 if "tyr-O-2" in z:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1088 O2 = float(Special_dict[z])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1089 if O2 > O9:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1090 highest_O = "O-2"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1091 elif ("O-3,10_wzx__1539" in O_dict) and (
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1092 "O-9,46_wzy__1191" in O_dict
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1093 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1094 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1095 highest_O = "O-3,10"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1096 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1097 highest_O = "O-1,3,19"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1098 ### end of special test for O9,46 and O3,10 family
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1099 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1100 try:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1101 max_score = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1102 for x in O_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1103 if float(O_dict[x]) >= max_score:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1104 max_score = float(O_dict[x])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1105 #highest_O = x.split("_")[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1106 # ed_SL_12182019: modified to fix the O-9,46 error example1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1107 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1108 highest_O = "O-9"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1109 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1110 highest_O = x.split("_")[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1111 if highest_O == "O-1,3,19":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1112 highest_O = '-'
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1113 max_score = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1114 for x in O_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1115 if x == 'O-1,3,19_not_in_3,10__130':
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1116 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1117 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1118 if float(O_dict[x]) >= max_score:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1119 max_score = float(O_dict[x])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1120 #highest_O = x.split("_")[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1121 # ed_SL_12182019: modified to fix the O-9,46 error example1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1122 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1123 highest_O = "O-9"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1124 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1125 highest_O = x.split("_")[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1126 except:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1127 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1128 #call_fliC:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1129 if len(H_dict)!=0:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1130 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1131 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1132 highest_H_score_both_BC=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1133 highest_fliC = '-'
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1134 highest_fliC_raw = '-'
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1135 highest_Score = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1136 log_file.write("\nH_scores:\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1137 for s in H_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1138 log_file.write(s+"\t"+str(H_dict[s])+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1139 if s.startswith('fliC'):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1140 if float(H_dict[s]) > highest_Score:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1141 highest_fliC = s.split('_')[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1142 highest_fliC_raw = s
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1143 highest_Score = float(H_dict[s])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1144 #call_fljB
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1145 highest_fljB = '-'
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1146 highest_fljB_raw = '-'
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1147 highest_Score = 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1148 for s in H_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1149 if s.startswith('fljB'):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1150 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1151 #highest_fljB = s.split('_')[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1152 #highest_fljB_raw = s
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1153 #highest_Score = float(H_dict[s])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1154 if s.split('_')[1]!=highest_fliC:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1155 highest_fljB = s.split('_')[1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1156 highest_fljB_raw = s
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1157 highest_Score = float(H_dict[s])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1158 log_file.write("\nSpecial_scores:\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1159 for s in Special_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1160 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1161 log_file.close()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1162 return highest_O,highest_fliC,highest_fljB
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1163
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1164 def get_temp_file_names(for_fq,rev_fq):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1165 #seqsero2 -a; get temp file names
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1166 sam=for_fq+".sam"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1167 bam=for_fq+".bam"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1168 sorted_bam=for_fq+"_sorted.bam"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1169 mapped_fq1=for_fq+"_mapped.fq"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1170 mapped_fq2=rev_fq+"_mapped.fq"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1171 combined_fq=for_fq+"_combined.fq"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1172 for_sai=for_fq+".sai"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1173 rev_sai=rev_fq+".sai"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1174 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1175
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1176 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1177 #seqsero2 -a; do mapping and sort
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1178 print("building database...")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1179 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1180 print("mapping...")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1181 if mapping_mode=="mem":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1182 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1183 elif mapping_mode=="sam":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1184 if fnameB!="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1185 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1186 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1187 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1188 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1189 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1190 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1191 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1192 ### check the version of samtools then use differnt commands
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1193 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1194 out, err = samtools_version.communicate()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1195 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1196 print("check samtools version:",version)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1197 ### end of samtools version check and its analysis
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1198 if LooseVersion(version)<=LooseVersion("1.2"):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1199 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1200 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1201 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1202
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1203 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1204 #seqsero2 -a; extract, assembly and blast
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1205 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1206 #print("fnameA:",fnameA)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1207 #print("fnameB:",fnameB)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1208 if fnameB!="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1209 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1210 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1211 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1212 outdir=current_time+"_temp"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1213 print("assembling...")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1214 if int(threads)>4:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1215 t="4"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1216 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1217 t=threads
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1218 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1219 if fnameB!="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1220 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1221 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1222 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1223 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1224 #new_fasta=fnameA+"_"+database.split('/')[-1]+"_"+mapping_mode+".fasta" # change path to databse for packaging
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1225 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1226 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1227 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1228 print("blasting...","\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1229 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1230 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1231 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1232 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1233 xmlfile="NA"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1234 return xmlfile,new_fasta
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1235
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1236 def judge_subspecies(fnameA):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1237 #seqsero2 -a; judge subspecies on just forward raw reads fastq
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1238 salmID_output=subprocess.Popen("SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1239 out, err = salmID_output.communicate()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1240 out=out.decode("utf-8")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1241 file=open("data_log.txt","a")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1242 file.write(out)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1243 file.close()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1244 salm_species_scores=out.split("\n")[1].split("\t")[6:]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1245 salm_species_results=out.split("\n")[0].split("\t")[6:]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1246 max_score=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1247 max_score_index=1 #default is 1, means "I"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1248 for i in range(len(salm_species_scores)):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1249 if max_score<float(salm_species_scores[i]):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1250 max_score=float(salm_species_scores[i])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1251 max_score_index=i
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1252 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1253 #if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1254 if float(out.split("\n")[1].split("\t")[4]) > 10 and float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): ## ed_SL_0318: change SalmID_ssp_threshold
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1255 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1256 #if max_score<10: ## ed_SL_0318: change SalmID_ssp_threshold
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1257 if max_score<60:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1258 prediction="-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1259 return prediction
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1260
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1261 def judge_subspecies_Kmer(Special_dict):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1262 #seqsero2 -k;
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1263 max_score=0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1264 prediction="-" #default should be I
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1265 for x in Special_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1266 #if "mer" in x: ## ed_SL_0318: change ssp_threshold
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1267 if "mer" in x and float(Special_dict[x]) > 60:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1268 if max_score<float(Special_dict[x]):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1269 max_score=float(Special_dict[x])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1270 prediction=x.split("_")[-1].strip()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1271 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1272 prediction="bongori"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1273 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1274 return prediction
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1275
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1276 ## ed_SL_11232019: add notes for missing antigen
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1277 def check_antigens(ssp,O_antigen,H1_antigen,H2_antigen,NA_note):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1278 antigen_note = ''
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1279 if ssp != '-':
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1280 if O_antigen != '-' and H1_antigen == '-' and H2_antigen == '-': # O:-:-
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1281 antigen_note = 'H antigens were not detected. This is an atypical result that should be further investigated. Most Salmonella strains have at least fliC, encoding the Phase 1 H antigen, even if it is not expressed. '
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1282 NA_note = ''
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1283 elif O_antigen != '-' and H1_antigen == '-' and H2_antigen != '-': # O:-:H2
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1284 antigen_note = 'fliC was not detected. This is an atypical result that should be further investigated. Most Salmonella strains have fliC, encoding the Phase 1 H antigen, even if it is not expressed. '
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1285 NA_note = ''
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1286 elif O_antigen == '-' and H1_antigen != '-': # -:H1:X
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1287 antigen_note = 'O antigen was not detected. This result may be due to a rough strain that has deleted the rfb region. For raw reads input, the k-mer workflow is sometimes more sensitive than the microassembly workflow in detecting O antigen. Caution should be used with this approach because the k-mer result may be due to low levels of contamination. '
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1288 NA_note = ''
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1289 elif O_antigen == '-' and H1_antigen == '-' and H2_antigen == '-': # -:-:-
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1290 antigen_note = 'No serotype antigens were detected. This is an atypical result that should be further investigated. '
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1291 NA_note = ''
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1292 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1293 antigen_note = 'The input genome cannot be identified as Salmonella. Check the input for taxonomic ID, contamination, or sequencing quality. '
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1294 NA_note = ''
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1295 # if [O_antigen, H1_antigen, H2_antigen].count('-') >= 2:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1296 # antigen_note = 'No subspecies marker was detected and less than 2 serotype antigens were detected; further, this genome was not identified as Salmonella. This is an atypical result that should be further investigated. '
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1297 # else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1298 # antigen_note = 'No subspecies marker was detected. This genome may not be Salmonella. This is an atypical result that should be further investigated. '
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1299 return (antigen_note,NA_note)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1300
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1301 def main():
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1302 #combine SeqSeroK and SeqSero2, also with SalmID
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1303 args = parse_args()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1304 input_file = args.i
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1305 data_type = args.t
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1306 analysis_mode = args.m
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1307 mapping_mode=args.b
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1308 threads=args.p
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1309 make_dir=args.d
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1310 clean_mode=args.c
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1311 sample_name=args.n
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1312 ingore_header=args.s
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1313 k_size=27 #will change for bug fixing
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1314 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1315 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: add ex_dir for packaging
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1316 seqsero2_db=ex_dir+"/H_and_O_and_specific_genes.fasta" # ed_SL_11092019: change path to database for packaging
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1317 database="H_and_O_and_specific_genes.fasta"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1318 note="Note: "
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1319 NA_note="This predicted serotype is not in the Kauffman-White scheme. " # ed_SL_09272019: add for new output format
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1320 if len(sys.argv)==1:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1321 subprocess.check_call(dirpath+"/SeqSero2_package.py -h",shell=True)#change name of python file
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1322 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1323 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1324 request_id += str(random.randint(1, 10000000))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1325 if make_dir is None:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1326 make_dir="SeqSero_result_"+request_id
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1327 make_dir=os.path.abspath(make_dir)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1328 if os.path.isdir(make_dir):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1329 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1330 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1331 subprocess.check_call("mkdir -p "+make_dir,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1332 #subprocess.check_call("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1333 #subprocess.check_call("ln -sr "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1334 subprocess.check_call("ln -f -s "+seqsero2_db+" "+" ".join(input_file)+" "+make_dir,shell=True) # ed_SL_11092019: change path to database for packaging
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1335 #subprocess.check_call("ln -f -s "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument).
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1336 ############################begin the real analysis
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1337 if analysis_mode=="a":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1338 if data_type in ["1","2","3"]:#use allele mode
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1339 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1340 os.chdir(make_dir)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1341 ###add a function to tell input files
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1342 fnameA=for_fq.split("/")[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1343 fnameB=rev_fq.split("/")[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1344 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1345 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1346 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1347 ### avoid error out when micro assembly fails. ed_SL_03172020
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1348 try:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1349 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1350 except (UnboundLocalError, subprocess.CalledProcessError):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1351 xmlfile="NA"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1352 H1_cont_stat_list=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1353 H2_cont_stat_list=[]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1354 ###
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1355 if xmlfile=="NA":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1356 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1357 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1358 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1359 file=open("data_log.txt","a")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1360 for x in Final_list:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1361 file.write("\t".join(str(y) for y in x)+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1362 file.close()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1363 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1364 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1365 subspecies=judge_subspecies(fnameA) #predict subspecies
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1366 ###output
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1367 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1368 claim="" #04132019, disable claim for new report requirement
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1369 contamination_report=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1370 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1371 if contamination_O!="" and contamination_H=="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1372 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1373 elif contamination_O=="" and contamination_H!="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1374 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1375 elif contamination_O!="" and contamination_H!="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1376 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1377 if contamination_report!="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1378 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1379 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles. "
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1380 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1381 #if contamination_report+star_line+claim=="": #0413, new output style
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1382 # note=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1383 #else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1384 # note="Note:"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1385
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1386 ### ed_SL_11232019: add notes for missing antigen
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1387 if O_choice=="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1388 O_choice="-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1389 antigen_note,NA_note=check_antigens(subspecies,O_choice,fliC_choice,fljB_choice,NA_note)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1390 if sample_name:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1391 print ("Sample name:\t"+sample_name)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1392 ###
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1393
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1394 if clean_mode:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1395 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1396 make_dir="none-output-directory due to '-c' flag"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1397 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1398 new_file=open("SeqSero_result.txt","w")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1399 ### ed_SL_01152020: add new output
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1400 conta_note="yes" if "inter-serotype contamination" in contamination_report else "no"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1401 tsv_file=open("SeqSero_result.tsv","w")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1402 if ingore_header:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1403 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1404 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1405 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted subspecies\tPredicted antigenic profile\tPredicted serotype\tPotential inter-serotype contamination\tNote\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1406 if sample_name:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1407 new_file.write("Sample name:\t"+sample_name+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1408 tsv_file.write(sample_name+'\t')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1409 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1410 tsv_file.write(input_file[0].split('/')[-1]+'\t')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1411 ###
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1412 if "N/A" not in predict_sero:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1413 new_file.write("Output directory:\t"+make_dir+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1414 "Input files:\t"+"\t".join(input_file)+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1415 "O antigen prediction:\t"+O_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1416 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1417 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1418 "Predicted subspecies:\t"+subspecies+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1419 "Predicted antigenic profile:\t"+predict_form+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1420 "Predicted serotype:\t"+predict_sero+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1421 note+contamination_report+star_line+claim+antigen_note+"\n")#+##
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1422 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies+"\t"+predict_form+"\t"+predict_sero+"\t"+conta_note+"\t"+contamination_report+star_line+claim+antigen_note+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1423 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1424 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1425 star_line="" #04132019, for new output requirement, diable star_line if "NA" in output
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1426 new_file.write("Output directory:\t"+make_dir+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1427 "Input files:\t"+"\t".join(input_file)+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1428 "O antigen prediction:\t"+O_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1429 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1430 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1431 "Predicted subspecies:\t"+subspecies+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1432 "Predicted antigenic profile:\t"+predict_form+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1433 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, add subspecies
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1434 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")#+##
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1435 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+conta_note+"\t"+NA_note+contamination_report+star_line+claim+antigen_note+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1436 new_file.close()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1437 tsv_file.close()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1438 #subprocess.check_call("cat Seqsero_result.txt",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1439 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1440 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1441 if "N/A" not in predict_sero:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1442 #print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\nNote:"+contamination_report+star+star_line+claim+"\n")#+##
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1443 print("Output directory:\t"+make_dir+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1444 "Input files:\t"+"\t".join(input_file)+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1445 "O antigen prediction:\t"+O_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1446 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1447 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1448 "Predicted subspecies:\t"+subspecies+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1449 "Predicted antigenic profile:\t"+predict_form+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1450 "Predicted serotype:\t"+predict_sero+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1451 note+contamination_report+star_line+claim+antigen_note+"\n")#+##
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1452 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1453 print("Output directory:\t"+make_dir+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1454 "Input files:\t"+"\t".join(input_file)+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1455 "O antigen prediction:\t"+O_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1456 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1457 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1458 "Predicted subspecies:\t"+subspecies+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1459 "Predicted antigenic profile:\t"+predict_form+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1460 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1461 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1462 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1463 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1464 elif analysis_mode=="k":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1465 #ex_dir = os.path.dirname(os.path.realpath(__file__))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1466 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: change ex_dir for packaging
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1467 #output_mode = args.mode
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1468 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1469 input_file = for_fq #-k will just use forward because not all reads were used
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1470 os.chdir(make_dir)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1471 f = open(ex_dir + '/antigens.pickle', 'rb')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1472 lib_dict = pickle.load(f)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1473 f.close
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1474 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1475 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1476 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1477 subspecies=judge_subspecies_Kmer(Special_dict)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1478 if subspecies=="IIb" or subspecies=="IIa":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1479 subspecies="II"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1480 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1481 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1482 claim="" #no claim any more based on new output requirement
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1483 #if star_line+claim=="": #0413, new output style
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1484 # note=""
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1485 #else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1486 # note="Note:"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1487
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1488 ### ed_SL_11232019: add notes for missing antigen
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1489 if highest_O.split('-')[-1]=="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1490 O_choice="-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1491 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1492 O_choice=highest_O.split('-')[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1493 antigen_note,NA_note=check_antigens(subspecies,O_choice,highest_fliC,highest_fljB,NA_note)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1494 if sample_name:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1495 print ("Sample name:\t"+sample_name)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1496 ###
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1497
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1498 if clean_mode:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1499 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1500 make_dir="none-output-directory due to '-c' flag"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1501 # ### ed_SL_05282019, fix the assignment issue of variable 'O_choice' using "-m k -c"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1502 # if highest_O.split('-')[-1]=="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1503 # O_choice="-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1504 # else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1505 # O_choice=highest_O.split('-')[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1506 # ###
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1507 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1508 # if highest_O.split('-')[-1]=="":
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1509 # O_choice="-"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1510 # else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1511 # O_choice=highest_O.split('-')[-1]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1512 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1513 new_file=open("SeqSero_result.txt","w")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1514 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1515 ### ed_SL_01152020: add new output
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1516 tsv_file=open("SeqSero_result.tsv","w")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1517 if ingore_header:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1518 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1519 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1520 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted subspecies\tPredicted antigenic profile\tPredicted serotype\tNote\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1521 if sample_name:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1522 new_file.write("Sample name:\t"+sample_name+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1523 tsv_file.write(sample_name+'\t')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1524 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1525 tsv_file.write(input_file.split('/')[-1]+'\t')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1526 ###
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1527 if "N/A" not in predict_sero:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1528 new_file.write("Output directory:\t"+make_dir+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1529 "Input files:\t"+input_file+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1530 "O antigen prediction:\t"+O_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1531 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1532 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1533 "Predicted subspecies:\t"+subspecies+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1534 "Predicted antigenic profile:\t"+predict_form+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1535 "Predicted serotype:\t"+predict_sero+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1536 note+star_line+claim+antigen_note+"\n")#+##
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1537 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies+"\t"+predict_form+"\t"+predict_sero+"\t"+star_line+claim+antigen_note+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1538 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1539 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1540 star_line = "" #changed for new output requirement, 04132019
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1541 new_file.write("Output directory:\t"+make_dir+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1542 "Input files:\t"+input_file+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1543 "O antigen prediction:\t"+O_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1544 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1545 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1546 "Predicted subspecies:\t"+subspecies+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1547 "Predicted antigenic profile:\t"+predict_form+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1548 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1549 note+NA_note+star_line+claim+antigen_note+"\n")#+##
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1550 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+NA_note+star_line+claim+antigen_note+"\n")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1551 new_file.close()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1552 tsv_file.close()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1553 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1554 if "N/A" not in predict_sero:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1555 print("Output directory:\t"+make_dir+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1556 "Input files:\t"+input_file+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1557 "O antigen prediction:\t"+O_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1558 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1559 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1560 "Predicted subspecies:\t"+subspecies+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1561 "Predicted antigenic profile:\t"+predict_form+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1562 "Predicted serotype:\t"+predict_sero+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1563 note+star_line+claim+antigen_note+"\n")#+##
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1564 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1565 print("Output directory:\t"+make_dir+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1566 "Input files:\t"+input_file+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1567 "O antigen prediction:\t"+O_choice+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1568 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1569 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1570 "Predicted subspecies:\t"+subspecies+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1571 "Predicted antigenic profile:\t"+predict_form+"\n"+
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1572 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1573 note+NA_note+star_line+claim+antigen_note+"\n")#+##
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1574
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1575 if __name__ == '__main__':
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1576 main()