annotate run_galaxy.py @ 0:e4b3fc88efe0 draft

Uploaded
author pedro_araujo
date Wed, 27 Jan 2021 13:50:11 +0000
parents
children f8dee15a72a4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
2
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
3 class GalaxyPrediction:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
4
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
5 def __init__(self, phage_input_type='ID', bact_input_type='ID', phage='', bacteria='', ml_model='RandomForests', run_interpro=False):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
6 import pickle
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
7 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
8 import re
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
9 with open('files/FeatureDataset', 'rb') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
10 dataset = pickle.load(f)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
11 self.all_phages = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
12 self.all_bacteria = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
13 for ID in dataset.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
14 temp_phage = ID[:ID.find('--')]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
15 temp_bacteria = ID[ID.find('--')+2:]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
16 if temp_phage not in self.all_phages:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
17 self.all_phages.append(temp_phage)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
18 if temp_bacteria not in self.all_bacteria:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
19 self.all_bacteria.append(temp_bacteria)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
20 if phage_input_type == 'ID':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
21 phage = re.split('\W', phage.replace(' ', ''))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
22 len_phage_id = len(phage)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
23 phage_seqs = self._retrieve_from_phage_id(phage)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
24 elif phage_input_type == 'seq_file':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
25 phage_seqs = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
26 phage_seqs['PhageFasta'] = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
27 with open(phage, 'r') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
28 temp = f.readlines()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
29 count_prot = 0
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
30 prot = ''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
31 i=0
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
32 while i < len(temp):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
33 if '>' in temp[i]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
34 if prot:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
35 phage_seqs['PhageFasta']['Protein' + str(count_prot)] = ['Unknown', prot]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
36 count_prot += 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
37 prot = ''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
38 i+=1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
39 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
40 prot += temp[i].strip()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
41 i+=1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
42 if bact_input_type == 'ID':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
43 bacteria = re.split('\W', bacteria.replace(' ', ''))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
44 if len(bacteria) > 1 and len_phage_id == 1 or len(bacteria) == 1:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
45 bact_seqs = self._retrieve_from_bact_id(bacteria)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
46 elif bact_input_type == 'seq_file':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
47 bact_seqs = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
48 bact_seqs['BacteriaFasta'] = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
49 with open(bacteria, 'r') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
50 temp = f.readlines()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
51 count_prot = 0
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
52 prot = ''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
53 i=0
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
54 while i < len(temp):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
55 if '>' in temp[i]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
56 if prot:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
57 bact_seqs['BacteriaFasta']['Protein' + str(count_prot)] = ['Unknown', prot]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
58 count_prot += 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
59 prot = ''
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
60 i+=1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
61 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
62 prot += temp[i].strip()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
63 i+=1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
64 phage_seqs = self._find_phage_functions(phage_seqs, run_interpro)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
65 phage_seqs = self._find_phage_tails(phage_seqs)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
66
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
67 list_remove = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
68 for org in phage_seqs:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
69 if not phage_seqs[org]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
70 print('Could not find tails for phage ' + org + '. Deleting entry...')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
71 list_remove.append(org)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
72 for org in list_remove:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
73 del phage_seqs[org]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
74
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
75 if phage_seqs:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
76 output = self.run_prediction(phage_seqs, bact_seqs, ml_model)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
77 self.create_output(output, phage_seqs, bact_seqs)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
78 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
79 with open(or_location + '/output.tsv', 'w') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
80 f.write('No phage tails found in query')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
81 for file in os.listdir('files'):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
82 if file.startswith('temp'):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
83 os.remove('files/' + file)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
84
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
85 def _retrieve_from_phage_id(self, phage):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
86 temp_phage = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
87 for ID in phage:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
88 temp_phage[ID] = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
89 if ID in self.all_phages:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
90 import json
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
91 with open('files/phageTails.json', encoding='utf-8') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
92 phage_tails = json.loads(f.read())
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
93 temp_phage[ID] = phage_tails[ID]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
94 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
95 from Bio import Entrez
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
96 from Bio import SeqIO
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
97 phage = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
98 Entrez.email = 'insert@email.com'
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
99 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID) as handle:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
100 genome = SeqIO.read(handle, "gb")
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
101 for feat in genome.features:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
102 if feat.type == 'CDS':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
103 try: temp_phage[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
104 except: pass
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
105 return temp_phage
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
106
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
107 def _retrieve_from_bact_id(self, bacteria):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
108 temp_bacteria = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
109 for ID in bacteria:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
110 temp_bacteria[ID] = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
111 if '.' in ID:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
112 ID = ID[:ID.find('.')]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
113 if ID in self.all_bacteria:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
114 import json
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
115 with open('files/bacteria/' + ID + '.json', encoding='utf-8') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
116 temp_bacteria[ID] = json.loads(f.read())
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
117 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
118 from Bio import Entrez
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
119 from Bio import SeqIO
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
120 bacteria = {}
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
121 Entrez.email = 'insert@email.com'
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
122 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID+'.1') as handle:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
123 genome = SeqIO.read(handle, "gb")
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
124 for feat in genome.features:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
125 if feat.type == 'CDS':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
126 try: temp_bacteria[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
127 except: pass
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
128 if len(genome.features) <= 5:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
129 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=ID) as handle:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
130 genome = handle.readlines()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
131 for i in range(len(genome)):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
132 if ' CDS ' in genome[i]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
133 j = i
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
134 protDone = False
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
135 while j < len(genome):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
136 if protDone:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
137 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
138 if '/product=' in genome[j]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
139 product = genome[j].strip()[10:]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
140 j += 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
141 elif '_id=' in genome[j]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
142 protKey = genome[j].strip()[13:-1]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
143 j += 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
144 elif '/translation=' in genome[j]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
145 protSeq = genome[j].strip()[14:]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
146 j += 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
147 for k in range(j, len(genome)):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
148 if genome[k].islower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
149 j = k
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
150 protDone = True
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
151 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
152 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
153 protSeq += genome[k].strip()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
154 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
155 j += 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
156 temp_bacteria[ID][protKey] = [product, protSeq[:protSeq.find('"')]]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
157 return temp_bacteria
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
158
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
159 def _find_phage_functions(self, phage_dict, run_interpro):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
160 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
161 import json
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
162 with open('files/known_function.json', encoding='utf-8') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
163 known_function = json.loads(F.read())
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
164 with open('files/temp_database.fasta', 'w') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
165 for phage in known_function:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
166 for prot in known_function[phage]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
167 F.write('>' + phage + '-' + prot + '\n' + known_function[phage][prot][1] + '\n')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
168 os.system('makeblastdb -in files/temp_database.fasta -dbtype prot -title PhageProts -parse_seqids -out files/temp_database -logfile files/temp_log')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
169 for org in phage_dict:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
170 with open('files/temp.fasta', 'w') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
171 for prot in phage_dict[org]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
172 F.write('>' + prot + '\n' + phage_dict[org][prot][1] + '\n')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
173 os.system('blastp -db files/temp_database -query files/temp.fasta -out files/temp_blast -num_threads 2 -outfmt 6')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
174 phage_dict[org] = self.process_blast(phage_dict[org], known_function)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
175 if run_interpro: phage_dict[org] = self.interpro(phage_dict[org])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
176 return phage_dict
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
177
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
178 def process_blast(self, phage_dict, known_function):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
179 import pandas as pd
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
180 import re
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
181 blast_domains = pd.read_csv('files/temp_blast', sep='\t', header=None)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
182 for prot in phage_dict:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
183 func = phage_dict[prot][0]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
184 known = False
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
185 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
186 known = True
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
187 if not known:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
188 evalue = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
189 bitscore = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
190 pred = blast_domains[blast_domains[0] == prot]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
191 if pred.shape[0] == 0: break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
192 for i in pred[10]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
193 evalue.append(float(i))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
194 for i in pred[11]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
195 bitscore.append(float(i))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
196 if min(evalue) < 1.0 and max(bitscore) > 30.0:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
197 ind = evalue.index(min(evalue))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
198 if ind != bitscore.index(max(bitscore)):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
199 ind = bitscore.index(max(bitscore))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
200 temp = pred.iloc[ind, 1]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
201 known_phage = temp[:temp.find('-')]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
202 known_prot = temp[temp.find('-') + 1:]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
203 if known_function[known_phage][known_prot]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
204 new_func = known_function[known_phage][known_prot][0]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
205 # for j in known_function.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
206 # if pred.iloc[ind, 1] in known_function[j].keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
207 # new_func = known_function[j][pred.iloc[ind, 1]][0]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
208 # break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
209 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
210 if not any(z in new_func.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(new_func) > 3 and not x:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
211 phage_dict[prot][0] = new_func
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
212 return phage_dict
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
213
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
214 def interpro(self, phage_dict):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
215 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
216 import pandas as pd
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
217 import re
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
218 os.system('interproscan.sh -b ' + 'files/temp_interpro -i ' + 'files/temp.fasta -f tsv > files/temp_interpro_log')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
219 domains = pd.read_csv('files/temp_interpro.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
220 domains = domains.fillna('-')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
221 domains = domains[domains.loc[:, 3] != 'Coils']
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
222 domains = domains[domains.loc[:, 3] != 'MobiDBLite']
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
223 for prot in phage_dict:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
224 func = phage_dict[prot][0]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
225 known = False
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
226 if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
227 known = True
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
228 if prot in domains.index and not known:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
229 temp = '-'
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
230 try:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
231 for i in range(domains.loc[prot, :].shape[0]):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
232 if '-' not in domains.loc[prot, 12].iloc[i].lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
233 if float(domains.loc[prot, 8].iloc[i]) < 1.0:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
234 temp = domains.loc[prot, 12].iloc[i]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
235 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
236 except:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
237 if float(domains.loc[prot, 8]) < 1.0:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
238 temp = domains.loc[prot, 12]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
239 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp) # se tiver hits, remover
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
240 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
241 phage_dict[prot][0] = temp
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
242 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
243 try:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
244 for i in range(domains.loc[prot, :].shape[0]):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
245 if '-' not in domains.loc[prot, 5].iloc[i].lower():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
246 temp = domains.loc[prot, 5].iloc[i]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
247 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
248 except:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
249 temp = domains.loc[prot, 5]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
250 x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
251 if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
252 phage_dict[prot][0] = temp
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
253 return phage_dict
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
254
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
255 def _find_phage_tails(self, phage_dict):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
256 for org in phage_dict:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
257 list_remove = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
258 for protein in phage_dict[org]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
259 if any(z in phage_dict[org][protein][0].lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing', 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
260 and not any(z in phage_dict[org][protein][0].lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp', 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal', 'terminase', 'nucl', 'promot', 'block', 'olfact', 'wedge', 'lysozyme', 'mur', 'sheat']):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
261 pass
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
262 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
263 list_remove.append(protein)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
264 for protein in list_remove:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
265 del phage_dict[org][protein]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
266 return phage_dict
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
267
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
268 def run_prediction(self, phage_dict, bact_dict, ml_model):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
269 from feature_construction import FeatureConstruction
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
270 import pickle
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
271 from sklearn.preprocessing import LabelEncoder
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
272 from sklearn.preprocessing import StandardScaler
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
273 import numpy as np
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
274
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
275 if ml_model == 'RandomForests':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
276 with open('files/dataset_reduced', 'rb') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
277 dataset = pickle.load(f)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
278 columns_remove = [3, 7, 9, 11, 24, 28, 32, 34, 38, 42, 45, 52, 53, 61, 65, 73, 75, 79, 104, 122, 141, 151, 154, 155, 157, 159, 160, 161, 163, 165, 169, 170, 173, 176, 178, 180, 182, 183, 185, 186, 187, 190, 193, 194, 196, 197, 201, 202, 203, 206, 207, 209, 210, 212, 216, 217, 221, 223, 225, 226, 230, 233, 235, 236, 245, 251]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
279 elif ml_model == 'SVM':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
280 with open('files/feature_dataset', 'rb') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
281 dataset = pickle.load(f)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
282 columns_remove = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
283
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
284 dataset = dataset.dropna()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
285 le = LabelEncoder()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
286 le.fit(['Yes', 'No'])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
287 output = le.transform(dataset['Infects'].values)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
288 dataset = dataset.drop('Infects', 1)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
289 scaler = StandardScaler()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
290 scaler.fit(dataset)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
291 data_z = scaler.transform(dataset)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
292
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
293 fc = FeatureConstruction()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
294 solution = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
295 for phage in phage_dict:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
296 for bacteria in bact_dict:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
297 temp_solution = np.array([])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
298 temp_solution = np.append(temp_solution, fc.get_grouping(phage_dict[phage], bact_dict[bacteria]))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
299 temp_solution = np.append(temp_solution, fc.get_composition(phage_dict[phage], bact_dict[bacteria]))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
300 temp_solution = np.append(temp_solution, fc.get_kmers(phage_dict[phage], bact_dict[bacteria]))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
301 temp_solution = temp_solution.reshape(1, -1)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
302 if columns_remove:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
303 temp_solution = np.delete(temp_solution, columns_remove, 1)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
304 if phage in self.all_phages:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
305 for ID in dataset.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
306 if phage in ID:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
307 for i in range(len(dataset.loc[ID].index)):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
308 if 'phage' in dataset.loc[ID].index[i]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
309 temp_solution[0][i] = dataset.loc[ID, dataset.loc[ID].index[i]]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
310 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
311 if bacteria in self.all_bacteria:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
312 for ID in dataset.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
313 if bacteria in ID:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
314 for i in range(len(dataset.loc[ID].index)):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
315 if 'bact' in dataset.loc[ID].index[i]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
316 temp_solution[0][i] = dataset.loc[ID, dataset.loc[ID].index[i]]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
317 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
318 if type(solution) != np.ndarray:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
319 solution = temp_solution
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
320 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
321 solution = np.append(solution, temp_solution, axis=0)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
322 # solution = solution.reshape(1, -1)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
323 solution = scaler.transform(solution)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
324
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
325 if ml_model == 'RandomForests':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
326 from sklearn.ensemble import RandomForestClassifier
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
327 clf = RandomForestClassifier(n_estimators=200, bootstrap=False, criterion='gini', min_samples_leaf=2, min_samples_split=4, oob_score=False)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
328 clf = clf.fit(data_z, output)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
329 elif ml_model == 'SVM':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
330 from sklearn.svm import SVC
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
331 clf = SVC(C=10, degree=2, gamma='auto', kernel='rbf')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
332 clf = clf.fit(data_z, output)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
333 pred = clf.predict(solution)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
334 pred = list(le.inverse_transform(pred))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
335 return pred
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
336
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
337 def create_output(self, output, phage_seqs, bact_seqs):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
338 import pandas as pd
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
339 list_orgs = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
340 for phage in phage_seqs:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
341 for bact in bact_seqs:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
342 list_orgs.append(phage + ' - ' + bact)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
343 file = pd.DataFrame({'Phage - Bacteria': list_orgs, 'Infects': output})
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
344 file.to_csv('files/output.tsv', sep='\t', index=False, header=True)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
345 file.to_csv(or_location + '/output.tsv', sep='\t', index=False, header=True)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
346
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
347
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
348 if __name__ == '__main__':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
349 import sys
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
350 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
351 global or_location
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
352 or_location = os.getcwd()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
353 os.chdir(os.path.dirname(__file__))
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
354
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
355 phage_input_type = sys.argv[1]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
356 Phages = sys.argv[2]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
357 bact_input_type = sys.argv[3]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
358 Bacts = sys.argv[4]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
359 run_interpro = sys.argv[5]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
360 if run_interpro == 'True':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
361 run_interpro = True
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
362 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
363 run_interpro = False
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
364 model = sys.argv[6]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
365 GalaxyPrediction(phage_input_type=phage_input_type, bact_input_type=bact_input_type, phage=Phages, bacteria=Bacts, ml_model=model, run_interpro=run_interpro)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
366 # rg = GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_050154', bacteria='NC_007414,NZ_MK033499,NZ_CP031214')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
367 # GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_031087,NC_049833,NC_049838,NC_049444', bacteria='LR133964', ml_model='SVM')