| 
0
 | 
     1 
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 class GalaxyPrediction:
 | 
| 
 | 
     4 
 | 
| 
 | 
     5 	def __init__(self, phage_input_type='ID', bact_input_type='ID', phage='', bacteria='', ml_model='RandomForests', run_interpro=False):
 | 
| 
 | 
     6 		import pickle
 | 
| 
 | 
     7 		import os
 | 
| 
 | 
     8 		import re
 | 
| 
 | 
     9 		with open('files/FeatureDataset', 'rb') as f:
 | 
| 
 | 
    10 			dataset = pickle.load(f)
 | 
| 
 | 
    11 		self.all_phages = []
 | 
| 
 | 
    12 		self.all_bacteria = []
 | 
| 
 | 
    13 		for ID in dataset.index:
 | 
| 
 | 
    14 			temp_phage = ID[:ID.find('--')]
 | 
| 
 | 
    15 			temp_bacteria = ID[ID.find('--')+2:]
 | 
| 
 | 
    16 			if temp_phage not in self.all_phages:
 | 
| 
 | 
    17 				self.all_phages.append(temp_phage)
 | 
| 
 | 
    18 			if temp_bacteria not in self.all_bacteria:
 | 
| 
 | 
    19 				self.all_bacteria.append(temp_bacteria)
 | 
| 
 | 
    20 		if phage_input_type == 'ID':
 | 
| 
 | 
    21 			phage = re.split('\W', phage.replace(' ', ''))
 | 
| 
 | 
    22 			len_phage_id = len(phage)
 | 
| 
 | 
    23 			phage_seqs = self._retrieve_from_phage_id(phage)
 | 
| 
 | 
    24 		elif phage_input_type == 'seq_file':
 | 
| 
 | 
    25 			phage_seqs = {}
 | 
| 
 | 
    26 			phage_seqs['PhageFasta'] = {}
 | 
| 
 | 
    27 			with open(phage, 'r') as f:
 | 
| 
 | 
    28 				temp = f.readlines()
 | 
| 
 | 
    29 			count_prot = 0
 | 
| 
 | 
    30 			prot = ''
 | 
| 
 | 
    31 			i=0
 | 
| 
 | 
    32 			while i < len(temp):
 | 
| 
 | 
    33 				if '>' in temp[i]:
 | 
| 
 | 
    34 					if prot:
 | 
| 
 | 
    35 						phage_seqs['PhageFasta']['Protein' + str(count_prot)] = ['Unknown', prot]
 | 
| 
 | 
    36 					count_prot += 1
 | 
| 
 | 
    37 					prot = ''
 | 
| 
 | 
    38 					i+=1
 | 
| 
 | 
    39 				else:
 | 
| 
 | 
    40 					prot += temp[i].strip()
 | 
| 
 | 
    41 					i+=1
 | 
| 
 | 
    42 		if bact_input_type == 'ID':
 | 
| 
 | 
    43 			bacteria = re.split('\W', bacteria.replace(' ', ''))
 | 
| 
 | 
    44 			if len(bacteria) > 1 and len_phage_id == 1 or len(bacteria) == 1:
 | 
| 
 | 
    45 				bact_seqs = self._retrieve_from_bact_id(bacteria)
 | 
| 
 | 
    46 		elif bact_input_type == 'seq_file':
 | 
| 
 | 
    47 			bact_seqs = {}
 | 
| 
 | 
    48 			bact_seqs['BacteriaFasta'] = {}
 | 
| 
 | 
    49 			with open(bacteria, 'r') as f:
 | 
| 
 | 
    50 				temp = f.readlines()
 | 
| 
 | 
    51 			count_prot = 0
 | 
| 
 | 
    52 			prot = ''
 | 
| 
 | 
    53 			i=0
 | 
| 
 | 
    54 			while i < len(temp):
 | 
| 
 | 
    55 				if '>' in temp[i]:
 | 
| 
 | 
    56 					if prot:
 | 
| 
 | 
    57 						bact_seqs['BacteriaFasta']['Protein' + str(count_prot)] = ['Unknown', prot]
 | 
| 
 | 
    58 					count_prot += 1
 | 
| 
 | 
    59 					prot = ''
 | 
| 
 | 
    60 					i+=1
 | 
| 
 | 
    61 				else:
 | 
| 
 | 
    62 					prot += temp[i].strip()
 | 
| 
 | 
    63 					i+=1
 | 
| 
 | 
    64 		phage_seqs = self._find_phage_functions(phage_seqs, run_interpro)
 | 
| 
 | 
    65 		phage_seqs = self._find_phage_tails(phage_seqs)
 | 
| 
 | 
    66 		
 | 
| 
 | 
    67 		list_remove = []
 | 
| 
 | 
    68 		for org in phage_seqs:
 | 
| 
 | 
    69 			if not phage_seqs[org]:
 | 
| 
 | 
    70 				print('Could not find tails for phage ' + org + '. Deleting entry...')
 | 
| 
 | 
    71 				list_remove.append(org)
 | 
| 
 | 
    72 		for org in list_remove:
 | 
| 
 | 
    73 			del phage_seqs[org]
 | 
| 
 | 
    74 		
 | 
| 
 | 
    75 		if phage_seqs:
 | 
| 
 | 
    76 			output = self.run_prediction(phage_seqs, bact_seqs, ml_model)
 | 
| 
 | 
    77 			self.create_output(output, phage_seqs, bact_seqs)
 | 
| 
 | 
    78 		else:
 | 
| 
 | 
    79 			with open(or_location + '/output.tsv', 'w') as f:
 | 
| 
 | 
    80 				f.write('No phage tails found in query')
 | 
| 
 | 
    81 		for file in os.listdir('files'):
 | 
| 
 | 
    82 			if file.startswith('temp'):
 | 
| 
 | 
    83 				os.remove('files/' + file)
 | 
| 
 | 
    84 
 | 
| 
 | 
    85 	def _retrieve_from_phage_id(self, phage):
 | 
| 
 | 
    86 		temp_phage = {}
 | 
| 
 | 
    87 		for ID in phage:
 | 
| 
 | 
    88 			temp_phage[ID] = {}
 | 
| 
 | 
    89 			if ID in self.all_phages:
 | 
| 
 | 
    90 				import json
 | 
| 
 | 
    91 				with open('files/phageTails.json', encoding='utf-8') as f:
 | 
| 
 | 
    92 					phage_tails = json.loads(f.read())
 | 
| 
 | 
    93 				temp_phage[ID] = phage_tails[ID]
 | 
| 
 | 
    94 			else:
 | 
| 
 | 
    95 				from Bio import Entrez
 | 
| 
 | 
    96 				from Bio import SeqIO
 | 
| 
 | 
    97 				phage = {}
 | 
| 
 | 
    98 				Entrez.email = 'insert@email.com'
 | 
| 
 | 
    99 				with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID) as handle:
 | 
| 
 | 
   100 					genome = SeqIO.read(handle, "gb")
 | 
| 
 | 
   101 				for feat in genome.features:
 | 
| 
 | 
   102 					if feat.type == 'CDS':
 | 
| 
 | 
   103 						try: temp_phage[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
 | 
| 
 | 
   104 						except: pass
 | 
| 
 | 
   105 		return temp_phage
 | 
| 
 | 
   106 
 | 
| 
 | 
   107 	def _retrieve_from_bact_id(self, bacteria):
 | 
| 
 | 
   108 		temp_bacteria = {}
 | 
| 
 | 
   109 		for ID in bacteria:
 | 
| 
 | 
   110 			temp_bacteria[ID] = {}
 | 
| 
 | 
   111 			if '.' in ID:
 | 
| 
 | 
   112 				ID = ID[:ID.find('.')]
 | 
| 
 | 
   113 			if ID in self.all_bacteria:
 | 
| 
 | 
   114 				import json
 | 
| 
 | 
   115 				with open('files/bacteria/' + ID + '.json', encoding='utf-8') as f:
 | 
| 
 | 
   116 					temp_bacteria[ID] = json.loads(f.read())
 | 
| 
 | 
   117 			else:
 | 
| 
 | 
   118 				from Bio import Entrez
 | 
| 
 | 
   119 				from Bio import SeqIO
 | 
| 
 | 
   120 				bacteria = {}
 | 
| 
 | 
   121 				Entrez.email = 'insert@email.com'
 | 
| 
 | 
   122 				with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ID+'.1') as handle:
 | 
| 
 | 
   123 					genome = SeqIO.read(handle, "gb")
 | 
| 
 | 
   124 				for feat in genome.features:
 | 
| 
 | 
   125 					if feat.type == 'CDS':
 | 
| 
 | 
   126 						try: temp_bacteria[ID][feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
 | 
| 
 | 
   127 						except: pass
 | 
| 
 | 
   128 				if len(genome.features) <= 5:
 | 
| 
 | 
   129 					with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=ID) as handle:
 | 
| 
 | 
   130 						genome = handle.readlines()
 | 
| 
 | 
   131 					for i in range(len(genome)):
 | 
| 
 | 
   132 						if ' CDS ' in genome[i]:
 | 
| 
 | 
   133 							j = i
 | 
| 
 | 
   134 							protDone = False
 | 
| 
 | 
   135 							while j < len(genome):
 | 
| 
 | 
   136 								if protDone:
 | 
| 
 | 
   137 									break
 | 
| 
 | 
   138 								if '/product=' in genome[j]:
 | 
| 
 | 
   139 									product = genome[j].strip()[10:]
 | 
| 
 | 
   140 									j += 1
 | 
| 
 | 
   141 								elif '_id=' in genome[j]:
 | 
| 
 | 
   142 									protKey = genome[j].strip()[13:-1]
 | 
| 
 | 
   143 									j += 1
 | 
| 
 | 
   144 								elif '/translation=' in genome[j]:
 | 
| 
 | 
   145 									protSeq = genome[j].strip()[14:]
 | 
| 
 | 
   146 									j += 1
 | 
| 
 | 
   147 									for k in range(j, len(genome)):
 | 
| 
 | 
   148 										if genome[k].islower():
 | 
| 
 | 
   149 											j = k
 | 
| 
 | 
   150 											protDone = True
 | 
| 
 | 
   151 											break
 | 
| 
 | 
   152 										else:
 | 
| 
 | 
   153 											protSeq += genome[k].strip()
 | 
| 
 | 
   154 								else:
 | 
| 
 | 
   155 									j += 1
 | 
| 
 | 
   156 							temp_bacteria[ID][protKey] = [product, protSeq[:protSeq.find('"')]]
 | 
| 
 | 
   157 		return temp_bacteria
 | 
| 
 | 
   158 
 | 
| 
 | 
   159 	def _find_phage_functions(self, phage_dict, run_interpro):
 | 
| 
 | 
   160 		import os
 | 
| 
 | 
   161 		import json
 | 
| 
 | 
   162 		with open('files/known_function.json', encoding='utf-8') as F:
 | 
| 
 | 
   163 			known_function = json.loads(F.read())
 | 
| 
 | 
   164 		with open('files/temp_database.fasta', 'w') as F:
 | 
| 
 | 
   165 			for phage in known_function:
 | 
| 
 | 
   166 				for prot in known_function[phage]:
 | 
| 
 | 
   167 					F.write('>' + phage + '-' + prot + '\n' + known_function[phage][prot][1] + '\n')
 | 
| 
 | 
   168 		os.system('makeblastdb -in files/temp_database.fasta -dbtype prot -title PhageProts -parse_seqids -out files/temp_database -logfile files/temp_log')
 | 
| 
 | 
   169 		for org in phage_dict:
 | 
| 
 | 
   170 			with open('files/temp.fasta', 'w') as F:
 | 
| 
 | 
   171 				for prot in phage_dict[org]:
 | 
| 
 | 
   172 					F.write('>' + prot + '\n' + phage_dict[org][prot][1] + '\n')
 | 
| 
 | 
   173 			os.system('blastp -db files/temp_database -query files/temp.fasta -out files/temp_blast -num_threads 2 -outfmt 6')
 | 
| 
 | 
   174 			phage_dict[org] = self.process_blast(phage_dict[org], known_function)
 | 
| 
 | 
   175 			if run_interpro: phage_dict[org] = self.interpro(phage_dict[org])
 | 
| 
 | 
   176 		return phage_dict
 | 
| 
 | 
   177 
 | 
| 
 | 
   178 	def process_blast(self, phage_dict, known_function):
 | 
| 
 | 
   179 		import pandas as pd
 | 
| 
 | 
   180 		import re
 | 
| 
 | 
   181 		blast_domains = pd.read_csv('files/temp_blast', sep='\t', header=None)
 | 
| 
 | 
   182 		for prot in phage_dict:
 | 
| 
 | 
   183 			func = phage_dict[prot][0]
 | 
| 
 | 
   184 			known = False
 | 
| 
 | 
   185 			if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5):
 | 
| 
 | 
   186 				known = True
 | 
| 
 | 
   187 			if not known:
 | 
| 
 | 
   188 				evalue = []
 | 
| 
 | 
   189 				bitscore = []
 | 
| 
 | 
   190 				pred = blast_domains[blast_domains[0] == prot]
 | 
| 
 | 
   191 				if pred.shape[0] == 0: break
 | 
| 
 | 
   192 				for i in pred[10]:
 | 
| 
 | 
   193 					evalue.append(float(i))
 | 
| 
 | 
   194 				for i in pred[11]:
 | 
| 
 | 
   195 					bitscore.append(float(i))
 | 
| 
 | 
   196 				if min(evalue) < 1.0 and max(bitscore) > 30.0:
 | 
| 
 | 
   197 					ind = evalue.index(min(evalue))
 | 
| 
 | 
   198 					if ind != bitscore.index(max(bitscore)):
 | 
| 
 | 
   199 						ind = bitscore.index(max(bitscore))
 | 
| 
 | 
   200 					temp = pred.iloc[ind, 1]
 | 
| 
 | 
   201 					known_phage = temp[:temp.find('-')]
 | 
| 
 | 
   202 					known_prot = temp[temp.find('-') + 1:]
 | 
| 
 | 
   203 					if known_function[known_phage][known_prot]:
 | 
| 
 | 
   204 						new_func = known_function[known_phage][known_prot][0]
 | 
| 
 | 
   205 					# for j in known_function.keys():
 | 
| 
 | 
   206 					# 	if pred.iloc[ind, 1] in known_function[j].keys():
 | 
| 
 | 
   207 					# 		new_func = known_function[j][pred.iloc[ind, 1]][0]
 | 
| 
 | 
   208 					# 		break
 | 
| 
 | 
   209 					x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)  # se tiver hits, remover
 | 
| 
 | 
   210 					if not any(z in new_func.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(new_func) > 3 and not x:
 | 
| 
 | 
   211 						phage_dict[prot][0] = new_func
 | 
| 
 | 
   212 		return phage_dict
 | 
| 
 | 
   213 
 | 
| 
 | 
   214 	def interpro(self, phage_dict):
 | 
| 
 | 
   215 		import os
 | 
| 
 | 
   216 		import pandas as pd
 | 
| 
 | 
   217 		import re
 | 
| 
 | 
   218 		os.system('interproscan.sh -b ' + 'files/temp_interpro -i ' + 'files/temp.fasta -f tsv > files/temp_interpro_log')
 | 
| 
 | 
   219 		domains = pd.read_csv('files/temp_interpro.tsv', sep='\t', index_col=0, header=None, names=list(range(13)))
 | 
| 
 | 
   220 		domains = domains.fillna('-')
 | 
| 
 | 
   221 		domains = domains[domains.loc[:, 3] != 'Coils']
 | 
| 
 | 
   222 		domains = domains[domains.loc[:, 3] != 'MobiDBLite']
 | 
| 
 | 
   223 		for prot in phage_dict:
 | 
| 
 | 
   224 			func = phage_dict[prot][0]
 | 
| 
 | 
   225 			known = False
 | 
| 
 | 
   226 			if (not any(i in func.lower() for i in ['hypothetical', 'unknown', 'kda', 'uncharacterized', 'hyphothetical']) and len(func) > 3) and not ('gp' in func.lower() and len(func.split(' ')) < 2) and not (len(func.split(' ')) == 1 and len(func) < 5):
 | 
| 
 | 
   227 				known = True
 | 
| 
 | 
   228 			if prot in domains.index and not known:
 | 
| 
 | 
   229 				temp = '-'
 | 
| 
 | 
   230 				try:
 | 
| 
 | 
   231 					for i in range(domains.loc[prot, :].shape[0]):
 | 
| 
 | 
   232 						if '-' not in domains.loc[prot, 12].iloc[i].lower():
 | 
| 
 | 
   233 							if float(domains.loc[prot, 8].iloc[i]) < 1.0:
 | 
| 
 | 
   234 								temp = domains.loc[prot, 12].iloc[i]
 | 
| 
 | 
   235 							break
 | 
| 
 | 
   236 				except:
 | 
| 
 | 
   237 					if float(domains.loc[prot, 8]) < 1.0:
 | 
| 
 | 
   238 						temp = domains.loc[prot, 12]
 | 
| 
 | 
   239 				x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)  # se tiver hits, remover
 | 
| 
 | 
   240 				if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
 | 
| 
 | 
   241 					phage_dict[prot][0] = temp
 | 
| 
 | 
   242 				else:
 | 
| 
 | 
   243 					try:
 | 
| 
 | 
   244 						for i in range(domains.loc[prot, :].shape[0]):
 | 
| 
 | 
   245 							if '-' not in domains.loc[prot, 5].iloc[i].lower():
 | 
| 
 | 
   246 								temp = domains.loc[prot, 5].iloc[i]
 | 
| 
 | 
   247 								break
 | 
| 
 | 
   248 					except:
 | 
| 
 | 
   249 						temp = domains.loc[prot, 5]
 | 
| 
 | 
   250 					x = re.findall('(Gp\d{2,}[^,\d -]|Gp\d{1}[^,\d -])', temp)
 | 
| 
 | 
   251 					if temp != '-' and not any(z in temp.lower() for z in ['unknown', 'ucp', 'uncharacterized', 'consensus']) and len(temp) > 3 and not x:
 | 
| 
 | 
   252 						phage_dict[prot][0] = temp
 | 
| 
 | 
   253 		return phage_dict
 | 
| 
 | 
   254 
 | 
| 
 | 
   255 	def _find_phage_tails(self, phage_dict):
 | 
| 
 | 
   256 		for org in phage_dict:
 | 
| 
 | 
   257 			list_remove = []
 | 
| 
 | 
   258 			for protein in phage_dict[org]:
 | 
| 
 | 
   259 				if any(z in phage_dict[org][protein][0].lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing', 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
 | 
| 
 | 
   260 						and not any(z in phage_dict[org][protein][0].lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp', 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal', 'terminase', 'nucl', 'promot', 'block', 'olfact', 'wedge', 'lysozyme', 'mur', 'sheat']):
 | 
| 
 | 
   261 					pass
 | 
| 
 | 
   262 				else:
 | 
| 
 | 
   263 					list_remove.append(protein)
 | 
| 
 | 
   264 			for protein in list_remove:
 | 
| 
 | 
   265 				del phage_dict[org][protein]
 | 
| 
 | 
   266 		return phage_dict
 | 
| 
 | 
   267 
 | 
| 
 | 
   268 	def run_prediction(self, phage_dict, bact_dict, ml_model):
 | 
| 
 | 
   269 		from feature_construction import FeatureConstruction
 | 
| 
 | 
   270 		import pickle
 | 
| 
 | 
   271 		from sklearn.preprocessing import LabelEncoder
 | 
| 
 | 
   272 		from sklearn.preprocessing import StandardScaler
 | 
| 
 | 
   273 		import numpy as np
 | 
| 
 | 
   274 
 | 
| 
 | 
   275 		if ml_model == 'RandomForests':
 | 
| 
 | 
   276 			with open('files/dataset_reduced', 'rb') as f:
 | 
| 
 | 
   277 				dataset = pickle.load(f)
 | 
| 
 | 
   278 			columns_remove = [3, 7, 9, 11, 24, 28, 32, 34, 38, 42, 45, 52, 53, 61, 65, 73, 75, 79, 104, 122, 141, 151, 154, 155, 157, 159, 160, 161, 163, 165, 169, 170, 173, 176, 178, 180, 182, 183, 185, 186, 187, 190, 193, 194, 196, 197, 201, 202, 203, 206, 207, 209, 210, 212, 216, 217, 221, 223, 225, 226, 230, 233, 235, 236, 245, 251]
 | 
| 
 | 
   279 		elif ml_model == 'SVM':
 | 
| 
 | 
   280 			with open('files/feature_dataset', 'rb') as f:
 | 
| 
 | 
   281 				dataset = pickle.load(f)
 | 
| 
 | 
   282 			columns_remove = []
 | 
| 
 | 
   283 
 | 
| 
 | 
   284 		dataset = dataset.dropna()
 | 
| 
 | 
   285 		le = LabelEncoder()
 | 
| 
 | 
   286 		le.fit(['Yes', 'No'])
 | 
| 
 | 
   287 		output = le.transform(dataset['Infects'].values)
 | 
| 
 | 
   288 		dataset = dataset.drop('Infects', 1)
 | 
| 
 | 
   289 		scaler = StandardScaler()
 | 
| 
 | 
   290 		scaler.fit(dataset)
 | 
| 
 | 
   291 		data_z = scaler.transform(dataset)
 | 
| 
 | 
   292 
 | 
| 
 | 
   293 		fc = FeatureConstruction()
 | 
| 
 | 
   294 		solution = []
 | 
| 
 | 
   295 		for phage in phage_dict:
 | 
| 
 | 
   296 			for bacteria in bact_dict:
 | 
| 
 | 
   297 				temp_solution = np.array([])
 | 
| 
 | 
   298 				temp_solution = np.append(temp_solution, fc.get_grouping(phage_dict[phage], bact_dict[bacteria]))
 | 
| 
 | 
   299 				temp_solution = np.append(temp_solution, fc.get_composition(phage_dict[phage], bact_dict[bacteria]))
 | 
| 
 | 
   300 				temp_solution = np.append(temp_solution, fc.get_kmers(phage_dict[phage], bact_dict[bacteria]))
 | 
| 
 | 
   301 				temp_solution = temp_solution.reshape(1, -1)
 | 
| 
 | 
   302 				if columns_remove:
 | 
| 
 | 
   303 					temp_solution = np.delete(temp_solution, columns_remove, 1)
 | 
| 
 | 
   304 				if phage in self.all_phages:
 | 
| 
 | 
   305 					for ID in dataset.index:
 | 
| 
 | 
   306 						if phage in ID:
 | 
| 
 | 
   307 							for i in range(len(dataset.loc[ID].index)):
 | 
| 
 | 
   308 								if 'phage' in dataset.loc[ID].index[i]:
 | 
| 
 | 
   309 									temp_solution[0][i] = dataset.loc[ID, dataset.loc[ID].index[i]]
 | 
| 
 | 
   310 							break
 | 
| 
 | 
   311 				if bacteria in self.all_bacteria:
 | 
| 
 | 
   312 					for ID in dataset.index:
 | 
| 
 | 
   313 						if bacteria in ID:
 | 
| 
 | 
   314 							for i in range(len(dataset.loc[ID].index)):
 | 
| 
 | 
   315 								if 'bact' in dataset.loc[ID].index[i]:
 | 
| 
 | 
   316 									temp_solution[0][i] = dataset.loc[ID, dataset.loc[ID].index[i]]
 | 
| 
 | 
   317 							break
 | 
| 
 | 
   318 				if type(solution) != np.ndarray:
 | 
| 
 | 
   319 					solution = temp_solution
 | 
| 
 | 
   320 				else:
 | 
| 
 | 
   321 					solution = np.append(solution, temp_solution, axis=0)
 | 
| 
 | 
   322 		# solution = solution.reshape(1, -1)
 | 
| 
 | 
   323 		solution = scaler.transform(solution)
 | 
| 
 | 
   324 
 | 
| 
 | 
   325 		if ml_model == 'RandomForests':
 | 
| 
 | 
   326 			from sklearn.ensemble import RandomForestClassifier
 | 
| 
 | 
   327 			clf = RandomForestClassifier(n_estimators=200, bootstrap=False, criterion='gini', min_samples_leaf=2, min_samples_split=4, oob_score=False)
 | 
| 
 | 
   328 			clf = clf.fit(data_z, output)
 | 
| 
 | 
   329 		elif ml_model == 'SVM':
 | 
| 
 | 
   330 			from sklearn.svm import SVC
 | 
| 
 | 
   331 			clf = SVC(C=10, degree=2, gamma='auto', kernel='rbf')
 | 
| 
 | 
   332 			clf = clf.fit(data_z, output)
 | 
| 
 | 
   333 		pred = clf.predict(solution)
 | 
| 
 | 
   334 		pred = list(le.inverse_transform(pred))
 | 
| 
 | 
   335 		return pred
 | 
| 
 | 
   336 
 | 
| 
 | 
   337 	def create_output(self, output, phage_seqs, bact_seqs):
 | 
| 
 | 
   338 		import pandas as pd
 | 
| 
 | 
   339 		list_orgs = []
 | 
| 
 | 
   340 		for phage in phage_seqs:
 | 
| 
 | 
   341 			for bact in bact_seqs:
 | 
| 
 | 
   342 				list_orgs.append(phage + ' - ' + bact)
 | 
| 
 | 
   343 		file = pd.DataFrame({'Phage - Bacteria': list_orgs, 'Infects': output})
 | 
| 
 | 
   344 		file.to_csv('files/output.tsv', sep='\t', index=False, header=True)
 | 
| 
 | 
   345 		file.to_csv(or_location + '/output.tsv', sep='\t', index=False, header=True)
 | 
| 
 | 
   346 
 | 
| 
 | 
   347 
 | 
| 
 | 
   348 if __name__ == '__main__':
 | 
| 
 | 
   349 	import sys
 | 
| 
 | 
   350 	import os
 | 
| 
 | 
   351 	global or_location
 | 
| 
 | 
   352 	or_location = os.getcwd()
 | 
| 
 | 
   353 	os.chdir(os.path.dirname(__file__))
 | 
| 
 | 
   354 
 | 
| 
 | 
   355 	phage_input_type = sys.argv[1]
 | 
| 
 | 
   356 	Phages = sys.argv[2]
 | 
| 
 | 
   357 	bact_input_type = sys.argv[3]
 | 
| 
 | 
   358 	Bacts = sys.argv[4]
 | 
| 
 | 
   359 	run_interpro = sys.argv[5]
 | 
| 
 | 
   360 	if run_interpro == 'True':
 | 
| 
 | 
   361 		run_interpro = True
 | 
| 
 | 
   362 	else:
 | 
| 
 | 
   363 		run_interpro = False
 | 
| 
 | 
   364 	model = sys.argv[6]
 | 
| 
 | 
   365 	GalaxyPrediction(phage_input_type=phage_input_type, bact_input_type=bact_input_type, phage=Phages, bacteria=Bacts, ml_model=model, run_interpro=run_interpro)
 | 
| 
 | 
   366 	# rg = GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_050154', bacteria='NC_007414,NZ_MK033499,NZ_CP031214')
 | 
| 
 | 
   367 	# GalaxyPrediction(phage_input_type='ID', bact_input_type='ID', phage='NC_031087,NC_049833,NC_049838,NC_049444', bacteria='LR133964', ml_model='SVM')
 |