0
|
1 #!/usr/bin/env python3
|
|
2 #
|
|
3 # Program: PointFinder-3.0
|
|
4 # Author: Camilla Hundahl Johnsen
|
|
5 #
|
|
6 # Dependencies: KMA or NCBI-blast together with BioPython.
|
|
7
|
|
8 import os
|
|
9 import re
|
|
10 import sys
|
|
11 import math
|
|
12 import argparse
|
|
13 import subprocess
|
|
14 import random
|
|
15
|
|
16 from cgecore.blaster import Blaster
|
|
17 from cgecore.cgefinder import CGEFinder
|
|
18 from .output.table import TableResults
|
|
19 from .phenotype2genotype.feature import ResMutation
|
|
20 from .phenotype2genotype.res_profile import PhenoDB
|
|
21
|
|
22
|
|
23 def eprint(*args, **kwargs):
|
|
24 print(*args, file=sys.stderr, **kwargs)
|
|
25
|
|
26
|
|
27 class GeneListError(Exception):
|
|
28 """ Raise when a specified gene is not found within the gene list.
|
|
29 """
|
|
30 def __init__(self, message, *args):
|
|
31 self.message = message
|
|
32 # allow users initialize misc. arguments as any other builtin Error
|
|
33 super(PanelNameError, self).__init__(message, *args)
|
|
34
|
|
35
|
|
36 class PointFinder(CGEFinder):
|
|
37
|
|
38 def __init__(self, db_path, species, gene_list=None):
|
|
39 """
|
|
40 """
|
|
41 self.species = species
|
|
42 self.specie_path = db_path
|
|
43 self.RNA_gene_list = []
|
|
44
|
|
45 self.gene_list = PointFinder.get_file_content(
|
|
46 self.specie_path + "/genes.txt")
|
|
47
|
|
48 if os.path.isfile(self.specie_path + "/RNA_genes.txt"):
|
|
49 self.RNA_gene_list = PointFinder.get_file_content(
|
|
50 self.specie_path + "/RNA_genes.txt")
|
|
51
|
|
52 # Creat user defined gene_list if given
|
|
53 if(gene_list is not None):
|
|
54 self.gene_list = get_user_defined_gene_list(gene_list)
|
|
55
|
|
56 self.known_mutations, self.drug_genes, self.known_stop_codon = (
|
|
57 self.get_db_mutations(self.specie_path + "/resistens-overview.txt",
|
|
58 self.gene_list))
|
|
59
|
|
60 def get_user_defined_gene_list(self, gene_list):
|
|
61 genes_specified = []
|
|
62 for gene in gene_list:
|
|
63 # Check that the genes are valid
|
|
64 if gene not in self.gene_list:
|
|
65 raise(GeneListError(
|
|
66 "Input Error: Specified gene not recognised "
|
|
67 "(%s)\nChoose one or more of the following genes:"
|
|
68 "\n%s" % (gene, "\n".join(self.gene_list))))
|
|
69 genes_specified.append(gene)
|
|
70 # Change the gene_list to the user defined gene_list
|
|
71 return genes_specified
|
|
72
|
|
73 #DELETE
|
|
74 def old_results_to_standard_output(self, result, software, version,
|
|
75 run_date, run_cmd, id,
|
|
76 unknown_mut=False, tableresults=None):
|
|
77 """
|
|
78 """
|
|
79 std_results = TableResults(software, version, run_date, run_cmd, id)
|
|
80 headers = [
|
|
81 "template_name",
|
|
82 "template_length",
|
|
83 "aln_length",
|
|
84 "aln_identity",
|
|
85 "aln_gaps",
|
|
86 "aln_template_string",
|
|
87 "aln_query_string",
|
|
88 "aln_homology_string",
|
|
89 "query_id",
|
|
90 "query_start_pos",
|
|
91 "query_end_pos",
|
|
92 "template_variant",
|
|
93 "query_depth",
|
|
94 "blast_eval",
|
|
95 "blast_bitscore",
|
|
96 "pval",
|
|
97 "software_score",
|
|
98 "mutation",
|
|
99 "ref_codon",
|
|
100 "alt_codon",
|
|
101 "ref_aa",
|
|
102 "alt_aa",
|
|
103 "insertion",
|
|
104 "deletion"
|
|
105 ]
|
|
106 for db_name, db in result.items():
|
|
107 if(db_name == "excluded"):
|
|
108 continue
|
|
109
|
|
110 if(db == "No hit found"):
|
|
111 continue
|
|
112
|
|
113 ####Added to solve PointFinder####
|
|
114 if(isinstance(db, str)):
|
|
115 if db.startswith("Gene found with coverage"):
|
|
116 continue
|
|
117 ####
|
|
118
|
|
119 # Start writing output string (to HTML tab file)
|
|
120 gene_name = PhenoDB.if_promoter_rename(db_name)
|
|
121
|
|
122 # Find and save mis_matches in gene
|
|
123 sbjct_start = db["sbjct_start"]
|
|
124 sbjct_seq = db["sbjct_string"]
|
|
125 qry_seq = db["query_string"]
|
|
126 db["mis_matches"] = self.find_mismatches(gene=db_name,
|
|
127 sbjct_start=sbjct_start,
|
|
128 sbjct_seq=sbjct_seq,
|
|
129 qry_seq=qry_seq)
|
|
130
|
|
131 known_muts = []
|
|
132 unknown_muts = []
|
|
133 if(len(db["mis_matches"]) > 0):
|
|
134 known_muts, unknown_muts = self.get_mutations(
|
|
135 db_name, gene_name, db['mis_matches'], True, db)
|
|
136
|
|
137 # No mutations found
|
|
138 if(not (unknown_muts or known_muts)):
|
|
139 continue
|
|
140
|
|
141 std_results.add_table(db_name)
|
|
142 std_db = std_results.long[db_name]
|
|
143 std_db.add_headers(headers)
|
|
144 std_db.lock_headers = True
|
|
145
|
|
146 for mut in known_muts:
|
|
147 unique_id = mut.unique_id
|
|
148 std_db[unique_id] = {
|
|
149 "template_name": db_name,
|
|
150 "template_length": db["sbjct_length"],
|
|
151 "aln_length": db["HSP_length"],
|
|
152 "aln_identity": db["perc_ident"],
|
|
153 "aln_gaps": db["gaps"],
|
|
154 "aln_template_string": db["sbjct_string"],
|
|
155 "aln_query_string": db["query_string"],
|
|
156 "aln_homology_string": db["homo_string"],
|
|
157 "query_id": db["contig_name"],
|
|
158 "query_start_pos": mut.pos,
|
|
159 "query_end_pos": mut.end,
|
|
160 "query_depth": db.get("depth", "NA"),
|
|
161 "pval": db.get("p_value", "NA"),
|
|
162 "software_score": db["cal_score"],
|
|
163 "mutation": mut.mut_string_short,
|
|
164 "ref_codon": mut.ref_codon,
|
|
165 "alt_codon": mut.mut_codon,
|
|
166 "ref_aa": mut.ref_aa,
|
|
167 "alt_aa": mut.mut_aa,
|
|
168 "insertion": mut.insertion,
|
|
169 "deletion": mut.deletion
|
|
170 }
|
|
171
|
|
172 return std_results
|
|
173
|
|
174 @staticmethod
|
|
175 def get_db_names(db_root_path):
|
|
176 out_lst = []
|
|
177 for entry in os.scandir(db_root_path):
|
|
178 if not entry.name.startswith('.') and entry.is_dir():
|
|
179 out_lst.append(entry.name)
|
|
180 return tuple(out_lst)
|
|
181
|
|
182 def results_to_str(self, res_type, results, unknown_flag, min_cov,
|
|
183 perc_iden):
|
|
184 # Initiate output stings with headers
|
|
185 gene_list = ', '.join(self.gene_list)
|
|
186 output_strings = [
|
|
187 "Mutation\tNucleotide change\tAmino acid change\tResistance\tPMID",
|
|
188 "Chromosomal point mutations - Results\nSpecies: %s\nGenes: %s\n"
|
|
189 "Mapping methode: %s\n\n\nKnown Mutations\n" % (self.species, gene_list, res_type), ""
|
|
190 ]
|
|
191 # Get all drug names and add header of all drugs to prediction file
|
|
192 drug_lst = [drug for drug in self.drug_genes.keys()]
|
|
193 output_strings[2] = "\t".join(drug_lst) + "\n"
|
|
194
|
|
195 # Define variables to write temporary output into
|
|
196 total_unknown_str = ""
|
|
197 unique_drug_list = []
|
|
198 excluded_hits = {}
|
|
199
|
|
200 if res_type == PointFinder.TYPE_BLAST:
|
|
201 # Patch together partial sequence hits (BLASTER does not do this)
|
|
202 # and removes dummy_hit_id
|
|
203 GENES = self.find_best_seqs(results, min_cov)
|
|
204
|
|
205 else:
|
|
206 # TODO: Some databases are either genes or species,
|
|
207 # depending on the method applied (BLAST/KMA).
|
|
208 GENES = results[self.species]
|
|
209
|
|
210 # If not hits found insert genes not found
|
|
211 if GENES == 'No hit found':
|
|
212 GENES = {}
|
|
213
|
|
214 # KMA only gives the genes found, however all genes should be
|
|
215 # included
|
|
216 for gene in self.gene_list:
|
|
217 if gene not in GENES:
|
|
218 GENES[gene] = 'No hit found'
|
|
219
|
|
220 # Included excluded
|
|
221 GENES['excluded'] = results['excluded']
|
|
222
|
|
223 for gene, hit in GENES.items():
|
|
224 # Start writing output string (to HTML tab file)
|
|
225 gene_name = gene # Not perfeft can differ from specific mutation
|
|
226 regex = r"promoter_size_(\d+)(?:bp)"
|
|
227 promtr_gene_objt = re.search(regex, gene)
|
|
228
|
|
229 if promtr_gene_objt:
|
|
230 gene_name = gene.split("_")[0]
|
|
231
|
|
232 output_strings[1] += "\n%s\n" % (gene_name)
|
|
233
|
|
234 # Ignore excluded genes
|
|
235 # TODO: Should all not found genes be moved to this dict?
|
|
236 if gene == "excluded":
|
|
237 continue
|
|
238
|
|
239 # Write exclusion reason for genes not found
|
|
240 if isinstance(GENES[gene], str):
|
|
241 output_strings[1] += GENES[gene] + "\n"
|
|
242 continue
|
|
243
|
|
244 if hit['perc_ident'] < perc_iden and hit['perc_coverage'] < min_cov:
|
|
245 GENES[gene] = ('Gene found with identity, %s, below minimum '
|
|
246 'identity threshold: %s. And with coverage, %s,'
|
|
247 ' below minimum coverage threshold: %s.'
|
|
248 % (hit['perc_ident'], perc_iden,
|
|
249 hit['perc_coverage'], min_cov))
|
|
250 output_strings[1] += GENES[gene] + "\n"
|
|
251 continue
|
|
252
|
|
253 if hit['perc_ident'] < perc_iden:
|
|
254 GENES[gene] = ('Gene found with identity, %s, below minimum '
|
|
255 'identity threshold: %s' % (hit['perc_ident'],
|
|
256 perc_iden))
|
|
257 output_strings[1] += GENES[gene] + "\n"
|
|
258 continue
|
|
259
|
|
260 if hit['perc_coverage'] < min_cov:
|
|
261 # Gene not found above given coverage
|
|
262 GENES[gene] = ('Gene found with coverage, %s, below minimum '
|
|
263 'coverage threshold: %s' % (hit['perc_coverage'],
|
|
264 min_cov))
|
|
265 output_strings[1] += GENES[gene] + "\n"
|
|
266 continue
|
|
267
|
|
268 sbjct_start = hit['sbjct_start']
|
|
269 sbjct_seq = hit['sbjct_string']
|
|
270 qry_seq = hit['query_string']
|
|
271
|
|
272 # Find and save mis_matches in gene
|
|
273 hit['mis_matches'] = self.find_mismatches(gene, sbjct_start,
|
|
274 sbjct_seq, qry_seq)
|
|
275
|
|
276 # Check if no mutations was found
|
|
277 if len(hit['mis_matches']) < 1:
|
|
278 output_strings[1] += ("No mutations found in {}\n"
|
|
279 .format(gene_name))
|
|
280 else:
|
|
281 # Write mutations found to output file
|
|
282 total_unknown_str += "\n%s\n" % (gene_name)
|
|
283
|
|
284 str_tuple = self.mut2str(gene, gene_name, hit['mis_matches'],
|
|
285 unknown_flag, hit)
|
|
286
|
|
287 all_results = str_tuple[0]
|
|
288 total_known = str_tuple[1]
|
|
289 total_unknown = str_tuple[2]
|
|
290 drug_list = str_tuple[3]
|
|
291
|
|
292 # Add results to output strings
|
|
293 if(all_results):
|
|
294 output_strings[0] += "\n" + all_results
|
|
295 output_strings[1] += total_known + "\n"
|
|
296
|
|
297 # Add unknown mutations the total results of
|
|
298 # unknown mutations
|
|
299 total_unknown_str += total_unknown + "\n"
|
|
300
|
|
301 # Add drugs to druglist
|
|
302 unique_drug_list += [drug.lower() for drug in drug_list]
|
|
303
|
|
304 if unknown_flag is True:
|
|
305 output_strings[1] += ("\n\nUnknown Mutations \n"
|
|
306 + total_unknown_str)
|
|
307
|
|
308 # Make Resistance Prediction output
|
|
309
|
|
310 # Go throug all drugs in the database and see if prediction
|
|
311 # can be called.
|
|
312 pred_output = []
|
|
313 for drug in drug_lst:
|
|
314 drug = drug.lower()
|
|
315 # Check if resistance to drug was found
|
|
316 if drug in unique_drug_list:
|
|
317 pred_output.append("1")
|
|
318 else:
|
|
319 # Check at all genes associated with the drug
|
|
320 # resistance where found
|
|
321 all_genes_found = True
|
|
322 for gene in self.drug_genes[drug]:
|
|
323 if gene not in GENES:
|
|
324 all_genes_found = False
|
|
325
|
|
326 if all_genes_found is False:
|
|
327 pred_output.append("?")
|
|
328 else:
|
|
329 pred_output.append("0")
|
|
330
|
|
331 output_strings[2] += "\t".join(pred_output) + "\n"
|
|
332
|
|
333 return output_strings
|
|
334
|
|
335 def write_results(self, out_path, result, res_type, unknown_flag, min_cov,
|
|
336 perc_iden):
|
|
337 """
|
|
338 """
|
|
339
|
|
340 result_str = self.results_to_str(res_type=res_type,
|
|
341 results=result,
|
|
342 unknown_flag=unknown_flag,
|
|
343 min_cov=min_cov,
|
|
344 perc_iden=perc_iden)
|
|
345
|
|
346 with open(out_path + "/PointFinder_results.txt", "w") as fh:
|
|
347 fh.write(result_str[0])
|
|
348 with open(out_path + "/PointFinder_table.txt", "w") as fh:
|
|
349 fh.write(result_str[1])
|
|
350 with open(out_path + "/PointFinder_prediction.txt", "w") as fh:
|
|
351 fh.write(result_str[2])
|
|
352
|
|
353 @staticmethod
|
|
354 def discard_unwanted_results(results, wanted):
|
|
355 """
|
|
356 Takes a dict and a list of keys.
|
|
357 Returns a dict containing only the keys from the list.
|
|
358 """
|
|
359 cleaned_results = dict()
|
|
360 for key, val in results.items():
|
|
361 if(key in wanted):
|
|
362 cleaned_results[key] = val
|
|
363 return cleaned_results
|
|
364
|
|
365 def blast(self, inputfile, out_path, min_cov=0.6, threshold=0.9,
|
|
366 blast="blastn", cut_off=True):
|
|
367 """
|
|
368 """
|
|
369 blast_run = Blaster(inputfile=inputfile, databases=self.gene_list,
|
|
370 db_path=self.specie_path, out_path=out_path,
|
|
371 min_cov=min_cov, threshold=threshold, blast=blast,
|
|
372 cut_off=cut_off)
|
|
373
|
|
374 self.blast_results = blast_run.results # TODO Is this used?
|
|
375 return blast_run
|
|
376
|
|
377 @staticmethod
|
|
378 def get_db_mutations(mut_db_path, gene_list):
|
|
379 """
|
|
380 This function opens the file resistenss-overview.txt, and reads
|
|
381 the content into a dict of dicts. The dict will contain
|
|
382 information about all known mutations given in the database.
|
|
383 This dict is returned.
|
|
384 """
|
|
385
|
|
386 # Initiate variables
|
|
387 known_mutations = dict()
|
|
388 known_stop_codon = dict()
|
|
389 drug_genes = dict()
|
|
390 indelflag = False
|
|
391 stopcodonflag = False
|
|
392
|
|
393 # Go throug mutation file line by line
|
|
394
|
|
395 with open(mut_db_path, "r") as fh:
|
|
396 mutdb_file = fh.readlines()
|
|
397 mutdb_file = [line.strip() for line in mutdb_file]
|
|
398
|
|
399 for line in mutdb_file:
|
|
400 # Ignore headers and check where the indel section starts
|
|
401 if line.startswith("#"):
|
|
402 if "indel" in line.lower():
|
|
403 indelflag = True
|
|
404 elif "stop codon" in line.lower():
|
|
405 stopcodonflag = True
|
|
406 else:
|
|
407 stopcodonflag = False
|
|
408 continue
|
|
409
|
|
410 mutation = [data.strip() for data in line.split("\t")]
|
|
411
|
|
412 gene_ID = mutation[0]
|
|
413
|
|
414 # Only consider mutations in genes found in the gene list
|
|
415 if gene_ID in gene_list:
|
|
416 gene_name = mutation[1]
|
|
417 mut_pos = int(mutation[2])
|
|
418 ref_codon = mutation[3] # Ref_nuc (1 or 3?)
|
|
419 ref_aa = mutation[4] # Ref_codon
|
|
420 alt_aa = mutation[5].split(",") # Res_codon
|
|
421 res_drug = mutation[6].replace("\t", " ")
|
|
422 pmid = mutation[7].split(",")
|
|
423
|
|
424 # Check if stop codons are predictive for resistance
|
|
425 if stopcodonflag is True:
|
|
426 if gene_ID not in known_stop_codon:
|
|
427 known_stop_codon[gene_ID] = {"pos": [],
|
|
428 "drug": res_drug}
|
|
429 known_stop_codon[gene_ID]["pos"].append(mut_pos)
|
|
430
|
|
431 # Add genes associated with drug resistance to drug_genes dict
|
|
432 drug_lst = res_drug.split(",")
|
|
433 drug_lst = [d.strip().lower() for d in drug_lst]
|
|
434 for drug in drug_lst:
|
|
435 if drug not in drug_genes:
|
|
436 drug_genes[drug] = []
|
|
437 if gene_ID not in drug_genes[drug]:
|
|
438 drug_genes[drug].append(gene_ID)
|
|
439
|
|
440 # Initiate empty dict to store relevant mutation information
|
|
441 mut_info = dict()
|
|
442
|
|
443 # Save need mutation info with pmid cooresponding to the amino
|
|
444 # acid change
|
|
445 for i in range(len(alt_aa)):
|
|
446 try:
|
|
447 mut_info[alt_aa[i]] = {"gene_name": gene_name,
|
|
448 "drug": res_drug,
|
|
449 "pmid": pmid[i]}
|
|
450 except IndexError:
|
|
451 mut_info[alt_aa[i]] = {"gene_name": gene_name,
|
|
452 "drug": res_drug,
|
|
453 "pmid": "-"}
|
|
454
|
|
455 # Add all possible types of mutations to the dict
|
|
456 if gene_ID not in known_mutations:
|
|
457 known_mutations[gene_ID] = {"sub": dict(), "ins": dict(),
|
|
458 "del": dict()}
|
|
459 # Check for the type of mutation
|
|
460 if indelflag is False:
|
|
461 mutation_type = "sub"
|
|
462 else:
|
|
463 mutation_type = ref_aa
|
|
464
|
|
465 # Save mutations positions with required information given in
|
|
466 # mut_info
|
|
467 if mut_pos not in known_mutations[gene_ID][mutation_type]:
|
|
468 known_mutations[gene_ID][mutation_type][mut_pos] = dict()
|
|
469 for amino in alt_aa:
|
|
470 if (amino in known_mutations[gene_ID][mutation_type]
|
|
471 [mut_pos]):
|
|
472 stored_mut_info = (known_mutations[gene_ID]
|
|
473 [mutation_type]
|
|
474 [mut_pos][amino])
|
|
475 if stored_mut_info["drug"] != mut_info[amino]["drug"]:
|
|
476 stored_mut_info["drug"] += "," + (mut_info[amino]
|
|
477 ["drug"])
|
|
478 if stored_mut_info["pmid"] != mut_info[amino]["pmid"]:
|
|
479 stored_mut_info["pmid"] += ";" + (mut_info[amino]
|
|
480 ["pmid"])
|
|
481
|
|
482 (known_mutations[gene_ID][mutation_type]
|
|
483 [mut_pos][amino]) = stored_mut_info
|
|
484 else:
|
|
485 (known_mutations[gene_ID][mutation_type]
|
|
486 [mut_pos][amino]) = mut_info[amino]
|
|
487
|
|
488 # Check that all genes in the gene list has known mutations
|
|
489 for gene in gene_list:
|
|
490 if gene not in known_mutations:
|
|
491 known_mutations[gene] = {"sub": dict(), "ins": dict(),
|
|
492 "del": dict()}
|
|
493 return known_mutations, drug_genes, known_stop_codon
|
|
494
|
|
495 def find_best_seqs(self, blast_results, min_cov):
|
|
496 """
|
|
497 This function finds gene sequences with the largest coverage based on
|
|
498 the blast results. If more hits covering different sequences parts
|
|
499 exists it concatinates partial gene hits into one hit.
|
|
500 If different overlap sequences occurs they are saved in the list
|
|
501 alternative_overlaps. The function returns a new results dict where
|
|
502 each gene has an inner dict with hit information corresponding to
|
|
503 the new concatinated hit. If no gene is found the value is a string
|
|
504 with info of why the gene wasn't found.
|
|
505 """
|
|
506
|
|
507 GENES = dict()
|
|
508 for gene, hits in blast_results.items():
|
|
509
|
|
510 # Save all hits in the list 'hits_found'
|
|
511 hits_found = []
|
|
512 GENES[gene] = {}
|
|
513
|
|
514 # Check for gene has any hits in blast results
|
|
515 if gene == 'excluded':
|
|
516 GENES[gene] = hits
|
|
517 continue
|
|
518 elif isinstance(hits, dict) and len(hits) > 0:
|
|
519 GENES[gene]['found'] = 'partially'
|
|
520 GENES[gene]['hits'] = hits
|
|
521 else:
|
|
522 # Gene not found! go to next gene
|
|
523 GENES[gene] = 'No hit found'
|
|
524 continue
|
|
525
|
|
526 # Check coverage for each hit, patch together partial genes hits
|
|
527 for hit_id, hit in hits.items():
|
|
528
|
|
529 # Save hits start and end positions in subject, total subject
|
|
530 # len, and subject and query sequences of each hit
|
|
531 hits_found += [(hit['sbjct_start'], hit['sbjct_end'],
|
|
532 hit['sbjct_string'], hit['query_string'],
|
|
533 hit['sbjct_length'], hit['homo_string'],
|
|
534 hit_id)]
|
|
535
|
|
536 # If coverage is 100% change found to yes
|
|
537 hit_coverage = hit['perc_coverage']
|
|
538 if hit_coverage == 1.0:
|
|
539 GENES[gene]['found'] = 'yes'
|
|
540
|
|
541 # Sort positions found
|
|
542 hits_found = sorted(hits_found, key=lambda x: x[0])
|
|
543
|
|
544 # Find best hit by concatenating sequences if more hits exist
|
|
545
|
|
546 # Get information from the first hit found
|
|
547 all_start = hits_found[0][0]
|
|
548 current_end = hits_found[0][1]
|
|
549 final_sbjct = hits_found[0][2]
|
|
550 final_qry = hits_found[0][3]
|
|
551 sbjct_len = hits_found[0][4]
|
|
552 final_homol = hits_found[0][5]
|
|
553 first_hit_id = hits_found[0][6]
|
|
554
|
|
555 alternative_overlaps = []
|
|
556 contigs = [hit['contig_name']]
|
|
557 scores = [str(hit['cal_score'])]
|
|
558
|
|
559 # Check if more then one hit was found within the same gene
|
|
560 for i in range(len(hits_found) - 1):
|
|
561
|
|
562 # Save information from previous hit
|
|
563 pre_block_start = hits_found[i][0]
|
|
564 pre_block_end = hits_found[i][1]
|
|
565 pre_sbjct = hits_found[i][2]
|
|
566 pre_qry = hits_found[i][3]
|
|
567 pre_homol = hits_found[i][5]
|
|
568 pre_id = hits_found[i][6]
|
|
569
|
|
570 # Save information from next hit
|
|
571 next_block_start = hits_found[i + 1][0]
|
|
572 next_block_end = hits_found[i + 1][1]
|
|
573 next_sbjct = hits_found[i + 1][2]
|
|
574 next_qry = hits_found[i + 1][3]
|
|
575 next_homol = hits_found[i + 1][5]
|
|
576 next_id = hits_found[i + 1][6]
|
|
577
|
|
578 contigs.append(hits[next_id]['contig_name'])
|
|
579 scores.append(str(hits[next_id]['cal_score']))
|
|
580
|
|
581 # Check for overlapping sequences, collaps them and save
|
|
582 # alternative overlaps if any
|
|
583 if next_block_start <= current_end:
|
|
584
|
|
585 # Find overlap start and take gaps into account
|
|
586 pos_count = 0
|
|
587 overlap_pos = pre_block_start
|
|
588 for i in range(len(pre_sbjct)):
|
|
589
|
|
590 # Stop loop if overlap_start position is reached
|
|
591 if overlap_pos == next_block_start:
|
|
592 overlap_start = pos_count
|
|
593 break
|
|
594 if pre_sbjct[i] != "-":
|
|
595 overlap_pos += 1
|
|
596 pos_count += 1
|
|
597
|
|
598 # Find overlap len and add next sequence to final sequence
|
|
599 if len(pre_sbjct[overlap_start:]) > len(next_sbjct):
|
|
600 # <--------->
|
|
601 # <--->
|
|
602 overlap_len = len(next_sbjct)
|
|
603 overlap_end_pos = next_block_end
|
|
604 else:
|
|
605 # <--------->
|
|
606 # <--------->
|
|
607 overlap_len = len(pre_sbjct[overlap_start:])
|
|
608 overlap_end_pos = pre_block_end
|
|
609
|
|
610 # Update current end
|
|
611 current_end = next_block_end
|
|
612
|
|
613 # Use the entire previous sequence and add the last
|
|
614 # part of the next sequence
|
|
615 final_sbjct += next_sbjct[overlap_len:]
|
|
616 final_qry += next_qry[overlap_len:]
|
|
617 final_homol += next_homol[overlap_len:]
|
|
618
|
|
619 # Find query overlap sequences
|
|
620 pre_qry_overlap = pre_qry[overlap_start: (overlap_start
|
|
621 + overlap_len)]
|
|
622 next_qry_overlap = next_qry[:overlap_len]
|
|
623 sbjct_overlap = next_sbjct[:overlap_len]
|
|
624
|
|
625 # If alternative query overlap excist save it
|
|
626 if pre_qry_overlap != next_qry_overlap:
|
|
627 eprint("OVERLAP WARNING:")
|
|
628 eprint("{}\n{}"
|
|
629 .format(pre_qry_overlap, next_qry_overlap))
|
|
630
|
|
631 # Save alternative overlaps
|
|
632 alternative_overlaps += [(next_block_start,
|
|
633 overlap_end_pos,
|
|
634 sbjct_overlap,
|
|
635 next_qry_overlap)]
|
|
636
|
|
637 elif next_block_start > current_end:
|
|
638 # <------->
|
|
639 # <------->
|
|
640 gap_size = next_block_start - current_end - 1
|
|
641 final_qry += "N" * gap_size
|
|
642 final_sbjct += "N" * gap_size
|
|
643 final_homol += "-" * gap_size
|
|
644 current_end = next_block_end
|
|
645 final_sbjct += next_sbjct
|
|
646 final_qry += next_qry
|
|
647 final_homol += next_homol
|
|
648
|
|
649 # Calculate new coverage
|
|
650 no_call = final_qry.upper().count("N")
|
|
651 coverage = ((current_end - all_start + 1 - no_call)
|
|
652 / float(sbjct_len))
|
|
653
|
|
654 # Calculate identity
|
|
655 equal = 0
|
|
656 not_equal = 0
|
|
657 for i in range(len(final_qry)):
|
|
658 if final_qry[i].upper() != "N":
|
|
659 if final_qry[i].upper() == final_sbjct[i].upper():
|
|
660 equal += 1
|
|
661 else:
|
|
662 not_equal += 1
|
|
663 identity = equal / float(equal + not_equal)
|
|
664 if coverage >= min_cov:
|
|
665 GENES[gene]['perc_coverage'] = coverage
|
|
666 GENES[gene]['perc_ident'] = identity
|
|
667 GENES[gene]['sbjct_string'] = final_sbjct
|
|
668 GENES[gene]['query_string'] = final_qry
|
|
669 GENES[gene]['homo_string'] = final_homol
|
|
670 GENES[gene]['contig_name'] = ", ".join(contigs)
|
|
671 GENES[gene]['HSP_length'] = len(final_qry)
|
|
672 GENES[gene]['sbjct_start'] = all_start
|
|
673 GENES[gene]['sbjct_end'] = current_end
|
|
674 GENES[gene]['sbjct_length'] = sbjct_len
|
|
675 GENES[gene]['cal_score'] = ", ".join(scores)
|
|
676 GENES[gene]['gaps'] = no_call
|
|
677 GENES[gene]['alternative_overlaps'] = alternative_overlaps
|
|
678 GENES[gene]['mis_matches'] = []
|
|
679
|
|
680 else:
|
|
681 # Gene not found above given coverage
|
|
682 GENES[gene] = ('Gene found with coverage, %f, below '
|
|
683 'minimum coverage threshold: %s' % (coverage,
|
|
684 min_cov))
|
|
685 return GENES
|
|
686
|
|
687 def find_mismatches(self, gene, sbjct_start, sbjct_seq, qry_seq):
|
|
688 """
|
|
689 This function finds mis matches between two sequeces. Depending
|
|
690 on the the sequence type either the function
|
|
691 find_codon_mismatches or find_nucleotid_mismatches are called,
|
|
692 if the sequences contains both a promoter and a coding region
|
|
693 both functions are called. The function can also call itself if
|
|
694 alternative overlaps is give. All found mismatches are returned
|
|
695 """
|
|
696
|
|
697 # Initiate the mis_matches list that will store all found mis matcehs
|
|
698 mis_matches = []
|
|
699
|
|
700 # Find mis matches in RNA genes
|
|
701 if gene in self.RNA_gene_list:
|
|
702 mis_matches += PointFinder.find_nucleotid_mismatches(sbjct_start,
|
|
703 sbjct_seq,
|
|
704 qry_seq)
|
|
705 else:
|
|
706 # Check if the gene sequence is with a promoter
|
|
707 regex = r"promoter_size_(\d+)(?:bp)"
|
|
708 promtr_gene_objt = re.search(regex, gene)
|
|
709
|
|
710 # Check for promoter sequences
|
|
711 if promtr_gene_objt:
|
|
712 # Get promoter length
|
|
713 promtr_len = int(promtr_gene_objt.group(1))
|
|
714
|
|
715 # Extract promoter sequence, while considering gaps
|
|
716 # --------agt-->----
|
|
717 # ---->?
|
|
718 if sbjct_start <= promtr_len:
|
|
719 # Find position in sbjct sequence where promoter ends
|
|
720 promtr_end = 0
|
|
721 nuc_count = sbjct_start - 1
|
|
722
|
|
723 for i in range(len(sbjct_seq)):
|
|
724 promtr_end += 1
|
|
725
|
|
726 if sbjct_seq[i] != "-":
|
|
727 nuc_count += 1
|
|
728
|
|
729 if nuc_count == promtr_len:
|
|
730 break
|
|
731
|
|
732 # Check if only a part of the promoter is found
|
|
733 # --------agt-->----
|
|
734 # ----
|
|
735 promtr_sbjct_start = -1
|
|
736 if nuc_count < promtr_len:
|
|
737 promtr_sbjct_start = nuc_count - promtr_len
|
|
738
|
|
739 # Get promoter part of subject and query
|
|
740 sbjct_promtr_seq = sbjct_seq[:promtr_end]
|
|
741 qry_promtr_seq = qry_seq[:promtr_end]
|
|
742
|
|
743 # For promoter part find nucleotide mis matches
|
|
744 mis_matches += PointFinder.find_nucleotid_mismatches(
|
|
745 promtr_sbjct_start, sbjct_promtr_seq, qry_promtr_seq,
|
|
746 promoter=True)
|
|
747
|
|
748 # Check if gene is also found
|
|
749 # --------agt-->----
|
|
750 # -----------
|
|
751 if((sbjct_start + len(sbjct_seq.replace("-", "")))
|
|
752 > promtr_len):
|
|
753 sbjct_gene_seq = sbjct_seq[promtr_end:]
|
|
754 qry_gene_seq = qry_seq[promtr_end:]
|
|
755 sbjct_gene_start = 1
|
|
756
|
|
757 # Find mismatches in gene part
|
|
758 mis_matches += PointFinder.find_codon_mismatches(
|
|
759 sbjct_gene_start, sbjct_gene_seq, qry_gene_seq)
|
|
760
|
|
761 # No promoter, only gene is found
|
|
762 # --------agt-->----
|
|
763 # -----
|
|
764 else:
|
|
765 sbjct_gene_start = sbjct_start - promtr_len
|
|
766
|
|
767 # Find mismatches in gene part
|
|
768 mis_matches += PointFinder.find_codon_mismatches(
|
|
769 sbjct_gene_start, sbjct_seq, qry_seq)
|
|
770
|
|
771 else:
|
|
772 # Find mismatches in gene
|
|
773 mis_matches += PointFinder.find_codon_mismatches(
|
|
774 sbjct_start, sbjct_seq, qry_seq)
|
|
775
|
|
776 return mis_matches
|
|
777
|
|
778 @staticmethod
|
|
779 def find_nucleotid_mismatches(sbjct_start, sbjct_seq, qry_seq,
|
|
780 promoter=False):
|
|
781 """
|
|
782 This function takes two alligned sequence (subject and query),
|
|
783 and the position on the subject where the alignment starts. The
|
|
784 sequences are compared one nucleotide at a time. If mis matches
|
|
785 are found they are saved. If a gap is found the function
|
|
786 find_nuc_indel is called to find the entire indel and it is
|
|
787 also saved into the list mis_matches. If promoter sequences are
|
|
788 given as arguments, these are reversed the and the absolut
|
|
789 value of the sequence position used, but when mutations are
|
|
790 saved the negative value and det reverse sequences are saved in
|
|
791 mis_mathces.
|
|
792 """
|
|
793
|
|
794 # Initiate the mis_matches list that will store all found
|
|
795 # mismatcehs
|
|
796 mis_matches = []
|
|
797
|
|
798 sbjct_start = abs(sbjct_start)
|
|
799 seq_pos = sbjct_start
|
|
800
|
|
801 # Set variables depending on promoter status
|
|
802 factor = 1
|
|
803 mut_prefix = "r."
|
|
804
|
|
805 if promoter is True:
|
|
806 factor = (-1)
|
|
807 mut_prefix = "n."
|
|
808 # Reverse promoter sequences
|
|
809 sbjct_seq = sbjct_seq[::-1]
|
|
810 qry_seq = qry_seq[::-1]
|
|
811
|
|
812 # Go through sequences one nucleotide at a time
|
|
813 shift = 0
|
|
814 for index in range(sbjct_start - 1, len(sbjct_seq)):
|
|
815 mut_name = mut_prefix
|
|
816 mut = ""
|
|
817 # Shift index according to gaps
|
|
818 i = index + shift
|
|
819
|
|
820 # If the end of the sequence is reached, stop
|
|
821 if i == len(sbjct_seq):
|
|
822 break
|
|
823
|
|
824 sbjct_nuc = sbjct_seq[i]
|
|
825 qry_nuc = qry_seq[i]
|
|
826
|
|
827 # Check for mis matches
|
|
828 if sbjct_nuc.upper() != qry_nuc.upper():
|
|
829
|
|
830 # check for insertions and deletions
|
|
831 if sbjct_nuc == "-" or qry_nuc == "-":
|
|
832 if sbjct_nuc == "-":
|
|
833 mut = "ins"
|
|
834 indel_start_pos = (seq_pos - 1) * factor
|
|
835 indel_end_pos = seq_pos * factor
|
|
836 indel = PointFinder.find_nuc_indel(sbjct_seq[i:],
|
|
837 qry_seq[i:])
|
|
838 else:
|
|
839 mut = "del"
|
|
840 indel_start_pos = seq_pos * factor
|
|
841 indel = PointFinder.find_nuc_indel(qry_seq[i:],
|
|
842 sbjct_seq[i:])
|
|
843 indel_end_pos = (seq_pos + len(indel) - 1) * factor
|
|
844 seq_pos += len(indel) - 1
|
|
845
|
|
846 # Shift the index to the end of the indel
|
|
847 shift += len(indel) - 1
|
|
848
|
|
849 # Write mutation name, depending on sequnce
|
|
850 if len(indel) == 1 and mut == "del":
|
|
851 mut_name += str(indel_start_pos) + mut + indel
|
|
852 else:
|
|
853 if promoter is True:
|
|
854 # Reverse the sequence and the start and
|
|
855 # end positions
|
|
856 indel = indel[::-1]
|
|
857 temp = indel_start_pos
|
|
858 indel_start_pos = indel_end_pos
|
|
859 indel_end_pos = temp
|
|
860
|
|
861 mut_name += (str(indel_start_pos) + "_"
|
|
862 + str(indel_end_pos) + mut + indel)
|
|
863
|
|
864 mis_matches += [[mut, seq_pos * factor, seq_pos * factor,
|
|
865 indel, mut_name, mut, indel]]
|
|
866
|
|
867 # Check for substitutions mutations
|
|
868 else:
|
|
869 mut = "sub"
|
|
870 mut_name += (str(seq_pos * factor) + sbjct_nuc + ">"
|
|
871 + qry_nuc)
|
|
872
|
|
873 mis_matches += [[mut, seq_pos * factor, seq_pos * factor,
|
|
874 qry_nuc, mut_name, sbjct_nuc, qry_nuc]]
|
|
875
|
|
876 # Increment sequence position
|
|
877 if mut != "ins":
|
|
878 seq_pos += 1
|
|
879
|
|
880 return mis_matches
|
|
881
|
|
882 @staticmethod
|
|
883 def find_nuc_indel(gapped_seq, indel_seq):
|
|
884 """
|
|
885 This function finds the entire indel missing in from a gapped
|
|
886 sequence compared to the indel_seqeunce. It is assumes that the
|
|
887 sequences start with the first position of the gap.
|
|
888 """
|
|
889 ref_indel = indel_seq[0]
|
|
890 for j in range(1, len(gapped_seq)):
|
|
891 if gapped_seq[j] == "-":
|
|
892 ref_indel += indel_seq[j]
|
|
893 else:
|
|
894 break
|
|
895 return ref_indel
|
|
896
|
|
897 @staticmethod
|
|
898 def aa(codon):
|
|
899 """
|
|
900 This function converts a codon to an amino acid. If the codon
|
|
901 is not valid an error message is given, or else, the amino acid
|
|
902 is returned.
|
|
903
|
|
904 Potential future issue: If species are added that utilizes
|
|
905 alternative translation tables.
|
|
906 """
|
|
907 codon = codon.upper()
|
|
908 aa = {"ATT": "I", "ATC": "I", "ATA": "I",
|
|
909 "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", "TTA": "L",
|
|
910 "TTG": "L",
|
|
911 "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
|
|
912 "TTT": "F", "TTC": "F",
|
|
913 "ATG": "M",
|
|
914 "TGT": "C", "TGC": "C",
|
|
915 "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
|
|
916 "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
|
|
917 "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
|
|
918 "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
|
|
919 "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "AGT": "S",
|
|
920 "AGC": "S",
|
|
921 "TAT": "Y", "TAC": "Y",
|
|
922 "TGG": "W",
|
|
923 "CAA": "Q", "CAG": "Q",
|
|
924 "AAT": "N", "AAC": "N",
|
|
925 "CAT": "H", "CAC": "H",
|
|
926 "GAA": "E", "GAG": "E",
|
|
927 "GAT": "D", "GAC": "D",
|
|
928 "AAA": "K", "AAG": "K",
|
|
929 "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R",
|
|
930 "AGG": "R",
|
|
931 "TAA": "*", "TAG": "*", "TGA": "*"}
|
|
932 # Translate valid codon
|
|
933 try:
|
|
934 amino_a = aa[codon]
|
|
935 except KeyError:
|
|
936 amino_a = "?"
|
|
937 return amino_a
|
|
938
|
|
939 @staticmethod
|
|
940 def get_codon(seq, codon_no, start_offset):
|
|
941 """
|
|
942 This function takes a sequece and a codon number and returns
|
|
943 the codon found in the sequence at that position
|
|
944 """
|
|
945 seq = seq.replace("-", "")
|
|
946 codon_start_pos = int(codon_no - 1) * 3 - start_offset
|
|
947 codon = seq[codon_start_pos:codon_start_pos + 3]
|
|
948 return codon
|
|
949
|
|
950 @staticmethod
|
|
951 def name_insertion(sbjct_seq, codon_no, sbjct_nucs, aa_alt, start_offset):
|
|
952 """
|
|
953 This function is used to name a insertion mutation based on the
|
|
954 HGVS recommendation.
|
|
955 """
|
|
956 start_codon_no = codon_no - 1
|
|
957
|
|
958 if len(sbjct_nucs) == 3:
|
|
959 start_codon_no = codon_no
|
|
960
|
|
961 start_codon = PointFinder.get_codon(sbjct_seq, start_codon_no,
|
|
962 start_offset)
|
|
963 end_codon = PointFinder.get_codon(sbjct_seq, codon_no, start_offset)
|
|
964 pos_name = "p.%s%d_%s%dins%s" % (PointFinder.aa(start_codon),
|
|
965 start_codon_no,
|
|
966 PointFinder.aa(end_codon),
|
|
967 codon_no, aa_alt)
|
|
968 return pos_name
|
|
969
|
|
970 @staticmethod
|
|
971 def name_deletion(sbjct_seq, sbjct_rf_indel, sbjct_nucs, codon_no, aa_alt,
|
|
972 start_offset, mutation="del"):
|
|
973 """
|
|
974 This function is used to name a deletion mutation based on the
|
|
975 HGVS recommendation. If combination of a deletion and an
|
|
976 insertion is identified the argument 'mutation' is set to
|
|
977 'delins' and the mutation name will indicate that the mutation
|
|
978 is a delins mutation.
|
|
979 """
|
|
980 del_codon = PointFinder.get_codon(sbjct_seq, codon_no, start_offset)
|
|
981 pos_name = "p.%s%d" % (PointFinder.aa(del_codon), codon_no)
|
|
982
|
|
983 # This has been changed
|
|
984 if len(sbjct_rf_indel) == 3 and mutation == "del":
|
|
985 return pos_name + mutation
|
|
986
|
|
987 end_codon_no = codon_no + math.ceil(len(sbjct_nucs) / 3) - 1
|
|
988 end_codon = PointFinder.get_codon(sbjct_seq, end_codon_no,
|
|
989 start_offset)
|
|
990 pos_name += "_%s%d%s" % (PointFinder.aa(end_codon), end_codon_no,
|
|
991 mutation)
|
|
992 if mutation == "delins":
|
|
993 pos_name += aa_alt
|
|
994 return pos_name
|
|
995
|
|
996 @staticmethod
|
|
997 def name_indel_mutation(sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel,
|
|
998 codon_no, mut, start_offset):
|
|
999 """
|
|
1000 This function serves to name the individual mutations
|
|
1001 dependently on the type of the mutation.
|
|
1002 """
|
|
1003 # Get the subject and query sequences without gaps
|
|
1004 sbjct_nucs = sbjct_rf_indel.replace("-", "")
|
|
1005 qry_nucs = qry_rf_indel.replace("-", "")
|
|
1006
|
|
1007 # Translate nucleotides to amino acids
|
|
1008 aa_ref = ""
|
|
1009 aa_alt = ""
|
|
1010
|
|
1011 for i in range(0, len(sbjct_nucs), 3):
|
|
1012 aa_ref += PointFinder.aa(sbjct_nucs[i:i + 3])
|
|
1013
|
|
1014 for i in range(0, len(qry_nucs), 3):
|
|
1015 aa_alt += PointFinder.aa(qry_nucs[i:i + 3])
|
|
1016
|
|
1017 # Identify the gapped sequence
|
|
1018 if mut == "ins":
|
|
1019 gapped_seq = sbjct_rf_indel
|
|
1020 else:
|
|
1021 gapped_seq = qry_rf_indel
|
|
1022
|
|
1023 gap_size = gapped_seq.count("-")
|
|
1024
|
|
1025 # Write mutation names
|
|
1026 if gap_size < 3 and len(sbjct_nucs) == 3 and len(qry_nucs) == 3:
|
|
1027 # Write mutation name for substitution mutation
|
|
1028 mut_name = "p.%s%d%s" % (PointFinder.aa(sbjct_nucs), codon_no,
|
|
1029 PointFinder.aa(qry_nucs))
|
|
1030 elif len(gapped_seq) == gap_size:
|
|
1031
|
|
1032 if mut == "ins":
|
|
1033 # Write mutation name for insertion mutation
|
|
1034 mut_name = PointFinder.name_insertion(sbjct_seq, codon_no,
|
|
1035 sbjct_nucs, aa_alt,
|
|
1036 start_offset)
|
|
1037 aa_ref = mut
|
|
1038 else:
|
|
1039 # Write mutation name for deletion mutation
|
|
1040 mut_name = PointFinder.name_deletion(sbjct_seq, sbjct_rf_indel,
|
|
1041 sbjct_nucs, codon_no,
|
|
1042 aa_alt, start_offset,
|
|
1043 mutation="del")
|
|
1044 aa_alt = mut
|
|
1045 # Check for delins - mix of insertion and deletion
|
|
1046 else:
|
|
1047 # Write mutation name for a mixed insertion and deletion
|
|
1048 # mutation
|
|
1049 mut_name = PointFinder.name_deletion(sbjct_seq,
|
|
1050 sbjct_rf_indel, sbjct_nucs,
|
|
1051 codon_no, aa_alt,
|
|
1052 start_offset,
|
|
1053 mutation="delins")
|
|
1054
|
|
1055 # Check for frameshift
|
|
1056 if gapped_seq.count("-") % 3 != 0:
|
|
1057 # Add the frameshift tag to mutation name
|
|
1058 mut_name += " - Frameshift"
|
|
1059
|
|
1060 return mut_name, aa_ref, aa_alt
|
|
1061
|
|
1062 @staticmethod
|
|
1063 def get_inframe_gap(seq, nucs_needed=3):
|
|
1064 """
|
|
1065 This funtion takes a sequnece starting with a gap or the
|
|
1066 complementary seqeuence to the gap, and the number of
|
|
1067 nucleotides that the seqeunce should contain in order to
|
|
1068 maintain the correct reading frame. The sequence is gone
|
|
1069 through and the number of non-gap characters are counted. When
|
|
1070 the number has reach the number of needed nucleotides the indel
|
|
1071 is returned. If the indel is a 'clean' insert or deletion that
|
|
1072 starts in the start of a codon and can be divided by 3, then
|
|
1073 only the gap is returned.
|
|
1074 """
|
|
1075 nuc_count = 0
|
|
1076 gap_indel = ""
|
|
1077 nucs = ""
|
|
1078
|
|
1079 for i in range(len(seq)):
|
|
1080 # Check if the character is not a gap
|
|
1081 if seq[i] != "-":
|
|
1082 # Check if the indel is a 'clean'
|
|
1083 # i.e. if the insert or deletion starts at the first
|
|
1084 # nucleotide in the codon and can be divided by 3
|
|
1085
|
|
1086 if(gap_indel.count("-") == len(gap_indel)
|
|
1087 and gap_indel.count("-") >= 3 and len(gap_indel) != 0):
|
|
1088 return gap_indel
|
|
1089
|
|
1090 nuc_count += 1
|
|
1091 gap_indel += seq[i]
|
|
1092
|
|
1093 # if the number of nucleotides in the indel equals the amount
|
|
1094 # needed for the indel, the indel is returned.
|
|
1095 if nuc_count == nucs_needed:
|
|
1096 return gap_indel
|
|
1097
|
|
1098 # This will only happen if the gap is in the very end of a sequence
|
|
1099 return gap_indel
|
|
1100
|
|
1101 @staticmethod
|
|
1102 def get_indels(sbjct_seq, qry_seq, start_pos):
|
|
1103 """
|
|
1104 This function uses regex to find inserts and deletions in
|
|
1105 sequences given as arguments. A list of these indels are
|
|
1106 returned. The list includes, type of mutations(ins/del),
|
|
1107 subject codon no of found mutation, subject sequence position,
|
|
1108 insert/deletions nucleotide sequence, and the affected qry
|
|
1109 codon no.
|
|
1110 """
|
|
1111
|
|
1112 seqs = [sbjct_seq, qry_seq]
|
|
1113 indels = []
|
|
1114 gap_obj = re.compile(r"-+")
|
|
1115 for i in range(len(seqs)):
|
|
1116 for match in gap_obj.finditer(seqs[i]):
|
|
1117 pos = int(match.start())
|
|
1118 gap = match.group()
|
|
1119
|
|
1120 # Find position of the mutation corresponding to the
|
|
1121 # subject sequence
|
|
1122 sbj_pos = len(sbjct_seq[:pos].replace("-", "")) + start_pos
|
|
1123
|
|
1124 # Get indel sequence and the affected sequences in
|
|
1125 # sbjct and qry in the reading frame
|
|
1126 indel = seqs[abs(i - 1)][pos:pos + len(gap)]
|
|
1127
|
|
1128 # Find codon number for mutation
|
|
1129 codon_no = int(math.ceil((sbj_pos) / 3))
|
|
1130 qry_pos = len(qry_seq[:pos].replace("-", "")) + start_pos
|
|
1131 qry_codon = int(math.ceil((qry_pos) / 3))
|
|
1132
|
|
1133 if i == 0:
|
|
1134 mut = "ins"
|
|
1135 else:
|
|
1136 mut = "del"
|
|
1137
|
|
1138 indels.append([mut, codon_no, sbj_pos, indel, qry_codon])
|
|
1139
|
|
1140 # Sort indels based on codon position and sequence position
|
|
1141 indels = sorted(indels, key=lambda x: (x[1], x[2]))
|
|
1142
|
|
1143 return indels
|
|
1144
|
|
1145 @staticmethod
|
|
1146 def find_codon_mismatches(sbjct_start, sbjct_seq, qry_seq):
|
|
1147 """
|
|
1148 This function takes two alligned sequence (subject and query),
|
|
1149 and the position on the subject where the alignment starts. The
|
|
1150 sequences are compared codon by codon. If a mis matches is
|
|
1151 found it is saved in 'mis_matches'. If a gap is found the
|
|
1152 function get_inframe_gap is used to find the indel sequence and
|
|
1153 keep the sequence in the correct reading frame. The function
|
|
1154 translate_indel is used to name indel mutations and translate
|
|
1155 the indels to amino acids The function returns a list of tuples
|
|
1156 containing all needed informations about the mutation in order
|
|
1157 to look it up in the database dict known mutation and the with
|
|
1158 the output files the the user.
|
|
1159 """
|
|
1160 mis_matches = []
|
|
1161
|
|
1162 # Find start pos of first codon in frame, i_start
|
|
1163 codon_offset = (sbjct_start - 1) % 3
|
|
1164 i_start = 0
|
|
1165
|
|
1166 if codon_offset != 0:
|
|
1167 i_start = 3 - codon_offset
|
|
1168
|
|
1169 sbjct_start = sbjct_start + i_start
|
|
1170
|
|
1171 # Set sequences in frame
|
|
1172 sbjct_seq = sbjct_seq[i_start:]
|
|
1173 qry_seq = qry_seq[i_start:]
|
|
1174
|
|
1175 # Find codon number of the first codon in the sequence, start
|
|
1176 # at 0
|
|
1177 codon_no = int((sbjct_start - 1) / 3) # 1,2,3 start on 0
|
|
1178
|
|
1179 # s_shift and q_shift are used when gaps appears
|
|
1180 q_shift = 0
|
|
1181 s_shift = 0
|
|
1182 mut_no = 0
|
|
1183
|
|
1184 # Find inserts and deletions in sequence
|
|
1185 indel_no = 0
|
|
1186 indels = PointFinder.get_indels(sbjct_seq, qry_seq, sbjct_start)
|
|
1187
|
|
1188 # Go through sequence and save mutations when found
|
|
1189 for index in range(0, len(sbjct_seq), 3):
|
|
1190 # Count codon number
|
|
1191 codon_no += 1
|
|
1192
|
|
1193 # Shift index according to gaps
|
|
1194 s_i = index + s_shift
|
|
1195 q_i = index + q_shift
|
|
1196
|
|
1197 # Get codons
|
|
1198 sbjct_codon = sbjct_seq[s_i:s_i + 3]
|
|
1199 qry_codon = qry_seq[q_i:q_i + 3]
|
|
1200
|
|
1201 if(len(sbjct_seq[s_i:].replace("-", ""))
|
|
1202 + len(qry_codon[q_i:].replace("-", "")) < 6):
|
|
1203 break
|
|
1204
|
|
1205 # Check for mutations
|
|
1206 if sbjct_codon.upper() != qry_codon.upper():
|
|
1207
|
|
1208 # Check for codon insertions and deletions and
|
|
1209 # frameshift mutations
|
|
1210 if "-" in sbjct_codon or "-" in qry_codon:
|
|
1211
|
|
1212 # Get indel info
|
|
1213 try:
|
|
1214 indel_data = indels[indel_no]
|
|
1215 except IndexError:
|
|
1216 sys.exit("indel_data list is out of range, bug!")
|
|
1217
|
|
1218 mut = indel_data[0]
|
|
1219 codon_no_indel = indel_data[1]
|
|
1220 seq_pos = indel_data[2] + sbjct_start - 1
|
|
1221 indel = indel_data[3]
|
|
1222 indel_no += 1
|
|
1223
|
|
1224 # Get the affected sequence in frame for both for
|
|
1225 # sbjct and qry
|
|
1226 if mut == "ins":
|
|
1227 sbjct_rf_indel = PointFinder.get_inframe_gap(
|
|
1228 sbjct_seq[s_i:], 3)
|
|
1229 qry_rf_indel = PointFinder.get_inframe_gap(
|
|
1230 qry_seq[q_i:],
|
|
1231 int(math.floor(len(sbjct_rf_indel) / 3) * 3))
|
|
1232 else:
|
|
1233 qry_rf_indel = PointFinder.get_inframe_gap(
|
|
1234 qry_seq[q_i:], 3)
|
|
1235 sbjct_rf_indel = PointFinder.get_inframe_gap(
|
|
1236 sbjct_seq[s_i:],
|
|
1237 int(math.floor(len(qry_rf_indel) / 3) * 3))
|
|
1238
|
|
1239 mut_name, aa_ref, aa_alt = PointFinder.name_indel_mutation(
|
|
1240 sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel,
|
|
1241 codon_no, mut, sbjct_start - 1)
|
|
1242
|
|
1243 # Set index to the correct reading frame after the
|
|
1244 # indel gap
|
|
1245 shift_diff_before = abs(s_shift - q_shift)
|
|
1246 s_shift += len(sbjct_rf_indel) - 3
|
|
1247 q_shift += len(qry_rf_indel) - 3
|
|
1248 shift_diff = abs(s_shift - q_shift)
|
|
1249
|
|
1250 if shift_diff_before != 0 and shift_diff % 3 == 0:
|
|
1251
|
|
1252 if s_shift > q_shift:
|
|
1253 nucs_needed = (int((len(sbjct_rf_indel) / 3) * 3)
|
|
1254 + shift_diff)
|
|
1255 pre_qry_indel = qry_rf_indel
|
|
1256 qry_rf_indel = PointFinder.get_inframe_gap(
|
|
1257 qry_seq[q_i:], nucs_needed)
|
|
1258 q_shift += len(qry_rf_indel) - len(pre_qry_indel)
|
|
1259
|
|
1260 elif q_shift > s_shift:
|
|
1261 nucs_needed = (int((len(qry_rf_indel) / 3) * 3)
|
|
1262 + shift_diff)
|
|
1263 pre_sbjct_indel = sbjct_rf_indel
|
|
1264 sbjct_rf_indel = PointFinder.get_inframe_gap(
|
|
1265 sbjct_seq[s_i:], nucs_needed)
|
|
1266 s_shift += (len(sbjct_rf_indel)
|
|
1267 - len(pre_sbjct_indel))
|
|
1268
|
|
1269 mut_name, aa_ref, aa_alt = (
|
|
1270 PointFinder.name_indel_mutation(
|
|
1271 sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel,
|
|
1272 codon_no, mut, sbjct_start - 1)
|
|
1273 )
|
|
1274 if "Frameshift" in mut_name:
|
|
1275 mut_name = (mut_name.split("-")[0]
|
|
1276 + "- Frame restored")
|
|
1277 if mut_name is "p.V940delins - Frame restored":
|
|
1278 sys.exit()
|
|
1279 mis_matches += [[mut, codon_no_indel, seq_pos, indel,
|
|
1280 mut_name, sbjct_rf_indel, qry_rf_indel,
|
|
1281 aa_ref, aa_alt]]
|
|
1282
|
|
1283 # Check if the next mutation in the indels list is
|
|
1284 # in the current codon.
|
|
1285 # Find the number of individul gaps in the
|
|
1286 # evaluated sequence.
|
|
1287 no_of_indels = (len(re.findall("\-\w", sbjct_rf_indel))
|
|
1288 + len(re.findall("\-\w", qry_rf_indel)))
|
|
1289
|
|
1290 if no_of_indels > 1:
|
|
1291
|
|
1292 for j in range(indel_no, indel_no + no_of_indels - 1):
|
|
1293 try:
|
|
1294 indel_data = indels[j]
|
|
1295 except IndexError:
|
|
1296 sys.exit("indel_data list is out of range, "
|
|
1297 "bug!")
|
|
1298 mut = indel_data[0]
|
|
1299 codon_no_indel = indel_data[1]
|
|
1300 seq_pos = indel_data[2] + sbjct_start - 1
|
|
1301 indel = indel_data[3]
|
|
1302 indel_no += 1
|
|
1303 mis_matches += [[mut, codon_no_indel, seq_pos,
|
|
1304 indel, mut_name, sbjct_rf_indel,
|
|
1305 qry_rf_indel, aa_ref, aa_alt]]
|
|
1306
|
|
1307 # Set codon number, and save nucleotides from out
|
|
1308 # of frame mutations
|
|
1309 if mut == "del":
|
|
1310 codon_no += int((len(sbjct_rf_indel) - 3) / 3)
|
|
1311 # If evaluated insert is only gaps codon_no should
|
|
1312 # not increment
|
|
1313 elif sbjct_rf_indel.count("-") == len(sbjct_rf_indel):
|
|
1314 codon_no -= 1
|
|
1315
|
|
1316 # Check of point mutations
|
|
1317 else:
|
|
1318 mut = "sub"
|
|
1319 aa_ref = PointFinder.aa(sbjct_codon)
|
|
1320 aa_alt = PointFinder.aa(qry_codon)
|
|
1321
|
|
1322 if aa_ref != aa_alt:
|
|
1323 # End search for mutation if a premature stop
|
|
1324 # codon is found
|
|
1325 mut_name = "p." + aa_ref + str(codon_no) + aa_alt
|
|
1326
|
|
1327 mis_matches += [[mut, codon_no, codon_no, aa_alt,
|
|
1328 mut_name, sbjct_codon, qry_codon,
|
|
1329 aa_ref, aa_alt]]
|
|
1330 # If a Premature stop codon occur report it an stop the
|
|
1331 # loop
|
|
1332
|
|
1333 try:
|
|
1334 if mis_matches[-1][-1] == "*":
|
|
1335 mut_name += " - Premature stop codon"
|
|
1336 mis_matches[-1][4] = (mis_matches[-1][4].split("-")[0]
|
|
1337 + " - Premature stop codon")
|
|
1338 break
|
|
1339 except IndexError:
|
|
1340 pass
|
|
1341
|
|
1342 # Sort mutations on position
|
|
1343 mis_matches = sorted(mis_matches, key=lambda x: x[1])
|
|
1344
|
|
1345 return mis_matches
|
|
1346
|
|
1347 @staticmethod
|
|
1348 def mutstr2mutdict(m):
|
|
1349 out_dict = {}
|
|
1350
|
|
1351 # Protein / Amino acid mutations
|
|
1352 # Ex.: "p.T83I"
|
|
1353 if(m.startswith("p.")):
|
|
1354 out_dict["nucleotide"] = False
|
|
1355
|
|
1356 # Remove frameshift tag
|
|
1357 frameshift_match = re.search(r"(.+) - Frameshift.*$", m)
|
|
1358 if(frameshift_match):
|
|
1359 m = frameshift_match.group(1)
|
|
1360 out_dict["frameshift"] = True
|
|
1361
|
|
1362 # Remove frame restored tag
|
|
1363 framerestored_match = re.search(r"(.+) - Frame restored.*$", m)
|
|
1364 if(framerestored_match):
|
|
1365 m = framerestored_match.group(1)
|
|
1366 out_dict["frame restored"] = True
|
|
1367
|
|
1368 # Remove premature stop tag
|
|
1369 prem_stop_match = re.search(r"(.+) - Premature stop codon.*$", m)
|
|
1370 if(prem_stop_match):
|
|
1371 m = prem_stop_match.group(1)
|
|
1372 out_dict["prem_stop"] = True
|
|
1373
|
|
1374 # TODO: premature or frameshift tag adds too many whitespaces
|
|
1375 m = m.strip()
|
|
1376
|
|
1377 # Delins
|
|
1378 multi_delins_match = re.search(
|
|
1379 r"^p.(\D{1})(\d+)_(\D{1})(\d+)delins(\S+)$", m)
|
|
1380 single_delins_match = re.search(
|
|
1381 r"^p.(\D{1})(\d+)delins(\S+)$", m)
|
|
1382 # TODO: is both necessary?
|
|
1383 multi_delins_match2 = re.search(
|
|
1384 r"^p.(\D{1})(\d+)_(\D{1})(\d+)delins$", m)
|
|
1385 single_delins_match2 = re.search(
|
|
1386 r"^p.(\D{1})(\d+)delins$", m)
|
|
1387 multi_ins_match = re.search(
|
|
1388 r"^p.(\D{1})(\d+)_(\D{1})(\d+)ins(\D*)$", m)
|
|
1389 if(multi_delins_match or single_delins_match):
|
|
1390 out_dict["deletion"] = True
|
|
1391 out_dict["insertion"] = True
|
|
1392 if(single_delins_match):
|
|
1393 out_dict["ref_aa"] = single_delins_match.group(1)
|
|
1394 out_dict["pos"] = single_delins_match.group(2)
|
|
1395 out_dict["mut_aa"] = single_delins_match.group(3)
|
|
1396 else:
|
|
1397 out_dict["ref_aa"] = multi_delins_match.group(1)
|
|
1398 out_dict["pos"] = multi_delins_match.group(2)
|
|
1399 out_dict["ref_aa_right"] = multi_delins_match.group(3)
|
|
1400 out_dict["mut_end"] = multi_delins_match.group(4)
|
|
1401 out_dict["mut_aa"] = multi_delins_match.group(5)
|
|
1402 # Deletions
|
|
1403 elif(m[-3:] == "del"):
|
|
1404 single_del_match = re.search(
|
|
1405 r"^p.(\D{1})(\d+)del$", m)
|
|
1406 multi_del_match = re.search(
|
|
1407 r"^p.(\D{1})(\d+)_(\D{1})(\d+)del$", m)
|
|
1408 out_dict["deletion"] = True
|
|
1409 if(single_del_match):
|
|
1410 out_dict["ref_aa"] = single_del_match.group(1)
|
|
1411 out_dict["pos"] = single_del_match.group(2)
|
|
1412 else:
|
|
1413 out_dict["ref_aa"] = multi_del_match.group(1)
|
|
1414 out_dict["pos"] = multi_del_match.group(2)
|
|
1415 out_dict["ref_aa_right"] = multi_del_match.group(3)
|
|
1416 out_dict["mut_end"] = multi_del_match.group(4)
|
|
1417 # Insertions
|
|
1418 elif(multi_ins_match):
|
|
1419 out_dict["insertion"] = True
|
|
1420 out_dict["ref_aa"] = multi_ins_match.group(1).lower()
|
|
1421 out_dict["pos"] = multi_ins_match.group(2)
|
|
1422 out_dict["ref_aa_right"] = multi_ins_match.group(3).lower()
|
|
1423 if(multi_ins_match.group(4)):
|
|
1424 out_dict["mut_aa"] = multi_ins_match.group(4).lower()
|
|
1425 # Substitutions
|
|
1426 else:
|
|
1427 sub_match = re.search(
|
|
1428 r"^p.(\D{1})(\d+)(\D{1})$", m)
|
|
1429 out_dict["ref_aa"] = sub_match.group(1).lower()
|
|
1430 out_dict["pos"] = sub_match.group(2)
|
|
1431 out_dict["mut_aa"] = sub_match.group(3).lower()
|
|
1432
|
|
1433 # Nucleotide mutations
|
|
1434 # Ex. sub: n.-42T>C
|
|
1435 # Ex. ins: n.-13_-14insG (TODO: Verify)
|
|
1436 # Ex. del: n.42delT (TODO: Verify)
|
|
1437 # Ex. del: n.42_45del (TODO: Verify)
|
|
1438 elif(m.startswith("n.") or m.startswith("r.")):
|
|
1439 out_dict["nucleotide"] = True
|
|
1440
|
|
1441 sub_match = re.search(
|
|
1442 r"^[nr]{1}\.(-{0,1}\d+)(\D{1})>(\D{1})$", m)
|
|
1443 ins_match = re.search(
|
|
1444 r"^[nr]{1}\.(-{0,1}\d+)_(-{0,1}\d+)ins(\S+)$", m)
|
|
1445 # r.541delA
|
|
1446 del_match = re.search((
|
|
1447 r"^[nr]{1}\.(-{0,1}\d+)_{0,1}(-{0,1}\d*)del(\S*)$"), m)
|
|
1448 if(sub_match):
|
|
1449 out_dict["pos"] = sub_match.group(1)
|
|
1450 out_dict["ref_nuc"] = sub_match.group(2)
|
|
1451 out_dict["mut_nuc"] = sub_match.group(3)
|
|
1452 elif(ins_match):
|
|
1453 out_dict["insertion"] = True
|
|
1454 out_dict["pos"] = ins_match.group(1)
|
|
1455 out_dict["mut_end"] = ins_match.group(2)
|
|
1456 out_dict["mut_nuc"] = ins_match.group(3)
|
|
1457 elif(del_match):
|
|
1458 out_dict["deletion"] = True
|
|
1459 out_dict["pos"] = del_match.group(1)
|
|
1460 if(del_match.group(2)):
|
|
1461 out_dict["mut_end"] = del_match.group(2)
|
|
1462 if(del_match.group(3)):
|
|
1463 out_dict["ref_nuc"] = del_match.group(3)
|
|
1464 else:
|
|
1465 sys.exit("ERROR: Nucleotide deletion did not contain any "
|
|
1466 "reference sequence. mut string: {}".format(m))
|
|
1467
|
|
1468 return out_dict
|
|
1469
|
|
1470 def get_mutations(self, gene, gene_name, mis_matches, unknown_flag, hit):
|
|
1471 RNA = False
|
|
1472 if gene in self.RNA_gene_list:
|
|
1473 RNA = True
|
|
1474
|
|
1475 known_muts = []
|
|
1476 unknown_muts = []
|
|
1477
|
|
1478 # Go through each mutation
|
|
1479 for i in range(len(mis_matches)):
|
|
1480 m_type = mis_matches[i][0]
|
|
1481 pos = mis_matches[i][1] # sort on pos?
|
|
1482 look_up_pos = mis_matches[i][2]
|
|
1483 look_up_mut = mis_matches[i][3]
|
|
1484 mut_name = mis_matches[i][4]
|
|
1485 # nuc_ref = mis_matches[i][5]
|
|
1486 # nuc_alt = mis_matches[i][6]
|
|
1487 ref = mis_matches[i][-2]
|
|
1488 alt = mis_matches[i][-1]
|
|
1489 mut_dict = PointFinder.mutstr2mutdict(mut_name)
|
|
1490
|
|
1491 mut_id = ("{gene}_{pos}_{alt}"
|
|
1492 .format(gene=gene_name, pos=pos, alt=alt.lower()))
|
|
1493
|
|
1494 ref_aa = mut_dict.get("ref_aa", None)
|
|
1495 ref_aa_right = mut_dict.get("ref_aa_right", None)
|
|
1496 mut_aa = mut_dict.get("mut_aa", None)
|
|
1497 ref_nuc = mut_dict.get("ref_nuc", None)
|
|
1498 mut_nuc = mut_dict.get("mut_nuc", None)
|
|
1499 is_nuc = mut_dict.get("nucleotide", None)
|
|
1500 is_ins = mut_dict.get("insertion", None)
|
|
1501 is_del = mut_dict.get("deletion", None)
|
|
1502 mut_end = mut_dict.get("mut_end", None)
|
|
1503 prem_stop = mut_dict.get("prem_stop", False)
|
|
1504
|
|
1505 mut = ResMutation(unique_id=mut_id, seq_region=gene_name, pos=pos,
|
|
1506 hit=hit, ref_codon=ref_nuc, mut_codon=mut_nuc,
|
|
1507 ref_aa=ref_aa, ref_aa_right=ref_aa_right,
|
|
1508 mut_aa=mut_aa, insertion=is_ins, deletion=is_del,
|
|
1509 end=mut_end, nuc=is_nuc,
|
|
1510 premature_stop=prem_stop, ab_class=None)
|
|
1511
|
|
1512 if "Premature stop codon" in mut_name:
|
|
1513 sbjct_len = hit['sbjct_length']
|
|
1514 qry_len = pos * 3
|
|
1515 perc_trunc = round(
|
|
1516 ((float(sbjct_len) - qry_len)
|
|
1517 / float(sbjct_len))
|
|
1518 * 100, 2
|
|
1519 )
|
|
1520 mut.premature_stop = perc_trunc
|
|
1521
|
|
1522 # Check if mutation is known
|
|
1523 gene_mut_name, resistence, pmid = self.look_up_known_muts(
|
|
1524 gene, look_up_pos, look_up_mut, m_type, gene_name)
|
|
1525
|
|
1526 # Collect known mutations
|
|
1527 if resistence != "Unknown":
|
|
1528 known_muts.append(mut)
|
|
1529 # Collect unknown mutations
|
|
1530 else:
|
|
1531 unknown_muts.append(mut)
|
|
1532
|
|
1533 # TODO: Use ResMutation class to make sure identical mutations are
|
|
1534 # not kept.
|
|
1535
|
|
1536 return (known_muts, unknown_muts)
|
|
1537
|
|
1538 def mut2str(self, gene, gene_name, mis_matches, unknown_flag, hit):
|
|
1539 """
|
|
1540 This function takes a gene name a list of mis matches found
|
|
1541 between subject and query of this gene, the dictionary of
|
|
1542 known mutation in the point finder database, and the flag
|
|
1543 telling weather the user wants unknown mutations to be
|
|
1544 reported. All mis matches are looked up in the known
|
|
1545 mutation dict to se if the mutation is known, and in this
|
|
1546 case what drug resistence it causes. The funtions return 3
|
|
1547 strings that are used as output to the users. One string is
|
|
1548 only tab seperated and contains the mutations listed line
|
|
1549 by line. If the unknown flag is set to true it will contain
|
|
1550 both known and unknown mutations. The next string contains
|
|
1551 only known mutation and are given in in a format that is
|
|
1552 easy to convert to HTML. The last string is the HTML tab
|
|
1553 sting from the unknown mutations.
|
|
1554 """
|
|
1555 known_header = ("Mutation\tNucleotide change\tAmino acid change\t"
|
|
1556 "Resistance\tPMID\n")
|
|
1557
|
|
1558 unknown_header = "Mutation\tNucleotide change\tAmino acid change\n"
|
|
1559
|
|
1560 RNA = False
|
|
1561 if gene in self.RNA_gene_list:
|
|
1562 RNA = True
|
|
1563 known_header = "Mutation\tNucleotide change\tResistance\tPMID\n"
|
|
1564 unknown_header = "Mutation\tNucleotide change\n"
|
|
1565
|
|
1566 known_lst = []
|
|
1567 unknown_lst = []
|
|
1568 all_results_lst = []
|
|
1569 output_mut = []
|
|
1570 stop_codons = []
|
|
1571
|
|
1572 # Go through each mutation
|
|
1573 for i in range(len(mis_matches)):
|
|
1574 m_type = mis_matches[i][0]
|
|
1575 pos = mis_matches[i][1] # sort on pos?
|
|
1576 look_up_pos = mis_matches[i][2]
|
|
1577 look_up_mut = mis_matches[i][3]
|
|
1578 mut_name = mis_matches[i][4]
|
|
1579 nuc_ref = mis_matches[i][5]
|
|
1580 nuc_alt = mis_matches[i][6]
|
|
1581 ref = mis_matches[i][-2]
|
|
1582 alt = mis_matches[i][-1]
|
|
1583
|
|
1584 # First index in list indicates if mutation is known
|
|
1585 output_mut += [[]]
|
|
1586
|
|
1587 # Define output vaiables
|
|
1588 codon_change = nuc_ref + " -> " + nuc_alt
|
|
1589 aa_change = ref + " -> " + alt
|
|
1590
|
|
1591 if RNA is True:
|
|
1592 aa_change = "RNA mutations"
|
|
1593 elif pos < 0:
|
|
1594 aa_change = "Promoter mutations"
|
|
1595
|
|
1596 # Check if mutation is known
|
|
1597 gene_mut_name, resistence, pmid = self.look_up_known_muts(
|
|
1598 gene, look_up_pos, look_up_mut, m_type, gene_name)
|
|
1599
|
|
1600 gene_mut_name = gene_mut_name + " " + mut_name
|
|
1601
|
|
1602 output_mut[i] = [gene_mut_name, codon_change, aa_change,
|
|
1603 resistence, pmid]
|
|
1604
|
|
1605 # Add mutation to output strings for known mutations
|
|
1606 if resistence != "Unknown":
|
|
1607 if RNA is True:
|
|
1608 # don't include the amino acid change field for
|
|
1609 # RNA mutations
|
|
1610 known_lst += ["\t".join(output_mut[i][:2]) + "\t"
|
|
1611 + "\t".join(output_mut[i][3:])]
|
|
1612 else:
|
|
1613 known_lst += ["\t".join(output_mut[i])]
|
|
1614
|
|
1615 all_results_lst += ["\t".join(output_mut[i])]
|
|
1616
|
|
1617 # Add mutation to output strings for unknown mutations
|
|
1618 else:
|
|
1619 if RNA is True:
|
|
1620 unknown_lst += ["\t".join(output_mut[i][:2])]
|
|
1621 else:
|
|
1622 unknown_lst += ["\t".join(output_mut[i][:3])]
|
|
1623
|
|
1624 if unknown_flag is True:
|
|
1625 all_results_lst += ["\t".join(output_mut[i])]
|
|
1626
|
|
1627 # Check that you do not print two equal lines (can happen
|
|
1628 # if two indels occure in the same codon)
|
|
1629 if len(output_mut) > 1:
|
|
1630 if output_mut[i] == output_mut[i - 1]:
|
|
1631
|
|
1632 if resistence != "Unknown":
|
|
1633 known_lst = known_lst[:-1]
|
|
1634 all_results_lst = all_results_lst[:-1]
|
|
1635 else:
|
|
1636 unknown_lst = unknown_lst[:-1]
|
|
1637
|
|
1638 if unknown_flag is True:
|
|
1639 all_results_lst = all_results_lst[:-1]
|
|
1640
|
|
1641 if "Premature stop codon" in mut_name:
|
|
1642 sbjct_len = hit['sbjct_length']
|
|
1643 qry_len = pos * 3
|
|
1644 prec_truckat = round(
|
|
1645 ((float(sbjct_len) - qry_len)
|
|
1646 / float(sbjct_len))
|
|
1647 * 100, 2
|
|
1648 )
|
|
1649 perc = "%"
|
|
1650 stop_codons.append("Premature stop codon in %s, %.2f%s lost"
|
|
1651 % (gene, prec_truckat, perc))
|
|
1652
|
|
1653 # Creat final strings
|
|
1654 if(all_results_lst):
|
|
1655 all_results = "\n".join(all_results_lst)
|
|
1656 else:
|
|
1657 all_results = ""
|
|
1658 total_known_str = ""
|
|
1659 total_unknown_str = ""
|
|
1660
|
|
1661 # Check if there are only unknown mutations
|
|
1662 resistence_lst = []
|
|
1663 for mut in output_mut:
|
|
1664 for res in mut[3].split(","):
|
|
1665 resistence_lst.append(res)
|
|
1666
|
|
1667 # Save known mutations
|
|
1668 unknown_no = resistence_lst.count("Unknown")
|
|
1669 if unknown_no < len(resistence_lst):
|
|
1670 total_known_str = known_header + "\n".join(known_lst)
|
|
1671 else:
|
|
1672 total_known_str = "No known mutations found in %s" % gene_name
|
|
1673
|
|
1674 # Save unknown mutations
|
|
1675 if unknown_no > 0:
|
|
1676 total_unknown_str = unknown_header + "\n".join(unknown_lst)
|
|
1677 else:
|
|
1678 total_unknown_str = "No unknown mutations found in %s" % gene_name
|
|
1679
|
|
1680 return (all_results, total_known_str, total_unknown_str,
|
|
1681 resistence_lst + stop_codons)
|
|
1682
|
|
1683 @staticmethod
|
|
1684 def get_file_content(file_path, fst_char_only=False):
|
|
1685 """
|
|
1686 This function opens a file, given as the first argument
|
|
1687 file_path and returns the lines of the file in a list or the
|
|
1688 first character of the file if fst_char_only is set to True.
|
|
1689 """
|
|
1690 with open(file_path, "r") as infile:
|
|
1691 line_lst = []
|
|
1692 for line in infile:
|
|
1693 line = line.strip()
|
|
1694 if line != "":
|
|
1695 line_lst.append(line)
|
|
1696 if fst_char_only:
|
|
1697 return line_lst[0][0]
|
|
1698 return line_lst
|
|
1699
|
|
1700 def look_up_known_muts(self, gene, pos, found_mut, mut, gene_name):
|
|
1701 """
|
|
1702 This function uses the known_mutations dictionay, a gene a
|
|
1703 string with the gene key name, a gene position as integer,
|
|
1704 found_mut is given as amino acid or nucleotides, the
|
|
1705 mutation type (mut) is either "del", "ins", or "sub", and
|
|
1706 gene_name is the gene name that should be returned to the
|
|
1707 user. The function looks up if the found_mut defined by the
|
|
1708 gene, position and the found_mut string is given in the
|
|
1709 known_mutations dictionary, if it is, the resistance and
|
|
1710 the pmid are returned together with the gene_name given in
|
|
1711 the known_mutations dict. If the mutation type is "del" the
|
|
1712 deleted nucleotids are checked to be contained in any of
|
|
1713 the deletion described in the known_mutation dict.
|
|
1714 """
|
|
1715 resistence = "Unknown"
|
|
1716 pmid = "-"
|
|
1717 found_mut = found_mut.upper()
|
|
1718
|
|
1719 if mut == "del":
|
|
1720 for i, i_pos in enumerate(range(pos, pos + len(found_mut))):
|
|
1721
|
|
1722 known_indels = self.known_mutations[gene]["del"].get(i_pos, [])
|
|
1723 for known_indel in known_indels:
|
|
1724 partial_mut = found_mut[i:len(known_indel) + i]
|
|
1725
|
|
1726 # Check if part of found mut is known and check if
|
|
1727 # found mut and known mut is in the same reading
|
|
1728 # frame
|
|
1729 if(partial_mut == known_indel
|
|
1730 and len(found_mut) % 3 == len(known_indel) % 3):
|
|
1731
|
|
1732 resistence = (self.known_mutations[gene]["del"][i_pos]
|
|
1733 [known_indel]['drug'])
|
|
1734
|
|
1735 pmid = (self.known_mutations[gene]["del"][i_pos]
|
|
1736 [known_indel]['pmid'])
|
|
1737
|
|
1738 gene_name = (self.known_mutations[gene]["del"][i_pos]
|
|
1739 [known_indel]['gene_name'])
|
|
1740 break
|
|
1741 else:
|
|
1742 if pos in self.known_mutations[gene][mut]:
|
|
1743 if found_mut in self.known_mutations[gene][mut][pos]:
|
|
1744 resistence = (self.known_mutations[gene][mut][pos]
|
|
1745 [found_mut]['drug'])
|
|
1746
|
|
1747 pmid = (self.known_mutations[gene][mut][pos][found_mut]
|
|
1748 ['pmid'])
|
|
1749
|
|
1750 gene_name = (self.known_mutations[gene][mut][pos]
|
|
1751 [found_mut]['gene_name'])
|
|
1752
|
|
1753 # Check if stop codons refer resistance
|
|
1754 if "*" in found_mut and gene in self.known_stop_codon:
|
|
1755 if resistence == "Unknown":
|
|
1756 resistence = self.known_stop_codon[gene]["drug"]
|
|
1757 else:
|
|
1758 resistence += "," + self.known_stop_codon[gene]["drug"]
|
|
1759
|
|
1760 return (gene_name, resistence, pmid)
|
|
1761
|
|
1762
|
|
1763 if __name__ == '__main__':
|
|
1764
|
|
1765 ##########################################################################
|
|
1766 # PARSE COMMAND LINE OPTIONS
|
|
1767 ##########################################################################
|
|
1768
|
|
1769 parser = argparse.ArgumentParser(
|
|
1770 description="This program predicting resistance associated with \
|
|
1771 chromosomal mutations based on WGS data",
|
|
1772 prog="pointfinder.py")
|
|
1773
|
|
1774 # required arguments
|
|
1775 parser.add_argument("-i",
|
|
1776 dest="inputfiles",
|
|
1777 metavar="INFILE",
|
|
1778 nargs='+',
|
|
1779 help="Input file. fastq file(s) from one sample for \
|
|
1780 KMA or one fasta file for blastn.",
|
|
1781 required=True)
|
|
1782 parser.add_argument("-o",
|
|
1783 dest="out_path",
|
|
1784 metavar="OUTFOLDER",
|
|
1785 help="Output folder, output files will be stored \
|
|
1786 here.",
|
|
1787 required=True)
|
|
1788 parser.add_argument("-s",
|
|
1789 dest="species",
|
|
1790 metavar="SPECIES",
|
|
1791 choices=['e.coli', 'gonorrhoeae', 'campylobacter',
|
|
1792 'salmonella', 'tuberculosis'],
|
|
1793 help="Species of choice, e.coli, tuberculosis, \
|
|
1794 salmonella, campylobactor, gonorrhoeae, \
|
|
1795 klebsiella, or malaria",
|
|
1796 required=True)
|
|
1797 parser.add_argument("-m",
|
|
1798 dest="method",
|
|
1799 metavar="METHOD",
|
|
1800 choices=["kma", "blastn"],
|
|
1801 help="Method of choice, kma or blastn",
|
|
1802 required=True)
|
|
1803 parser.add_argument("-m_p",
|
|
1804 dest="method_path",
|
|
1805 help="Path to the location of blastn or kma dependent \
|
|
1806 of the chosen method",
|
|
1807 required=True)
|
|
1808 parser.add_argument("-p",
|
|
1809 dest="db_path",
|
|
1810 metavar="DATABASE",
|
|
1811 help="Path to the location of the pointfinder \
|
|
1812 database",
|
|
1813 required=True)
|
|
1814
|
|
1815 # optional arguments
|
|
1816 parser.add_argument("-t",
|
|
1817 dest="threshold",
|
|
1818 metavar="IDENTITY",
|
|
1819 help="Minimum gene identity threshold, default = 0.9",
|
|
1820 type=float,
|
|
1821 default=0.9)
|
|
1822 parser.add_argument("-l",
|
|
1823 dest="min_cov",
|
|
1824 metavar="COVERAGE",
|
|
1825 help="Minimum gene coverage threshold, \
|
|
1826 threshold = 0.6",
|
|
1827 type=float,
|
|
1828 default=0.6)
|
|
1829 parser.add_argument("-u",
|
|
1830 dest="unknown_mutations",
|
|
1831 help="Show all mutations found even if it's unknown \
|
|
1832 to the resistance database.",
|
|
1833 action='store_true',
|
|
1834 default=False)
|
|
1835 parser.add_argument("-g",
|
|
1836 dest="specific_gene",
|
|
1837 nargs='+',
|
|
1838 help="Specify genes existing in the database to \
|
|
1839 search for - if none is specified all genes are \
|
|
1840 included in the search.",
|
|
1841 default=None)
|
|
1842
|
|
1843 args = parser.parse_args()
|
|
1844
|
|
1845 # If no arguments are given print usage message and exit
|
|
1846 if len(sys.argv) == 1:
|
|
1847 sys.exit("Usage: " + parser.usage)
|
|
1848
|
|
1849 if(args.method == "blastn"):
|
|
1850 method = PointFinder.TYPE_BLAST
|
|
1851 else:
|
|
1852 method = PointFinder.TYPE_KMA
|
|
1853
|
|
1854 # Get sample name
|
|
1855 filename = args.inputfiles[0].split("/")[-1]
|
|
1856 sample_name = "".join(filename.split(".")[0:-1]) # .split("_")[0]
|
|
1857 if sample_name == "":
|
|
1858 sample_name = filename
|
|
1859
|
|
1860 # TODO: Change ilogocal var name
|
|
1861 kma_db_path = args.db_path + "/" + args.species
|
|
1862
|
|
1863 finder = PointFinder(db_path=kma_db_path, species=args.species,
|
|
1864 gene_list=args.specific_gene)
|
|
1865
|
|
1866 if method == PointFinder.TYPE_BLAST:
|
|
1867
|
|
1868 # Check that only one input file is given
|
|
1869 if len(args.inputfiles) != 1:
|
|
1870 sys.exit("Input Error: Blast was chosen as mapping method only 1 "
|
|
1871 "input file requied, not %s" % (len(args.inputfiles)))
|
|
1872
|
|
1873 finder_run = finder.blast(inputfile=args.inputfiles[0],
|
|
1874 out_path=args.out_path,
|
|
1875 min_cov=0.01,
|
|
1876 threshold=args.threshold,
|
|
1877 blast=args.method_path,
|
|
1878 cut_off=False)
|
|
1879 else:
|
|
1880 inputfile_1 = args.inputfiles[0]
|
|
1881 inputfile_2 = None
|
|
1882
|
|
1883 if(len(args.inputfiles) == 2):
|
|
1884 inputfile_2 = args.inputfiles[1]
|
|
1885 finder_run = finder.kma(inputfile_1=inputfile_1,
|
|
1886 inputfile_2=inputfile_2,
|
|
1887 out_path=args.out_path,
|
|
1888 db_path_kma=kma_db_path,
|
|
1889 databases=[args.species],
|
|
1890 min_cov=args.min_cov,
|
|
1891 threshold=args.threshold,
|
|
1892 kma_path=args.method_path,
|
|
1893 sample_name=sample_name,
|
|
1894 kma_mrs=0.5, kma_gapopen=-5, kma_gapextend=-2,
|
|
1895 kma_penalty=-3, kma_reward=1)
|
|
1896
|
|
1897 results = finder_run.results
|
|
1898
|
|
1899 if(args.specific_gene):
|
|
1900 results = PointFinder.discard_unwanted_results(
|
|
1901 results=results, wanted=args.specific_gene)
|
|
1902
|
|
1903 finder.write_results(out_path=args.out_path, result=results, min_cov=args.min_cov,
|
|
1904 res_type=method, unknown_flag=args.unknown_mutations)
|