Mercurial > repos > dcouvin > resfinder4
comparison resfinder/cge/pointfinder.py @ 0:55051a9bc58d draft default tip
Uploaded
author | dcouvin |
---|---|
date | Mon, 10 Jan 2022 20:06:07 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:55051a9bc58d |
---|---|
1 #!/usr/bin/env python3 | |
2 # | |
3 # Program: PointFinder-3.0 | |
4 # Author: Camilla Hundahl Johnsen | |
5 # | |
6 # Dependencies: KMA or NCBI-blast together with BioPython. | |
7 | |
8 import os | |
9 import re | |
10 import sys | |
11 import math | |
12 import argparse | |
13 import subprocess | |
14 import random | |
15 | |
16 from cgecore.blaster import Blaster | |
17 from cgecore.cgefinder import CGEFinder | |
18 from .output.table import TableResults | |
19 from .phenotype2genotype.feature import ResMutation | |
20 from .phenotype2genotype.res_profile import PhenoDB | |
21 | |
22 | |
23 def eprint(*args, **kwargs): | |
24 print(*args, file=sys.stderr, **kwargs) | |
25 | |
26 | |
27 class GeneListError(Exception): | |
28 """ Raise when a specified gene is not found within the gene list. | |
29 """ | |
30 def __init__(self, message, *args): | |
31 self.message = message | |
32 # allow users initialize misc. arguments as any other builtin Error | |
33 super(PanelNameError, self).__init__(message, *args) | |
34 | |
35 | |
36 class PointFinder(CGEFinder): | |
37 | |
38 def __init__(self, db_path, species, gene_list=None): | |
39 """ | |
40 """ | |
41 self.species = species | |
42 self.specie_path = db_path | |
43 self.RNA_gene_list = [] | |
44 | |
45 self.gene_list = PointFinder.get_file_content( | |
46 self.specie_path + "/genes.txt") | |
47 | |
48 if os.path.isfile(self.specie_path + "/RNA_genes.txt"): | |
49 self.RNA_gene_list = PointFinder.get_file_content( | |
50 self.specie_path + "/RNA_genes.txt") | |
51 | |
52 # Creat user defined gene_list if given | |
53 if(gene_list is not None): | |
54 self.gene_list = get_user_defined_gene_list(gene_list) | |
55 | |
56 self.known_mutations, self.drug_genes, self.known_stop_codon = ( | |
57 self.get_db_mutations(self.specie_path + "/resistens-overview.txt", | |
58 self.gene_list)) | |
59 | |
60 def get_user_defined_gene_list(self, gene_list): | |
61 genes_specified = [] | |
62 for gene in gene_list: | |
63 # Check that the genes are valid | |
64 if gene not in self.gene_list: | |
65 raise(GeneListError( | |
66 "Input Error: Specified gene not recognised " | |
67 "(%s)\nChoose one or more of the following genes:" | |
68 "\n%s" % (gene, "\n".join(self.gene_list)))) | |
69 genes_specified.append(gene) | |
70 # Change the gene_list to the user defined gene_list | |
71 return genes_specified | |
72 | |
73 #DELETE | |
74 def old_results_to_standard_output(self, result, software, version, | |
75 run_date, run_cmd, id, | |
76 unknown_mut=False, tableresults=None): | |
77 """ | |
78 """ | |
79 std_results = TableResults(software, version, run_date, run_cmd, id) | |
80 headers = [ | |
81 "template_name", | |
82 "template_length", | |
83 "aln_length", | |
84 "aln_identity", | |
85 "aln_gaps", | |
86 "aln_template_string", | |
87 "aln_query_string", | |
88 "aln_homology_string", | |
89 "query_id", | |
90 "query_start_pos", | |
91 "query_end_pos", | |
92 "template_variant", | |
93 "query_depth", | |
94 "blast_eval", | |
95 "blast_bitscore", | |
96 "pval", | |
97 "software_score", | |
98 "mutation", | |
99 "ref_codon", | |
100 "alt_codon", | |
101 "ref_aa", | |
102 "alt_aa", | |
103 "insertion", | |
104 "deletion" | |
105 ] | |
106 for db_name, db in result.items(): | |
107 if(db_name == "excluded"): | |
108 continue | |
109 | |
110 if(db == "No hit found"): | |
111 continue | |
112 | |
113 ####Added to solve PointFinder#### | |
114 if(isinstance(db, str)): | |
115 if db.startswith("Gene found with coverage"): | |
116 continue | |
117 #### | |
118 | |
119 # Start writing output string (to HTML tab file) | |
120 gene_name = PhenoDB.if_promoter_rename(db_name) | |
121 | |
122 # Find and save mis_matches in gene | |
123 sbjct_start = db["sbjct_start"] | |
124 sbjct_seq = db["sbjct_string"] | |
125 qry_seq = db["query_string"] | |
126 db["mis_matches"] = self.find_mismatches(gene=db_name, | |
127 sbjct_start=sbjct_start, | |
128 sbjct_seq=sbjct_seq, | |
129 qry_seq=qry_seq) | |
130 | |
131 known_muts = [] | |
132 unknown_muts = [] | |
133 if(len(db["mis_matches"]) > 0): | |
134 known_muts, unknown_muts = self.get_mutations( | |
135 db_name, gene_name, db['mis_matches'], True, db) | |
136 | |
137 # No mutations found | |
138 if(not (unknown_muts or known_muts)): | |
139 continue | |
140 | |
141 std_results.add_table(db_name) | |
142 std_db = std_results.long[db_name] | |
143 std_db.add_headers(headers) | |
144 std_db.lock_headers = True | |
145 | |
146 for mut in known_muts: | |
147 unique_id = mut.unique_id | |
148 std_db[unique_id] = { | |
149 "template_name": db_name, | |
150 "template_length": db["sbjct_length"], | |
151 "aln_length": db["HSP_length"], | |
152 "aln_identity": db["perc_ident"], | |
153 "aln_gaps": db["gaps"], | |
154 "aln_template_string": db["sbjct_string"], | |
155 "aln_query_string": db["query_string"], | |
156 "aln_homology_string": db["homo_string"], | |
157 "query_id": db["contig_name"], | |
158 "query_start_pos": mut.pos, | |
159 "query_end_pos": mut.end, | |
160 "query_depth": db.get("depth", "NA"), | |
161 "pval": db.get("p_value", "NA"), | |
162 "software_score": db["cal_score"], | |
163 "mutation": mut.mut_string_short, | |
164 "ref_codon": mut.ref_codon, | |
165 "alt_codon": mut.mut_codon, | |
166 "ref_aa": mut.ref_aa, | |
167 "alt_aa": mut.mut_aa, | |
168 "insertion": mut.insertion, | |
169 "deletion": mut.deletion | |
170 } | |
171 | |
172 return std_results | |
173 | |
174 @staticmethod | |
175 def get_db_names(db_root_path): | |
176 out_lst = [] | |
177 for entry in os.scandir(db_root_path): | |
178 if not entry.name.startswith('.') and entry.is_dir(): | |
179 out_lst.append(entry.name) | |
180 return tuple(out_lst) | |
181 | |
182 def results_to_str(self, res_type, results, unknown_flag, min_cov, | |
183 perc_iden): | |
184 # Initiate output stings with headers | |
185 gene_list = ', '.join(self.gene_list) | |
186 output_strings = [ | |
187 "Mutation\tNucleotide change\tAmino acid change\tResistance\tPMID", | |
188 "Chromosomal point mutations - Results\nSpecies: %s\nGenes: %s\n" | |
189 "Mapping methode: %s\n\n\nKnown Mutations\n" % (self.species, gene_list, res_type), "" | |
190 ] | |
191 # Get all drug names and add header of all drugs to prediction file | |
192 drug_lst = [drug for drug in self.drug_genes.keys()] | |
193 output_strings[2] = "\t".join(drug_lst) + "\n" | |
194 | |
195 # Define variables to write temporary output into | |
196 total_unknown_str = "" | |
197 unique_drug_list = [] | |
198 excluded_hits = {} | |
199 | |
200 if res_type == PointFinder.TYPE_BLAST: | |
201 # Patch together partial sequence hits (BLASTER does not do this) | |
202 # and removes dummy_hit_id | |
203 GENES = self.find_best_seqs(results, min_cov) | |
204 | |
205 else: | |
206 # TODO: Some databases are either genes or species, | |
207 # depending on the method applied (BLAST/KMA). | |
208 GENES = results[self.species] | |
209 | |
210 # If not hits found insert genes not found | |
211 if GENES == 'No hit found': | |
212 GENES = {} | |
213 | |
214 # KMA only gives the genes found, however all genes should be | |
215 # included | |
216 for gene in self.gene_list: | |
217 if gene not in GENES: | |
218 GENES[gene] = 'No hit found' | |
219 | |
220 # Included excluded | |
221 GENES['excluded'] = results['excluded'] | |
222 | |
223 for gene, hit in GENES.items(): | |
224 # Start writing output string (to HTML tab file) | |
225 gene_name = gene # Not perfeft can differ from specific mutation | |
226 regex = r"promoter_size_(\d+)(?:bp)" | |
227 promtr_gene_objt = re.search(regex, gene) | |
228 | |
229 if promtr_gene_objt: | |
230 gene_name = gene.split("_")[0] | |
231 | |
232 output_strings[1] += "\n%s\n" % (gene_name) | |
233 | |
234 # Ignore excluded genes | |
235 # TODO: Should all not found genes be moved to this dict? | |
236 if gene == "excluded": | |
237 continue | |
238 | |
239 # Write exclusion reason for genes not found | |
240 if isinstance(GENES[gene], str): | |
241 output_strings[1] += GENES[gene] + "\n" | |
242 continue | |
243 | |
244 if hit['perc_ident'] < perc_iden and hit['perc_coverage'] < min_cov: | |
245 GENES[gene] = ('Gene found with identity, %s, below minimum ' | |
246 'identity threshold: %s. And with coverage, %s,' | |
247 ' below minimum coverage threshold: %s.' | |
248 % (hit['perc_ident'], perc_iden, | |
249 hit['perc_coverage'], min_cov)) | |
250 output_strings[1] += GENES[gene] + "\n" | |
251 continue | |
252 | |
253 if hit['perc_ident'] < perc_iden: | |
254 GENES[gene] = ('Gene found with identity, %s, below minimum ' | |
255 'identity threshold: %s' % (hit['perc_ident'], | |
256 perc_iden)) | |
257 output_strings[1] += GENES[gene] + "\n" | |
258 continue | |
259 | |
260 if hit['perc_coverage'] < min_cov: | |
261 # Gene not found above given coverage | |
262 GENES[gene] = ('Gene found with coverage, %s, below minimum ' | |
263 'coverage threshold: %s' % (hit['perc_coverage'], | |
264 min_cov)) | |
265 output_strings[1] += GENES[gene] + "\n" | |
266 continue | |
267 | |
268 sbjct_start = hit['sbjct_start'] | |
269 sbjct_seq = hit['sbjct_string'] | |
270 qry_seq = hit['query_string'] | |
271 | |
272 # Find and save mis_matches in gene | |
273 hit['mis_matches'] = self.find_mismatches(gene, sbjct_start, | |
274 sbjct_seq, qry_seq) | |
275 | |
276 # Check if no mutations was found | |
277 if len(hit['mis_matches']) < 1: | |
278 output_strings[1] += ("No mutations found in {}\n" | |
279 .format(gene_name)) | |
280 else: | |
281 # Write mutations found to output file | |
282 total_unknown_str += "\n%s\n" % (gene_name) | |
283 | |
284 str_tuple = self.mut2str(gene, gene_name, hit['mis_matches'], | |
285 unknown_flag, hit) | |
286 | |
287 all_results = str_tuple[0] | |
288 total_known = str_tuple[1] | |
289 total_unknown = str_tuple[2] | |
290 drug_list = str_tuple[3] | |
291 | |
292 # Add results to output strings | |
293 if(all_results): | |
294 output_strings[0] += "\n" + all_results | |
295 output_strings[1] += total_known + "\n" | |
296 | |
297 # Add unknown mutations the total results of | |
298 # unknown mutations | |
299 total_unknown_str += total_unknown + "\n" | |
300 | |
301 # Add drugs to druglist | |
302 unique_drug_list += [drug.lower() for drug in drug_list] | |
303 | |
304 if unknown_flag is True: | |
305 output_strings[1] += ("\n\nUnknown Mutations \n" | |
306 + total_unknown_str) | |
307 | |
308 # Make Resistance Prediction output | |
309 | |
310 # Go throug all drugs in the database and see if prediction | |
311 # can be called. | |
312 pred_output = [] | |
313 for drug in drug_lst: | |
314 drug = drug.lower() | |
315 # Check if resistance to drug was found | |
316 if drug in unique_drug_list: | |
317 pred_output.append("1") | |
318 else: | |
319 # Check at all genes associated with the drug | |
320 # resistance where found | |
321 all_genes_found = True | |
322 for gene in self.drug_genes[drug]: | |
323 if gene not in GENES: | |
324 all_genes_found = False | |
325 | |
326 if all_genes_found is False: | |
327 pred_output.append("?") | |
328 else: | |
329 pred_output.append("0") | |
330 | |
331 output_strings[2] += "\t".join(pred_output) + "\n" | |
332 | |
333 return output_strings | |
334 | |
335 def write_results(self, out_path, result, res_type, unknown_flag, min_cov, | |
336 perc_iden): | |
337 """ | |
338 """ | |
339 | |
340 result_str = self.results_to_str(res_type=res_type, | |
341 results=result, | |
342 unknown_flag=unknown_flag, | |
343 min_cov=min_cov, | |
344 perc_iden=perc_iden) | |
345 | |
346 with open(out_path + "/PointFinder_results.txt", "w") as fh: | |
347 fh.write(result_str[0]) | |
348 with open(out_path + "/PointFinder_table.txt", "w") as fh: | |
349 fh.write(result_str[1]) | |
350 with open(out_path + "/PointFinder_prediction.txt", "w") as fh: | |
351 fh.write(result_str[2]) | |
352 | |
353 @staticmethod | |
354 def discard_unwanted_results(results, wanted): | |
355 """ | |
356 Takes a dict and a list of keys. | |
357 Returns a dict containing only the keys from the list. | |
358 """ | |
359 cleaned_results = dict() | |
360 for key, val in results.items(): | |
361 if(key in wanted): | |
362 cleaned_results[key] = val | |
363 return cleaned_results | |
364 | |
365 def blast(self, inputfile, out_path, min_cov=0.6, threshold=0.9, | |
366 blast="blastn", cut_off=True): | |
367 """ | |
368 """ | |
369 blast_run = Blaster(inputfile=inputfile, databases=self.gene_list, | |
370 db_path=self.specie_path, out_path=out_path, | |
371 min_cov=min_cov, threshold=threshold, blast=blast, | |
372 cut_off=cut_off) | |
373 | |
374 self.blast_results = blast_run.results # TODO Is this used? | |
375 return blast_run | |
376 | |
377 @staticmethod | |
378 def get_db_mutations(mut_db_path, gene_list): | |
379 """ | |
380 This function opens the file resistenss-overview.txt, and reads | |
381 the content into a dict of dicts. The dict will contain | |
382 information about all known mutations given in the database. | |
383 This dict is returned. | |
384 """ | |
385 | |
386 # Initiate variables | |
387 known_mutations = dict() | |
388 known_stop_codon = dict() | |
389 drug_genes = dict() | |
390 indelflag = False | |
391 stopcodonflag = False | |
392 | |
393 # Go throug mutation file line by line | |
394 | |
395 with open(mut_db_path, "r") as fh: | |
396 mutdb_file = fh.readlines() | |
397 mutdb_file = [line.strip() for line in mutdb_file] | |
398 | |
399 for line in mutdb_file: | |
400 # Ignore headers and check where the indel section starts | |
401 if line.startswith("#"): | |
402 if "indel" in line.lower(): | |
403 indelflag = True | |
404 elif "stop codon" in line.lower(): | |
405 stopcodonflag = True | |
406 else: | |
407 stopcodonflag = False | |
408 continue | |
409 | |
410 mutation = [data.strip() for data in line.split("\t")] | |
411 | |
412 gene_ID = mutation[0] | |
413 | |
414 # Only consider mutations in genes found in the gene list | |
415 if gene_ID in gene_list: | |
416 gene_name = mutation[1] | |
417 mut_pos = int(mutation[2]) | |
418 ref_codon = mutation[3] # Ref_nuc (1 or 3?) | |
419 ref_aa = mutation[4] # Ref_codon | |
420 alt_aa = mutation[5].split(",") # Res_codon | |
421 res_drug = mutation[6].replace("\t", " ") | |
422 pmid = mutation[7].split(",") | |
423 | |
424 # Check if stop codons are predictive for resistance | |
425 if stopcodonflag is True: | |
426 if gene_ID not in known_stop_codon: | |
427 known_stop_codon[gene_ID] = {"pos": [], | |
428 "drug": res_drug} | |
429 known_stop_codon[gene_ID]["pos"].append(mut_pos) | |
430 | |
431 # Add genes associated with drug resistance to drug_genes dict | |
432 drug_lst = res_drug.split(",") | |
433 drug_lst = [d.strip().lower() for d in drug_lst] | |
434 for drug in drug_lst: | |
435 if drug not in drug_genes: | |
436 drug_genes[drug] = [] | |
437 if gene_ID not in drug_genes[drug]: | |
438 drug_genes[drug].append(gene_ID) | |
439 | |
440 # Initiate empty dict to store relevant mutation information | |
441 mut_info = dict() | |
442 | |
443 # Save need mutation info with pmid cooresponding to the amino | |
444 # acid change | |
445 for i in range(len(alt_aa)): | |
446 try: | |
447 mut_info[alt_aa[i]] = {"gene_name": gene_name, | |
448 "drug": res_drug, | |
449 "pmid": pmid[i]} | |
450 except IndexError: | |
451 mut_info[alt_aa[i]] = {"gene_name": gene_name, | |
452 "drug": res_drug, | |
453 "pmid": "-"} | |
454 | |
455 # Add all possible types of mutations to the dict | |
456 if gene_ID not in known_mutations: | |
457 known_mutations[gene_ID] = {"sub": dict(), "ins": dict(), | |
458 "del": dict()} | |
459 # Check for the type of mutation | |
460 if indelflag is False: | |
461 mutation_type = "sub" | |
462 else: | |
463 mutation_type = ref_aa | |
464 | |
465 # Save mutations positions with required information given in | |
466 # mut_info | |
467 if mut_pos not in known_mutations[gene_ID][mutation_type]: | |
468 known_mutations[gene_ID][mutation_type][mut_pos] = dict() | |
469 for amino in alt_aa: | |
470 if (amino in known_mutations[gene_ID][mutation_type] | |
471 [mut_pos]): | |
472 stored_mut_info = (known_mutations[gene_ID] | |
473 [mutation_type] | |
474 [mut_pos][amino]) | |
475 if stored_mut_info["drug"] != mut_info[amino]["drug"]: | |
476 stored_mut_info["drug"] += "," + (mut_info[amino] | |
477 ["drug"]) | |
478 if stored_mut_info["pmid"] != mut_info[amino]["pmid"]: | |
479 stored_mut_info["pmid"] += ";" + (mut_info[amino] | |
480 ["pmid"]) | |
481 | |
482 (known_mutations[gene_ID][mutation_type] | |
483 [mut_pos][amino]) = stored_mut_info | |
484 else: | |
485 (known_mutations[gene_ID][mutation_type] | |
486 [mut_pos][amino]) = mut_info[amino] | |
487 | |
488 # Check that all genes in the gene list has known mutations | |
489 for gene in gene_list: | |
490 if gene not in known_mutations: | |
491 known_mutations[gene] = {"sub": dict(), "ins": dict(), | |
492 "del": dict()} | |
493 return known_mutations, drug_genes, known_stop_codon | |
494 | |
495 def find_best_seqs(self, blast_results, min_cov): | |
496 """ | |
497 This function finds gene sequences with the largest coverage based on | |
498 the blast results. If more hits covering different sequences parts | |
499 exists it concatinates partial gene hits into one hit. | |
500 If different overlap sequences occurs they are saved in the list | |
501 alternative_overlaps. The function returns a new results dict where | |
502 each gene has an inner dict with hit information corresponding to | |
503 the new concatinated hit. If no gene is found the value is a string | |
504 with info of why the gene wasn't found. | |
505 """ | |
506 | |
507 GENES = dict() | |
508 for gene, hits in blast_results.items(): | |
509 | |
510 # Save all hits in the list 'hits_found' | |
511 hits_found = [] | |
512 GENES[gene] = {} | |
513 | |
514 # Check for gene has any hits in blast results | |
515 if gene == 'excluded': | |
516 GENES[gene] = hits | |
517 continue | |
518 elif isinstance(hits, dict) and len(hits) > 0: | |
519 GENES[gene]['found'] = 'partially' | |
520 GENES[gene]['hits'] = hits | |
521 else: | |
522 # Gene not found! go to next gene | |
523 GENES[gene] = 'No hit found' | |
524 continue | |
525 | |
526 # Check coverage for each hit, patch together partial genes hits | |
527 for hit_id, hit in hits.items(): | |
528 | |
529 # Save hits start and end positions in subject, total subject | |
530 # len, and subject and query sequences of each hit | |
531 hits_found += [(hit['sbjct_start'], hit['sbjct_end'], | |
532 hit['sbjct_string'], hit['query_string'], | |
533 hit['sbjct_length'], hit['homo_string'], | |
534 hit_id)] | |
535 | |
536 # If coverage is 100% change found to yes | |
537 hit_coverage = hit['perc_coverage'] | |
538 if hit_coverage == 1.0: | |
539 GENES[gene]['found'] = 'yes' | |
540 | |
541 # Sort positions found | |
542 hits_found = sorted(hits_found, key=lambda x: x[0]) | |
543 | |
544 # Find best hit by concatenating sequences if more hits exist | |
545 | |
546 # Get information from the first hit found | |
547 all_start = hits_found[0][0] | |
548 current_end = hits_found[0][1] | |
549 final_sbjct = hits_found[0][2] | |
550 final_qry = hits_found[0][3] | |
551 sbjct_len = hits_found[0][4] | |
552 final_homol = hits_found[0][5] | |
553 first_hit_id = hits_found[0][6] | |
554 | |
555 alternative_overlaps = [] | |
556 contigs = [hit['contig_name']] | |
557 scores = [str(hit['cal_score'])] | |
558 | |
559 # Check if more then one hit was found within the same gene | |
560 for i in range(len(hits_found) - 1): | |
561 | |
562 # Save information from previous hit | |
563 pre_block_start = hits_found[i][0] | |
564 pre_block_end = hits_found[i][1] | |
565 pre_sbjct = hits_found[i][2] | |
566 pre_qry = hits_found[i][3] | |
567 pre_homol = hits_found[i][5] | |
568 pre_id = hits_found[i][6] | |
569 | |
570 # Save information from next hit | |
571 next_block_start = hits_found[i + 1][0] | |
572 next_block_end = hits_found[i + 1][1] | |
573 next_sbjct = hits_found[i + 1][2] | |
574 next_qry = hits_found[i + 1][3] | |
575 next_homol = hits_found[i + 1][5] | |
576 next_id = hits_found[i + 1][6] | |
577 | |
578 contigs.append(hits[next_id]['contig_name']) | |
579 scores.append(str(hits[next_id]['cal_score'])) | |
580 | |
581 # Check for overlapping sequences, collaps them and save | |
582 # alternative overlaps if any | |
583 if next_block_start <= current_end: | |
584 | |
585 # Find overlap start and take gaps into account | |
586 pos_count = 0 | |
587 overlap_pos = pre_block_start | |
588 for i in range(len(pre_sbjct)): | |
589 | |
590 # Stop loop if overlap_start position is reached | |
591 if overlap_pos == next_block_start: | |
592 overlap_start = pos_count | |
593 break | |
594 if pre_sbjct[i] != "-": | |
595 overlap_pos += 1 | |
596 pos_count += 1 | |
597 | |
598 # Find overlap len and add next sequence to final sequence | |
599 if len(pre_sbjct[overlap_start:]) > len(next_sbjct): | |
600 # <---------> | |
601 # <---> | |
602 overlap_len = len(next_sbjct) | |
603 overlap_end_pos = next_block_end | |
604 else: | |
605 # <---------> | |
606 # <---------> | |
607 overlap_len = len(pre_sbjct[overlap_start:]) | |
608 overlap_end_pos = pre_block_end | |
609 | |
610 # Update current end | |
611 current_end = next_block_end | |
612 | |
613 # Use the entire previous sequence and add the last | |
614 # part of the next sequence | |
615 final_sbjct += next_sbjct[overlap_len:] | |
616 final_qry += next_qry[overlap_len:] | |
617 final_homol += next_homol[overlap_len:] | |
618 | |
619 # Find query overlap sequences | |
620 pre_qry_overlap = pre_qry[overlap_start: (overlap_start | |
621 + overlap_len)] | |
622 next_qry_overlap = next_qry[:overlap_len] | |
623 sbjct_overlap = next_sbjct[:overlap_len] | |
624 | |
625 # If alternative query overlap excist save it | |
626 if pre_qry_overlap != next_qry_overlap: | |
627 eprint("OVERLAP WARNING:") | |
628 eprint("{}\n{}" | |
629 .format(pre_qry_overlap, next_qry_overlap)) | |
630 | |
631 # Save alternative overlaps | |
632 alternative_overlaps += [(next_block_start, | |
633 overlap_end_pos, | |
634 sbjct_overlap, | |
635 next_qry_overlap)] | |
636 | |
637 elif next_block_start > current_end: | |
638 # <-------> | |
639 # <-------> | |
640 gap_size = next_block_start - current_end - 1 | |
641 final_qry += "N" * gap_size | |
642 final_sbjct += "N" * gap_size | |
643 final_homol += "-" * gap_size | |
644 current_end = next_block_end | |
645 final_sbjct += next_sbjct | |
646 final_qry += next_qry | |
647 final_homol += next_homol | |
648 | |
649 # Calculate new coverage | |
650 no_call = final_qry.upper().count("N") | |
651 coverage = ((current_end - all_start + 1 - no_call) | |
652 / float(sbjct_len)) | |
653 | |
654 # Calculate identity | |
655 equal = 0 | |
656 not_equal = 0 | |
657 for i in range(len(final_qry)): | |
658 if final_qry[i].upper() != "N": | |
659 if final_qry[i].upper() == final_sbjct[i].upper(): | |
660 equal += 1 | |
661 else: | |
662 not_equal += 1 | |
663 identity = equal / float(equal + not_equal) | |
664 if coverage >= min_cov: | |
665 GENES[gene]['perc_coverage'] = coverage | |
666 GENES[gene]['perc_ident'] = identity | |
667 GENES[gene]['sbjct_string'] = final_sbjct | |
668 GENES[gene]['query_string'] = final_qry | |
669 GENES[gene]['homo_string'] = final_homol | |
670 GENES[gene]['contig_name'] = ", ".join(contigs) | |
671 GENES[gene]['HSP_length'] = len(final_qry) | |
672 GENES[gene]['sbjct_start'] = all_start | |
673 GENES[gene]['sbjct_end'] = current_end | |
674 GENES[gene]['sbjct_length'] = sbjct_len | |
675 GENES[gene]['cal_score'] = ", ".join(scores) | |
676 GENES[gene]['gaps'] = no_call | |
677 GENES[gene]['alternative_overlaps'] = alternative_overlaps | |
678 GENES[gene]['mis_matches'] = [] | |
679 | |
680 else: | |
681 # Gene not found above given coverage | |
682 GENES[gene] = ('Gene found with coverage, %f, below ' | |
683 'minimum coverage threshold: %s' % (coverage, | |
684 min_cov)) | |
685 return GENES | |
686 | |
687 def find_mismatches(self, gene, sbjct_start, sbjct_seq, qry_seq): | |
688 """ | |
689 This function finds mis matches between two sequeces. Depending | |
690 on the the sequence type either the function | |
691 find_codon_mismatches or find_nucleotid_mismatches are called, | |
692 if the sequences contains both a promoter and a coding region | |
693 both functions are called. The function can also call itself if | |
694 alternative overlaps is give. All found mismatches are returned | |
695 """ | |
696 | |
697 # Initiate the mis_matches list that will store all found mis matcehs | |
698 mis_matches = [] | |
699 | |
700 # Find mis matches in RNA genes | |
701 if gene in self.RNA_gene_list: | |
702 mis_matches += PointFinder.find_nucleotid_mismatches(sbjct_start, | |
703 sbjct_seq, | |
704 qry_seq) | |
705 else: | |
706 # Check if the gene sequence is with a promoter | |
707 regex = r"promoter_size_(\d+)(?:bp)" | |
708 promtr_gene_objt = re.search(regex, gene) | |
709 | |
710 # Check for promoter sequences | |
711 if promtr_gene_objt: | |
712 # Get promoter length | |
713 promtr_len = int(promtr_gene_objt.group(1)) | |
714 | |
715 # Extract promoter sequence, while considering gaps | |
716 # --------agt-->---- | |
717 # ---->? | |
718 if sbjct_start <= promtr_len: | |
719 # Find position in sbjct sequence where promoter ends | |
720 promtr_end = 0 | |
721 nuc_count = sbjct_start - 1 | |
722 | |
723 for i in range(len(sbjct_seq)): | |
724 promtr_end += 1 | |
725 | |
726 if sbjct_seq[i] != "-": | |
727 nuc_count += 1 | |
728 | |
729 if nuc_count == promtr_len: | |
730 break | |
731 | |
732 # Check if only a part of the promoter is found | |
733 # --------agt-->---- | |
734 # ---- | |
735 promtr_sbjct_start = -1 | |
736 if nuc_count < promtr_len: | |
737 promtr_sbjct_start = nuc_count - promtr_len | |
738 | |
739 # Get promoter part of subject and query | |
740 sbjct_promtr_seq = sbjct_seq[:promtr_end] | |
741 qry_promtr_seq = qry_seq[:promtr_end] | |
742 | |
743 # For promoter part find nucleotide mis matches | |
744 mis_matches += PointFinder.find_nucleotid_mismatches( | |
745 promtr_sbjct_start, sbjct_promtr_seq, qry_promtr_seq, | |
746 promoter=True) | |
747 | |
748 # Check if gene is also found | |
749 # --------agt-->---- | |
750 # ----------- | |
751 if((sbjct_start + len(sbjct_seq.replace("-", ""))) | |
752 > promtr_len): | |
753 sbjct_gene_seq = sbjct_seq[promtr_end:] | |
754 qry_gene_seq = qry_seq[promtr_end:] | |
755 sbjct_gene_start = 1 | |
756 | |
757 # Find mismatches in gene part | |
758 mis_matches += PointFinder.find_codon_mismatches( | |
759 sbjct_gene_start, sbjct_gene_seq, qry_gene_seq) | |
760 | |
761 # No promoter, only gene is found | |
762 # --------agt-->---- | |
763 # ----- | |
764 else: | |
765 sbjct_gene_start = sbjct_start - promtr_len | |
766 | |
767 # Find mismatches in gene part | |
768 mis_matches += PointFinder.find_codon_mismatches( | |
769 sbjct_gene_start, sbjct_seq, qry_seq) | |
770 | |
771 else: | |
772 # Find mismatches in gene | |
773 mis_matches += PointFinder.find_codon_mismatches( | |
774 sbjct_start, sbjct_seq, qry_seq) | |
775 | |
776 return mis_matches | |
777 | |
778 @staticmethod | |
779 def find_nucleotid_mismatches(sbjct_start, sbjct_seq, qry_seq, | |
780 promoter=False): | |
781 """ | |
782 This function takes two alligned sequence (subject and query), | |
783 and the position on the subject where the alignment starts. The | |
784 sequences are compared one nucleotide at a time. If mis matches | |
785 are found they are saved. If a gap is found the function | |
786 find_nuc_indel is called to find the entire indel and it is | |
787 also saved into the list mis_matches. If promoter sequences are | |
788 given as arguments, these are reversed the and the absolut | |
789 value of the sequence position used, but when mutations are | |
790 saved the negative value and det reverse sequences are saved in | |
791 mis_mathces. | |
792 """ | |
793 | |
794 # Initiate the mis_matches list that will store all found | |
795 # mismatcehs | |
796 mis_matches = [] | |
797 | |
798 sbjct_start = abs(sbjct_start) | |
799 seq_pos = sbjct_start | |
800 | |
801 # Set variables depending on promoter status | |
802 factor = 1 | |
803 mut_prefix = "r." | |
804 | |
805 if promoter is True: | |
806 factor = (-1) | |
807 mut_prefix = "n." | |
808 # Reverse promoter sequences | |
809 sbjct_seq = sbjct_seq[::-1] | |
810 qry_seq = qry_seq[::-1] | |
811 | |
812 # Go through sequences one nucleotide at a time | |
813 shift = 0 | |
814 for index in range(sbjct_start - 1, len(sbjct_seq)): | |
815 mut_name = mut_prefix | |
816 mut = "" | |
817 # Shift index according to gaps | |
818 i = index + shift | |
819 | |
820 # If the end of the sequence is reached, stop | |
821 if i == len(sbjct_seq): | |
822 break | |
823 | |
824 sbjct_nuc = sbjct_seq[i] | |
825 qry_nuc = qry_seq[i] | |
826 | |
827 # Check for mis matches | |
828 if sbjct_nuc.upper() != qry_nuc.upper(): | |
829 | |
830 # check for insertions and deletions | |
831 if sbjct_nuc == "-" or qry_nuc == "-": | |
832 if sbjct_nuc == "-": | |
833 mut = "ins" | |
834 indel_start_pos = (seq_pos - 1) * factor | |
835 indel_end_pos = seq_pos * factor | |
836 indel = PointFinder.find_nuc_indel(sbjct_seq[i:], | |
837 qry_seq[i:]) | |
838 else: | |
839 mut = "del" | |
840 indel_start_pos = seq_pos * factor | |
841 indel = PointFinder.find_nuc_indel(qry_seq[i:], | |
842 sbjct_seq[i:]) | |
843 indel_end_pos = (seq_pos + len(indel) - 1) * factor | |
844 seq_pos += len(indel) - 1 | |
845 | |
846 # Shift the index to the end of the indel | |
847 shift += len(indel) - 1 | |
848 | |
849 # Write mutation name, depending on sequnce | |
850 if len(indel) == 1 and mut == "del": | |
851 mut_name += str(indel_start_pos) + mut + indel | |
852 else: | |
853 if promoter is True: | |
854 # Reverse the sequence and the start and | |
855 # end positions | |
856 indel = indel[::-1] | |
857 temp = indel_start_pos | |
858 indel_start_pos = indel_end_pos | |
859 indel_end_pos = temp | |
860 | |
861 mut_name += (str(indel_start_pos) + "_" | |
862 + str(indel_end_pos) + mut + indel) | |
863 | |
864 mis_matches += [[mut, seq_pos * factor, seq_pos * factor, | |
865 indel, mut_name, mut, indel]] | |
866 | |
867 # Check for substitutions mutations | |
868 else: | |
869 mut = "sub" | |
870 mut_name += (str(seq_pos * factor) + sbjct_nuc + ">" | |
871 + qry_nuc) | |
872 | |
873 mis_matches += [[mut, seq_pos * factor, seq_pos * factor, | |
874 qry_nuc, mut_name, sbjct_nuc, qry_nuc]] | |
875 | |
876 # Increment sequence position | |
877 if mut != "ins": | |
878 seq_pos += 1 | |
879 | |
880 return mis_matches | |
881 | |
882 @staticmethod | |
883 def find_nuc_indel(gapped_seq, indel_seq): | |
884 """ | |
885 This function finds the entire indel missing in from a gapped | |
886 sequence compared to the indel_seqeunce. It is assumes that the | |
887 sequences start with the first position of the gap. | |
888 """ | |
889 ref_indel = indel_seq[0] | |
890 for j in range(1, len(gapped_seq)): | |
891 if gapped_seq[j] == "-": | |
892 ref_indel += indel_seq[j] | |
893 else: | |
894 break | |
895 return ref_indel | |
896 | |
897 @staticmethod | |
898 def aa(codon): | |
899 """ | |
900 This function converts a codon to an amino acid. If the codon | |
901 is not valid an error message is given, or else, the amino acid | |
902 is returned. | |
903 | |
904 Potential future issue: If species are added that utilizes | |
905 alternative translation tables. | |
906 """ | |
907 codon = codon.upper() | |
908 aa = {"ATT": "I", "ATC": "I", "ATA": "I", | |
909 "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", "TTA": "L", | |
910 "TTG": "L", | |
911 "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", | |
912 "TTT": "F", "TTC": "F", | |
913 "ATG": "M", | |
914 "TGT": "C", "TGC": "C", | |
915 "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", | |
916 "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", | |
917 "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", | |
918 "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", | |
919 "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "AGT": "S", | |
920 "AGC": "S", | |
921 "TAT": "Y", "TAC": "Y", | |
922 "TGG": "W", | |
923 "CAA": "Q", "CAG": "Q", | |
924 "AAT": "N", "AAC": "N", | |
925 "CAT": "H", "CAC": "H", | |
926 "GAA": "E", "GAG": "E", | |
927 "GAT": "D", "GAC": "D", | |
928 "AAA": "K", "AAG": "K", | |
929 "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R", | |
930 "AGG": "R", | |
931 "TAA": "*", "TAG": "*", "TGA": "*"} | |
932 # Translate valid codon | |
933 try: | |
934 amino_a = aa[codon] | |
935 except KeyError: | |
936 amino_a = "?" | |
937 return amino_a | |
938 | |
939 @staticmethod | |
940 def get_codon(seq, codon_no, start_offset): | |
941 """ | |
942 This function takes a sequece and a codon number and returns | |
943 the codon found in the sequence at that position | |
944 """ | |
945 seq = seq.replace("-", "") | |
946 codon_start_pos = int(codon_no - 1) * 3 - start_offset | |
947 codon = seq[codon_start_pos:codon_start_pos + 3] | |
948 return codon | |
949 | |
950 @staticmethod | |
951 def name_insertion(sbjct_seq, codon_no, sbjct_nucs, aa_alt, start_offset): | |
952 """ | |
953 This function is used to name a insertion mutation based on the | |
954 HGVS recommendation. | |
955 """ | |
956 start_codon_no = codon_no - 1 | |
957 | |
958 if len(sbjct_nucs) == 3: | |
959 start_codon_no = codon_no | |
960 | |
961 start_codon = PointFinder.get_codon(sbjct_seq, start_codon_no, | |
962 start_offset) | |
963 end_codon = PointFinder.get_codon(sbjct_seq, codon_no, start_offset) | |
964 pos_name = "p.%s%d_%s%dins%s" % (PointFinder.aa(start_codon), | |
965 start_codon_no, | |
966 PointFinder.aa(end_codon), | |
967 codon_no, aa_alt) | |
968 return pos_name | |
969 | |
970 @staticmethod | |
971 def name_deletion(sbjct_seq, sbjct_rf_indel, sbjct_nucs, codon_no, aa_alt, | |
972 start_offset, mutation="del"): | |
973 """ | |
974 This function is used to name a deletion mutation based on the | |
975 HGVS recommendation. If combination of a deletion and an | |
976 insertion is identified the argument 'mutation' is set to | |
977 'delins' and the mutation name will indicate that the mutation | |
978 is a delins mutation. | |
979 """ | |
980 del_codon = PointFinder.get_codon(sbjct_seq, codon_no, start_offset) | |
981 pos_name = "p.%s%d" % (PointFinder.aa(del_codon), codon_no) | |
982 | |
983 # This has been changed | |
984 if len(sbjct_rf_indel) == 3 and mutation == "del": | |
985 return pos_name + mutation | |
986 | |
987 end_codon_no = codon_no + math.ceil(len(sbjct_nucs) / 3) - 1 | |
988 end_codon = PointFinder.get_codon(sbjct_seq, end_codon_no, | |
989 start_offset) | |
990 pos_name += "_%s%d%s" % (PointFinder.aa(end_codon), end_codon_no, | |
991 mutation) | |
992 if mutation == "delins": | |
993 pos_name += aa_alt | |
994 return pos_name | |
995 | |
996 @staticmethod | |
997 def name_indel_mutation(sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel, | |
998 codon_no, mut, start_offset): | |
999 """ | |
1000 This function serves to name the individual mutations | |
1001 dependently on the type of the mutation. | |
1002 """ | |
1003 # Get the subject and query sequences without gaps | |
1004 sbjct_nucs = sbjct_rf_indel.replace("-", "") | |
1005 qry_nucs = qry_rf_indel.replace("-", "") | |
1006 | |
1007 # Translate nucleotides to amino acids | |
1008 aa_ref = "" | |
1009 aa_alt = "" | |
1010 | |
1011 for i in range(0, len(sbjct_nucs), 3): | |
1012 aa_ref += PointFinder.aa(sbjct_nucs[i:i + 3]) | |
1013 | |
1014 for i in range(0, len(qry_nucs), 3): | |
1015 aa_alt += PointFinder.aa(qry_nucs[i:i + 3]) | |
1016 | |
1017 # Identify the gapped sequence | |
1018 if mut == "ins": | |
1019 gapped_seq = sbjct_rf_indel | |
1020 else: | |
1021 gapped_seq = qry_rf_indel | |
1022 | |
1023 gap_size = gapped_seq.count("-") | |
1024 | |
1025 # Write mutation names | |
1026 if gap_size < 3 and len(sbjct_nucs) == 3 and len(qry_nucs) == 3: | |
1027 # Write mutation name for substitution mutation | |
1028 mut_name = "p.%s%d%s" % (PointFinder.aa(sbjct_nucs), codon_no, | |
1029 PointFinder.aa(qry_nucs)) | |
1030 elif len(gapped_seq) == gap_size: | |
1031 | |
1032 if mut == "ins": | |
1033 # Write mutation name for insertion mutation | |
1034 mut_name = PointFinder.name_insertion(sbjct_seq, codon_no, | |
1035 sbjct_nucs, aa_alt, | |
1036 start_offset) | |
1037 aa_ref = mut | |
1038 else: | |
1039 # Write mutation name for deletion mutation | |
1040 mut_name = PointFinder.name_deletion(sbjct_seq, sbjct_rf_indel, | |
1041 sbjct_nucs, codon_no, | |
1042 aa_alt, start_offset, | |
1043 mutation="del") | |
1044 aa_alt = mut | |
1045 # Check for delins - mix of insertion and deletion | |
1046 else: | |
1047 # Write mutation name for a mixed insertion and deletion | |
1048 # mutation | |
1049 mut_name = PointFinder.name_deletion(sbjct_seq, | |
1050 sbjct_rf_indel, sbjct_nucs, | |
1051 codon_no, aa_alt, | |
1052 start_offset, | |
1053 mutation="delins") | |
1054 | |
1055 # Check for frameshift | |
1056 if gapped_seq.count("-") % 3 != 0: | |
1057 # Add the frameshift tag to mutation name | |
1058 mut_name += " - Frameshift" | |
1059 | |
1060 return mut_name, aa_ref, aa_alt | |
1061 | |
1062 @staticmethod | |
1063 def get_inframe_gap(seq, nucs_needed=3): | |
1064 """ | |
1065 This funtion takes a sequnece starting with a gap or the | |
1066 complementary seqeuence to the gap, and the number of | |
1067 nucleotides that the seqeunce should contain in order to | |
1068 maintain the correct reading frame. The sequence is gone | |
1069 through and the number of non-gap characters are counted. When | |
1070 the number has reach the number of needed nucleotides the indel | |
1071 is returned. If the indel is a 'clean' insert or deletion that | |
1072 starts in the start of a codon and can be divided by 3, then | |
1073 only the gap is returned. | |
1074 """ | |
1075 nuc_count = 0 | |
1076 gap_indel = "" | |
1077 nucs = "" | |
1078 | |
1079 for i in range(len(seq)): | |
1080 # Check if the character is not a gap | |
1081 if seq[i] != "-": | |
1082 # Check if the indel is a 'clean' | |
1083 # i.e. if the insert or deletion starts at the first | |
1084 # nucleotide in the codon and can be divided by 3 | |
1085 | |
1086 if(gap_indel.count("-") == len(gap_indel) | |
1087 and gap_indel.count("-") >= 3 and len(gap_indel) != 0): | |
1088 return gap_indel | |
1089 | |
1090 nuc_count += 1 | |
1091 gap_indel += seq[i] | |
1092 | |
1093 # if the number of nucleotides in the indel equals the amount | |
1094 # needed for the indel, the indel is returned. | |
1095 if nuc_count == nucs_needed: | |
1096 return gap_indel | |
1097 | |
1098 # This will only happen if the gap is in the very end of a sequence | |
1099 return gap_indel | |
1100 | |
1101 @staticmethod | |
1102 def get_indels(sbjct_seq, qry_seq, start_pos): | |
1103 """ | |
1104 This function uses regex to find inserts and deletions in | |
1105 sequences given as arguments. A list of these indels are | |
1106 returned. The list includes, type of mutations(ins/del), | |
1107 subject codon no of found mutation, subject sequence position, | |
1108 insert/deletions nucleotide sequence, and the affected qry | |
1109 codon no. | |
1110 """ | |
1111 | |
1112 seqs = [sbjct_seq, qry_seq] | |
1113 indels = [] | |
1114 gap_obj = re.compile(r"-+") | |
1115 for i in range(len(seqs)): | |
1116 for match in gap_obj.finditer(seqs[i]): | |
1117 pos = int(match.start()) | |
1118 gap = match.group() | |
1119 | |
1120 # Find position of the mutation corresponding to the | |
1121 # subject sequence | |
1122 sbj_pos = len(sbjct_seq[:pos].replace("-", "")) + start_pos | |
1123 | |
1124 # Get indel sequence and the affected sequences in | |
1125 # sbjct and qry in the reading frame | |
1126 indel = seqs[abs(i - 1)][pos:pos + len(gap)] | |
1127 | |
1128 # Find codon number for mutation | |
1129 codon_no = int(math.ceil((sbj_pos) / 3)) | |
1130 qry_pos = len(qry_seq[:pos].replace("-", "")) + start_pos | |
1131 qry_codon = int(math.ceil((qry_pos) / 3)) | |
1132 | |
1133 if i == 0: | |
1134 mut = "ins" | |
1135 else: | |
1136 mut = "del" | |
1137 | |
1138 indels.append([mut, codon_no, sbj_pos, indel, qry_codon]) | |
1139 | |
1140 # Sort indels based on codon position and sequence position | |
1141 indels = sorted(indels, key=lambda x: (x[1], x[2])) | |
1142 | |
1143 return indels | |
1144 | |
1145 @staticmethod | |
1146 def find_codon_mismatches(sbjct_start, sbjct_seq, qry_seq): | |
1147 """ | |
1148 This function takes two alligned sequence (subject and query), | |
1149 and the position on the subject where the alignment starts. The | |
1150 sequences are compared codon by codon. If a mis matches is | |
1151 found it is saved in 'mis_matches'. If a gap is found the | |
1152 function get_inframe_gap is used to find the indel sequence and | |
1153 keep the sequence in the correct reading frame. The function | |
1154 translate_indel is used to name indel mutations and translate | |
1155 the indels to amino acids The function returns a list of tuples | |
1156 containing all needed informations about the mutation in order | |
1157 to look it up in the database dict known mutation and the with | |
1158 the output files the the user. | |
1159 """ | |
1160 mis_matches = [] | |
1161 | |
1162 # Find start pos of first codon in frame, i_start | |
1163 codon_offset = (sbjct_start - 1) % 3 | |
1164 i_start = 0 | |
1165 | |
1166 if codon_offset != 0: | |
1167 i_start = 3 - codon_offset | |
1168 | |
1169 sbjct_start = sbjct_start + i_start | |
1170 | |
1171 # Set sequences in frame | |
1172 sbjct_seq = sbjct_seq[i_start:] | |
1173 qry_seq = qry_seq[i_start:] | |
1174 | |
1175 # Find codon number of the first codon in the sequence, start | |
1176 # at 0 | |
1177 codon_no = int((sbjct_start - 1) / 3) # 1,2,3 start on 0 | |
1178 | |
1179 # s_shift and q_shift are used when gaps appears | |
1180 q_shift = 0 | |
1181 s_shift = 0 | |
1182 mut_no = 0 | |
1183 | |
1184 # Find inserts and deletions in sequence | |
1185 indel_no = 0 | |
1186 indels = PointFinder.get_indels(sbjct_seq, qry_seq, sbjct_start) | |
1187 | |
1188 # Go through sequence and save mutations when found | |
1189 for index in range(0, len(sbjct_seq), 3): | |
1190 # Count codon number | |
1191 codon_no += 1 | |
1192 | |
1193 # Shift index according to gaps | |
1194 s_i = index + s_shift | |
1195 q_i = index + q_shift | |
1196 | |
1197 # Get codons | |
1198 sbjct_codon = sbjct_seq[s_i:s_i + 3] | |
1199 qry_codon = qry_seq[q_i:q_i + 3] | |
1200 | |
1201 if(len(sbjct_seq[s_i:].replace("-", "")) | |
1202 + len(qry_codon[q_i:].replace("-", "")) < 6): | |
1203 break | |
1204 | |
1205 # Check for mutations | |
1206 if sbjct_codon.upper() != qry_codon.upper(): | |
1207 | |
1208 # Check for codon insertions and deletions and | |
1209 # frameshift mutations | |
1210 if "-" in sbjct_codon or "-" in qry_codon: | |
1211 | |
1212 # Get indel info | |
1213 try: | |
1214 indel_data = indels[indel_no] | |
1215 except IndexError: | |
1216 sys.exit("indel_data list is out of range, bug!") | |
1217 | |
1218 mut = indel_data[0] | |
1219 codon_no_indel = indel_data[1] | |
1220 seq_pos = indel_data[2] + sbjct_start - 1 | |
1221 indel = indel_data[3] | |
1222 indel_no += 1 | |
1223 | |
1224 # Get the affected sequence in frame for both for | |
1225 # sbjct and qry | |
1226 if mut == "ins": | |
1227 sbjct_rf_indel = PointFinder.get_inframe_gap( | |
1228 sbjct_seq[s_i:], 3) | |
1229 qry_rf_indel = PointFinder.get_inframe_gap( | |
1230 qry_seq[q_i:], | |
1231 int(math.floor(len(sbjct_rf_indel) / 3) * 3)) | |
1232 else: | |
1233 qry_rf_indel = PointFinder.get_inframe_gap( | |
1234 qry_seq[q_i:], 3) | |
1235 sbjct_rf_indel = PointFinder.get_inframe_gap( | |
1236 sbjct_seq[s_i:], | |
1237 int(math.floor(len(qry_rf_indel) / 3) * 3)) | |
1238 | |
1239 mut_name, aa_ref, aa_alt = PointFinder.name_indel_mutation( | |
1240 sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel, | |
1241 codon_no, mut, sbjct_start - 1) | |
1242 | |
1243 # Set index to the correct reading frame after the | |
1244 # indel gap | |
1245 shift_diff_before = abs(s_shift - q_shift) | |
1246 s_shift += len(sbjct_rf_indel) - 3 | |
1247 q_shift += len(qry_rf_indel) - 3 | |
1248 shift_diff = abs(s_shift - q_shift) | |
1249 | |
1250 if shift_diff_before != 0 and shift_diff % 3 == 0: | |
1251 | |
1252 if s_shift > q_shift: | |
1253 nucs_needed = (int((len(sbjct_rf_indel) / 3) * 3) | |
1254 + shift_diff) | |
1255 pre_qry_indel = qry_rf_indel | |
1256 qry_rf_indel = PointFinder.get_inframe_gap( | |
1257 qry_seq[q_i:], nucs_needed) | |
1258 q_shift += len(qry_rf_indel) - len(pre_qry_indel) | |
1259 | |
1260 elif q_shift > s_shift: | |
1261 nucs_needed = (int((len(qry_rf_indel) / 3) * 3) | |
1262 + shift_diff) | |
1263 pre_sbjct_indel = sbjct_rf_indel | |
1264 sbjct_rf_indel = PointFinder.get_inframe_gap( | |
1265 sbjct_seq[s_i:], nucs_needed) | |
1266 s_shift += (len(sbjct_rf_indel) | |
1267 - len(pre_sbjct_indel)) | |
1268 | |
1269 mut_name, aa_ref, aa_alt = ( | |
1270 PointFinder.name_indel_mutation( | |
1271 sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel, | |
1272 codon_no, mut, sbjct_start - 1) | |
1273 ) | |
1274 if "Frameshift" in mut_name: | |
1275 mut_name = (mut_name.split("-")[0] | |
1276 + "- Frame restored") | |
1277 if mut_name is "p.V940delins - Frame restored": | |
1278 sys.exit() | |
1279 mis_matches += [[mut, codon_no_indel, seq_pos, indel, | |
1280 mut_name, sbjct_rf_indel, qry_rf_indel, | |
1281 aa_ref, aa_alt]] | |
1282 | |
1283 # Check if the next mutation in the indels list is | |
1284 # in the current codon. | |
1285 # Find the number of individul gaps in the | |
1286 # evaluated sequence. | |
1287 no_of_indels = (len(re.findall("\-\w", sbjct_rf_indel)) | |
1288 + len(re.findall("\-\w", qry_rf_indel))) | |
1289 | |
1290 if no_of_indels > 1: | |
1291 | |
1292 for j in range(indel_no, indel_no + no_of_indels - 1): | |
1293 try: | |
1294 indel_data = indels[j] | |
1295 except IndexError: | |
1296 sys.exit("indel_data list is out of range, " | |
1297 "bug!") | |
1298 mut = indel_data[0] | |
1299 codon_no_indel = indel_data[1] | |
1300 seq_pos = indel_data[2] + sbjct_start - 1 | |
1301 indel = indel_data[3] | |
1302 indel_no += 1 | |
1303 mis_matches += [[mut, codon_no_indel, seq_pos, | |
1304 indel, mut_name, sbjct_rf_indel, | |
1305 qry_rf_indel, aa_ref, aa_alt]] | |
1306 | |
1307 # Set codon number, and save nucleotides from out | |
1308 # of frame mutations | |
1309 if mut == "del": | |
1310 codon_no += int((len(sbjct_rf_indel) - 3) / 3) | |
1311 # If evaluated insert is only gaps codon_no should | |
1312 # not increment | |
1313 elif sbjct_rf_indel.count("-") == len(sbjct_rf_indel): | |
1314 codon_no -= 1 | |
1315 | |
1316 # Check of point mutations | |
1317 else: | |
1318 mut = "sub" | |
1319 aa_ref = PointFinder.aa(sbjct_codon) | |
1320 aa_alt = PointFinder.aa(qry_codon) | |
1321 | |
1322 if aa_ref != aa_alt: | |
1323 # End search for mutation if a premature stop | |
1324 # codon is found | |
1325 mut_name = "p." + aa_ref + str(codon_no) + aa_alt | |
1326 | |
1327 mis_matches += [[mut, codon_no, codon_no, aa_alt, | |
1328 mut_name, sbjct_codon, qry_codon, | |
1329 aa_ref, aa_alt]] | |
1330 # If a Premature stop codon occur report it an stop the | |
1331 # loop | |
1332 | |
1333 try: | |
1334 if mis_matches[-1][-1] == "*": | |
1335 mut_name += " - Premature stop codon" | |
1336 mis_matches[-1][4] = (mis_matches[-1][4].split("-")[0] | |
1337 + " - Premature stop codon") | |
1338 break | |
1339 except IndexError: | |
1340 pass | |
1341 | |
1342 # Sort mutations on position | |
1343 mis_matches = sorted(mis_matches, key=lambda x: x[1]) | |
1344 | |
1345 return mis_matches | |
1346 | |
1347 @staticmethod | |
1348 def mutstr2mutdict(m): | |
1349 out_dict = {} | |
1350 | |
1351 # Protein / Amino acid mutations | |
1352 # Ex.: "p.T83I" | |
1353 if(m.startswith("p.")): | |
1354 out_dict["nucleotide"] = False | |
1355 | |
1356 # Remove frameshift tag | |
1357 frameshift_match = re.search(r"(.+) - Frameshift.*$", m) | |
1358 if(frameshift_match): | |
1359 m = frameshift_match.group(1) | |
1360 out_dict["frameshift"] = True | |
1361 | |
1362 # Remove frame restored tag | |
1363 framerestored_match = re.search(r"(.+) - Frame restored.*$", m) | |
1364 if(framerestored_match): | |
1365 m = framerestored_match.group(1) | |
1366 out_dict["frame restored"] = True | |
1367 | |
1368 # Remove premature stop tag | |
1369 prem_stop_match = re.search(r"(.+) - Premature stop codon.*$", m) | |
1370 if(prem_stop_match): | |
1371 m = prem_stop_match.group(1) | |
1372 out_dict["prem_stop"] = True | |
1373 | |
1374 # TODO: premature or frameshift tag adds too many whitespaces | |
1375 m = m.strip() | |
1376 | |
1377 # Delins | |
1378 multi_delins_match = re.search( | |
1379 r"^p.(\D{1})(\d+)_(\D{1})(\d+)delins(\S+)$", m) | |
1380 single_delins_match = re.search( | |
1381 r"^p.(\D{1})(\d+)delins(\S+)$", m) | |
1382 # TODO: is both necessary? | |
1383 multi_delins_match2 = re.search( | |
1384 r"^p.(\D{1})(\d+)_(\D{1})(\d+)delins$", m) | |
1385 single_delins_match2 = re.search( | |
1386 r"^p.(\D{1})(\d+)delins$", m) | |
1387 multi_ins_match = re.search( | |
1388 r"^p.(\D{1})(\d+)_(\D{1})(\d+)ins(\D*)$", m) | |
1389 if(multi_delins_match or single_delins_match): | |
1390 out_dict["deletion"] = True | |
1391 out_dict["insertion"] = True | |
1392 if(single_delins_match): | |
1393 out_dict["ref_aa"] = single_delins_match.group(1) | |
1394 out_dict["pos"] = single_delins_match.group(2) | |
1395 out_dict["mut_aa"] = single_delins_match.group(3) | |
1396 else: | |
1397 out_dict["ref_aa"] = multi_delins_match.group(1) | |
1398 out_dict["pos"] = multi_delins_match.group(2) | |
1399 out_dict["ref_aa_right"] = multi_delins_match.group(3) | |
1400 out_dict["mut_end"] = multi_delins_match.group(4) | |
1401 out_dict["mut_aa"] = multi_delins_match.group(5) | |
1402 # Deletions | |
1403 elif(m[-3:] == "del"): | |
1404 single_del_match = re.search( | |
1405 r"^p.(\D{1})(\d+)del$", m) | |
1406 multi_del_match = re.search( | |
1407 r"^p.(\D{1})(\d+)_(\D{1})(\d+)del$", m) | |
1408 out_dict["deletion"] = True | |
1409 if(single_del_match): | |
1410 out_dict["ref_aa"] = single_del_match.group(1) | |
1411 out_dict["pos"] = single_del_match.group(2) | |
1412 else: | |
1413 out_dict["ref_aa"] = multi_del_match.group(1) | |
1414 out_dict["pos"] = multi_del_match.group(2) | |
1415 out_dict["ref_aa_right"] = multi_del_match.group(3) | |
1416 out_dict["mut_end"] = multi_del_match.group(4) | |
1417 # Insertions | |
1418 elif(multi_ins_match): | |
1419 out_dict["insertion"] = True | |
1420 out_dict["ref_aa"] = multi_ins_match.group(1).lower() | |
1421 out_dict["pos"] = multi_ins_match.group(2) | |
1422 out_dict["ref_aa_right"] = multi_ins_match.group(3).lower() | |
1423 if(multi_ins_match.group(4)): | |
1424 out_dict["mut_aa"] = multi_ins_match.group(4).lower() | |
1425 # Substitutions | |
1426 else: | |
1427 sub_match = re.search( | |
1428 r"^p.(\D{1})(\d+)(\D{1})$", m) | |
1429 out_dict["ref_aa"] = sub_match.group(1).lower() | |
1430 out_dict["pos"] = sub_match.group(2) | |
1431 out_dict["mut_aa"] = sub_match.group(3).lower() | |
1432 | |
1433 # Nucleotide mutations | |
1434 # Ex. sub: n.-42T>C | |
1435 # Ex. ins: n.-13_-14insG (TODO: Verify) | |
1436 # Ex. del: n.42delT (TODO: Verify) | |
1437 # Ex. del: n.42_45del (TODO: Verify) | |
1438 elif(m.startswith("n.") or m.startswith("r.")): | |
1439 out_dict["nucleotide"] = True | |
1440 | |
1441 sub_match = re.search( | |
1442 r"^[nr]{1}\.(-{0,1}\d+)(\D{1})>(\D{1})$", m) | |
1443 ins_match = re.search( | |
1444 r"^[nr]{1}\.(-{0,1}\d+)_(-{0,1}\d+)ins(\S+)$", m) | |
1445 # r.541delA | |
1446 del_match = re.search(( | |
1447 r"^[nr]{1}\.(-{0,1}\d+)_{0,1}(-{0,1}\d*)del(\S*)$"), m) | |
1448 if(sub_match): | |
1449 out_dict["pos"] = sub_match.group(1) | |
1450 out_dict["ref_nuc"] = sub_match.group(2) | |
1451 out_dict["mut_nuc"] = sub_match.group(3) | |
1452 elif(ins_match): | |
1453 out_dict["insertion"] = True | |
1454 out_dict["pos"] = ins_match.group(1) | |
1455 out_dict["mut_end"] = ins_match.group(2) | |
1456 out_dict["mut_nuc"] = ins_match.group(3) | |
1457 elif(del_match): | |
1458 out_dict["deletion"] = True | |
1459 out_dict["pos"] = del_match.group(1) | |
1460 if(del_match.group(2)): | |
1461 out_dict["mut_end"] = del_match.group(2) | |
1462 if(del_match.group(3)): | |
1463 out_dict["ref_nuc"] = del_match.group(3) | |
1464 else: | |
1465 sys.exit("ERROR: Nucleotide deletion did not contain any " | |
1466 "reference sequence. mut string: {}".format(m)) | |
1467 | |
1468 return out_dict | |
1469 | |
1470 def get_mutations(self, gene, gene_name, mis_matches, unknown_flag, hit): | |
1471 RNA = False | |
1472 if gene in self.RNA_gene_list: | |
1473 RNA = True | |
1474 | |
1475 known_muts = [] | |
1476 unknown_muts = [] | |
1477 | |
1478 # Go through each mutation | |
1479 for i in range(len(mis_matches)): | |
1480 m_type = mis_matches[i][0] | |
1481 pos = mis_matches[i][1] # sort on pos? | |
1482 look_up_pos = mis_matches[i][2] | |
1483 look_up_mut = mis_matches[i][3] | |
1484 mut_name = mis_matches[i][4] | |
1485 # nuc_ref = mis_matches[i][5] | |
1486 # nuc_alt = mis_matches[i][6] | |
1487 ref = mis_matches[i][-2] | |
1488 alt = mis_matches[i][-1] | |
1489 mut_dict = PointFinder.mutstr2mutdict(mut_name) | |
1490 | |
1491 mut_id = ("{gene}_{pos}_{alt}" | |
1492 .format(gene=gene_name, pos=pos, alt=alt.lower())) | |
1493 | |
1494 ref_aa = mut_dict.get("ref_aa", None) | |
1495 ref_aa_right = mut_dict.get("ref_aa_right", None) | |
1496 mut_aa = mut_dict.get("mut_aa", None) | |
1497 ref_nuc = mut_dict.get("ref_nuc", None) | |
1498 mut_nuc = mut_dict.get("mut_nuc", None) | |
1499 is_nuc = mut_dict.get("nucleotide", None) | |
1500 is_ins = mut_dict.get("insertion", None) | |
1501 is_del = mut_dict.get("deletion", None) | |
1502 mut_end = mut_dict.get("mut_end", None) | |
1503 prem_stop = mut_dict.get("prem_stop", False) | |
1504 | |
1505 mut = ResMutation(unique_id=mut_id, seq_region=gene_name, pos=pos, | |
1506 hit=hit, ref_codon=ref_nuc, mut_codon=mut_nuc, | |
1507 ref_aa=ref_aa, ref_aa_right=ref_aa_right, | |
1508 mut_aa=mut_aa, insertion=is_ins, deletion=is_del, | |
1509 end=mut_end, nuc=is_nuc, | |
1510 premature_stop=prem_stop, ab_class=None) | |
1511 | |
1512 if "Premature stop codon" in mut_name: | |
1513 sbjct_len = hit['sbjct_length'] | |
1514 qry_len = pos * 3 | |
1515 perc_trunc = round( | |
1516 ((float(sbjct_len) - qry_len) | |
1517 / float(sbjct_len)) | |
1518 * 100, 2 | |
1519 ) | |
1520 mut.premature_stop = perc_trunc | |
1521 | |
1522 # Check if mutation is known | |
1523 gene_mut_name, resistence, pmid = self.look_up_known_muts( | |
1524 gene, look_up_pos, look_up_mut, m_type, gene_name) | |
1525 | |
1526 # Collect known mutations | |
1527 if resistence != "Unknown": | |
1528 known_muts.append(mut) | |
1529 # Collect unknown mutations | |
1530 else: | |
1531 unknown_muts.append(mut) | |
1532 | |
1533 # TODO: Use ResMutation class to make sure identical mutations are | |
1534 # not kept. | |
1535 | |
1536 return (known_muts, unknown_muts) | |
1537 | |
1538 def mut2str(self, gene, gene_name, mis_matches, unknown_flag, hit): | |
1539 """ | |
1540 This function takes a gene name a list of mis matches found | |
1541 between subject and query of this gene, the dictionary of | |
1542 known mutation in the point finder database, and the flag | |
1543 telling weather the user wants unknown mutations to be | |
1544 reported. All mis matches are looked up in the known | |
1545 mutation dict to se if the mutation is known, and in this | |
1546 case what drug resistence it causes. The funtions return 3 | |
1547 strings that are used as output to the users. One string is | |
1548 only tab seperated and contains the mutations listed line | |
1549 by line. If the unknown flag is set to true it will contain | |
1550 both known and unknown mutations. The next string contains | |
1551 only known mutation and are given in in a format that is | |
1552 easy to convert to HTML. The last string is the HTML tab | |
1553 sting from the unknown mutations. | |
1554 """ | |
1555 known_header = ("Mutation\tNucleotide change\tAmino acid change\t" | |
1556 "Resistance\tPMID\n") | |
1557 | |
1558 unknown_header = "Mutation\tNucleotide change\tAmino acid change\n" | |
1559 | |
1560 RNA = False | |
1561 if gene in self.RNA_gene_list: | |
1562 RNA = True | |
1563 known_header = "Mutation\tNucleotide change\tResistance\tPMID\n" | |
1564 unknown_header = "Mutation\tNucleotide change\n" | |
1565 | |
1566 known_lst = [] | |
1567 unknown_lst = [] | |
1568 all_results_lst = [] | |
1569 output_mut = [] | |
1570 stop_codons = [] | |
1571 | |
1572 # Go through each mutation | |
1573 for i in range(len(mis_matches)): | |
1574 m_type = mis_matches[i][0] | |
1575 pos = mis_matches[i][1] # sort on pos? | |
1576 look_up_pos = mis_matches[i][2] | |
1577 look_up_mut = mis_matches[i][3] | |
1578 mut_name = mis_matches[i][4] | |
1579 nuc_ref = mis_matches[i][5] | |
1580 nuc_alt = mis_matches[i][6] | |
1581 ref = mis_matches[i][-2] | |
1582 alt = mis_matches[i][-1] | |
1583 | |
1584 # First index in list indicates if mutation is known | |
1585 output_mut += [[]] | |
1586 | |
1587 # Define output vaiables | |
1588 codon_change = nuc_ref + " -> " + nuc_alt | |
1589 aa_change = ref + " -> " + alt | |
1590 | |
1591 if RNA is True: | |
1592 aa_change = "RNA mutations" | |
1593 elif pos < 0: | |
1594 aa_change = "Promoter mutations" | |
1595 | |
1596 # Check if mutation is known | |
1597 gene_mut_name, resistence, pmid = self.look_up_known_muts( | |
1598 gene, look_up_pos, look_up_mut, m_type, gene_name) | |
1599 | |
1600 gene_mut_name = gene_mut_name + " " + mut_name | |
1601 | |
1602 output_mut[i] = [gene_mut_name, codon_change, aa_change, | |
1603 resistence, pmid] | |
1604 | |
1605 # Add mutation to output strings for known mutations | |
1606 if resistence != "Unknown": | |
1607 if RNA is True: | |
1608 # don't include the amino acid change field for | |
1609 # RNA mutations | |
1610 known_lst += ["\t".join(output_mut[i][:2]) + "\t" | |
1611 + "\t".join(output_mut[i][3:])] | |
1612 else: | |
1613 known_lst += ["\t".join(output_mut[i])] | |
1614 | |
1615 all_results_lst += ["\t".join(output_mut[i])] | |
1616 | |
1617 # Add mutation to output strings for unknown mutations | |
1618 else: | |
1619 if RNA is True: | |
1620 unknown_lst += ["\t".join(output_mut[i][:2])] | |
1621 else: | |
1622 unknown_lst += ["\t".join(output_mut[i][:3])] | |
1623 | |
1624 if unknown_flag is True: | |
1625 all_results_lst += ["\t".join(output_mut[i])] | |
1626 | |
1627 # Check that you do not print two equal lines (can happen | |
1628 # if two indels occure in the same codon) | |
1629 if len(output_mut) > 1: | |
1630 if output_mut[i] == output_mut[i - 1]: | |
1631 | |
1632 if resistence != "Unknown": | |
1633 known_lst = known_lst[:-1] | |
1634 all_results_lst = all_results_lst[:-1] | |
1635 else: | |
1636 unknown_lst = unknown_lst[:-1] | |
1637 | |
1638 if unknown_flag is True: | |
1639 all_results_lst = all_results_lst[:-1] | |
1640 | |
1641 if "Premature stop codon" in mut_name: | |
1642 sbjct_len = hit['sbjct_length'] | |
1643 qry_len = pos * 3 | |
1644 prec_truckat = round( | |
1645 ((float(sbjct_len) - qry_len) | |
1646 / float(sbjct_len)) | |
1647 * 100, 2 | |
1648 ) | |
1649 perc = "%" | |
1650 stop_codons.append("Premature stop codon in %s, %.2f%s lost" | |
1651 % (gene, prec_truckat, perc)) | |
1652 | |
1653 # Creat final strings | |
1654 if(all_results_lst): | |
1655 all_results = "\n".join(all_results_lst) | |
1656 else: | |
1657 all_results = "" | |
1658 total_known_str = "" | |
1659 total_unknown_str = "" | |
1660 | |
1661 # Check if there are only unknown mutations | |
1662 resistence_lst = [] | |
1663 for mut in output_mut: | |
1664 for res in mut[3].split(","): | |
1665 resistence_lst.append(res) | |
1666 | |
1667 # Save known mutations | |
1668 unknown_no = resistence_lst.count("Unknown") | |
1669 if unknown_no < len(resistence_lst): | |
1670 total_known_str = known_header + "\n".join(known_lst) | |
1671 else: | |
1672 total_known_str = "No known mutations found in %s" % gene_name | |
1673 | |
1674 # Save unknown mutations | |
1675 if unknown_no > 0: | |
1676 total_unknown_str = unknown_header + "\n".join(unknown_lst) | |
1677 else: | |
1678 total_unknown_str = "No unknown mutations found in %s" % gene_name | |
1679 | |
1680 return (all_results, total_known_str, total_unknown_str, | |
1681 resistence_lst + stop_codons) | |
1682 | |
1683 @staticmethod | |
1684 def get_file_content(file_path, fst_char_only=False): | |
1685 """ | |
1686 This function opens a file, given as the first argument | |
1687 file_path and returns the lines of the file in a list or the | |
1688 first character of the file if fst_char_only is set to True. | |
1689 """ | |
1690 with open(file_path, "r") as infile: | |
1691 line_lst = [] | |
1692 for line in infile: | |
1693 line = line.strip() | |
1694 if line != "": | |
1695 line_lst.append(line) | |
1696 if fst_char_only: | |
1697 return line_lst[0][0] | |
1698 return line_lst | |
1699 | |
1700 def look_up_known_muts(self, gene, pos, found_mut, mut, gene_name): | |
1701 """ | |
1702 This function uses the known_mutations dictionay, a gene a | |
1703 string with the gene key name, a gene position as integer, | |
1704 found_mut is given as amino acid or nucleotides, the | |
1705 mutation type (mut) is either "del", "ins", or "sub", and | |
1706 gene_name is the gene name that should be returned to the | |
1707 user. The function looks up if the found_mut defined by the | |
1708 gene, position and the found_mut string is given in the | |
1709 known_mutations dictionary, if it is, the resistance and | |
1710 the pmid are returned together with the gene_name given in | |
1711 the known_mutations dict. If the mutation type is "del" the | |
1712 deleted nucleotids are checked to be contained in any of | |
1713 the deletion described in the known_mutation dict. | |
1714 """ | |
1715 resistence = "Unknown" | |
1716 pmid = "-" | |
1717 found_mut = found_mut.upper() | |
1718 | |
1719 if mut == "del": | |
1720 for i, i_pos in enumerate(range(pos, pos + len(found_mut))): | |
1721 | |
1722 known_indels = self.known_mutations[gene]["del"].get(i_pos, []) | |
1723 for known_indel in known_indels: | |
1724 partial_mut = found_mut[i:len(known_indel) + i] | |
1725 | |
1726 # Check if part of found mut is known and check if | |
1727 # found mut and known mut is in the same reading | |
1728 # frame | |
1729 if(partial_mut == known_indel | |
1730 and len(found_mut) % 3 == len(known_indel) % 3): | |
1731 | |
1732 resistence = (self.known_mutations[gene]["del"][i_pos] | |
1733 [known_indel]['drug']) | |
1734 | |
1735 pmid = (self.known_mutations[gene]["del"][i_pos] | |
1736 [known_indel]['pmid']) | |
1737 | |
1738 gene_name = (self.known_mutations[gene]["del"][i_pos] | |
1739 [known_indel]['gene_name']) | |
1740 break | |
1741 else: | |
1742 if pos in self.known_mutations[gene][mut]: | |
1743 if found_mut in self.known_mutations[gene][mut][pos]: | |
1744 resistence = (self.known_mutations[gene][mut][pos] | |
1745 [found_mut]['drug']) | |
1746 | |
1747 pmid = (self.known_mutations[gene][mut][pos][found_mut] | |
1748 ['pmid']) | |
1749 | |
1750 gene_name = (self.known_mutations[gene][mut][pos] | |
1751 [found_mut]['gene_name']) | |
1752 | |
1753 # Check if stop codons refer resistance | |
1754 if "*" in found_mut and gene in self.known_stop_codon: | |
1755 if resistence == "Unknown": | |
1756 resistence = self.known_stop_codon[gene]["drug"] | |
1757 else: | |
1758 resistence += "," + self.known_stop_codon[gene]["drug"] | |
1759 | |
1760 return (gene_name, resistence, pmid) | |
1761 | |
1762 | |
1763 if __name__ == '__main__': | |
1764 | |
1765 ########################################################################## | |
1766 # PARSE COMMAND LINE OPTIONS | |
1767 ########################################################################## | |
1768 | |
1769 parser = argparse.ArgumentParser( | |
1770 description="This program predicting resistance associated with \ | |
1771 chromosomal mutations based on WGS data", | |
1772 prog="pointfinder.py") | |
1773 | |
1774 # required arguments | |
1775 parser.add_argument("-i", | |
1776 dest="inputfiles", | |
1777 metavar="INFILE", | |
1778 nargs='+', | |
1779 help="Input file. fastq file(s) from one sample for \ | |
1780 KMA or one fasta file for blastn.", | |
1781 required=True) | |
1782 parser.add_argument("-o", | |
1783 dest="out_path", | |
1784 metavar="OUTFOLDER", | |
1785 help="Output folder, output files will be stored \ | |
1786 here.", | |
1787 required=True) | |
1788 parser.add_argument("-s", | |
1789 dest="species", | |
1790 metavar="SPECIES", | |
1791 choices=['e.coli', 'gonorrhoeae', 'campylobacter', | |
1792 'salmonella', 'tuberculosis'], | |
1793 help="Species of choice, e.coli, tuberculosis, \ | |
1794 salmonella, campylobactor, gonorrhoeae, \ | |
1795 klebsiella, or malaria", | |
1796 required=True) | |
1797 parser.add_argument("-m", | |
1798 dest="method", | |
1799 metavar="METHOD", | |
1800 choices=["kma", "blastn"], | |
1801 help="Method of choice, kma or blastn", | |
1802 required=True) | |
1803 parser.add_argument("-m_p", | |
1804 dest="method_path", | |
1805 help="Path to the location of blastn or kma dependent \ | |
1806 of the chosen method", | |
1807 required=True) | |
1808 parser.add_argument("-p", | |
1809 dest="db_path", | |
1810 metavar="DATABASE", | |
1811 help="Path to the location of the pointfinder \ | |
1812 database", | |
1813 required=True) | |
1814 | |
1815 # optional arguments | |
1816 parser.add_argument("-t", | |
1817 dest="threshold", | |
1818 metavar="IDENTITY", | |
1819 help="Minimum gene identity threshold, default = 0.9", | |
1820 type=float, | |
1821 default=0.9) | |
1822 parser.add_argument("-l", | |
1823 dest="min_cov", | |
1824 metavar="COVERAGE", | |
1825 help="Minimum gene coverage threshold, \ | |
1826 threshold = 0.6", | |
1827 type=float, | |
1828 default=0.6) | |
1829 parser.add_argument("-u", | |
1830 dest="unknown_mutations", | |
1831 help="Show all mutations found even if it's unknown \ | |
1832 to the resistance database.", | |
1833 action='store_true', | |
1834 default=False) | |
1835 parser.add_argument("-g", | |
1836 dest="specific_gene", | |
1837 nargs='+', | |
1838 help="Specify genes existing in the database to \ | |
1839 search for - if none is specified all genes are \ | |
1840 included in the search.", | |
1841 default=None) | |
1842 | |
1843 args = parser.parse_args() | |
1844 | |
1845 # If no arguments are given print usage message and exit | |
1846 if len(sys.argv) == 1: | |
1847 sys.exit("Usage: " + parser.usage) | |
1848 | |
1849 if(args.method == "blastn"): | |
1850 method = PointFinder.TYPE_BLAST | |
1851 else: | |
1852 method = PointFinder.TYPE_KMA | |
1853 | |
1854 # Get sample name | |
1855 filename = args.inputfiles[0].split("/")[-1] | |
1856 sample_name = "".join(filename.split(".")[0:-1]) # .split("_")[0] | |
1857 if sample_name == "": | |
1858 sample_name = filename | |
1859 | |
1860 # TODO: Change ilogocal var name | |
1861 kma_db_path = args.db_path + "/" + args.species | |
1862 | |
1863 finder = PointFinder(db_path=kma_db_path, species=args.species, | |
1864 gene_list=args.specific_gene) | |
1865 | |
1866 if method == PointFinder.TYPE_BLAST: | |
1867 | |
1868 # Check that only one input file is given | |
1869 if len(args.inputfiles) != 1: | |
1870 sys.exit("Input Error: Blast was chosen as mapping method only 1 " | |
1871 "input file requied, not %s" % (len(args.inputfiles))) | |
1872 | |
1873 finder_run = finder.blast(inputfile=args.inputfiles[0], | |
1874 out_path=args.out_path, | |
1875 min_cov=0.01, | |
1876 threshold=args.threshold, | |
1877 blast=args.method_path, | |
1878 cut_off=False) | |
1879 else: | |
1880 inputfile_1 = args.inputfiles[0] | |
1881 inputfile_2 = None | |
1882 | |
1883 if(len(args.inputfiles) == 2): | |
1884 inputfile_2 = args.inputfiles[1] | |
1885 finder_run = finder.kma(inputfile_1=inputfile_1, | |
1886 inputfile_2=inputfile_2, | |
1887 out_path=args.out_path, | |
1888 db_path_kma=kma_db_path, | |
1889 databases=[args.species], | |
1890 min_cov=args.min_cov, | |
1891 threshold=args.threshold, | |
1892 kma_path=args.method_path, | |
1893 sample_name=sample_name, | |
1894 kma_mrs=0.5, kma_gapopen=-5, kma_gapextend=-2, | |
1895 kma_penalty=-3, kma_reward=1) | |
1896 | |
1897 results = finder_run.results | |
1898 | |
1899 if(args.specific_gene): | |
1900 results = PointFinder.discard_unwanted_results( | |
1901 results=results, wanted=args.specific_gene) | |
1902 | |
1903 finder.write_results(out_path=args.out_path, result=results, min_cov=args.min_cov, | |
1904 res_type=method, unknown_flag=args.unknown_mutations) |