Mercurial > repos > cstrittmatter > ss2v110
comparison SeqSero2_update_kmer_database.py @ 0:fc22ec8e924e draft
planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
| author | cstrittmatter |
|---|---|
| date | Tue, 21 Apr 2020 12:45:34 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:fc22ec8e924e |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 import argparse | |
| 4 import os,subprocess | |
| 5 import pickle | |
| 6 | |
| 7 ### SeqSero Kmer | |
| 8 def parse_args(): | |
| 9 "Parse the input arguments, use '-h' for help." | |
| 10 parser = argparse.ArgumentParser(usage='Just type "SeqSero2_update_kmer_database.py", it will update kmer database automatically') | |
| 11 return parser.parse_args() | |
| 12 | |
| 13 def reverse_complement(sequence): | |
| 14 complement = { | |
| 15 'A': 'T', | |
| 16 'C': 'G', | |
| 17 'G': 'C', | |
| 18 'T': 'A', | |
| 19 'N': 'N', | |
| 20 'M': 'K', | |
| 21 'R': 'Y', | |
| 22 'W': 'W', | |
| 23 'S': 'S', | |
| 24 'Y': 'R', | |
| 25 'K': 'M', | |
| 26 'V': 'B', | |
| 27 'H': 'D', | |
| 28 'D': 'H', | |
| 29 'B': 'V' | |
| 30 } | |
| 31 return "".join(complement[base] for base in reversed(sequence)) | |
| 32 | |
| 33 def multifasta_dict(multifasta): | |
| 34 multifasta_list = [ | |
| 35 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0 | |
| 36 ] | |
| 37 headers = [i for i in multifasta_list if i[0] == '>'] | |
| 38 multifasta_dict = {} | |
| 39 for h in headers: | |
| 40 start = multifasta_list.index(h) | |
| 41 for element in multifasta_list[start + 1:]: | |
| 42 if element[0] == '>': | |
| 43 break | |
| 44 else: | |
| 45 if h[1:] in multifasta_dict: | |
| 46 multifasta_dict[h[1:]] += element | |
| 47 else: | |
| 48 multifasta_dict[h[1:]] = element | |
| 49 return multifasta_dict | |
| 50 | |
| 51 def createKmerDict_reads(list_of_strings, kmer): | |
| 52 kmer_table = {} | |
| 53 for string in list_of_strings: | |
| 54 sequence = string.strip('\n') | |
| 55 for i in range(len(sequence) - kmer + 1): | |
| 56 new_mer = sequence[i:i + kmer].upper() | |
| 57 new_mer_rc = reverse_complement(new_mer) | |
| 58 if new_mer in kmer_table: | |
| 59 kmer_table[new_mer.upper()] += 1 | |
| 60 else: | |
| 61 kmer_table[new_mer.upper()] = 1 | |
| 62 if new_mer_rc in kmer_table: | |
| 63 kmer_table[new_mer_rc.upper()] += 1 | |
| 64 else: | |
| 65 kmer_table[new_mer_rc.upper()] = 1 | |
| 66 return kmer_table | |
| 67 | |
| 68 def multifasta_to_kmers_dict(multifasta): | |
| 69 multi_seq_dict = multifasta_dict(multifasta) | |
| 70 lib_dict = {} | |
| 71 for h in multi_seq_dict: | |
| 72 lib_dict[h] = set( | |
| 73 [k for k in createKmerDict_reads([multi_seq_dict[h]], 27)]) | |
| 74 return lib_dict | |
| 75 | |
| 76 def get_salmid_invA_database(ex_dir): | |
| 77 # read invA kmer and return it | |
| 78 a = open(ex_dir + '/invA_mers_dict', 'rb') | |
| 79 invA_dict = pickle.load(a) | |
| 80 try: | |
| 81 del invA_dict['version'] | |
| 82 except: | |
| 83 pass | |
| 84 return invA_dict | |
| 85 | |
| 86 def get_salmid_rpoB_database(ex_dir): | |
| 87 # read invA kmer and return it | |
| 88 a = open(ex_dir + '/rpoB_mers_dict', 'rb') | |
| 89 rpoB_dict = pickle.load(a) | |
| 90 try: | |
| 91 del rpoB_dict['version'] | |
| 92 except: | |
| 93 pass | |
| 94 return rpoB_dict | |
| 95 | |
| 96 def main(): | |
| 97 args = parse_args() | |
| 98 ex_dir = os.path.dirname(os.path.realpath(__file__)) | |
| 99 lib_dict = multifasta_to_kmers_dict(ex_dir + '/H_and_O_and_specific_genes.fasta') | |
| 100 invA_dict=get_salmid_invA_database(ex_dir) | |
| 101 #rpoB_dict=get_salmid_rpoB_database(ex_dir) | |
| 102 lib_dict_new = lib_dict.copy() | |
| 103 #print(len(lib_dict_new)) | |
| 104 lib_dict_new.update(invA_dict) | |
| 105 #print(len(lib_dict_new)) | |
| 106 #lib_dict_new.update(rpoB_dict) | |
| 107 #print(len(lib_dict_new)) | |
| 108 f = open(ex_dir + '/antigens.pickle', "wb") | |
| 109 pickle.dump(lib_dict_new, f) | |
| 110 f.close() | |
| 111 | |
| 112 if __name__ == '__main__': | |
| 113 main() |
